Source code for hicberg.plot

from os import getcwd
from os.path import join
from pathlib import Path

from itertools import product, combinations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as plc
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable

import cooler
import bioframe as bf
import pysam as ps

from hicberg.io import load_dictionary, load_cooler
from hicberg import logger


DIST_FRAG = "dist.frag.npy"
XS = "xs.npy"
COVERAGE = "coverage.npy"
D1D2 = "d1d2.npy"
UNCUTS = "uncuts.npy"
WEIRDS = "weirds.npy"
LOOPS = "loops.npy"
TRANS_PS = "trans_ps.npy"
CLR = "unrescued_map.cool"
RESTRICTION_MAP = "restriction_map.npy"
DENSITY_MAP = "density_map.npy"


[docs] def plot_density(output_dir : str = None) -> None: """ Plot density maps Parameters ---------- output_dir : str, optional Path to the folder where to save the plots (one plot per chromosome couple), by default None. """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) # reload dictionaries density_map = load_dictionary(output_path / DENSITY_MAP) for chromosome_couple in density_map.keys(): matrix = density_map[chromosome_couple] cmap = plt.get_cmap("seismic") cmap.set_bad(color="black") plt.figure(figsize=(10, 10)) plt.imshow(np.log10(matrix), cmap=cmap, vmin = -1, vmax = 1) plt.title(f"Contact density for {chromosome_couple}") plt.colorbar(fraction=0.046) plt.savefig(output_path / f"density_{chromosome_couple[0]}-{chromosome_couple[1]}.pdf", format = "pdf") plt.close() logger.info(f"Saved plots of densities at : {output_path}")
[docs] def plot_benchmark(original_matrix : str = None, depleted_matrix : str = None, rescued_matrix : str = None, chromosomes : list[str] = None, output_dir : str = None) -> None: """ Plot benchmark results (original, depleted and rescued matrices with associated log ratios). One plot per chromosome. Parameters ---------- original_matrix : str, optional Path to the original matrix, by default None rescued_matrix : str, optional Path to the rescued matrix (re-attributed reads), by default None chromosomes : list[str], optional List of chromosomes to plot, by default None output_dir : str, optional Path to where to save plots, by default None """ if output_dir is None: # if no output directory is provided, save in current directory output_path = Path(getcwd()) else : output_path = Path(output_dir) chromosomes = chromosomes if type(chromosomes) == list else chromosomes.split() # define paths original_matrix_path = output_dir / original_matrix depleted_matrix_path = output_dir / depleted_matrix rescued_matrix_path = output_dir / rescued_matrix if not original_matrix_path.is_file(): raise FileNotFoundError(f"Original matrix not found at {original_matrix_path}. Please provide a valid path.") if not depleted_matrix_path.is_file(): raise FileNotFoundError(f"Depleted matrix not found at {depleted_matrix_path}. Please provide a valid path.") if not rescued_matrix_path.is_file(): raise FileNotFoundError(f"Rescued matrix not found at {rescued_matrix_path}. Please provide a valid path.") # Relaod matricies original_matrix = load_cooler(original_matrix_path) depleted_matrix = load_cooler(depleted_matrix_path) rescued_matrix = load_cooler(rescued_matrix_path) for chrm in chromosomes: ori_matrix = original_matrix.matrix(balance=False).fetch(chrm) dep_matrix = depleted_matrix.matrix(balance=False).fetch(chrm) res_matrix = rescued_matrix.matrix(balance=False).fetch(chrm) ratio = np.divide( res_matrix, ori_matrix, out=np.ones(res_matrix.shape), where=ori_matrix != 0, ) log_ratio = np.log10(ratio) # TODO : Adjust log non log and exponent plt.figure(figsize=(10, 10)) plt.subplot(221) plt.imshow(ori_matrix ** 0.15, cmap = "afmhot_r", vmin = 0, vmax = np.max(ori_matrix ** 0.15)) plt.title(f"Original map - {chrm}") plt.subplot(222) plt.imshow(dep_matrix ** 0.15, cmap = "afmhot_r", vmin = 0, vmax = np.max(ori_matrix ** 0.15)) plt.title(f"Depleted map - {chrm}") plt.subplot(223) plt.imshow(res_matrix ** 0.15, cmap = "afmhot_r", vmin = 0, vmax = np.max(ori_matrix ** 0.15)) plt.title(f"Rescued map - {chrm}") plt.subplot(224) plt.imshow(log_ratio, cmap = "bwr" , vmin = -1, vmax = 1) plt.title(f"Log ratio - {chrm}") plt.colorbar(fraction=0.046) plt.savefig(output_path / f"benchmark_{chrm}.pdf", format = "pdf") plt.close()
[docs] def plot_d1d2(output_dir : str = None) -> None: """ Plot d1d2 law Parameters ---------- output_dir : str, optional Path to the folder where to save the plot, by default None, by default None. """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) # reload dictionary d1d2 = load_dictionary(output_path / D1D2) histo, bins = np.histogram(d1d2, max(d1d2)) plt.figure(figsize=(10, 10)) plt.loglog(histo) plt.title("Log distribution of d1d2 distance") plt.xlabel("d1+d2") plt.ylabel("No. occurences") plt.savefig(output_path / f"d1d2.pdf", format = "pdf") plt.close() logger.info(f"Saved plots of d1d2 at : {output_path}")
[docs] def plot_laws(output_dir : str = None) -> None: """ Plot P(s) patterns laws Parameters ---------- output_dir : str, optional Path to the folder where to save the plot, by default None, by default None. """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) # reload dictionaries xs = load_dictionary(output_path / XS) weirds = load_dictionary(output_path / WEIRDS) uncuts = load_dictionary(output_path / UNCUTS) loops = load_dictionary(output_path / LOOPS) for chromosome in xs.keys(): plt.figure(figsize=(10, 10)) plt.loglog(xs[chromosome], weirds[chromosome], "o", label="++/--") plt.loglog(xs[chromosome], uncuts[chromosome], "o", label="+-") plt.loglog(xs[chromosome], loops[chromosome], "o", label="-+") plt.title(f"Distribution of weirds, uncuts and loops events across {chromosome}") plt.xlabel("Logarithmic binned genomic distances") plt.ylabel("Number of events") plt.grid() plt.legend() plt.savefig(output_path / f"patterns_distribution_{chromosome}.pdf", format = "pdf") plt.close() logger.info(f"Saved plots of patterns at : {output_path}")
[docs] def plot_trans_ps(output_dir : str = None) -> None: """ Plot P(s) patterns laws Parameters ---------- output_dir : str, optional Path to the folder where to save the plot, by default None, by default None. """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) # reload dictionaries dist_frag = load_dictionary(output_path / DIST_FRAG) clr_unambiguous = load_cooler(output_path / CLR) chrm_sets = product((dist_frag.keys()), repeat=2) t_ps = np.zeros((len(dist_frag.keys()) ** 2, 1)) all_interaction_matrix = np.zeros((len(dist_frag.keys()) ** 2, 1)) n_frags_matrix = np.zeros((len(dist_frag.keys()) ** 2, 1)) trans_ps_dictionary = dict() for idx, s in enumerate(chrm_sets): all_interactions = clr_unambiguous.matrix(balance=False).fetch(s[0], s[1]).sum() n_frags = len(dist_frag.get(str(s[0]))) * len( dist_frag.get(str(s[1])) ) trans_ps_dictionary[s] = np.divide(all_interactions, np.multiply(n_frags, 4)) t_ps[idx] = np.divide(all_interactions, np.multiply(n_frags, 4)) all_interaction_matrix[idx] = all_interactions n_frags_matrix[idx] = n_frags t_ps = t_ps.reshape( (len(dist_frag.keys()), (len(dist_frag.keys()))) ) np.fill_diagonal(t_ps, np.nan) all_interaction_matrix = all_interaction_matrix.reshape( (len(dist_frag.keys()), (len(dist_frag.keys()))) ) np.fill_diagonal(all_interaction_matrix, np.nan) n_frags_matrix = n_frags_matrix.reshape( (len(dist_frag.keys()), (len(dist_frag.keys()))) ) np.fill_diagonal(n_frags_matrix, np.nan) plt.figure(figsize=(10, 10)) plt.imshow(t_ps, cmap="Wistia", interpolation="None") plt.colorbar(fraction=0.046) plt.xticks( np.arange(len(list(dist_frag.keys()))), list(dist_frag.keys()), rotation="vertical", ) plt.yticks( np.arange(len(list(dist_frag.keys()))), list(dist_frag.keys()), ) plt.title("Pseudo P(s)") plt.savefig(output_path / f"pseudo_ps.pdf", format = "pdf") plt.close() logger.info(f"Saved pseudo P(s) of patterns at : {output_path}")
[docs] def plot_coverages(bins : int = 2000, output_dir : str = None ) -> None: """ Plot coverages of chromosomes Parameters ---------- bins : int, optional Size of the desired bin., by default 2000 output_dir : str, optional Path to the folder where to save the plot, by default None, by default None. """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) # reload dictionaries xs = load_dictionary(output_path / XS) coverage = load_dictionary(output_path / COVERAGE) for chromosome in xs.keys(): plt.figure() plt.plot(coverage[chromosome], label="Covering smoothed") plt.title(f"Covering across {chromosome} - bins of {bins} bp") plt.xlabel(f"Bin number") plt.ylabel("Number of reads") plt.legend() plt.grid() plt.savefig(output_path / f"coverage_{chromosome}.pdf", format = "pdf") plt.close() logger.info(f"Saved coverages at : {output_path}")
[docs] def plot_couple_repartition(forward_bam_file : str = "group2.1.rescued.bam", reverse_bam_file : str = "group2.2.rescued.bam", output_dir : str = None ) -> None: """ Plot read couples sizes distribution Parameters ---------- forward_bam_file : str, optional Path to forward .bam alignment file, by default 1.sorted.bam reverse_bam_file : str, optional Path to reverse .bam alignment file, by default 2.sorted.bam Minimal read quality under which a Hi-C read pair will not be kept, by default 30 output_dir : str, optional Path to the folder where to save the plot, by default None """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) merged_forward_alignment_path = output_path / forward_bam_file merged_reverse_alignment_path = output_path / reverse_bam_file merged_forward_alignment_file_handler = ps.AlignmentFile(merged_forward_alignment_path, "rb") merged_reverse_alignment_file_handler = ps.AlignmentFile(merged_reverse_alignment_path, "rb") # Get the number of possible couples couple_lenght = list() for forward_read, reverse_read in zip(merged_forward_alignment_file_handler, merged_reverse_alignment_file_handler): couple_lenght.append(forward_read.get_tag("XL") * reverse_read.get_tag("XL")) _, bins_edges = np.histogram(couple_lenght, bins=max(couple_lenght)) plt.figure() plt.vlines( x=np.mean(couple_lenght), ymin=0, ymax=max(_), color="red", label="mean", linestyles="dashed", ) plt.vlines( x=np.median(couple_lenght), ymin=0, ymax=max(_), color="green", label="median", linestyles="dashed", ) plt.vlines( x=np.percentile(couple_lenght, 99), ymin=0, ymax=max(_), color="purple", label="99 percentile", linestyles="dashed", ) plt.loglog(_) plt.xlim( (2, (np.percentile(couple_lenght, 99) + np.std(couple_lenght)).astype(int)) ) plt.xlabel("Size of the set of possible couple") plt.ylabel("Number of couples") plt.title("Distribution of set of potential couple sizes") plt.legend() plt.savefig(output_path / f"Couple_sizes_distribution.pdf", format="pdf", ) plt.close() logger.info(f"Saved couple size distribution at : {output_path}")
[docs] def plot_matrix(unrescued_matrix : str = "unrescued_map.cool", rescued_matrix : str = "rescued_map.cool", restriction_map : str = "restriction_map.npy", genome : str = "", vmin : float = 0.0, vmax : float = 3.5, bins : int = 2000, output_dir : str = None) -> None: """ Plot matrix with additional trackss Parameters ---------- unrescued_matrix : str, optional Path to the unrescued map file, by default unrescued_map.cool rescued_matrix : str, optional Path to rescued map file, by default rescued_map.cool restriction_map : dict, optional Restriction map saved as a dictionary like chrom_name : list of restriction sites' position, by default dist.frag.npy genome : str, optional Path to the genome to digest, by default None, by default None vmin : float, optional Inferior limit for the colorscale, by default 0.0 vmax : float, optional Superior limit for the colorscale, by default 3.5 bins : int, optional Size of the desired bin., by default 2000 output_dir : str, optional Path to the folder where to save the plot, by default None """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) # Get the matrix unrescued_matrix = load_cooler(output_path / unrescued_matrix) rescued_matrix = load_cooler(output_path /rescued_matrix) genome_file = bf.load_fasta(genome, engine="pysam") restriction_map = load_dictionary(output_path / restriction_map) bins = unrescued_matrix.bins()[:] gc_cov = bf.frac_gc(bins[["chrom", "start", "end"]], genome_file) ### to make a list of chromosome start/ends in bins: for i in rescued_matrix.chromnames: lower = rescued_matrix.extent(str(i))[0] upper = rescued_matrix.extent(str(i))[1] # Unrescued coverage_unrescued = np.sum(np.tril(unrescued_matrix.matrix(balance = False).fetch(i)), axis = 1) median_coverage = np.repeat(np.median(coverage_unrescued), coverage_unrescued.shape[0]) # Rescued coverage_rescued = np.sum(np.tril(rescued_matrix.matrix(balance = False).fetch(i)), axis = 1) # Plot the matrix fig = plt.figure(figsize=(20, 20)) gs = gridspec.GridSpec(2, 2, height_ratios=[10, 1], width_ratios=[1, 1]) ax1 = plt.subplot(gs[0]) divider1 = make_axes_locatable(ax1) cax1 = divider1.append_axes("right", size="5%", pad=0.1) im_unrescued = ax1.imshow( np.log10(unrescued_matrix.matrix(balance=False).fetch(i)), vmin = vmin, vmax = vmax, cmap = "afmhot_r", ) fig.colorbar(im_unrescued, cax=cax1, label="corrected frequencies") ax1.set_title( f"Unrescued map of chromosome {i} \n binned at {int(rescued_matrix.binsize / 1000 )}kb", loc="center", ) # Synchronize rescued and unrescued parts ax2 = plt.subplot(gs[1], sharex=ax1, sharey=ax1) # Rescued map divider2 = make_axes_locatable(ax2) cax2 = divider2.append_axes("right", size="5%", pad=0.1) im_rescued = ax2.imshow( np.log10(rescued_matrix.matrix(balance=False).fetch(i)), vmin = vmin, vmax = vmax, cmap = "afmhot_r", ) fig.colorbar(im_rescued, cax=cax2, label="corrected frequencies") ax2.set_title( f"Rescued map of chromosome {i} \n binned at {int(unrescued_matrix.binsize / 1000 ) }kb", loc="center", ) ax3 = divider1.append_axes("bottom", size="15%", pad=0.5, sharex=ax1) ax3.plot(coverage_unrescued) ax3.plot(median_coverage, linestyle = '--', color = 'black') ax3.set_ylabel("Coverage") ax3.set_xticks([]) ax3.set_title('Natural coverage') ax4 = divider1.append_axes("bottom", size="15%", pad=0.5, sharex=ax1) ax4.plot(list(gc_cov["GC"][lower:upper]), color="purple") ax4.set_ylabel("GC Content") ax5 = divider2.append_axes("bottom", size="15%", pad=0.5, sharex=ax2) ax5.plot(coverage_unrescued, label="Before HiC-BERG") ax5.plot(coverage_rescued, label="After HiC-BERG") ax5.plot(median_coverage, linestyle = '--', color = 'black') ax5.set_title('Enhanced coverage') ax5.set_xlim([0, len(unrescued_matrix.bins().fetch(str(i)))]) ax5.set_ylabel("Coverage") ax5.legend(loc="center left", bbox_to_anchor=(1, 0.5)) ax5.set_xticks([]) ax6 = divider2.append_axes("bottom", size="15%", pad=0.5, sharex=ax2) ax6.plot(list(gc_cov["GC"][lower:upper]), color="purple") ax6.set_ylabel("GC Content") plt.savefig( output_path / f"chr_{i}.pdf", format="pdf", ) plt.close()
# # Plot the balanced matrix # fig = plt.figure(figsize=(20, 20)) # gs = gridspec.GridSpec(2, 2, height_ratios=[10, 1], width_ratios=[1, 1]) # ax1 = plt.subplot(gs[0]) # divider1 = make_axes_locatable(ax1) # cax1 = divider1.append_axes("right", size="5%", pad=0.1) # im_unrescued = ax1.imshow( # np.log10(unrescued_matrix.matrix(balance=True).fetch(i)), vmin = vmin, vmax = vmax, # cmap = "afmhot_r", # ) # fig.colorbar(im_unrescued, cax=cax1, label="corrected frequencies") # ax1.set_title( # f"Unrescued map of chromosome {i} \n binned at {int(rescued_matrix.binsize / 1000 )}kb", # loc="center", # ) # # Synchronize rescued and unrescued parts # ax2 = plt.subplot(gs[1], sharex=ax1, sharey=ax1) # # Rescued map # divider2 = make_axes_locatable(ax2) # cax2 = divider2.append_axes("right", size="5%", pad=0.1) # im_rescued = ax2.imshow( # np.log10(rescued_matrix.matrix(balance=True).fetch(i)), vmin = vmin, vmax = vmax, # cmap = "afmhot_r", # ) # fig.colorbar(im_rescued, cax=cax2, label="corrected frequencies") # ax2.set_title( # f"Rescued map of chromosome {i} \n binned at {int(unrescued_matrix.binsize / 1000 ) }kb", # loc="center", # ) # ax3 = divider1.append_axes("bottom", size="15%", pad=0.5, sharex=ax1) # ax3.plot(tot_coverage_unrescued[lower:upper], label="total") # ax3.plot(cis_coverage_unrescued[lower:upper], label="cis") # ax3.set_ylabel("Coverage") # ax3.legend(loc="lower left", bbox_to_anchor=(1, 0.5)) # ax3.set_xticks([]) # ax4 = divider1.append_axes("bottom", size="15%", pad=0.5, sharex=ax1) # ax4.plot(list(gc_cov["GC"][lower:upper]), color="purple") # ax4.set_ylabel("GC Content") # ax5 = divider2.append_axes("bottom", size="15%", pad=0.5, sharex=ax2) # ax5.plot(tot_coverage_unrescued[lower:upper], label="Before recovery") # ax5.plot(tot_coverage[lower:upper], label="After recovery") # ax5.set_xlim([0, len(unrescued_matrix.bins().fetch(str(i)))]) # ax5.set_ylabel("Coverage") # ax5.legend(loc="center left", bbox_to_anchor=(1, 0.5)) # ax5.set_xticks([]) # ax6 = divider2.append_axes("bottom", size="15%", pad=0.5, sharex=ax2) # ax6.plot(list(gc_cov["GC"][lower:upper]), color="purple") # ax6.set_ylabel("GC Content") # plt.savefig( # output_path / f"chr_{i}_normalized.pdf", # format="pdf", # ) # plt.close()
[docs] def plot_pattern_reconstruction(table : pd.DataFrame = None, original_cool : str = None, rescued_cool : str = None, chromosome : str = None, threshold : float = 0.0, case : str = "", output_dir : str = None) -> None: """ Create a plot of pattern reconstruction quality. Parameters ---------- table : pd.DataFrame, optional Table containing either true positives, false positives or false negatives patterns, by default None original_cool : str, optional Path to the original matrix in .cool format, by default None rescued_cool : str, optional Path to the rescued matrix in .cool format, by default None chromosome : str, optional Selected chromosome, by default None threshold : float, optional Threshold for pattern score significance, by default 0.0 case : str, optional Mode to consider, either true positives, false positives or false negatives, by default "" output_dir : str, optional Path to save plots, by default None """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) original_matrix = load_cooler(original_cool).matrix(balance = False) rescued_matrix = load_cooler(rescued_cool).matrix(balance = False) bin_size = load_cooler(original_cool).info['bin-size'] fig, ax = plt.subplots() plt.title(f"Reconstructed pattern {chromosome}\n {case}") # Use imshow to add the first set of data to the plot img1 = ax.imshow(original_matrix.fetch(chromosome) ** 0.15, cmap ='afmhot_r', vmin = 0, vmax=np.max(rescued_matrix.fetch(chromosome) ** 0.15)) if table is not None: colormap = plt.colormaps['Blues'] # 'plasma' or 'viridis' colors = colormap(table['score']) norm = plc.Normalize(vmin = 0.0, vmax = 1.0) # Create a divider for the existing axes instance divider = make_axes_locatable(ax) # Append axes to the right of the main axes. cax1 = divider.append_axes("right", size = "5%", pad = 0.1) # Add the colorbar to the figure cbar1 = fig.colorbar(img1, cax = cax1) sc = ax.scatter(x = table['start1'] // bin_size, y = table['start2'] // bin_size, s = 40, linewidth = 2, color = 'none', edgecolors = colors) sm = plt.cm.ScalarMappable(cmap = colormap) # Append axes to the bottom of the main axes. cax2 = divider.append_axes("bottom", size = "5%", pad = 0.4) # Add the second colorbar to the figure cbar2 = fig.colorbar(sm, cax = cax2, orientation = 'horizontal', ) cbar2.set_label(f'Pattern score - threshold : {threshold}') fig.savefig(str(output_path / f"pattern_{case.replace(' ', '')}_{chromosome}.pdf"), format = "pdf")