Source code for hicberg.io

import logging
import uuid
from os import getcwd, mkdir
from os.path import join
from pathlib import Path

from glob import glob
from shutil import rmtree

import subprocess as sp

import numpy as np
import cooler
import pysam as ps

from hicberg import logger

[docs] def create_folder(sample_name : str = None, output_dir : str = None, force : bool = False) -> None: """ Creates folder architecture to store results and intermediate files for the full HiC-BERG pipeline. Parameters ---------- sample_name : str Name of the folder to be created. force : bool Set if existing folder has to be deleted before folder creation. output_dir : str Path where the folder will be created. Returns ------- [str] Path of the folder created """ logger.info(f"Creating folder {sample_name} in {output_dir}") if sample_name is None: sample_name = "sample" if output_dir is None: folder_path = Path(getcwd(), sample_name) else: folder_path = Path(output_dir, sample_name) if folder_path.exists() and force : rmtree(folder_path) mkdir(folder_path) mkdir(folder_path / "index") mkdir(folder_path / "alignments") mkdir(folder_path / "statistics") mkdir(folder_path / "contacts") mkdir(folder_path / "contacts" / "matrices") mkdir(folder_path / "contacts" / "pairs") mkdir(folder_path / "plots") mkdir(folder_path / "plots" / "contact_maps") mkdir(folder_path / "plots" / "densities") mkdir(folder_path / "plots" / "ps") mkdir(folder_path / "plots" / "coverages") logger.info(f"Folder {sample_name} in {folder_path} created.") return folder_path.as_posix()
[docs] def build_pairs(bam_for : str = "group1.1.bam", bam_rev : str = "group1.2.bam", bam_for_rescued :str = "group2.1.rescued.bam", bam_rev_rescued : str = "group2.2.rescued.bam", mode : bool = False, output_dir : str = None) -> None: """ Build pairs of reads from the aligned reads. Parameters ---------- bam_for : str, optional Path to forward .bam file for the construction of the .pairs equivalent file (non rescued)., by default "group1.1.bam" bam_rev : str, optional Path to reverse .bam file for the construction of the .pairs equivalent file (non rescued)., by default None, by default "group1.2.bam" bam_for_rescued : str, optional Path to forward .bam file for the construction of the .pairs equivalent file (rescued)., by default "group2.1.rescued.bam" bam_rev_rescued : str, optional Path to reverse .bam file for the construction of the .pairs equivalent file (rescued)., by default "group2.2.rescued.bam" mode : bool, optional Choose wether the mode is rescued or unrescued to construct associated .pairs file, by default False output_dir : str, optional Path where the alignment files (.sam) should be stored, by default None """ output_path = Path(output_dir) chromosome_sizes_path = Path(output_path / "chromosome_sizes.npy") chromosome_sizes = load_dictionary(chromosome_sizes_path) if not output_path.exists(): raise ValueError(f"Output path {output_path} does not exist. Please provide existing output path.") if not mode: logger.info(f"Start building pairs file for unambiguously aligned reads") bam_for_path = Path(output_path / bam_for) bam_rev_path = Path(output_path / bam_rev) if not bam_for_path.exists(): raise ValueError(f"Forward bam file {bam_for} not found") if not bam_rev_path.exists(): raise ValueError(f"Reverse bam file {bam_rev} not found") bam_for_handler = ps.AlignmentFile(bam_for_path, "rb") bam_rev_handler = ps.AlignmentFile(bam_rev_path, "rb") with open(output_path / "group1.pairs", "w") as f_out: f_out.write("## pairs format v1.0\n") f_out.write("#columns: readID chr1 pos1 strand1 chr2 pos2 strand2\n") for chromosome, size in chromosome_sizes.items(): f_out.write(f"#chromsize: {chromosome} {size}\n") for forward_read, reverse_read in zip(bam_for_handler, bam_rev_handler): if forward_read.query_name != reverse_read.query_name: raise ValueError(f"Forward and reverse reads do not match. Please check the bam files.") f_out.write(f"{forward_read.query_name}\t{forward_read.reference_name}\t{forward_read.pos}\t{reverse_read.reference_name}\t{reverse_read.pos}\t{'+' if forward_read.flag == 0 or forward_read.flag == 256 else '-'}\t{'+' if reverse_read.flag == 0 or forward_read.flag == 256 else '-'}\n") f_out.close() bam_for_handler.close() bam_rev_handler.close() elif mode: logger.info(f"Start building pairs file for ambiguously aligned reads") bam_for_path = Path(output_path / bam_for) bam_rev_path = Path(output_path / bam_rev) bam_for_path_rescued = Path(output_path / bam_for_rescued) bam_rev_path_rescued = Path(output_path / bam_rev_rescued) if not bam_for_path.exists(): raise ValueError(f"Forward bam file {bam_for_rescued} not found") if not bam_rev_path.exists(): raise ValueError(f"Reverse bam file {bam_rev_rescued} not found") if not bam_for_path_rescued.exists(): raise ValueError(f"Forward rescued bam file {bam_for_rescued} not found") if not bam_rev_path_rescued.exists(): raise ValueError(f"Reverse rescued bam file {bam_rev_rescued} not found") bam_for_handler = ps.AlignmentFile(bam_for_path, "rb") bam_rev_handler = ps.AlignmentFile(bam_rev_path, "rb") bam_for_handler_rescued = ps.AlignmentFile(bam_for_path_rescued, "rb") bam_rev_handler_rescued = ps.AlignmentFile(bam_rev_path_rescued, "rb") with open(output_path / "all_group.pairs", "w") as f_out: f_out.write("## pairs format v1.0\n") f_out.write("#columns: readID chr1 pos1 strand1 chr2 pos2 strand2\n") for chromosome, size in chromosome_sizes.items(): f_out.write(f"#chromsize: {chromosome} {size}\n") for forward_read, reverse_read in zip(bam_for_handler, bam_rev_handler): if forward_read.query_name != reverse_read.query_name: raise ValueError(f"Forward and reverse reads do not match. Please check the bam files.") f_out.write(f"{forward_read.query_name}\t{forward_read.reference_name}\t{forward_read.pos}\t{reverse_read.reference_name}\t{reverse_read.pos}\t{'+' if forward_read.flag == 0 or forward_read.flag == 256 else '-'}\t{'+' if reverse_read.flag == 0 or forward_read.flag == 256 else '-'}\n") for forward_read, reverse_read in zip(bam_for_handler_rescued, bam_rev_handler_rescued): if forward_read.query_name != reverse_read.query_name: raise ValueError(f"Forward and reverse reads do not match. Please check the bam files.") f_out.write(f"{forward_read.query_name}\t{forward_read.reference_name}\t{forward_read.pos}\t{reverse_read.reference_name}\t{reverse_read.pos}\t{'+' if forward_read.flag == 0 or forward_read.flag == 256 else '-'}\t{'+' if reverse_read.flag == 0 or forward_read.flag == 256 else '-'}\n") f_out.close() bam_for_handler.close() bam_rev_handler.close() bam_for_handler_rescued.close() bam_rev_handler_rescued.close() logger.info(f"Pairs file successfully created in {output_path}")
[docs] def build_matrix(bins : str = "fragments_fixed_sizes.txt", pairs : str = "group1.pairs", mode : bool = False, balance : bool = False, cpus : int = 8, output_dir : str = None) -> None: """ Take table of bins and .pairs file and build a matrix in .cool format. Parameters ---------- bins : str Path to the bin file. pairs : str, optional Path to pairs file, by default "group1.pairs" mode : bool, optional Choose wether the mode is rescued or un-rescued to construct associated .cool file, by default False balance : bool, optional Set wether or not to balance the matrix, by default False output_dir : str, optional Path to the folder where to save the cooler matrix file, by default None, by default None """ output_path = Path(output_dir) if not output_path.exists(): raise ValueError(f"Output path {output_path} does not exist. Please provide existing ouput path.") pairs_path = output_path / Path(pairs) if not pairs_path.is_file(): raise ValueError(f"Pairs file {pairs_path} not found. Please provide existing pairs file.") bins_path = output_path / Path(bins) if not bins_path.is_file(): raise ValueError(f"Bins file {bins_path} not found. Please provide existing bins file.") if not mode: cool_path = output_path / "unrescued_map.cool" cooler_cmd = f"cooler cload pairs --zero-based -c1 2 -p1 3 -p2 4 -c2 5 {bins_path} {pairs_path} {cool_path}" elif mode: pairs_path = output_path /"all_group.pairs" cool_path = output_path / "rescued_map.cool" cooler_cmd = f"cooler cload pairs --zero-based -c1 2 -p1 3 -p2 4 -c2 5 {bins_path} {pairs_path} {cool_path}" balance_cmd = f"cooler balance --nproc {cpus} {cool_path}" sp.run(cooler_cmd, shell=True) if balance: sp.run(balance_cmd, shell=True) logger.info(f"Cooler matrix successfully created in {output_path}")
[docs] def load_dictionary(dictionary : str = None) -> dict: """ Load dictionary save into numpy format (.npy). Parameters ---------- dictionary : str, optional Path to a the dictionary saved in numpy (.npy) format, by default None Returns ------- dict Python native dictionary """ try: return np.load(dictionary, allow_pickle=True).item() except: return np.load(dictionary, allow_pickle=True)
[docs] def load_cooler(matrix : str = None) -> cooler.Cooler: """ Load cooler matrix. Parameters ---------- matrix : str, optional Path to a cooler matrix, by default None Returns ------- cooler.Cooler Cooler matrix object """ return cooler.Cooler(matrix.as_posix())
[docs] def merge_predictions(output_dir : str = None, clean : bool = True, stage = "prediction", cpus : int = 1) -> None: """ Merge predictions of all chunks of ambiguous reads predictions. Parameters ---------- output_dir : str, optional Path to the folder where to save the fused alignment file, by default None clean : bool, optional Set weither or not to remove temporary chunks, by default True stage : str, optional Set the stage of the merging, by default "prediction". Can be "prediction" or "classification" cpus : int, optional Number of cpus to use for the merging, by default 1 """ logger.info(f"Start merging predictions") if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) if stage == "prediction": forward_alignment_chunk_files = sorted(glob(str(output_path / "forward_*_predicted.bam"))) reverse_alignment_chunk_files = sorted(glob(str(output_path / "reverse_*_predicted.bam"))) forward_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group2.1.rescued.bam'} {' '.join(forward_alignment_chunk_files)}" reverse_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group2.2.rescued.bam'} {' '.join(reverse_alignment_chunk_files)}" # Launch merge sp.run(forward_merge_cmd, shell=True) # Launch merge sp.run(reverse_merge_cmd, shell=True) if clean: for forward_chunk, reverse_chunk in zip(forward_alignment_chunk_files, reverse_alignment_chunk_files): Path(forward_chunk).unlink() Path(reverse_chunk).unlink() elif stage == "classification": forward_unaligned_chunk_files = sorted(glob(str(output_path / "group_*_0.1.bam"))) reverse_unaligned_chunk_files = sorted(glob(str(output_path / "group_*_0.2.bam"))) forward_aligned_chunk_files = sorted(glob(str(output_path / "group_*_1.1.bam"))) reverse_aligned_chunk_files = sorted(glob(str(output_path / "group_*_1.2.bam"))) forward_multi_aligned_chunk_files = sorted(glob(str(output_path / "group_*_2.1.bam"))) reverse_multi_aligned_chunk_files = sorted(glob(str(output_path / "group_*_2.2.bam"))) forward_group0_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group0.1.bam'} {' '.join(forward_unaligned_chunk_files)}" reverse_group0_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group0.2.bam'} {' '.join(reverse_unaligned_chunk_files)}" forward_group1_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group1.1.bam'} {' '.join(forward_aligned_chunk_files)}" reverse_group1_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group1.2.bam'} {' '.join(reverse_aligned_chunk_files)}" forward_group2_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group2.1.bam'} {' '.join(forward_multi_aligned_chunk_files)}" reverse_group2_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group2.2.bam'} {' '.join(reverse_multi_aligned_chunk_files)}" # Unaligned reads sp.run(forward_group0_merge_cmd, shell=True) sp.run(reverse_group0_merge_cmd, shell=True) # Aligned once reads sp.run(forward_group1_merge_cmd, shell=True) sp.run(reverse_group1_merge_cmd, shell=True) # Multi-aligned reads sp.run(forward_group2_merge_cmd, shell=True) sp.run(reverse_group2_merge_cmd, shell=True) logger.info(f"Groups successfully merged in {output_path}") if clean: for forward_chunk, reverse_chunk in zip(forward_unaligned_chunk_files, reverse_unaligned_chunk_files): Path(forward_chunk).unlink() Path(reverse_chunk).unlink() for forward_chunk, reverse_chunk in zip(forward_aligned_chunk_files, reverse_aligned_chunk_files): Path(forward_chunk).unlink() Path(reverse_chunk).unlink() for forward_chunk, reverse_chunk in zip(forward_multi_aligned_chunk_files, reverse_multi_aligned_chunk_files): Path(forward_chunk).unlink() Path(reverse_chunk).unlink() elif stage == "benchmark": forward_out_chunk_files = sorted(glob(str(output_path / "chunk_for_*.out.bam"))) reverse_out_chunk_files = sorted(glob(str(output_path / "chunk_rev_*.out.bam"))) forward_out_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group1.1.out.bam'} {' '.join(forward_out_chunk_files)}" reverse_out_merge_cmd = f"samtools merge -f -n --threads {cpus} {output_path / 'group1.2.out.bam'} {' '.join(reverse_out_chunk_files)}" # Out reads sp.run(forward_out_merge_cmd, shell=True) sp.run(reverse_out_merge_cmd, shell=True) if clean: for forward_chunk, reverse_chunk in zip(forward_out_chunk_files, reverse_out_chunk_files): Path(forward_chunk).unlink() Path(reverse_chunk).unlink() logger.info(f"Out reads successfully merged in {output_path}")
[docs] def tidy_folder(output_dir : str = None) -> None: """ Tidy all the files in the output folder. Parameters ---------- output_dir : str, optional Path to the folder where to save the fused alignment file, by default None """ if output_dir is None: output_path = Path(getcwd()) else : output_path = Path(output_dir) # Tidy folder files = [p for p in output_path.glob("*")] for file in files : if Path(file).suffix == ".bt2l": Path(file).rename(output_path / "index" / Path(file).name) if Path(file).suffix == ".bam": Path(file).rename(output_path / "alignments" / Path(file).name) elif Path(file).suffix in [".npy", ".bed", ".bedgraph", ".bw"]: Path(file).rename(output_path / "statistics" / Path(file).name) elif Path(file).suffix == ".pairs": Path(file).rename(output_path / "contacts" / "pairs" / Path(file).name) elif Path(file).suffix == ".cool": Path(file).rename(output_path / "contacts" / "matrices" / Path(file).name) elif Path(file).suffix == ".pdf" or Path(file).suffix == ".svg": Path(file).rename(output_path / "plots" / Path(file).name) # Tidy plots plot_files = [p for p in (output_path / "plots").glob("*.pdf")] for file in plot_files : if "density" in Path(file).name: Path(file).rename(output_path / "plots" / "densities" / Path(file).name) elif "coverage" in Path(file).name: Path(file).rename(output_path / "plots" / "coverages" / Path(file).name) elif "patterns" in Path(file).name: Path(file).rename(output_path / "plots" / "ps" / Path(file).name) elif Path(file).name.startswith("chr"): Path(file).rename(output_path / "plots" / "contact_maps" / Path(file).name)