Source code for biscot.Misc

from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import defaultdict

import coloredlogs
import os
import logging
import sys


_orig_print = print


[docs]def print(*args, **kwargs): """Unbuffured print function""" _orig_print(*args, flush=True, **kwargs)
[docs]def check_path(file_paths): """ Checks if the files given as input exist :param file_paths: List of file paths :type file_paths: list(str) :raises FileNotFoundError: If a file is not found """ for file_path in file_paths: if not os.path.exists(file_path): raise FileNotFoundError("%s file not found.")
[docs]def setup_logging(debug): """ Setups the logging streams :param debug: True to activate debug logs :type debug: bool """ coloredlogs.DEFAULT_FIELD_STYLES["asctime"] = {"color": "cyan", "bright": True} coloredlogs.DEFAULT_FIELD_STYLES["filename"] = {"color": "yellow", "bright": True} coloredlogs.DEFAULT_FIELD_STYLES["lineno"] = {"color": "yellow", "bright": True} coloredlogs.DEFAULT_FIELD_STYLES["levelname"] = {"bold": True} coloredlogs.DEFAULT_LEVEL_STYLES["INFO"] = {"color": "green", "bright": True} coloredlogs.DEFAULT_LEVEL_STYLES["DEBUG"] = {"color": "magenta", "bright": True} level = "" if debug: logging.basicConfig( format="[%(asctime)s]\t[%(filename)9.9s - %(lineno)03d]\t[%(levelname)5.5s]\t%(message)s", level=logging.DEBUG, filename="biscot.log", ) level = "DEBUG" else: logging.basicConfig( format="[%(asctime)s]\t[%(filename)9.9s - %(lineno)03d]\t[%(levelname)5.5s]\t%(message)s", level=logging.INFO, filename="biscot.log", ) level = "INFO" log_formatter = logging.Formatter( "[%(asctime)s]\t[%(filename)9.9s - %(lineno)03d]\t[%(levelname)5.5s]\t%(message)s" ) root_logger = logging.getLogger() console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(log_formatter) root_logger.addHandler(console_handler) coloredlogs.install( level=level, logger=root_logger, fmt="[%(asctime)s]\t[%(filename)9.9s - %(lineno)03d]\t[%(levelname)5.5s]\t%(message)s", )
[docs]def load_contigs(contigs_sequence_dict, contigs_path): """ Extracts contig sequences from a FASTA file :param contigs_sequence_dict: Dict that will contain contigs FASTA sequence :type contigs_sequence_dict: dict(str: str) :param contigs_path: Path to a contigs FASTA file :type contigs_path: str """ logging.info("Loading contigs fasta file") for record in SeqIO.parse(open(contigs_path), "fasta"): contigs_sequence_dict[record.id] = str(record.seq)
[docs]def agp_to_fasta(contigs_sequence_dict, agp_path, output_file): """ Parses an AGP file and thanks to a dict containing contigs sequence, transforms it into a scaffolds FASTA File :param contigs_sequence_dict: Dict containing contigs FASTA sequence :type contigs_sequence_dict: dict(str: str) :param agp_path: Path to an AGP file :type agp_path: str :param output_file: Path to an output FASTA file :type output_file: str """ logging.info("Converting agp file to fasta") scaffolds_sequence_dict = defaultdict(lambda: "") with open(agp_path) as agp: for line in agp: line = line.rstrip("\n").split("\t") seq_type = line[4] if seq_type == "W": scaffold_name, contig_name, contig_start, contig_end, orientation = ( line[0], line[5], int(line[6]), int(line[7]), line[8], ) if orientation == "+": scaffolds_sequence_dict[scaffold_name] += contigs_sequence_dict[ contig_name ][contig_start:contig_end] else: scaffolds_sequence_dict[scaffold_name] += str( Seq( contigs_sequence_dict[contig_name][contig_start:contig_end], generic_dna, ).reverse_complement() ) elif seq_type == "N": scaffold_name, gap_size = line[0], int(line[5]) scaffolds_sequence_dict[scaffold_name] += "N" * gap_size sorted_scaffolds = sorted( scaffolds_sequence_dict.items(), key=lambda d: len(d[1]), reverse=True ) with open(output_file, "w") as out: for scaffold, sequence in sorted_scaffolds: record = SeqRecord(Seq(sequence, generic_dna), id=scaffold) out.write(record.format("fasta"))