Source code for mavis.constants

"""
module responsible for small utility functions and constants used throughout the structural_variant package
"""
import argparse
import re

from Bio.Alphabet import Gapped
from Bio.Alphabet.IUPAC import ambiguous_dna
from Bio.Data.IUPACData import ambiguous_dna_values
from Bio.Seq import Seq
from tab import cast_boolean, cast_null


[docs]class MavisNamespace(argparse.Namespace): """ Namespace to hold module constants Example: >>> nspace = MavisNamespace(thing=1, otherthing=2) >>> nspace.thing 1 >>> nspace.otherthing 2 """ reserved_attr = ['_types', '_defns'] def __init__(self, **kwargs): for k in kwargs: if k in MavisNamespace.reserved_attr: raise AttributeError('reserved attribute {} cannot be used'.format(k)) self._defns = {} self._types = {} argparse.Namespace.__init__(self, **kwargs) for attr, value in kwargs.items(): if not attr.startswith('_'): self._set_type(attr, type(value))
[docs] def items(self): """ Example: >>> MavisNamespace(thing=1, otherthing=2).items() [('thing', 1), ('otherthing', 2)] """ return [(k, self[k]) for k in self.keys()]
def __getitem__(self, key): return getattr(self, key) def __setitem__(self, key, val): if key in MavisNamespace.reserved_attr: raise AttributeError('reserved attribute {} cannot be used'.format(key)) self.__dict__[key] = val
[docs] def flatten(self): """ returns the namespace (minus types and definitions) as a dictionary Example: >>> MavisNamespace(thing=1, otherthing=2).flatten() {'thing': 1, 'otherthing': 2} """ items = {} items.update(self.items()) return items
[docs] def get(self, key, *pos): """ get an attribute, return a default (if given) if the attribute does not exist Example: >>> nspace = MavisNamespace(thing=1, otherthing=2) >>> nspace.get('thing', 2) 1 >>> nspace.get('nonexistant_thing', 2) 2 >>> nspace.get('nonexistant_thing') Traceback (most recent call last): .... """ if len(pos) > 1: raise TypeError('too many arguments. get takes a single \'default\' value argument') try: return self[key] except AttributeError as err: if pos: return pos[0] raise err
[docs] def keys(self): """ get the attribute keys as a list Example: >>> MavisNamespace(thing=1, otherthing=2).keys() ['thing', 'otherthing'] """ return [k for k in self.__dict__ if not k.startswith('_')]
[docs] def values(self): """ get the attribute values as a list Example: >>> MavisNamespace(thing=1, otherthing=2).values() [1, 2] """ return [self[k] for k in self.keys()]
[docs] def enforce(self, value): """ checks that the current namespace has a given value Returns: the input value Raises: KeyError: the value did not exist Example: >>> nspace = MavisNamespace(thing=1, otherthing=2) >>> nspace.enforce(1) 1 >>> nspace.enforce(3) Traceback (most recent call last): .... """ if value not in self.values(): raise KeyError('value {0} is not a valid member'.format(value), self.values()) return value
[docs] def reverse(self, value): """ for a given value, return the associated key Args: value: the value to get the key/attribute name for Raises: KeyError: the value is not unique KeyError: the value is not assigned Example: >>> nspace = MavisNamespace(thing=1, otherthing=2) >>> nspace.reverse(1) 'thing' """ result = [] for key in self.keys(): if self[key] == value: result.append(key) if len(result) > 1: raise KeyError('could not reverse, the mapping is not unique', value, result) elif not result: raise KeyError('input value is not assigned to a key', value) return result[0]
def __iter__(self): return iter(self.keys()) def _set_type(self, attr, cast_type): if cast_type == bool: self._types[attr] = cast_boolean else: self._types[attr] = cast_type
[docs] def type(self, attr): """ returns the type Example: >>> nspace = MavisNamespace(thing=1, otherthing=2) >>> nspace.type('thing') <class 'int'> """ return self._types[attr]
[docs] def define(self, attr, *pos): """ Get the definition of a given attribute or return a default (when given) if the attribute does not exist Returns: str: definition for the attribute Raises: KeyError: the attribute does not exist and a default was not given Example: >>> nspace = MavisNamespace() >>> nspace.add('thing', 1, defn='I am a thing') >>> nspace.add('otherthing', 2) >>> nspace.define('thing') 'I am a thing' >>> nspace.define('otherthing') Traceback (most recent call last): .... >>> nspace.define('otherthing', 'I am some other thing') 'I am some other thing' """ if len(pos) > 1: raise TypeError('too many arguments. define takes a single \'default\' value argument') try: return self._defns[attr] except KeyError as err: if pos: return pos[0] raise err
[docs] def add(self, attr, *pos, **kwargs): """ Add an attribute to the name space. Optionally include cast_type and definition Example: >>> nspace = MavisNamespace() >>> nspace.add('thing', 1, int, 'I am a thing') >>> nspace = MavisNamespace() >>> nspace.add('thing', 1, int) >>> nspace = MavisNamespace() >>> nspace.add('thing', 1) >>> nspace = MavisNamespace() >>> nspace.add('thing', value=1, cast_type=int, defn='I am a thing') """ if len(pos) > 1: raise TypeError('add() takes 3 positional arguments but more were given') for value, argname in zip(pos, ['value', 'cast_type', 'defn'][:len(pos)]): if argname in kwargs: raise TypeError('add() got multiple values for argument {}'.format(repr(argname))) kwargs['value'] = pos[0] value = kwargs.pop('value', attr) self[attr] = value if 'cast_type' not in kwargs: self._set_type(attr, type(value)) else: self._types[attr] = kwargs.pop('cast_type') if 'defn' in kwargs: self._defns[attr] = kwargs.pop('defn') if kwargs: raise TypeError('invalid arguments: {}'.format(kwargs.keys()))
def __call__(self, value): try: return self.enforce(value) except KeyError: raise TypeError('Invalid value {}. Cannot cast to type {}. Must be a valid member: {}'.format( repr(value), self.__class__.__name__, self.values()))
[docs]def float_fraction(num): """ cast input to a float Args: num: input to cast Returns: float Raises: TypeError: if the input cannot be cast to a float or the number is not between 0 and 1 """ try: num = float(num) except ValueError: raise argparse.ArgumentTypeError('Must be a value between 0 and 1') if num < 0 or num > 1: raise argparse.ArgumentTypeError('Must be a value between 0 and 1') return num
[docs]def nullable_int(num): """ casts input to an int if not an accepted null value. See :func:tab.tab.cast_null """ try: return cast_null(num) except TypeError: pass return int(num)
COMPLETE_STAMP = 'MAVIS.COMPLETE' SUBCOMMAND = MavisNamespace( ANNOTATE='annotate', VALIDATE='validate', PIPELINE='pipeline', CLUSTER='cluster', PAIR='pairing', SUMMARY='summary', CHECKER='checker', CONFIG='config', CONVERT='convert', OVERLAY='overlay' ) """:class:`MavisNamespace`: holds controlled vocabulary for allowed pipeline stage values - annotate - validate - pipeline - cluster - pairing - summary - checker - config - convert """ CODON_SIZE = 3 """:class:`int`: the number of bases making up a codon"""
[docs]def reverse_complement(s): """ wrapper for the Bio.Seq reverse_complement method Args: s (str): the input DNA sequence Returns: :class:`str`: the reverse complement of the input sequence Warning: assumes the input is a DNA sequence Example: >>> reverse_complement('ATCCGGT') 'ACCGGAT' """ input_string = str(s) if not re.match('^[A-Za-z]*$', input_string): raise ValueError('unexpected sequence format. cannot reverse complement', input_string) input_string = Seq(input_string, DNA_ALPHABET) return str(input_string.reverse_complement())
[docs]def translate(s, reading_frame=0): """ given a DNA sequence, translates it and returns the protein amino acid sequence Args: s (str): the input DNA sequence reading_frame (int): where to start translating the sequence Returns: str: the amino acid sequence """ reading_frame = reading_frame % CODON_SIZE temp = s[reading_frame:] if len(temp) % 3 == 1: temp = temp[:-1] elif len(temp) % 3 == 2: temp = temp[:-2] temp = Seq(temp, DNA_ALPHABET) return str(temp.translate())
GAP = '-' ORIENT = MavisNamespace(LEFT='L', RIGHT='R', NS='?') """:class:`MavisNamespace`: holds controlled vocabulary for allowed orientation values - ``LEFT``: left wrt to the positive/forward strand - ``RIGHT``: right wrt to the positive/forward strand - ``NS``: orientation is not specified """ setattr(ORIENT, 'expand', lambda x: [ORIENT.LEFT, ORIENT.RIGHT] if x == ORIENT.NS else [x]) setattr(ORIENT, 'compare', lambda x, y: True if ORIENT.NS in [x, y] else (x == y)) PROTOCOL = MavisNamespace(GENOME='genome', TRANS='transcriptome') """:class:`MavisNamespace`: holds controlled vocabulary for allowed protocol values - ``GENOME``: genome - ``TRANS``: transcriptome """ DISEASE_STATUS = MavisNamespace(DISEASED='diseased', NORMAL='normal') """:class:`MavisNamespace`: holds controlled vocabulary for allowed disease status - ``DISEASED``: diseased - ``NORMAL``: normal """ STRAND = MavisNamespace(POS='+', NEG='-', NS='?') """:class:`MavisNamespace`: holds controlled vocabulary for allowed strand values - ``POS``: the positive/forward strand - ``NEG``: the negative/reverse strand - ``NS``: strand is not specified """ setattr(STRAND, 'expand', lambda x: [STRAND.POS, STRAND.NEG] if x == STRAND.NS else [x]) setattr(STRAND, 'compare', lambda x, y: True if STRAND.NS in [x, y] else (x == y)) SVTYPE = MavisNamespace( DEL='deletion', TRANS='translocation', ITRANS='inverted translocation', INV='inversion', INS='insertion', DUP='duplication' ) """:class:`MavisNamespace`: holds controlled vocabulary for acceptable structural variant classifications - ``DEL``: deletion - ``TRANS``: translocation - ``ITRANS``: inverted translocation - ``INV``: inversion - ``INS``: insertion - ``DUP``: duplication """ CIGAR = MavisNamespace(M=0, I=1, D=2, N=3, S=4, H=5, P=6, X=8, EQ=7) """:class:`MavisNamespace`: Enum-like. For readable cigar values - ``M``: alignment match (can be a sequence match or mismatch) - ``I``: insertion to the reference - ``D``: deletion from the reference - ``N``: skipped region from the reference - ``S``: soft clipping (clipped sequences present in SEQ) - ``H``: hard clipping (clipped sequences NOT present in SEQ) - ``P``: padding (silent deletion from padded reference) - ``EQ``: sequence match (=) - ``X``: sequence mismatch note: descriptions are taken from the `samfile documentation <https://samtools.github.io/hts-specs/SAMv1.pdf>`_ """ NA_MAPPING_QUALITY = 255 """:class:`int`: mapping quality value to indicate mapping was not performed/calculated""" PYSAM_READ_FLAGS = MavisNamespace( REVERSE=16, MATE_REVERSE=32, UNMAPPED=4, MATE_UNMAPPED=8, FIRST_IN_PAIR=64, LAST_IN_PAIR=128, SECONDARY=256, MULTIMAP=1, SUPPLEMENTARY=2048, TARGETED_ALIGNMENT='ta', RECOMPUTED_CIGAR='rc', BLAT_RANK='br', BLAT_SCORE='bs', BLAT_ALIGNMENTS='ba', BLAT_PERCENT_IDENTITY='bi', BLAT_PMS='bp' ) """:class:`MavisNamespace`: Enum-like. For readable PYSAM flag constants - ``MULTIMAP``: template having multiple segments in sequencing - ``UNMAPPED``: segment unmapped - ``MATE_UNMAPPED``: next segment in the template unmapped - ``REVERSE``: SEQ being reverse complemented - ``MATE_REVERSE``: SEQ of the next segment in the template being reverse complemented - ``FIRST_IN_PAIR``: the first segment in the template - ``LAST_IN_PAIR``: the last segment in the template - ``SECONDARY``: secondary alignment - ``SUPPLEMENTARY``: supplementary alignment note: descriptions are taken from the `samfile documentation <https://samtools.github.io/hts-specs/SAMv1.pdf>`_ """ # read paired, read mapped in proper pair, mate reverse strand, first in pair def _match_ambiguous_dna(x, y): """ >>> _match_ambiguous_dna('A', 'N') True >>> _match_ambiguous_dna('A', 'T') False >>> _match_ambiguous_dna('A', 'A') True """ x = x.upper() y = y.upper() xset = set(ambiguous_dna_values.get(x, x)) yset = set(ambiguous_dna_values.get(y, y)) if not xset.intersection(yset): return False return True DNA_ALPHABET = alphabet = Gapped(ambiguous_dna, '-') DNA_ALPHABET.match = lambda x, y: _match_ambiguous_dna(x, y) FLAGS = MavisNamespace(LQ='LOWQUAL') READ_PAIR_TYPE = MavisNamespace(RR='RR', LL='LL', RL='RL', LR='LR') CALL_METHOD = MavisNamespace(CONTIG='contig', SPLIT='split reads', FLANK='flanking reads', SPAN='spanning reads', INPUT='input') """:class:`MavisNamespace`: holds controlled vocabulary for allowed call methods - ``CONTIG``: a contig was assembled and aligned across the breakpoints - ``SPLIT``: the event was called by :term:`split read` - ``FLANK``: the event was called by :term:`flanking read pair` - ``SPAN``: the event was called by :term:`spanning read` """ GENE_PRODUCT_TYPE = MavisNamespace(SENSE='sense', ANTI_SENSE='anti-sense') """:class:`MavisNamespace`: controlled vocabulary for gene products - ``SENSE``: the gene product is a sense fusion - ``ANTI_SENSE``: the gene product is anti-sense """ PRIME = MavisNamespace(FIVE=5, THREE=3) """:class:`MavisNamespace`: holds controlled vocabulary - ``FIVE``: five prime - ``THREE``: three prime """ START_AA = 'M' """:class:`str`: The amino acid expected to start translation """ STOP_AA = '*' """:class:`str`: The amino acid expected to end translation """ GIEMSA_STAIN = MavisNamespace( GNEG='gneg', GPOS50='gpos50', GPOS75='gpos75', GPOS25='gpos25', GPOS100='gpos100', ACEN='acen', GVAR='gvar', STALK='stalk' ) """:class:`MavisNamespace`: holds controlled vocabulary relating to stains of chromosome bands""" # content related to tabbed files for input/output # ensure that we don't have to change ALL the code when we update column names COLUMNS = MavisNamespace( tracking_id='tracking_id', library='library', cluster_id='cluster_id', cluster_size='cluster_size', validation_id='validation_id', annotation_id='annotation_id', product_id='product_id', event_type='event_type', pairing='pairing', inferred_pairing='inferred_pairing', gene1='gene1', gene1_direction='gene1_direction', gene2='gene2', gene2_direction='gene2_direction', gene1_aliases='gene1_aliases', gene2_aliases='gene2_aliases', gene_product_type='gene_product_type', transcript1='transcript1', transcript2='transcript2', fusion_splicing_pattern='fusion_splicing_pattern', fusion_cdna_coding_start='fusion_cdna_coding_start', fusion_cdna_coding_end='fusion_cdna_coding_end', fusion_mapped_domains='fusion_mapped_domains', fusion_sequence_fasta_id='fusion_sequence_fasta_id', fusion_sequence_fasta_file='fusion_sequence_fasta_file', fusion_protein_hgvs='fusion_protein_hgvs', annotation_figure='annotation_figure', annotation_figure_legend='annotation_figure_legend', genes_encompassed='genes_encompassed', genes_overlapping_break1='genes_overlapping_break1', genes_overlapping_break2='genes_overlapping_break2', genes_proximal_to_break1='genes_proximal_to_break1', genes_proximal_to_break2='genes_proximal_to_break2', break1_chromosome='break1_chromosome', break1_position_start='break1_position_start', break1_position_end='break1_position_end', break1_orientation='break1_orientation', exon_last_5prime='exon_last_5prime', exon_first_3prime='exon_first_3prime', break1_strand='break1_strand', break1_seq='break1_seq', break2_chromosome='break2_chromosome', break2_position_start='break2_position_start', break2_position_end='break2_position_end', break2_orientation='break2_orientation', break2_strand='break2_strand', break2_seq='break2_seq', opposing_strands='opposing_strands', stranded='stranded', protocol='protocol', disease_status='disease_status', tools='tools', call_method='call_method', break1_ewindow='break1_ewindow', break1_ewindow_count='break1_ewindow_count', break1_ewindow_practical_coverage='break1_ewindow_practical_coverage', break1_homologous_seq='break1_homologous_seq', break1_split_read_names='break1_split_read_names', break1_split_reads='break1_split_reads', break1_split_reads_forced='break1_split_reads_forced', break2_ewindow='break2_ewindow', break2_ewindow_count='break2_ewindow_count', break2_ewindow_practical_coverage='break2_ewindow_practical_coverage', break2_homologous_seq='break2_homologous_seq', break2_split_read_names='break2_split_read_names', break2_split_reads='break2_split_reads', break2_split_reads_forced='break2_split_reads_forced', contig_alignment_query_consumption='contig_alignment_query_consumption', contig_alignment_score='contig_alignment_score', contig_alignment_query_name='contig_alignment_query_name', contig_read_depth='contig_read_depth', contig_break1_read_depth='contig_break1_read_depth', contig_break2_read_depth='contig_break2_read_depth', contig_blat_rank='contig_blat_rank', contig_build_score='contig_build_score', contig_remap_score='contig_remap_score', contig_remap_coverage='contig_remap_coverage', contig_remapped_read_names='contig_remapped_read_names', contig_remapped_reads='contig_remapped_reads', contig_seq='contig_seq', contig_strand_specific='contig_strand_specific', contigs_aligned='contigs_aligned', contigs_assembled='contigs_assembled', spanning_reads='spanning_reads', spanning_read_names='spanning_read_names', flanking_median_fragment_size='flanking_median_fragment_size', flanking_pairs='flanking_pairs', flanking_pairs_compatible='flanking_pairs_compatible', flanking_pairs_read_names='flanking_pairs_read_names', flanking_pairs_compatible_read_names='flanking_pairs_compatible_read_names', flanking_stdev_fragment_size='flanking_stdev_fragment_size', linking_split_read_names='linking_split_read_names', linking_split_reads='linking_split_reads', raw_break1_half_mapped_reads='raw_break1_half_mapped_reads', raw_break1_split_reads='raw_break1_split_reads', raw_break2_half_mapped_reads='raw_break2_half_mapped_reads', raw_break2_split_reads='raw_break2_split_reads', raw_flanking_pairs='raw_flanking_pairs', raw_spanning_reads='raw_spanning_reads', untemplated_seq='untemplated_seq', filter_comment='filter_comment', cdna_synon='cdna_synon', protein_synon='protein_synon', supplementary_call='supplementary_call', net_size='net_size', repeat_count='repeat_count', assumed_untemplated='assumed_untemplated' ) """:class:`MavisNamespace`: Column names for i/o files used throughout the pipeline - :term:`annotation_figure_legend` - :term:`annotation_figure` - :term:`annotation_id` - :term:`break1_chromosome` - :term:`break1_ewindow_count` - :term:`break1_ewindow_practical_coverage` - :term:`break1_ewindow` - :term:`break1_homologous_seq` - :term:`break1_orientation` - :term:`break1_position_end` - :term:`break1_position_start` - :term:`break1_seq` - :term:`break1_split_reads_forced` - :term:`break1_split_reads` - :term:`break1_strand` - :term:`break2_chromosome` - :term:`break2_ewindow_count` - :term:`break2_ewindow_practical_coverage` - :term:`break2_ewindow` - :term:`break2_homologous_seq` - :term:`break2_orientation` - :term:`break2_position_end` - :term:`break2_position_start` - :term:`break2_seq` - :term:`break2_split_reads_forced` - :term:`break2_split_reads` - :term:`break2_strand` - :term:`call_method` - :term:`cdna_synon` - :term:`cluster_id` - :term:`cluster_size` - :term:`contig_alignment_cigar` - :term:`contig_alignment_query_name` - :term:`contig_alignment_reference_start` - :term:`contig_alignment_score` - :term:`contig_build_score` - :term:`contig_remap_coverage` - :term:`contig_remap_score` - :term:`contig_remapped_read_names` - :term:`contig_remapped_reads` - :term:`contig_seq` - :term:`contig_strand_specific` - :term:`contigs_aligned` - :term:`contigs_assembled` - :term:`event_type` - :term:`flanking_median_fragment_size` - :term:`flanking_pairs_compatible` - :term:`flanking_pairs` - :term:`flanking_stdev_fragment_size` - :term:`fusion_cdna_coding_end` - :term:`fusion_cdna_coding_end` - :term:`fusion_cdna_coding_start` - :term:`fusion_mapped_domains` - :term:`fusion_sequence_fasta_file` - :term:`fusion_sequence_fasta_id` - :term:`fusion_splicing_pattern` - :term:`fusion_protein_hgvs` - :term:`gene1_aliases` - :term:`gene1_direction` - :term:`gene1` - :term:`gene2_aliases` - :term:`gene2_direction` - :term:`gene2` - :term:`gene_product_type` - :term:`genes_encompassed` - :term:`genes_overlapping_break1` - :term:`genes_overlapping_break2` - :term:`genes_proximal_to_break1` - :term:`genes_proximal_to_break2` - :term:`inferred_pairing` - :term:`library` - :term:`linking_split_reads` - :term:`net_size` - :term:`opposing_strands` - :term:`pairing` - :term:`product_id` - :term:`protein_synon` - :term:`protocol` - :term:`raw_break1_split_reads` - :term:`raw_break2_split_reads` - :term:`raw_flanking_pairs` - :term:`raw_spanning_reads` - :term:`spanning_read_names` - :term:`spanning_reads` - :term:`stranded` - :term:`tools` - :term:`tracking_id` - :term:`transcript1` - :term:`transcript2` - :term:`supplementary_call` - :term:`untemplated_seq` - :term:`validation_id` - :term:`repeat_count` - :term:`assumed_untemplated` """
[docs]def sort_columns(input_columns): order = {} for i, col in enumerate(COLUMNS.values()): order[col] = i temp = sorted([c for c in input_columns if c in order], key=lambda x: order[x]) temp = temp + sorted([c for c in input_columns if c not in order]) return temp