Source code for mavis.annotate.variant

import itertools
import json
from shortuuid import uuid

from .fusion import determine_prime, FusionTranscript
from .genomic import IntergenicRegion
from ..breakpoint import Breakpoint, BreakpointPair
from ..constants import COLUMNS, GENE_PRODUCT_TYPE, PROTOCOL, STOP_AA, STRAND, SVTYPE
from ..error import NotSpecifiedError
from ..interval import Interval
from ..util import devnull


[docs]class Annotation(BreakpointPair): """ a fusion of two transcripts created by the associated breakpoint_pair will also hold the other annotations for overlapping and encompassed and nearest genes """ def __init__( self, bpp, transcript1=None, transcript2=None, proximity=5000, data=None, **kwargs ): """ Holds a breakpoint call and a set of transcripts, other information is gathered relative to these Args: bpp (BreakpointPair): the breakpoint pair call. Will be adjusted and then stored based on the transcripts transcript1 (Transcript): transcript at the first breakpoint transcript2 (Transcript): Transcript at the second breakpoint data (dict): optional dictionary to hold related attributes event_type (SVTYPE): the type of event """ # narrow the breakpoint windows by the transcripts being used for annotation temp = bpp.break1 if transcript1 is None else bpp.break1 & transcript1 b1 = Breakpoint(bpp.break1.chr, temp[0], temp[1], strand=bpp.break1.strand, orient=bpp.break1.orient) temp = bpp.break2 if transcript2 is None else bpp.break2 & transcript2 b2 = Breakpoint(bpp.break2.chr, temp[0], temp[1], strand=bpp.break2.strand, orient=bpp.break2.orient) BreakpointPair.__init__( self, b1, b2, opposing_strands=bpp.opposing_strands, stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq ) self.data.update(bpp.data) if data is not None: conflicts = set(kwargs.keys()) & set(data.keys()) self.data.update(data) if len(conflicts) > 0: raise TypeError('got multiple values for data elements:', conflicts) self.data.update(kwargs) self.transcript1 = transcript1 self.transcript2 = transcript2 self.encompassed_genes = set() self.genes_proximal_to_break1 = set() self.genes_proximal_to_break2 = set() self.genes_overlapping_break1 = set() self.genes_overlapping_break2 = set() SVTYPE.enforce(self.event_type) PROTOCOL.enforce(self.protocol) self.proximity = proximity self.fusion = None
[docs] def add_gene(self, input_gene): """ adds a input_gene to the current set of annotations. Checks which set it should be added to Args: input_gene (input_gene): the input_gene being added """ if input_gene.chr not in [self.break1.chr, self.break2.chr]: raise AttributeError('cannot add input_gene not on the same chromosome as either breakpoint') if not self.interchromosomal: try: encompassment = Interval(self.break1.end + 1, self.break2.start - 1) if input_gene in encompassment: self.encompassed_genes.add(input_gene) except AttributeError: pass if Interval.overlaps(input_gene, self.break1) and input_gene.chr == self.break1.chr \ and input_gene != self.transcript1.reference_object: self.genes_overlapping_break1.add(input_gene) if Interval.overlaps(input_gene, self.break2) and input_gene.chr == self.break2.chr \ and input_gene != self.transcript2.reference_object: self.genes_overlapping_break2.add(input_gene) if input_gene in self.genes_overlapping_break1 or input_gene in self.genes_overlapping_break2 or \ input_gene in self.encompassed_genes or input_gene == self.transcript1.reference_object or \ input_gene == self.transcript2.reference_object: return dist1 = Interval.dist(input_gene, self.break1) dist2 = Interval.dist(input_gene, self.break2) if self.interchromosomal: if input_gene.chr == self.break1.chr: self.genes_proximal_to_break1.add((input_gene, dist1)) elif input_gene.chr == self.break2.chr: self.genes_proximal_to_break2.add((input_gene, dist2)) else: if dist1 < 0: self.genes_proximal_to_break1.add((input_gene, dist1)) if dist2 > 0: self.genes_proximal_to_break2.add((input_gene, dist2)) if self.genes_proximal_to_break1: temp = set() tgt = min([abs(d) for g, d in self.genes_proximal_to_break1]) for gene, dist in self.genes_proximal_to_break1: if self.proximity is None: if abs(dist) == tgt: temp.add((gene, dist)) elif abs(dist) <= self.proximity: temp.add((gene, dist)) self.genes_proximal_to_break1 = temp if self.genes_proximal_to_break2: temp = set() tgt = min([abs(d) for g, d in self.genes_proximal_to_break2]) for gene, dist in self.genes_proximal_to_break2: if self.proximity is None: if abs(dist) == tgt: temp.add((gene, dist)) elif abs(dist) <= self.proximity: temp.add((gene, dist)) self.genes_proximal_to_break2 = temp
[docs] def flatten(self): """ generates a dictionary of the annotation information as strings Returns: :class:`dict` of :class:`str` by :class:`str`: dictionary of attribute names and values """ row = BreakpointPair.flatten(self) row.update({ COLUMNS.genes_proximal_to_break1: self.genes_proximal_to_break1, COLUMNS.genes_proximal_to_break2: self.genes_proximal_to_break2, COLUMNS.gene1_direction: None, COLUMNS.gene2_direction: None, COLUMNS.gene_product_type: None, COLUMNS.gene1: None, COLUMNS.gene2: None, COLUMNS.transcript1: '{}:{}_{}{}'.format( self.transcript1.reference_object, self.transcript1.start, self.transcript1.end, self.transcript1.get_strand()), COLUMNS.transcript2: '{}:{}_{}{}'.format( self.transcript2.reference_object, self.transcript2.start, self.transcript2.end, self.transcript2.get_strand()), COLUMNS.genes_encompassed: ';'.join(sorted([x.name for x in self.encompassed_genes])), COLUMNS.genes_overlapping_break1: ';'.join(sorted([x.name for x in self.genes_overlapping_break1])), COLUMNS.genes_overlapping_break2: ';'.join(sorted([x.name for x in self.genes_overlapping_break2])), COLUMNS.genes_proximal_to_break1: ';'.join( sorted(['{}({})'.format(x[0].name, x[1]) for x in self.genes_proximal_to_break1])), COLUMNS.genes_proximal_to_break2: ';'.join( sorted(['{}({})'.format(x[0].name, x[1]) for x in self.genes_proximal_to_break2])), COLUMNS.event_type: self.event_type, COLUMNS.gene1_aliases: None, COLUMNS.gene2_aliases: None }) if hasattr(self.transcript1, 'gene'): row[COLUMNS.gene1] = self.transcript1.gene.name row[COLUMNS.transcript1] = self.transcript1.name if self.transcript1.gene.aliases: row[COLUMNS.gene1_aliases] = ';'.join(sorted(self.transcript1.gene.aliases)) try: row[COLUMNS.gene1_direction] = str(determine_prime(self.transcript1, self.break1)) except NotSpecifiedError: pass if hasattr(self.transcript2, 'gene'): row[COLUMNS.gene2] = self.transcript2.gene.name row[COLUMNS.transcript2] = self.transcript2.name if self.transcript2.gene.aliases: row[COLUMNS.gene2_aliases] = ';'.join(sorted(self.transcript2.gene.aliases)) try: row[COLUMNS.gene2_direction] = str(determine_prime(self.transcript2, self.break2)) if row[COLUMNS.gene1_direction] is not None: if row[COLUMNS.gene1_direction] == row[COLUMNS.gene2_direction]: row[COLUMNS.gene_product_type] = GENE_PRODUCT_TYPE.ANTI_SENSE else: row[COLUMNS.gene_product_type] = GENE_PRODUCT_TYPE.SENSE except NotSpecifiedError: pass return row
[docs] def single_transcript(self): return bool(self.transcript1 == self.transcript2 and self.transcript1)
[docs]def flatten_fusion_translation(translation): """ for a given fusion product (translation) gather the information to be output to the tabbed files Args: translation (Translation): the translation which is on the fusion transcript Returns: dict: the dictionary of column names to values """ row = dict() row[COLUMNS.fusion_cdna_coding_start] = translation.start row[COLUMNS.fusion_cdna_coding_end] = translation.end # select the exon that has changed domains = [] for dom in translation.domains: m, t = dom.score_region_mapping() temp = { 'name': dom.name, 'sequences': dom.get_seqs(), 'regions': [ {'start': dr.start, 'end': dr.end} for dr in sorted(dom.regions, key=lambda x: x.start) ], 'mapping_quality': round(m * 100 / t, 0), 'matches': m } domains.append(temp) row[COLUMNS.fusion_mapped_domains] = json.dumps(domains) return row
[docs]class IndelCall: def __init__(self, refseq, mutseq): """ Given two sequences, Assuming there exists a single difference between the two call an indel which accounts for the change """ self.last_aligned = 0 self.next_aligned = len(refseq) + 1 self.ref_seq = refseq self.mut_seq = mutseq self.is_dup = False for pos in range(0, min(len(refseq), len(mutseq))): if refseq[pos] != mutseq[pos]: break self.last_aligned = pos + 1 for pos in range(0, min(len(refseq), len(mutseq))): if refseq[-1 - pos] != mutseq[-1 - pos]: break self.next_aligned = len(refseq) - pos if len(self.ref_seq) - self.next_aligned + 1 == len(self.mut_seq): self.last_aligned = 0 del_length = max(self.next_aligned - self.last_aligned - 1, 0) ins_length = max(len(mutseq) + del_length - len(refseq), 0) self.ins_seq = mutseq[self.last_aligned:self.last_aligned + ins_length] self.del_seq = refseq[self.last_aligned:self.next_aligned - 1] if self.last_aligned == 0: self.last_aligned = -1 if self.next_aligned > len(refseq): self.next_aligned = -1 # check if the inserted sequence is actually a duplication of the preceding sequence if self.ins_seq and self.next_aligned <= self.last_aligned: dupped_refseq = self.ref_seq[self.next_aligned - 1:self.last_aligned] if dupped_refseq == self.ins_seq: self.is_dup = True
[docs] def hgvs_protein_notation(self): """ returns the HGVS protein notation for an indel call """ if any([ not self.ins_seq and not self.del_seq, # synonymous variant not self.del_seq and self.last_aligned < 1, # insertion before protein start self.last_aligned >= len(self.ref_seq) # mutation after end of ref sequence ]): return None last_align = self.last_aligned next_align = self.next_aligned if self.del_seq: last_align = max(1, self.last_aligned + 1) next_align = self.next_aligned - 1 if next_align < 0: next_align = len(self.ref_seq) if self.is_dup: if self.del_seq: raise NotImplementedError('duplication/deletion no supported', self) notation = 'p.{}{}_{}{}dup{}'.format( self.ref_seq[self.next_aligned - 1], self.next_aligned, self.ref_seq[self.last_aligned - 1], self.last_aligned, self.ins_seq) else: notation = 'p.{}{}'.format(self.ref_seq[last_align - 1], last_align) if (self.next_aligned < 0 or self.next_aligned >= len(self.ref_seq)) and self.last_aligned < len(self.mut_seq): notation += '{}fs'.format(self.mut_seq[self.last_aligned]) next_stops = [i for i, c in enumerate(self.mut_seq[self.last_aligned:]) if c == STOP_AA] if next_stops and next_stops[0]: notation += '*{}'.format(next_stops[0] + 1) else: if next_align != last_align: notation += '_{}{}'.format(self.ref_seq[next_align - 1], next_align) if self.del_seq: notation += 'del{}'.format(self.del_seq) if self.ins_seq: notation += 'ins{}'.format(self.ins_seq) return notation
def __str__(self): return 'IndelCall({})'.format(', '.join(['{}={}'.format(k, repr(v)) for k, v in sorted(self.__dict__.items())]))
[docs]def call_protein_indel(ref_translation, fusion_translation, reference_genome=None): """ compare the fusion protein/aa sequence to the reference protein/aa sequence and return an hgvs notation indel call Args: ref_translation (Translation): the reference protein/translation fusion_translation (Translation): the fusion protein/translation reference_genome: the reference genome object used to fetch the reference translation AA sequence Returns: str: the :term:`HGVS` protein indel notation """ ref_aa_seq = ref_translation.get_aa_seq(reference_genome) call = IndelCall(ref_aa_seq, fusion_translation.get_aa_seq()) notation = call.hgvs_protein_notation() if not notation: return None name = ref_translation.name curr = ref_translation while name is None: curr = curr.reference_object name = curr.name return '{}:{}'.format(name, notation)
[docs]def flatten_fusion_transcript(spliced_fusion_transcript): row = {} five_prime_exons = [] three_prime_exons = [] fusion_transcript = spliced_fusion_transcript.unspliced_transcript for ex in spliced_fusion_transcript.exons: try: src_exon = fusion_transcript.exon_mapping[ex.position] number = src_exon.transcript.exon_number(src_exon) if ex.end <= fusion_transcript.break1: five_prime_exons.append(number) elif ex.start >= fusion_transcript.break2: three_prime_exons.append(number) else: raise AssertionError( 'exon should not be mapped if not within a break region', ex, fusion_transcript.break1, fusion_transcript.break2 ) except KeyError: # novel exon for us_exon, src_exon in sorted(fusion_transcript.exon_mapping.items()): if Interval.overlaps(ex, us_exon): number = src_exon.transcript.exon_number(src_exon) if us_exon.end <= fusion_transcript.break1: five_prime_exons.append(number) elif us_exon.start >= fusion_transcript.break2: three_prime_exons.append(number) else: raise AssertionError( 'exon should not be mapped if not within a break region', us_exon, fusion_transcript.break1. fusion_transcript.break2 ) row[COLUMNS.exon_last_5prime] = five_prime_exons[-1] row[COLUMNS.exon_first_3prime] = three_prime_exons[0] row[COLUMNS.fusion_splicing_pattern] = spliced_fusion_transcript.splicing_pattern.splice_type return row
[docs]def overlapping_transcripts(ref_ann, breakpoint): """ Args: ref_ann (:class:`dict` of :class:`list` of :any:`Gene` by :class:`str`): the reference list of genes split by chromosome breakpoint (Breakpoint): the breakpoint in question Returns: :class:`list` of :any:`PreTranscript`: a list of possible transcripts """ putative_annotations = set() for gene in ref_ann.get(breakpoint.chr, []): for transcript in gene.transcripts: if breakpoint.strand != STRAND.NS and transcript.get_strand() != STRAND.NS \ and transcript.get_strand() != breakpoint.strand: continue if Interval.overlaps(breakpoint, transcript): putative_annotations.add(transcript) return putative_annotations
def _gather_breakpoint_annotations(ref_ann, breakpoint): """ Args: ref_ann (:class:`dict` of :class:`list` of :class:`Gene` by :class:`str`): the reference annotations split into lists of genes by chromosome breakpoint (Breakpoint): the breakpoint annotations are to be gathered for Returns: tuple: tuple contains - :class:`list` of (:class:`PreTranscript` or :class:`IntergenicRegion`): transcripts or intergenic regions overlapping the breakpoint on the positive strand - :class:`list` of (:class:`PreTranscript` or :class:`IntergenicRegion`): transcripts or intergenic regions overlapping the breakpoint on the negative strand .. todo:: Support for setting the transcript in the annotation when the breakpoint is just ahead of the transcript and the transcript would be 3'. Then assuming the splicing model takes the 2nd exon onward """ pos_overlapping_transcripts = [] neg_overlapping_transcripts = [] for gene in ref_ann.get(breakpoint.chr, []): for t in gene.transcripts: if Interval.overlaps(t, breakpoint): if STRAND.compare(t.get_strand(), STRAND.POS): pos_overlapping_transcripts.append(t) if STRAND.compare(t.get_strand(), STRAND.NEG): neg_overlapping_transcripts.append(t) pos_intervals = Interval.min_nonoverlapping(*pos_overlapping_transcripts) neg_intervals = Interval.min_nonoverlapping(*neg_overlapping_transcripts) temp = [] # before the first? if len(pos_intervals) > 0: first = pos_intervals[0] last = pos_intervals[-1] if breakpoint.start < first.start: temp.append(IntergenicRegion(breakpoint.chr, breakpoint[0], first[0] - 1, STRAND.POS)) if breakpoint.end > last.end: temp.append(IntergenicRegion(breakpoint.chr, last[1] + 1, breakpoint[1], STRAND.POS)) for i, curr in enumerate(pos_intervals): if i > 0: prev = pos_intervals[i - 1] try: temp.append(IntergenicRegion(breakpoint.chr, prev[1] + 1, curr[0] - 1, STRAND.POS)) except AttributeError: pass else: temp.append(IntergenicRegion(breakpoint.chr, breakpoint.start, breakpoint.end, STRAND.POS)) pos_overlapping_transcripts.extend(temp) temp = [] # before the first? if len(neg_intervals) > 0: first = neg_intervals[0] last = neg_intervals[-1] if breakpoint.start < first.start: temp.append(IntergenicRegion(breakpoint.chr, breakpoint[0], first[0] - 1, STRAND.NEG)) if breakpoint.end > last.end: temp.append(IntergenicRegion(breakpoint.chr, last[1] + 1, breakpoint[1], STRAND.NEG)) for i, curr in enumerate(neg_intervals): if i > 0: prev = neg_intervals[i - 1] try: temp.append(IntergenicRegion(breakpoint.chr, prev[1] + 1, curr[0] - 1, STRAND.NEG)) except AttributeError: pass else: temp.append(IntergenicRegion(breakpoint.chr, breakpoint.start, breakpoint.end, STRAND.NEG)) neg_overlapping_transcripts.extend(temp) if len(pos_overlapping_transcripts) == 0: raise AssertionError('neither strand group should ever be empty', pos_overlapping_transcripts) if len(neg_overlapping_transcripts) == 0: raise AssertionError('neither strand group should ever be empty', neg_overlapping_transcripts) return ( sorted(pos_overlapping_transcripts, key=lambda x: x.position), sorted(neg_overlapping_transcripts, key=lambda x: x.position)) def _gather_annotations(ref, bp, proximity=None): """ each annotation is defined by the annotations selected at the breakpoints the other annotations are given relative to this the annotation at the breakpoint can be a transcript or an intergenic region Args: ref (:class:`dict` of :class:`list` of :any:`Gene` by :class:`str`): the list of reference genes hashed by chromosomes breakpoint_pairs (:class:`list` of :any:`BreakpointPair`): breakpoint pairs we wish to annotate as events Returns: :class:`list` of :class:`Annotation`: The annotations """ annotations = dict() break1_pos, break1_neg = _gather_breakpoint_annotations(ref, bp.break1) break2_pos, break2_neg = _gather_breakpoint_annotations(ref, bp.break2) combinations = [] if bp.stranded: if bp.break1.strand == STRAND.POS: if bp.break2.strand == STRAND.POS: combinations.extend(itertools.product(break1_pos, break2_pos)) else: combinations.extend(itertools.product(break1_pos, break2_neg)) else: if bp.break2.strand == STRAND.POS: combinations.extend(itertools.product(break1_neg, break2_pos)) else: combinations.extend(itertools.product(break1_neg, break2_neg)) else: # single transcript starts .... for t in (set(break1_pos) | set(break1_neg)) & (set(break2_pos) | set(break2_neg)): try: t.gene except AttributeError: pass else: combinations.append((t, t)) if bp.opposing_strands: combinations.extend(itertools.product(break1_pos, break2_neg)) combinations.extend(itertools.product(break1_neg, break2_pos)) else: combinations.extend(itertools.product(break1_pos, break2_pos)) combinations.extend(itertools.product(break1_neg, break2_neg)) same = set() for a1, a2 in combinations: """ if a1 != a2 and hasattr(a1, 'exons') != hasattr(a2, 'exons') and not bp.interchromosomal: # one is a transcript, the other an intergenic region # take the transcript if it covers both breakpoints # this is due to the special case 'single transcript inversion' if hasattr(a1, 'exons'): if Interval.overlaps(bp.break1, a1) and Interval.overlaps(bp.break2, a1): a2 = a1 else: if Interval.overlaps(bp.break1, a2) and Interval.overlaps(bp.break2, a2): a1 = a2 """ if (a1, a2) in annotations: # ignore duplicates continue try: if a1.gene == a2.gene and a1 != a2: continue except AttributeError: pass if a1 == a2 and hasattr(a1, 'exons'): same.add(a1) b1_itvl = bp.break1 & a1 b2_itvl = bp.break2 & a2 bpp = BreakpointPair.copy(bp) bpp.break1.start = b1_itvl[0] bpp.break1.end = b1_itvl[1] bpp.break2.start = b2_itvl[0] bpp.break2.end = b2_itvl[1] a = Annotation(bpp, a1, a2, proximity=proximity) for gene in ref.get(bp.break1.chr, []): a.add_gene(gene) if bp.interchromosomal: for gene in ref.get(bp.break2.chr, []): a.add_gene(gene) annotations[(a1, a2)] = a filtered = [] # remove any inter-gene/inter-region annotations where a same transcript was found for pair, ann in annotations.items(): a1, a2 = pair if (a1 in same or a2 in same) and a1 != a2: pass else: filtered.append(ann) return filtered
[docs]def choose_more_annotated(ann_list): """ for a given set of annotations if there are annotations which contain transcripts and annotations that are simply intergenic regions, discard the intergenic region annotations similarly if there are annotations where both breakpoints fall in a transcript and annotations where one or more breakpoints lands in an intergenic region, discard those that land in the intergenic region Args: ann_list (list of :class:`Annotation`): list of input annotations Warning: input annotations are assumed to be the same event (the same validation_id) the logic used would not apply to different events Returns: list of :class:`Annotation`: the filtered list """ two_transcript = [] one_transcript = [] intergenic = [] for ann in ann_list: if isinstance(ann.transcript1, IntergenicRegion) and isinstance(ann.transcript2, IntergenicRegion): intergenic.append(ann) elif isinstance(ann.transcript1, IntergenicRegion) or isinstance(ann.transcript2, IntergenicRegion): one_transcript.append(ann) else: two_transcript.append(ann) if len(two_transcript) > 0: return two_transcript elif len(one_transcript) > 0: return one_transcript else: return intergenic
[docs]def choose_transcripts_by_priority(ann_list): """ for each set of annotations with the same combinations of genes, choose the annotation with the most "best_transcripts" or most "alphanumeric" choices of transcript. Throw an error if they are identical Args: ann_list (list of :class:`Annotation`): input annotations Warning: input annotations are assumed to be the same event (the same validation_id) the logic used would not apply to different events Returns: list of :class:`Annotation`: the filtered list """ annotations_by_gene_combination = {} genes = set() for ann in ann_list: gene1 = None gene2 = None try: gene1 = ann.transcript1.gene genes.add(gene1) except AttributeError: pass try: gene2 = ann.transcript2.gene genes.add(gene2) except AttributeError: pass annotations_by_gene_combination.setdefault((gene1, gene2), []).append(ann) filtered_annotations = [] for g, sublist in annotations_by_gene_combination.items(): gene1, gene2 = g if gene1 is None and gene2 is None: filtered_annotations.extend(sublist) elif gene2 is None: ann = min(sublist, key=lambda a: gene1.transcript_priority(a.transcript1)) filtered_annotations.append(ann) elif gene1 is None: ann = min(sublist, key=lambda a: gene2.transcript_priority(a.transcript2)) filtered_annotations.append(ann) else: ann = min(sublist, key=lambda a: ( gene1.transcript_priority(a.transcript1) + gene2.transcript_priority(a.transcript2), gene1.transcript_priority(a.transcript1), gene2.transcript_priority(a.transcript2) )) filtered_annotations.append(ann) return filtered_annotations
[docs]def annotate_events( bpps, annotations, reference_genome, max_proximity=5000, min_orf_size=200, min_domain_mapping_match=0.95, max_orf_cap=3, log=devnull, filters=None ): """ Args: bpps (list of :class:`~mavis.breakpoint.BreakpointPair`): list of events annotations: reference annotations reference_genome (dict of string by string): dictionary of reference sequences by name max_proximity (int): see :term:`max_proximity` min_orf_size (int): see :term:`min_orf_size` min_domain_mapping_match (float): see :term:`min_domain_mapping_match` max_orf_cap (int): see :term:`max_orf_cap` log (callable): callable function to take in strings and time_stamp args filters (list of callable): list of functions taking in a list and returning a list for filtering Returns: list of :class:`Annotation`: list of the putative annotations """ if filters is None: filters = [choose_more_annotated, choose_transcripts_by_priority] results = [] total = len(bpps) for i, bpp in enumerate(bpps): log('({} of {}) gathering annotations for'.format(i + 1, total), bpp) bpp.data[COLUMNS.validation_id] = bpp.data.get(COLUMNS.validation_id, str(uuid())) ann_list = _gather_annotations( annotations, bpp, proximity=max_proximity ) for f in filters: ann_list = f(ann_list) # apply the filter results.extend(ann_list) for j, ann in enumerate(ann_list): ann.data[COLUMNS.annotation_id] = '{}-a{}'.format(ann.validation_id, j + 1) if ann.untemplated_seq is None: if len(ann.break1) == 1 and len(ann.break2) == 1 and ann.event_type != SVTYPE.INS: ann.untemplated_seq = '' ann.data[COLUMNS.assumed_untemplated] = True else: ann.data[COLUMNS.assumed_untemplated] = False # try building the fusion product try: ft = FusionTranscript.build( ann, reference_genome, min_orf_size=min_orf_size, max_orf_cap=max_orf_cap, min_domain_mapping_match=min_domain_mapping_match ) ann.fusion = ft except NotSpecifiedError: pass # shouldn't build fusions for non-specific calls anyway except AttributeError: pass # will be thrown when transcript1/2 are intergenic ranges and not actual transcripts except NotImplementedError: pass # anti-sense fusions will throw this error except KeyError as e: log('warning. could not build fusion product', repr(e)) log('generated', len(ann_list), 'annotations', time_stamp=False) return results