Source code for mavis.annotate.fusion

from .genomic import Exon, Transcript, PreTranscript
from .protein import calculate_orf, Domain, Translation
from ..breakpoint import Breakpoint
from ..constants import ORIENT, PRIME, PROTOCOL, reverse_complement, STRAND, SVTYPE
from ..error import NotSpecifiedError
from ..interval import Interval, IntervalMapping


[docs]def determine_prime(transcript, breakpoint): """ determine the side of the transcript 5' or 3' which is 'kept' given the breakpoint Args: transcript (Transcript): the transcript breakpoint (Breakpoint): the breakpoint Returns: PRIME: 5' or 3' Raises: AttributeError: if the orientation of the breakpoint or the strand of the transcript is not specified """ if transcript.get_strand() == STRAND.POS: if breakpoint.orient == ORIENT.LEFT: return PRIME.FIVE elif breakpoint.orient == ORIENT.RIGHT: return PRIME.THREE else: raise NotSpecifiedError('cannot determine_prime if the orient of the breakpoint has not been specified') elif transcript.get_strand() == STRAND.NEG: if breakpoint.orient == ORIENT.LEFT: return PRIME.THREE elif breakpoint.orient == ORIENT.RIGHT: return PRIME.FIVE else: raise NotSpecifiedError('cannot determine_prime if the orient of the breakpoint has not been specified') else: raise NotSpecifiedError('cannot determine prime if the strand of the transcript has not been specified')
[docs]class FusionTranscript(PreTranscript): """ FusionTranscript is a PreTranscript built from two parent PreTranscripts. It has most of the same functionality as a regular PreTranscript except that it will not have a parent gene and retains a mapping of the new exons to the exons in the PreTranscript they originated from Additionally the FusionTranscript is always constructed on the positive strand. The preferred way to construct a FusionTranscript is through the build method. """ def __init__(self): self.exon_mapping = {} self.exons = [] self.seq = None self.spliced_transcripts = [] self.position = None self.strand = STRAND.POS # always built on the positive strand self.reference_object = None self.name = None # interval to interval mapping of transcript coord to their original genome coord self.mapping_to_genome = IntervalMapping() self.mapping_to_chrs = dict() # keeps track of what chromosome per interval self.break1 = None # first breakpoint position in the fusion transcript self.break2 = None # second breakpoint position in the fusion transcript
[docs] def exon_number(self, exon): """ Args: exon (Exon): the exon to be numbered Returns: int: the number of the exon in the original transcript (prior to fusion) """ old_exon = self.exon_mapping[exon.position] return old_exon.transcript.exon_number(old_exon)
[docs] def map_region_to_genome(self, chr, interval_on_fusion, genome_interval, flipped=False): self.mapping_to_genome.add(interval_on_fusion, genome_interval, flipped) self.mapping_to_chrs[Interval(interval_on_fusion[0], interval_on_fusion[1])] = chr
@classmethod def _build_single_gene_inversion(cls, ann, reference_genome, min_orf_size, max_orf_cap, min_domain_mapping_match): """ builds a fusion transcript for a single gene inversion. Note that this is an incomplete fusion transcript and still requires translations and domain information to be added """ assert ann.event_type == SVTYPE.INV assert ann.transcript1 == ann.transcript2 fusion_pre_transcript = FusionTranscript() # pull the exons from either size of the breakpoints window if ann.break1.orient == ORIENT.RIGHT: window = Interval(ann.break1.end, ann.break2.end - 1) else: window = Interval(ann.break1.end + 1, ann.break2.end) window_seq = reference_genome[ann.break1.chr].seq[window.start - 1:window.end] # now create 'pseudo-deletion' breakpoints b1 = Breakpoint(ann.break1.chr, window.start - 1, orient=ORIENT.LEFT) b2 = Breakpoint(ann.break2.chr, window.end + 1, orient=ORIENT.RIGHT) seq1, ex1 = cls._pull_exons(ann.transcript1, b1, reference_genome[b1.chr].seq) seq2, ex2 = cls._pull_exons(ann.transcript2, b2, reference_genome[b2.chr].seq) useq = ann.untemplated_seq if ann.transcript1.get_strand() == STRAND.POS: window_seq = reverse_complement(window_seq) # b/c inversion should be opposite fusion_pre_transcript.map_region_to_genome(b1.chr, (1, len(seq1)), (ann.transcript1.start, b1.start)) fusion_pre_transcript.map_region_to_genome( b1.chr, (len(seq1) + len(useq) + len(window_seq) + 1, len(seq1) + len(useq) + len(window_seq) + len(seq2)), (b2.start, ann.transcript1.end) ) else: useq = reverse_complement(useq) # exon sequence will already be revcomp from pull exons method seq1, seq2 = seq2, seq1 ex1, ex2 = ex2, ex1 fusion_pre_transcript.map_region_to_genome(b1.chr, (1, len(seq1)), (b2.start, ann.transcript1.end), True) fusion_pre_transcript.map_region_to_genome( b1.chr, (len(seq1) + len(useq) + len(window_seq) + 1, len(seq1) + len(useq) + len(window_seq) + len(seq2)), (ann.transcript1.start, b1.start), True ) fusion_pre_transcript.seq = seq1 fusion_pre_transcript.break1 = len(seq1) fusion_pre_transcript.break2 = len(seq1) + len(useq) + len(window_seq) + 1 if ann.break1.orient == ORIENT.LEFT: fusion_pre_transcript.seq += useq + window_seq fusion_pre_transcript.map_region_to_genome( b1.chr, (len(seq1) + len(useq) + 1, len(seq1) + len(useq) + len(window_seq)), window, True if ann.transcript1.get_strand() == STRAND.POS else False ) else: fusion_pre_transcript.seq += window_seq + useq fusion_pre_transcript.map_region_to_genome( b1.chr, (len(seq1) + 1, len(seq1) + len(window_seq)), window, True if ann.transcript1.get_strand() == STRAND.POS else False ) fusion_pre_transcript.last_five_prime_exon = ex1[-1][0] fusion_pre_transcript.first_three_prime_exon = ex2[0][0] for ex, old_ex in ex1: fusion_pre_transcript.exons.append(ex) ex.reference_object = fusion_pre_transcript fusion_pre_transcript.exon_mapping[ex.position] = old_ex offset = len(fusion_pre_transcript.seq) if ann.protocol == PROTOCOL.TRANS: fusion_pre_transcript.exons[-1].end_splice_site.intact = False ex2[0][0].start_splice_site.intact = False novel_exon_start = fusion_pre_transcript.exons[-1].end + 1 novel_exon_end = offset + ex2[0][0].start - 1 if novel_exon_end >= novel_exon_start: # create a novel exon exon = Exon( novel_exon_start, novel_exon_end, fusion_pre_transcript, intact_start_splice=False, intact_end_splice=False ) fusion_pre_transcript.exons.append(exon) fusion_pre_transcript.seq += seq2 for ex, old_ex in ex2: exon = Exon( ex.start + offset, ex.end + offset, fusion_pre_transcript, intact_start_splice=ex.start_splice_site.intact, intact_end_splice=ex.end_splice_site.intact ) fusion_pre_transcript.exons.append(exon) fusion_pre_transcript.exon_mapping[exon.position] = old_ex return fusion_pre_transcript @classmethod def _build_single_gene_duplication(cls, ann, reference_genome, min_orf_size, max_orf_cap, min_domain_mapping_match): """ builds a fusion transcript for a single gene duplication. Note that this is an incomplete fusion transcript and still requires translations and domain information to be added """ assert ann.event_type == SVTYPE.DUP assert ann.transcript1 == ann.transcript2 fusion_pre_transcript = FusionTranscript() seq1, ex1 = cls._pull_exons(ann.transcript1, ann.break1, reference_genome[ann.break1.chr].seq) seq2, ex2 = cls._pull_exons(ann.transcript2, ann.break2, reference_genome[ann.break2.chr].seq) useq = ann.untemplated_seq front = Interval(ann.transcript1.start, ann.break2.start) back = Interval(ann.break1.start, ann.transcript1.end) flipped = False if ann.transcript1.get_strand() == STRAND.NEG: seq1, seq2 = (seq2, seq1) ex1, ex2 = (ex2, ex1) front, back = (back, front) useq = reverse_complement(useq) flipped = True fusion_pre_transcript.map_region_to_genome(ann.break1.chr, Interval(1, len(seq2)), front, flipped) fusion_pre_transcript.map_region_to_genome( ann.break1.chr, Interval(len(seq2) + len(useq) + 1, len(seq1) + len(seq2) + len(useq)), back, flipped ) fusion_pre_transcript.seq = seq2 + useq fusion_pre_transcript.break1 = len(seq2) fusion_pre_transcript.break2 = len(seq2) + len(useq) + 1 for ex, old_ex in ex2: fusion_pre_transcript.exons.append(ex) ex.reference_object = fusion_pre_transcript fusion_pre_transcript.exon_mapping[ex.position] = old_ex fusion_pre_transcript.last_five_prime_exon = ex2[-1][0] fusion_pre_transcript.first_three_prime_exon = ex1[0][0] offset = len(fusion_pre_transcript.seq) if ann.protocol == PROTOCOL.TRANS: fusion_pre_transcript.exons[-1].end_splice_site.intact = False ex1[0][0].start_splice_site.intact = False novel_exon_start = fusion_pre_transcript.exons[-1].end + 1 novel_exon_end = offset + ex1[0][0].start - 1 if novel_exon_end >= novel_exon_start: # create a novel exon e = Exon( novel_exon_start, novel_exon_end, fusion_pre_transcript, intact_start_splice=False, intact_end_splice=False ) fusion_pre_transcript.exons.append(e) for ex, old_ex in ex1: e = Exon( ex.start + offset, ex.end + offset, fusion_pre_transcript, intact_start_splice=ex.start_splice_site.intact, intact_end_splice=ex.end_splice_site.intact ) fusion_pre_transcript.exons.append(e) fusion_pre_transcript.exon_mapping[e.position] = old_ex fusion_pre_transcript.seq += seq1 return fusion_pre_transcript
[docs] @classmethod def build(cls, ann, reference_genome, min_orf_size=None, max_orf_cap=None, min_domain_mapping_match=None): """ Args: ann (Annotation): the annotation object we want to build a FusionTranscript for reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: FusionTranscript: the newly built fusion transcript """ if not ann.transcript1 or not ann.transcript2: raise NotSpecifiedError('cannot produce fusion transcript for non-annotated fusions') elif not ann.event_type and ann.transcript1 == ann.transcript2: raise NotSpecifiedError('event_type must be specified to produce a fusion transcript') elif ann.untemplated_seq is None: raise NotSpecifiedError( 'cannot build a fusion transcript where the untemplated sequence has not been specified') elif len(ann.break1) > 1 or len(ann.break2) > 1: raise NotSpecifiedError('cannot build fusion transcripts for non-specific breakpoints') fusion_pre_transcript = FusionTranscript() if ann.transcript1 == ann.transcript2 and ann.event_type not in [SVTYPE.DEL, SVTYPE.INS]: # single transcript events are special cases if the breakpoints face each other # as is the case for duplications and inversions if ann.event_type == SVTYPE.DUP: fusion_pre_transcript = cls._build_single_gene_duplication( ann, reference_genome, min_orf_size, max_orf_cap, min_domain_mapping_match) elif ann.event_type == SVTYPE.INV: fusion_pre_transcript = cls._build_single_gene_inversion( ann, reference_genome, min_orf_size, max_orf_cap, min_domain_mapping_match) else: raise AttributeError('unrecognized event type') else: t1 = determine_prime(ann.transcript1, ann.break1) t2 = determine_prime(ann.transcript2, ann.break2) if t1 == t2: raise NotImplementedError('do not produce fusion transcript for anti-sense fusions') seq1, ex1 = cls._pull_exons(ann.transcript1, ann.break1, reference_genome[ann.break1.chr].seq) seq2, ex2 = cls._pull_exons(ann.transcript2, ann.break2, reference_genome[ann.break2.chr].seq) useq = ann.untemplated_seq if t1 == PRIME.FIVE: fusion_pre_transcript.break1 = len(seq1) fusion_pre_transcript.break2 = len(seq1) + len(useq) + 1 if ann.transcript1.strand == STRAND.NEG: useq = reverse_complement(useq) fusion_pre_transcript.map_region_to_genome( ann.break1.chr, (1, len(seq1)), (ann.break1.start, ann.transcript1.end), True ) else: fusion_pre_transcript.map_region_to_genome( ann.break1.chr, (1, len(seq1)), (ann.transcript1.start, ann.break1.start) ) if ann.transcript2.strand == STRAND.NEG: # t2 is the 3' transcript fusion_pre_transcript.map_region_to_genome( ann.break2.chr, (len(seq1) + len(useq) + 1, len(seq1) + len(useq) + len(seq2)), (ann.transcript2.start, ann.break2.start), True ) else: fusion_pre_transcript.map_region_to_genome( ann.break2.chr, (len(seq1) + len(useq) + 1, len(seq1) + len(useq) + len(seq2)), (ann.break2.start, ann.transcript2.end) ) else: fusion_pre_transcript.break1 = len(seq2) fusion_pre_transcript.break2 = len(seq2) + len(useq) + 1 if ann.transcript2.strand == STRAND.NEG: useq = reverse_complement(useq) fusion_pre_transcript.map_region_to_genome( ann.break2.chr, (1, len(seq2)), (ann.break2.start, ann.transcript2.end), True ) else: fusion_pre_transcript.map_region_to_genome( ann.break2.chr, (1, len(seq2)), (ann.transcript2.start, ann.break2.start) ) if ann.transcript1.strand == STRAND.NEG: # t1 is the 3' transcript fusion_pre_transcript.map_region_to_genome( ann.break1.chr, (len(seq2) + len(useq) + 1, len(seq1) + len(useq) + len(seq2)), (ann.transcript1.start, ann.break1.start) ) else: fusion_pre_transcript.map_region_to_genome( ann.break1.chr, (len(seq2) + len(useq) + 1, len(seq1) + len(useq) + len(seq2)), (ann.break1.start, ann.transcript1.end) ) seq1, seq2 = seq2, seq1 ex1, ex2 = ex2, ex1 fusion_pre_transcript.seq = seq1 + useq for ex, old_ex in ex1: fusion_pre_transcript.exons.append(ex) ex.reference_object = fusion_pre_transcript fusion_pre_transcript.exon_mapping[ex.position] = old_ex offset = len(fusion_pre_transcript.seq) if ann.protocol == PROTOCOL.TRANS: fusion_pre_transcript.exons[-1].end_splice_site.intact = False ex2[0][0].start_splice_site.intact = False novel_exon_start = fusion_pre_transcript.exons[-1].end + 1 novel_exon_end = offset + ex2[0][0].start - 1 if novel_exon_end >= novel_exon_start: # create a novel exon e = Exon( novel_exon_start, novel_exon_end, fusion_pre_transcript, intact_start_splice=False, intact_end_splice=False ) fusion_pre_transcript.exons.append(e) for ex, old_ex in ex2: e = Exon( ex.start + offset, ex.end + offset, fusion_pre_transcript, intact_start_splice=ex.start_splice_site.intact, intact_end_splice=ex.end_splice_site.intact ) fusion_pre_transcript.exons.append(e) fusion_pre_transcript.exon_mapping[e.position] = old_ex fusion_pre_transcript.seq += seq2 fusion_pre_transcript.position = Interval(1, len(fusion_pre_transcript.seq)) # add all splice variants for spl_patt in fusion_pre_transcript.generate_splicing_patterns(): fusion_spl_tx = Transcript(fusion_pre_transcript, spl_patt) fusion_pre_transcript.spliced_transcripts.append(fusion_spl_tx) # calculate the putative open reading frames orfs = calculate_orf(fusion_spl_tx.get_seq(), min_orf_size=min_orf_size) # limit the length to either only the longest ORF or anything longer than the input translations min_orf_length = max([len(o) for o in orfs] + [min_orf_size if min_orf_size else 0]) for ref_tx in [ann.transcript1, ann.transcript2]: for tlx in ref_tx.translations: min_orf_length = min(min_orf_length, len(tlx)) # filter the orfs based on size orfs = [o for o in orfs if len(o) >= min_orf_length] # if there are still too many filter to reasonable number if max_orf_cap and len(orfs) > max_orf_cap: # limit the number of orfs returned orfs = sorted(orfs, key=lambda x: len(x), reverse=True) orfs = orfs[0:max_orf_cap] # create the translations for orf in orfs: translation = Translation( orf.start - fusion_spl_tx.start + 1, orf.end - fusion_spl_tx.start + 1, fusion_spl_tx) fusion_spl_tx.translations.append(translation) # remap the domains from the original translations to the current translations for fusion_spl_tx in fusion_pre_transcript.spliced_transcripts: for new_tl in fusion_spl_tx.translations: aa_seq = new_tl.get_aa_seq() assert aa_seq[0] == 'M' translations = ann.transcript1.translations[:] if ann.transcript1 != ann.transcript2: translations += ann.transcript2.translations for translation in translations: for dom in translation.domains: try: match, total, regions = dom.align_seq(aa_seq, reference_genome) if min_domain_mapping_match is None or match / total >= min_domain_mapping_match: new_dom = Domain(dom.name, regions, new_tl) new_tl.domains.append(new_dom) except UserWarning: pass return fusion_pre_transcript
[docs] def get_seq(self, reference_genome=None, ignore_cache=False): return PreTranscript.get_seq(self)
[docs] def get_cdna_seq(self, splicing_pattern, reference_genome=None, ignore_cache=False): """ Args: splicing_pattern (:class:`list` of :class:`int`): the list of splicing positions reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference seq by template/chr name Returns: str: the spliced cDNA seq """ return PreTranscript.get_cdna_seq(self, splicing_pattern)
@classmethod def _pull_exons(cls, transcript, breakpoint, reference_sequence): """ given a transcript and breakpoint returns the exons and sequence expected the exons are returned in the order wrt to strand (i.e. reversed from a genomic sort if they are on the negative strand). The positions of the exons are wrt to the sequence being returned (starts at 1). """ if len(breakpoint) > 1: raise AttributeError('cannot pull exons on non-specific breakpoints') new_exons = [] s = '' exons = sorted(transcript.exons, key=lambda x: x.start) if breakpoint.orient == ORIENT.LEFT: # five prime for i, exon in enumerate(exons): intact_start_splice = True intact_end_splice = True if breakpoint.start < exon.start: # =====----|----|---- if i > 0: # add intron temp = reference_sequence[exons[i - 1].end:breakpoint.start] s += temp break else: if i > 0: # add intron temp = reference_sequence[exons[i - 1].end:exon.start - 1] s += temp if breakpoint.start <= exon.end_splice_site.end: intact_end_splice = False if breakpoint.start <= exon.start_splice_site.end: intact_start_splice = False t = min(breakpoint.start, exon.end) e = Exon( len(s) + 1, len(s) + t - exon.start + 1, intact_start_splice=intact_start_splice, intact_end_splice=intact_end_splice, strand=STRAND.POS ) temp = reference_sequence[exon.start - 1:t] s += temp new_exons.append((e, exon)) elif breakpoint.orient == ORIENT.RIGHT: # three prime for i, exon in enumerate(exons): intact_start_splice = True intact_end_splice = True if breakpoint.start < exon.start: # --==|====|==== if i > 0: # add last intron t = max(exons[i - 1].end + 1, breakpoint.start) temp = reference_sequence[t - 1:exon.start - 1] s += temp if Interval.overlaps(breakpoint, exon.start_splice_site): intact_start_splice = False # add the exon e = Exon( len(s) + 1, len(s) + len(exon), intact_start_splice=intact_start_splice, intact_end_splice=intact_end_splice, strand=STRAND.POS ) temp = reference_sequence[exon.start - 1:exon.end] assert len(temp) == len(e) s += temp new_exons.append((e, exon)) elif breakpoint.start <= exon.end: # --|-====|==== intact_start_splice = False temp = reference_sequence[breakpoint.start - 1:exon.end] if Interval.overlaps(breakpoint, exon.end_splice_site): intact_end_splice = False # add the exon e = Exon( len(s) + 1, len(s) + len(temp), intact_start_splice=intact_start_splice, intact_end_splice=intact_end_splice, strand=STRAND.POS ) s += temp new_exons.append((e, exon)) else: raise NotSpecifiedError('breakpoint orientation must be specified to pull exons') if transcript.get_strand() == STRAND.NEG: # reverse complement the sequence and reverse the exons temp = new_exons new_exons = [] for ex, old_exon in temp[::-1]: e = Exon( len(s) - ex.end + 1, len(s) - ex.start + 1, intact_start_splice=ex.end_splice_site.intact, intact_end_splice=ex.start_splice_site.intact, strand=STRAND.POS ) new_exons.append((e, old_exon)) s = reverse_complement(s) elif transcript.get_strand() != STRAND.POS: raise NotSpecifiedError('transcript strand must be specified to pull exons') return str(s), new_exons