Source code for mavis.annotate.splicing

import itertools

from .base import BioInterval
from .constants import ACCEPTOR_SEQ, DONOR_SEQ, SPLICE_SITE_RADIUS, SPLICE_SITE_TYPE, SPLICE_TYPE
from ..constants import reverse_complement, STRAND
from ..interval import Interval


[docs]class SplicingPattern(list): def __init__(self, *args, splice_type=SPLICE_TYPE.NORMAL): list.__init__(self, *args) self.splice_type = splice_type def __str__(self): temp = [] for site in self: temp.append('{}{}{}'.format('D' if site.type == SPLICE_SITE_TYPE.DONOR else 'A', site.pos, '' if site.intact else '*')) return '[{}]'.format(', '.join(temp))
[docs] @classmethod def classify(cls, pattern, original_sites): # now need to decide the type for each set pattern = sorted(pattern) r_introns = [] s_exons = [] assert len(pattern) % 2 == 0 for donor, acceptor in zip(pattern[0::2], pattern[1::2]): # check if any original splice positions are between this donor and acceptor temp = [] for site in original_sites: if site > donor and site < acceptor: temp.append(site) assert len(temp) % 2 == 0 s_exons.extend(temp) for acceptor, donor in zip(pattern[1::2], pattern[2::2]): temp = [] for site in original_sites: if site > acceptor and site < donor: temp.append(site) assert len(temp) % 2 == 0 r_introns.extend(temp) if pattern: # any skipped positions before the first donor or after the last acceptor temp = [] for site in original_sites: if site < pattern[0]: temp.append(site) assert len(temp) % 2 == 0 r_introns.extend(temp) temp = [] for site in original_sites: if site > pattern[-1]: temp.append(site) r_introns.extend(temp) assert len(temp) % 2 == 0 rintron_count = 0 for i in range(0, len(r_introns) - 1): if abs(r_introns[i].pos - r_introns[i + 1].pos) > 1: rintron_count += 1 sexon_count = len(s_exons) // 2 # now classifying the pattern if rintron_count + sexon_count == 0: return SPLICE_TYPE.NORMAL elif rintron_count == 0: if sexon_count > 1: return SPLICE_TYPE.MULTI_SKIP return SPLICE_TYPE.SKIP elif sexon_count == 0: if rintron_count > 1: return SPLICE_TYPE.MULTI_RETAIN return SPLICE_TYPE.RETAIN return SPLICE_TYPE.COMPLEX
[docs] @classmethod def generate_patterns(cls, sites, is_reverse=False): """ returns a list of splice sites to be connected as a splicing pattern Returns: :class:`list` of :class:`SplicingPattern`: List of positions to be spliced together see :ref:`theory - predicting splicing patterns <theory-predicting-splicing-patterns>` """ if not sites: return [SplicingPattern()] sites = sorted(sites, reverse=is_reverse) patterns = [] for site in sites: if site.intact: if patterns and patterns[-1][0].type == site.type: patterns[-1].append(site) else: patterns.append([site]) if patterns and patterns[0][0].type == SPLICE_SITE_TYPE.ACCEPTOR: patterns = patterns[1:] if patterns and patterns[-1][0].type == SPLICE_SITE_TYPE.DONOR: patterns = patterns[:-1] if not patterns: return [SplicingPattern()] patterns = list(itertools.product(*patterns)) for i, patt in enumerate(patterns): patterns[i] = SplicingPattern(patt, splice_type=cls.classify(patt, sites)) return patterns
[docs]class SpliceSite(BioInterval): def __init__(self, ref, pos, site_type, intact=True, start=None, end=None, strand=None, seq=None): if start is None or end is None: self.strand = strand if strand else ref.get_strand() if self.strand == STRAND.NEG: if site_type == SPLICE_SITE_TYPE.DONOR: if start is None: start = pos - SPLICE_SITE_RADIUS if end is None: end = pos + SPLICE_SITE_RADIUS - 1 else: if start is None: start = pos - SPLICE_SITE_RADIUS + 1 if end is None: end = pos + SPLICE_SITE_RADIUS else: if site_type == SPLICE_SITE_TYPE.ACCEPTOR: if start is None: start = pos - SPLICE_SITE_RADIUS if end is None: end = pos + SPLICE_SITE_RADIUS - 1 else: if start is None: start = pos - SPLICE_SITE_RADIUS + 1 if end is None: end = pos + SPLICE_SITE_RADIUS BioInterval.__init__(self, ref, start, end, seq=seq, strand=strand) assert pos <= self.end and pos >= self.start self.pos = pos self.intact = intact self.type = SPLICE_SITE_TYPE.enforce(site_type)
[docs] def __or__(self, other): return Interval.__or__(self, other)
def __repr__(self): cls = self.__class__.__name__ refname = self.reference_object try: refname = self.reference_object.name except AttributeError: pass seq = '' if not self.seq else ', seq=' + self.seq return '{}(type={}, {}:{}({}-{}){}, strand={})'.format( cls, SPLICE_SITE_TYPE.reverse(self.type), refname, self.pos, self.start, self.end, seq, self.get_strand())
[docs]def predict_splice_sites(input_sequence, is_reverse=False): """ looks for the expected splice site sequence patterns in the input strings and returns a list of putative splice sites Args: input_sequence (str): input sequence with respect to the positive/forward strand is_reverse (bool): True when the sequences is transcribed on the reverse strand Return: list of SpliceSite: list of putative splice sites """ if is_reverse: sequence = reverse_complement(input_sequence) else: sequence = input_sequence def convert_match_to_ss(match, splice_type): prefix = match.group(1) suffix = match.group(2) return SpliceSite( None, start=match.start() + 1, end=match.end(), pos=match.start() + len(prefix), seq=prefix + suffix, site_type=splice_type, strand=STRAND.POS) sites = [] positions = set() for regex in DONOR_SEQ: for match in regex.finditer(sequence): donor_site = convert_match_to_ss(match, SPLICE_SITE_TYPE.DONOR) if donor_site.pos not in positions: sites.append(donor_site) positions.add(donor_site.pos) positions = set() for regex in ACCEPTOR_SEQ: for match in regex.finditer(sequence): acceptor_site = convert_match_to_ss(match, SPLICE_SITE_TYPE.ACCEPTOR) if acceptor_site.pos not in positions: sites.append(acceptor_site) positions.add(acceptor_site.pos) if is_reverse: temp = [] # flip all the sites for site in sites: offset = site.end - site.pos start = len(sequence) - site.end + 1 new_site = SpliceSite( None, start=start, end=len(sequence) - site.start + 1, seq=reverse_complement(site.seq), strand=STRAND.NEG, pos=start + offset, site_type=site.type) temp.append(new_site) sites = temp return sites