import itertools
from .base import Evidence
from ..align import SplitAlignment, call_read_events
from ..bam import cigar as _cigar
from ..annotate.variant import overlapping_transcripts
from ..breakpoint import Breakpoint
from ..constants import ORIENT, PROTOCOL, STRAND, SVTYPE, CIGAR
from ..interval import Interval
[docs]class GenomeEvidence(Evidence):
def __init__(self, *pos, **kwargs):
Evidence.__init__(self, *pos, **kwargs)
self.protocol = PROTOCOL.GENOME
self.outer_window1 = self.generate_window(self.break1)
self.outer_window2 = self.generate_window(self.break2)
self.inner_window1 = Interval(
max([self.break1.start - self.call_error - self.read_length + 1, 1]),
self.break1.end + self.call_error + self.read_length - 1)
self.inner_window2 = Interval(
max([self.break2.start - self.call_error - self.read_length + 1, 1]),
self.break2.end + self.call_error + self.read_length - 1)
if SVTYPE.INS in self.putative_event_types():
comb = len(self.break1 | self.break2)
if comb > len(self.break1) and comb > len(self.break2):
compt_break1 = Breakpoint(
self.break1.chr, self.break1.start, self.break1.end, orient=ORIENT.RIGHT, strand=self.break1.strand)
compt_break2 = Breakpoint(
self.break2.chr, self.break2.start, self.break2.end, orient=ORIENT.LEFT, strand=self.break2.strand)
self.compatible_window1 = self.generate_window(compt_break1)
self.compatible_window2 = self.generate_window(compt_break2)
elif SVTYPE.DUP in self.putative_event_types():
compt_break1 = Breakpoint(
self.break1.chr, self.break1.start, self.break1.end, orient=ORIENT.LEFT, strand=self.break1.strand)
compt_break2 = Breakpoint(
self.break2.chr, self.break2.start, self.break2.end, orient=ORIENT.RIGHT, strand=self.break2.strand)
self.compatible_window1 = self.generate_window(compt_break1)
self.compatible_window2 = self.generate_window(compt_break2)
[docs] def generate_window(self, breakpoint):
"""
given some input breakpoint uses the current evidence setting to determine an
appropriate window/range of where one should search for supporting reads
Args:
breakpoint (Breakpoint): the breakpoint we are generating the evidence window for
read_length (int): the read length
call_error (int):
adds a buffer to the calculations if confidence in the breakpoint calls is low can increase this
Returns:
Interval: the range where reads should be read from the bam looking for evidence for this event
"""
start = breakpoint.start - self.max_expected_fragment_size - self.call_error + 1
end = breakpoint.end + self.max_expected_fragment_size + self.call_error - 1
if breakpoint.orient == ORIENT.LEFT:
end = breakpoint.end + self.call_error + self.read_length - 1
elif breakpoint.orient == ORIENT.RIGHT:
start = breakpoint.start - self.call_error - self.read_length + 1
return Interval(max([1, start]), max([end, 1]))
[docs] def compute_fragment_size(self, read, mate=None):
return Interval(abs(read.template_length))
[docs]class TranscriptomeEvidence(Evidence):
def __init__(self, annotations, *pos, **kwargs):
Evidence.__init__(self, *pos, **kwargs)
# set the transcriptome specific overrides
if self.trans_min_mapping_quality is not None:
self.min_mapping_quality = self.trans_min_mapping_quality
if self.trans_fetch_reads_limit is not None:
self.fetch_reads_limit = self.trans_fetch_reads_limit
self.protocol = PROTOCOL.TRANS
# get the list of overlapping transcripts
self.overlapping_transcripts = overlapping_transcripts(annotations, self.break1) | overlapping_transcripts(annotations, self.break2)
self.outer_window1 = self.generate_window(self.break1)
self.outer_window2 = self.generate_window(self.break2)
tgt = self.call_error + self.read_length - 1
self.inner_window1 = self.traverse(self.break1.end, tgt, ORIENT.RIGHT) | self.traverse(self.break1.start, tgt, ORIENT.LEFT)
self.inner_window2 = self.traverse(self.break2.start, tgt, ORIENT.LEFT) | self.traverse(self.break2.end, tgt, ORIENT.RIGHT)
if SVTYPE.INS in self.putative_event_types():
comb = len(self.break1 | self.break2)
if comb > len(self.break1) and comb > len(self.break2):
compt_break1 = Breakpoint(
self.break1.chr, self.break1.start, self.break1.end, orient=ORIENT.RIGHT, strand=self.break1.strand)
compt_break2 = Breakpoint(
self.break2.chr, self.break2.start, self.break2.end, orient=ORIENT.LEFT, strand=self.break2.strand)
self.compatible_window1 = self.generate_window(compt_break1)
self.compatible_window2 = self.generate_window(compt_break2)
elif SVTYPE.DUP in self.putative_event_types():
compt_break1 = Breakpoint(
self.break1.chr, self.break1.start, self.break1.end, orient=ORIENT.LEFT, strand=self.break1.strand)
compt_break2 = Breakpoint(
self.break2.chr, self.break2.start, self.break2.end, orient=ORIENT.RIGHT, strand=self.break2.strand)
self.compatible_window1 = self.generate_window(compt_break1)
self.compatible_window2 = self.generate_window(compt_break2)
[docs] def traverse(self, start, distance, direction, strand=STRAND.NS, chrom=None):
"""
given some genomic position and a distance. Uses the input transcripts to
compute all possible genomic end positions at that distance if intronic
positions are ignored
Args:
start (int): the genomic start position
distance (int): the amount of exonic/intergenic units to traverse
direction (ORIENT): the direction wrt to the positive/forward reference strand to traverse
transcripts (:class:`list` of :class:`PreTranscript`): list of transcripts to use
"""
transcripts = self._select_transcripts(chrom, strand)
is_left = True if direction == ORIENT.LEFT else False
genomic_end_positions = set()
normal_end = GenomeEvidence.traverse(start, distance, direction).start
for transcript in itertools.chain.from_iterable([pre_transcript.transcripts for pre_transcript in transcripts]):
# convert the start to cdna coordinates
if any([
start < transcript.reference_object.start and is_left,
start > transcript.reference_object.end and not is_left
]):
continue
cdna_start, start_shift = transcript.convert_genomic_to_nearest_cdna(
start, stick_direction=ORIENT.LEFT if is_left else ORIENT.RIGHT, allow_outside=True)
if abs(start_shift) > distance: # entirely within an intron
continue
if transcript.is_reverse:
if is_left:
cdna_end = cdna_start + (distance - start_shift)
else:
cdna_end = cdna_start - (distance + start_shift)
else:
if is_left:
cdna_end = cdna_start - (distance - start_shift)
else:
cdna_end = cdna_start + (distance + start_shift)
if cdna_end <= 0:
cdna_end -= 1
# convert the cdna end back to genomic coordinates
genomic_end = transcript.convert_cdna_to_genomic(cdna_end)
if genomic_end == normal_end:
continue
genomic_end_positions.add(genomic_end)
if not genomic_end_positions:
genomic_end_positions.add(normal_end)
return Interval.from_iterable(genomic_end_positions)
[docs] def compute_fragment_size(self, read, mate):
if read.reference_start > mate.reference_start:
read, mate = mate, read
if read.reference_name == mate.reference_name:
start, end = self.distance(read.reference_start + 1, mate.reference_end, chrom=read.reference_name)
return Interval(start + 1, end + 1)
return Interval(0)
def _select_transcripts(self, chrom=None, strand=STRAND.NS):
result = []
for transcript in self.overlapping_transcripts:
if (chrom is None or transcript.get_chr() == chrom) and STRAND.compare(transcript.strand, strand):
result.append(transcript)
return result
[docs] def distance(self, start, end, strand=STRAND.NS, chrom=None):
"""
give the current list of transcripts, computes the putative exonic/intergenic distance
given two genomic positions. Intronic positions are ignored
Intergenic calculations are only done if exonic only fails
"""
exonic = []
mixed = []
inter = []
transcripts = self._select_transcripts(chrom, strand)
genomic_distance = Evidence.distance(start, end).end
# try to calculate assuming the positions are exonic
for transcript in itertools.chain.from_iterable([t.transcripts for t in transcripts]):
if not transcript.reference_object.position & Interval(start, end):
continue
cdna_start, start_shift = transcript.convert_genomic_to_nearest_cdna(start)
cdna_end, end_shift = transcript.convert_genomic_to_nearest_cdna(end)
dist = abs(cdna_end - cdna_start) + abs(start_shift) + abs(end_shift)
if cdna_start == cdna_end:
dist = abs(start_shift - end_shift)
if start_shift and end_shift:
inter.append(dist)
elif start_shift or end_shift:
mixed.append(dist)
else:
exonic.append(dist)
if exonic:
return Interval.from_iterable(exonic)
elif mixed:
return Interval.from_iterable(mixed)
elif inter:
return Interval.from_iterable(inter)
return Evidence.distance(start, end)
[docs] def generate_window(self, breakpoint):
"""
given some input breakpoint uses the current evidence setting to determine an
appropriate window/range of where one should search for supporting reads
Args:
breakpoint (Breakpoint): the breakpoint we are generating the evidence window for
annotations (dict of str and list of Gene): the set of reference annotations: genes, transcripts, etc
read_length (int): the read length
median_fragment_size (int): the median insert size
call_error (int):
adds a buffer to the calculations if confidence in the breakpoint calls is low can increase this
stdev_fragment_size:
the standard deviation away from the median for regular (non STV) read pairs
Returns:
Interval: the range where reads should be read from the bam looking for evidence for this event
"""
window = GenomeEvidence.generate_window(self, breakpoint)
tgt_left = Evidence.distance(window.start, breakpoint.start) # amount to expand to the left
tgt_right = Evidence.distance(breakpoint.end, window.end) # amount to expand to the right
window1 = self.traverse(breakpoint.start, tgt_left.end, ORIENT.LEFT, strand=breakpoint.strand, chrom=breakpoint.chr)
window2 = self.traverse(breakpoint.end, tgt_right.end, ORIENT.RIGHT, strand=breakpoint.strand, chrom=breakpoint.chr)
return window1 | window2
[docs] def min_cds_shift(self, pos, strand=STRAND.NS, chrom=None):
exon_boundaries = set()
for transcript in self._select_transcripts(chrom, strand):
for exon in transcript.exons:
exon_boundaries.update({exon.start, exon.end})
return min(exon_boundaries, key=lambda x: abs(x - pos))
[docs] def exon_boundary_shift_cigar(self, read):
"""
given an input read, converts deletions to N when the deletion matches the exon boundaries. Also shifts alignments
to correspond to the exon boundaries where possible
"""
reference_pos = read.reference_start
query_pos = 0
new_cigar = []
# collapsed transcript model
exon_ends = set()
exon_starts = set()
for transcript in self._select_transcripts(read.reference_name):
for exon in transcript.exons:
exon_starts.add(exon.start)
exon_ends.add(exon.end)
refseq = self.reference_genome[read.reference_name].seq
for i, (state, freq) in enumerate(read.cigar):
# shift to coincide with exon boundaries if possible
if new_cigar and i < len(read.cigar) - 1 and exon_ends and exon_starts:
next_state, next_freq = read.cigar[i + 1]
prev_state, prev_freq = new_cigar[-1]
# compare deletions surrounded by exact alignments. Indels at exon boundaries will
# be aligned the same as genome indels
if state in {CIGAR.D, CIGAR.N} and {next_state} & {prev_state} & {CIGAR.EQ}:
nearest_end_boundary = min(exon_ends, key=lambda x: abs(x - reference_pos - 1))
prev_alignment_seq = refseq[reference_pos - prev_freq:reference_pos]
next_reference_pos = reference_pos + freq
next_alignment_seq = refseq[max(reference_pos, next_reference_pos - prev_freq):next_reference_pos]
shift = 0
for prev_base, next_base in zip(prev_alignment_seq[::-1], next_alignment_seq[::-1]):
if prev_base == next_base:
shift += 1
else:
break
shift = min(shift, reference_pos - nearest_end_boundary)
if shift > 0:
if shift == prev_freq:
del new_cigar[-1]
else:
new_cigar[-1] = (prev_state, prev_freq - shift)
new_cigar.extend([(state, freq), (CIGAR.EQ, shift)])
reference_pos += freq
query_pos += shift
continue
if state in _cigar.REFERENCE_ALIGNED_STATES:
reference_pos += freq
if state in _cigar.QUERY_ALIGNED_STATES:
query_pos += freq
new_cigar.append((state, freq))
# mark intron deletions as N instead of D
reference_pos = read.reference_start
for i, (state, freq) in enumerate(new_cigar):
if state == CIGAR.D:
dist = self.distance(reference_pos, reference_pos + freq + 1)
if dist.start == 1:
state = CIGAR.N
if state in _cigar.REFERENCE_ALIGNED_STATES:
reference_pos += freq
new_cigar[i] = (state, freq)
return _cigar.join(new_cigar)
[docs] def standardize_read(self, read):
read = Evidence.standardize_read(self, read)
read.cigar = self.exon_boundary_shift_cigar(read)
return read