from __future__ import division
from copy import copy as _copy
from .constants import CIGAR, COLUMNS, DNA_ALPHABET, ORIENT, reverse_complement, STRAND, SVTYPE
from .error import InvalidRearrangement, NotSpecifiedError
from .interval import Interval
[docs]class Breakpoint(Interval):
"""
class for storing information about a SV breakpoint
coordinates are given as 1-indexed
"""
@property
def key(self):
return (self.chr, self.start, self.end, self.orient, self.strand)
def __init__(self, chr, start, end=None, orient=ORIENT.NS, strand=STRAND.NS, seq=None):
"""
Args:
chr (str): the chromosome
start (int): the genomic position of the breakpoint
end (int): if the breakpoint is uncertain (a range) then specify the end of the range here
orient (ORIENT): the orientation (which side is retained at the break)
strand (STRAND): the strand
seq (str): the seq
Examples:
>>> Breakpoint('1', 1, 2)
>>> Breakpoint('1', 1)
>>> Breakpoint('1', 1, 2, 'R', )
>>> Breakpoint('1', 1, orient='R')
"""
from .annotate.base import ReferenceName
Interval.__init__(self, start, end)
self.orient = ORIENT.enforce(orient)
self.chr = ReferenceName(chr)
self.strand = STRAND.enforce(strand)
self.seq = seq
def __repr__(self):
strand = '' if self.strand == STRAND.NS else self.strand
orient = '' if self.orient == ORIENT.NS else self.orient
return 'Breakpoint({0}:{1}{2}{3}{4})'.format(
self.chr,
self.start,
'-' + str(self.end) if self.end != self.start else '',
orient,
strand
)
def __eq__(self, other):
if not hasattr(other, 'key'):
return False
return self.key == other.key
def __hash__(self):
return hash(self.key)
[docs] def to_dict(self):
return {
'chr': self.chr,
'start': self.start,
'end': self.end,
'strand': self.strand,
'seq': self.seq,
'orientation': self.orient,
'type': self.__class__.__name__
}
[docs]class BreakpointPair:
"""
"""
def __getattr__(self, attr):
data = object.__getattribute__(self, 'data')
try:
return data[COLUMNS[attr]]
except (KeyError, AttributeError):
try:
return data[attr]
except KeyError:
pass
raise AttributeError(attr)
def __getitem__(self, index):
try:
index = int(index)
except ValueError:
raise IndexError('index input accessor must be an integer', index)
if index == 0:
return self.break1
elif index == 1:
return self.break2
raise IndexError('index input accessor is out of bounds: 1 or 2 only', index)
def __hash__(self):
return hash((self.break1, self.break2, self.opposing_strands, self.stranded, self.untemplated_seq))
def __eq__(self, other):
for attr in ['break1', 'break2', 'opposing_strands', 'stranded', 'untemplated_seq']:
if not hasattr(other, attr):
return False
elif getattr(self, attr) != getattr(other, attr):
return False
return True
def __lt__(self, other):
if self.break1.key < other.break1.key:
return True
elif self.break1.key > other.break1.key:
return False
elif self.break2.key < other.break2.key:
return True
elif self.break2.key > other.break2.key:
return False
elif self.untemplated_seq == other.untemplated_seq:
return False
elif self.untemplated_seq is None:
return False
elif other.untemplated_seq is None:
return True
return self.untemplated_seq < other.untemplated_seq
@property
def interchromosomal(self):
""":class:`bool`: True if the breakpoints are on different chromosomes, False otherwise"""
if self.break1.chr == self.break2.chr:
return False
return True
[docs] def copy(self):
temp = _copy(self)
temp.data = {}
temp.data.update(self.data)
temp.break1 = _copy(self.break1)
temp.break2 = _copy(self.break2)
return temp
def __init__(self, b1, b2, stranded=False, opposing_strands=None, untemplated_seq=None, data=None, **kwargs):
"""
Args:
b1 (Breakpoint): the first breakpoint
b2 (Breakpoint): the second breakpoint
stranded (bool): if not stranded then +/- is equivalent to -/+
opposing_strands (bool): are the strands at the breakpoint opposite? i.e. +/- instead of +/+
untemplated_seq (str): seq between the breakpoints that is not part of either breakpoint
data (dict): optional dictionary of attributes associated with this pair
Note:
untemplated_seq should always be given wrt to the positive/forward reference strand
Example:
>>> BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True)
>>> BreakpointPair(Breakpoint('1', 1, strand='+'), Breakpoint('1', 9999, strand='-'))
"""
if b1.key[:3] > b2.key[:3]:
self.break1 = b2
self.break2 = b1
else:
self.break1 = b1
self.break2 = b2
self.stranded = stranded
self.opposing_strands = opposing_strands
if self.break1.orient != ORIENT.NS and self.break2.orient != ORIENT.NS:
if self.opposing_strands is not None:
if (self.break1.orient == self.break2.orient and not self.opposing_strands) \
or (self.break1.orient != self.break2.orient and self.opposing_strands):
raise InvalidRearrangement(
'invalid breakpoint pair cannot form a valid combination', b1, b2, self.opposing_strands)
else:
self.opposing_strands = self.break1.orient == self.break2.orient
# between break1 and break2 not in either
self.untemplated_seq = untemplated_seq
self.data = {}
if data is not None:
self.data.update(data)
conflicts = set(data.keys()) & set(kwargs.keys())
if conflicts:
raise TypeError('data got multiple values for data elements:', conflicts)
self.data.update(kwargs)
if self.break1.strand != STRAND.NS and self.break2.strand != STRAND.NS:
opposing = self.break1.strand != self.break2.strand
if self.opposing_strands is None:
self.opposing_strands = opposing
elif self.opposing_strands != opposing:
raise AssertionError(
'conflict in input arguments, opposing_strands must agree with input breakpoints'
' when the strand has been specified'
)
if self.opposing_strands is None:
raise NotSpecifiedError('must specify if opposing_strands')
if self.stranded and STRAND.NS in [self.break1.strand, self.break2.strand]:
raise NotSpecifiedError('if stranded is specified, breakpoint strands cannot be \'not specified\'')
# try classifying to make sure it's a valid combination
BreakpointPair.classify(self)
def __str__(self):
return 'BPP({}, {}{}{})'.format(
str(self.break1),
str(self.break2),
', opposing={}'.format(self.opposing_strands) if not self.stranded else '',
', seq=' + repr(self.untemplated_seq) if self.untemplated_seq is not None else ''
)
[docs] def flatten(self):
"""
returns the key-value self for the breakpoint self information as
can be written directly as a tab row
"""
row = {}
row.update(self.data)
temp = {
COLUMNS.break1_chromosome: self.break1.chr,
COLUMNS.break1_position_start: self.break1.start,
COLUMNS.break1_position_end: self.break1.end,
COLUMNS.break1_orientation: self.break1.orient,
COLUMNS.break1_strand: self.break1.strand,
COLUMNS.break2_chromosome: self.break2.chr,
COLUMNS.break2_position_start: self.break2.start,
COLUMNS.break2_position_end: self.break2.end,
COLUMNS.break2_orientation: self.break2.orient,
COLUMNS.break2_strand: self.break2.strand,
COLUMNS.opposing_strands: self.opposing_strands,
COLUMNS.stranded: self.stranded,
COLUMNS.untemplated_seq: self.untemplated_seq,
COLUMNS.break1_seq: self.break1.seq,
COLUMNS.break2_seq: self.break2.seq
}
for col in temp:
temp[col] = str(temp[col])
row.update(temp)
return row
[docs] @classmethod
def classify(cls, pair, distance=None):
"""
uses the chr, orientations and strands to determine the
possible structural_variant types that this pair could support
Args:
pair (BreakpointPair): the pair to classify
distance (callable): if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins)
Returns:
:class:`list` of :any:`SVTYPE`: a list of possible SVTYPE
Example:
>>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True)
>>> BreakpointPair.classify(bpp)
['inversion']
>>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False)
>>> BreakpointPair.classify(bpp)
{'deletion', 'insertion'}
see :ref:`related theory documentation <theory-classifying-events>`
"""
if not pair.interchromosomal: # intrachromosomal
if pair.opposing_strands:
if (pair.break1.orient == ORIENT.LEFT and pair.break2.orient == ORIENT.RIGHT) \
or (pair.break1.orient == ORIENT.RIGHT and pair.break2.orient == ORIENT.LEFT):
raise InvalidRearrangement(pair)
return {SVTYPE.INV}
else:
if (pair.break1.orient == ORIENT.LEFT and pair.break2.orient == ORIENT.LEFT) \
or (pair.break1.orient == ORIENT.RIGHT and pair.break2.orient == ORIENT.RIGHT):
raise InvalidRearrangement(pair)
elif pair.break1.orient == ORIENT.LEFT or pair.break2.orient == ORIENT.RIGHT:
if distance:
try:
if pair.net_size(distance).start > 0:
return {SVTYPE.INS}
elif pair.net_size(distance).end < 0:
return {SVTYPE.DEL}
except ValueError:
pass
elif len(pair.break1) == 1 and len(pair.break2) == 1 and abs(pair.break1.start - pair.break2.start) < 2:
if pair.untemplated_seq == '':
return set()
return {SVTYPE.INS}
elif pair.untemplated_seq == '':
return {SVTYPE.DEL}
return {SVTYPE.DEL, SVTYPE.INS}
elif pair.break1.orient == ORIENT.RIGHT or pair.break2.orient == ORIENT.LEFT:
return {SVTYPE.DUP}
else: # interchromosomal
if pair.opposing_strands:
if (pair.break1.orient == ORIENT.LEFT and pair.break2.orient == ORIENT.RIGHT) \
or (pair.break1.orient == ORIENT.RIGHT and pair.break2.orient == ORIENT.LEFT):
raise InvalidRearrangement(pair)
return {SVTYPE.ITRANS}
else:
if (pair.break1.orient == ORIENT.LEFT and pair.break2.orient == ORIENT.LEFT) \
or (pair.break1.orient == ORIENT.RIGHT and pair.break2.orient == ORIENT.RIGHT):
raise InvalidRearrangement(pair)
return {SVTYPE.TRANS}
[docs] def net_size(self, distance=lambda x, y: Interval(abs(x - y))):
"""
Returns the size of the event for a given pair. Mainly applicable to indels
"""
if self.untemplated_seq is None or len(self.break1) > 1 or len(self.break2) > 1:
raise ValueError('cannot determine net size of a non-specific breakpoint pair')
if self.interchromosomal:
return Interval(0)
size = Interval(len(self.untemplated_seq))
min_dist, max_dist = distance(self.break1.start, self.break2.start)
# if it is a duplication the net change is the untemplated_seq as well as the duplicated region
if self.break1.orient == ORIENT.RIGHT and self.break2.orient == ORIENT.LEFT:
size.start += min_dist + 1
size.end += max_dist + 1
elif not self.opposing_strands:
size.start -= (max_dist - 1)
size.end -= (min_dist - 1)
return size
@property
def is_putative_indel(self):
if self.interchromosomal or self.opposing_strands or self.break1.orient == ORIENT.RIGHT:
return False
return True
[docs] def breakpoint_sequence_homology(self, reference_genome):
"""
for a given set of breakpoints matches the sequence opposite the partner breakpoint
this sequence comparison is done with reference to a reference genome and does not
use novel or untemplated sequence in the comparison. For this reason, insertions
will never return any homologous sequence
::
small duplication event CTT => CTTCTT
GATACATTTCTTCTTGAAAA reference
---------<========== first breakpoint
===========>-------- second breakpoint
---------CT-CT------ first break homology
-------TT-TT-------- second break homology
Args:
reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name
Returns:
tuple:
- :class:`str` - homologous sequence at the first breakpoint
- :class:`str` - homologous sequence at the second breakpoint
Raises:
AttributeError: for non specific breakpoints
"""
b1_refseq = reference_genome[self.break1.chr].seq
b2_refseq = reference_genome[self.break2.chr].seq
if len(self.break1) > 1 or len(self.break2) > 1:
raise AttributeError('cannot call shared sequence for non-specific breakpoints')
# find for the first breakpoint
pos1 = self.break1.start - 1
pos2 = self.break2.start - 1
shift1 = -1 if self.break1.orient == ORIENT.LEFT else 1
shift2 = -1 if self.break2.orient == ORIENT.RIGHT else 1
pos2 += shift2
first_seq = []
while all([
pos1 >= 0,
pos1 < len(b1_refseq),
pos2 >= 0,
pos2 < len(b2_refseq),
self.interchromosomal or pos2 > self.break1.start - 1,
self.interchromosomal or pos1 < self.break2.start - 1
]):
break1_bp = b1_refseq[pos1] if self.break1.strand == STRAND.POS else reverse_complement(b1_refseq[pos1])
break2_bp = b2_refseq[pos2] if self.break2.strand == STRAND.POS else reverse_complement(b2_refseq[pos2])
if DNA_ALPHABET.match(break1_bp, break2_bp):
first_seq.append(b1_refseq[pos1])
else:
break
pos1 += shift1
pos2 += shift2
if shift1 < 0:
first_seq.reverse()
# now go over the second breakpoint
pos1 = self.break1.start - 1 # reset the start points
pos2 = self.break2.start - 1
shift1 *= -1 # flip the directions
shift2 *= -1
pos1 += shift1
second_seq = []
while all([
pos1 >= 0,
pos1 < len(b1_refseq),
pos2 >= 0,
pos2 < len(b2_refseq),
self.interchromosomal or pos2 > self.break1.start - 1,
self.interchromosomal or pos1 < self.break2.start - 1
]):
break1_bp = b1_refseq[pos1] if self.break1.strand == STRAND.POS else reverse_complement(b1_refseq[pos1])
break2_bp = b2_refseq[pos2] if self.break2.strand == STRAND.POS else reverse_complement(b2_refseq[pos2])
if DNA_ALPHABET.match(break1_bp, break2_bp):
second_seq.append(b2_refseq[pos2])
else:
break
pos1 += shift1
pos2 += shift2
if shift1 < 0:
second_seq.reverse()
return ''.join(first_seq).upper(), ''.join(second_seq).upper()
[docs] def get_bed_repesentation(self):
bed = []
if self.interchromosomal:
bed.append((self.break1.chr, self.break1.start, self.break1.end, self.data.get(COLUMNS.cluster_id, None)))
bed.append((self.break2.chr, self.break2.start, self.break2.end, self.data.get(COLUMNS.cluster_id, None)))
else:
bed.append((self.break1.chr, self.break1.start, self.break2.end, self.data.get(COLUMNS.cluster_id, None)))
return bed