from .constants import PAIRING_STATE
from ..breakpoint import Breakpoint, BreakpointPair
from ..constants import CALL_METHOD, COLUMNS, DISEASE_STATUS, PROTOCOL, SVTYPE
from ..interval import Interval
from ..pairing.pairing import pair_by_distance, product_key
from ..util import get_connected_components
[docs]def filter_by_annotations(bpp_list, best_transcripts):
"""
Args:
bpp_list (list of BreakpointPair): list of pairs to filter
best_transcripts (:class `dict` of :any:`Transcript` by :class:`str`): the best transcripts of the annotations
based on their names
"""
strings = []
for bpp in bpp_list:
for attr in ['gene1', 'gene2', 'transcript1', 'transcript2']:
if bpp.data[attr] is not None:
strings.append(bpp.data[attr])
string_ranks = {s: i for i, s in enumerate(sorted(strings))}
string_ranks[None] = len(strings)
def sort_key(bpp):
if bpp.fusion_cdna_coding_start is None:
result = [1, 0]
else:
result = [0, -1 * (int(bpp.fusion_cdna_coding_end) - int(bpp.fusion_cdna_coding_start))]
result.extend([
0 if bpp.transcript1 in best_transcripts else 1,
0 if bpp.transcript2 in best_transcripts else 1,
sum([bpp.transcript1 is None, bpp.transcript2 is None]),
string_ranks[bpp.gene1], string_ranks[bpp.gene2],
string_ranks[bpp.transcript1], string_ranks[bpp.transcript2]
])
return tuple(result)
bpp_list = sorted(bpp_list, key=sort_key)
result = []
removed = []
for bpp in bpp_list:
if sort_key(bpp) == sort_key(bpp_list[0]):
result.append(bpp)
else:
removed.append(bpp)
return result, removed
[docs]def filter_by_call_method(bpp_list):
"""
Filters a set of breakpoint pairs to returns the call with the most evidence.
Prefers contig evidence over spanning over split over flanking, etc.
"""
# ranking scores of the methods (more is better)
def sort_key(bpp):
key = [bpp.data.get(col, 0) if bpp.data.get(col, 0) is not None else 0 for col in [
'contig_remapped_reads',
'contig_alignment_score',
'spanning_reads',
'break1_split_reads',
'break2_split_reads',
'linking_split_reads',
'flanking_pairs'
]]
return tuple(key)
if not bpp_list:
return bpp_list
bpp_list = sorted(bpp_list, key=sort_key, reverse=True)
# filter to the top ranked method
result = []
removed = []
for bpp in bpp_list:
if sort_key(bpp) == sort_key(bpp_list[0]):
result.append(bpp)
else:
removed.append(bpp)
return result, removed
[docs]def group_events(events):
"""
group events together and join data attributes
"""
# take the outer regions of the breakpoints
first = events[0]
new_bpp = BreakpointPair(
Breakpoint(
first.break1.chr,
min([b.break1.start for b in events]),
max([b.break1.end for b in events]),
orient=first.break1.orient,
strand=first.break1.strand),
Breakpoint(
first.break2.chr,
min([b.break2.start for b in events]),
max([b.break2.end for b in events]),
orient=first.break2.orient,
strand=first.break2.strand),
opposing_strands=first.opposing_strands,
stranded=first.stranded
)
for bpp in events:
if any([
bpp.break1.chr != new_bpp.break1.chr,
bpp.break2.chr != new_bpp.break2.chr,
bpp.break1.orient != new_bpp.break1.orient,
bpp.break2.orient != new_bpp.break2.orient,
bpp.opposing_strands != new_bpp.opposing_strands,
bpp.break1.strand != new_bpp.break1.strand,
bpp.break2.strand != new_bpp.break2.strand
]):
raise AssertionError('cannot group events differing on key elements', bpp, new_bpp)
# Note: There are some attributes that shouldn't be lost if different, currently appending the information
# The evidence could be better off as a max instead of a join
for col in [
COLUMNS.contig_seq, COLUMNS.call_method,
COLUMNS.break1_split_reads, COLUMNS.break2_split_reads, COLUMNS.contig_alignment_score,
COLUMNS.spanning_reads, COLUMNS.flanking_pairs, COLUMNS.tools,
COLUMNS.product_id, COLUMNS.event_type, COLUMNS.annotation_id,
COLUMNS.pairing, COLUMNS.annotation_figure,
COLUMNS.contig_remapped_reads, COLUMNS.tools,
COLUMNS.tracking_id
]:
new_data = sorted(list({bpp.data[col] for bpp in events}))
new_bpp.data[col] = new_data[0] if len(new_data) == 1 else ';'.join([str(v) for v in new_data])
untemplated_seq = {bpp.untemplated_seq for bpp in events}
if len(untemplated_seq) == 1:
new_bpp.untemplated_seq = list(untemplated_seq)[0]
return new_bpp
[docs]def group_by_distance(calls, distances):
"""
groups a set of calls based on their proximity. Returns a new list of calls where close calls have been merged
"""
mapping = {}
for call in calls:
mapping.setdefault(product_key(call), []).append(call)
pairing = pair_by_distance(calls, distances, against_self=True)
# merge all the 'close-enough' pairs
grouped_calls = []
removed_calls = []
for component in get_connected_components(pairing):
if len(component) == 1:
grouped_calls.extend(mapping[component.pop()])
else:
pairs = []
for key in component:
pairs.extend(mapping[key])
grouped_calls.append(group_events(pairs))
removed_calls.extend(pairs)
return grouped_calls, removed_calls
[docs]def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0):
"""
given a list of bpps and a dgv reference, annotate the events that are within the set distance of both breakpoints
Args:
bpps (list) : the list of BreakpointPair objects
dgv_regions_by_reference_name (dict) : the dgv reference regions file loaded by load_masking_regions
distance (int) : the minimum distance required to match a dgv event with a breakpoint
"""
for chrom in dgv_regions_by_reference_name:
dgv_regions_by_reference_name[chrom] = sorted(dgv_regions_by_reference_name[chrom], key=lambda x: x.start)
lowest_resolution = max([len(b.break1) for b in bpps]) # only need start res
# only look at the bpps that dgv events could pair to, Intrachromosomal
for bpp in [b for b in bpps if not b.interchromosomal and b.break1.chr in dgv_regions_by_reference_name]:
for dgv_region in dgv_regions_by_reference_name[bpp.break1.chr]:
dist = abs(Interval.dist(Interval(dgv_region.start), bpp.break1))
if dist > lowest_resolution + distance:
break
elif dist > distance or abs(Interval.dist(Interval(dgv_region.end), bpp.break2)) > distance:
continue
refname = dgv_region.reference_object
try:
refname = dgv_region.reference_object.name
except AttributeError:
pass
bpp.data['dgv'] = '{}({}:{}-{})'.format(dgv_region.name, refname, dgv_region.start, dgv_region.end)
[docs]def get_pairing_state(current_protocol, current_disease_state, other_protocol, other_disease_state, is_matched=False, inferred_is_matched=False):
"""
given two libraries, returns the appropriate descriptor for their matched state
Args:
current_protocol (PROTOCOL): the protocol of the current library
current_disease_state (DISEASE_STATUS): the disease status of the current library
other_protocol (PROTOCOL): protocol of the library being comparing to
other_disease_state (DISEASE_STATUS): disease status of the library being compared to
is_matched (bool): True if the libraries are paired
Returns:
(PAIRING_STATE): descriptor of the pairing of the two libraries
"""
PROTOCOL.enforce(current_protocol)
PROTOCOL.enforce(other_protocol)
DISEASE_STATUS.enforce(current_disease_state)
DISEASE_STATUS.enforce(other_disease_state)
curr = (current_protocol, current_disease_state)
other = (other_protocol, other_disease_state)
dg = (PROTOCOL.GENOME, DISEASE_STATUS.DISEASED)
dt = (PROTOCOL.TRANS, DISEASE_STATUS.DISEASED)
ng = (PROTOCOL.GENOME, DISEASE_STATUS.NORMAL)
if curr == dg and other == ng:
return PAIRING_STATE.GERMLINE if is_matched else PAIRING_STATE.SOMATIC
elif curr == dg and other == dt:
return PAIRING_STATE.EXP if inferred_is_matched else PAIRING_STATE.NO_EXP
elif curr == dt and other == dg:
return PAIRING_STATE.GENOMIC if inferred_is_matched else PAIRING_STATE.NO_GENOMIC
elif curr == dt and other == ng:
return PAIRING_STATE.GERMLINE if inferred_is_matched else PAIRING_STATE.SOMATIC
elif curr == ng and other == dt:
return PAIRING_STATE.EXP if inferred_is_matched else PAIRING_STATE.NO_EXP
else:
if current_protocol == other_protocol:
return PAIRING_STATE.MATCH if is_matched else PAIRING_STATE.NO_MATCH
else:
return PAIRING_STATE.MATCH if inferred_is_matched else PAIRING_STATE.NO_MATCH
[docs]def filter_by_evidence(
bpps,
filter_min_remapped_reads=5,
filter_min_spanning_reads=5,
filter_min_flanking_reads=10,
filter_min_split_reads=5,
filter_min_linking_split_reads=1
):
filtered = []
removed = []
for bpp in bpps:
if bpp.call_method == CALL_METHOD.CONTIG:
# inherently the breakpoints have been linked
if int(bpp.contig_remapped_reads) < filter_min_remapped_reads:
removed.append(bpp)
continue
elif bpp.call_method == CALL_METHOD.SPAN:
if bpp.spanning_reads < filter_min_spanning_reads:
removed.append(bpp)
continue
elif bpp.call_method == CALL_METHOD.SPLIT:
linking_split_reads = bpp.linking_split_reads
if bpp.event_type == SVTYPE.INS:
linking_split_reads += bpp.flanking_pairs
if any([
bpp.break1_split_reads + bpp.break1_split_reads_forced < filter_min_split_reads,
bpp.break2_split_reads + bpp.break2_split_reads_forced < filter_min_split_reads,
linking_split_reads < filter_min_linking_split_reads,
bpp.break1_split_reads < 1,
bpp.break2_split_reads < 1
]):
removed.append(bpp)
continue
elif bpp.call_method == CALL_METHOD.FLANK:
if bpp.flanking_pairs < filter_min_flanking_reads:
removed.append(bpp)
continue
elif bpp.call_method != CALL_METHOD.INPUT:
raise AssertionError('unexpected value for call_method: {}'.format(
bpp.call_method))
filtered.append(bpp)
return filtered, removed