from functools import partial
import os
import re
import time
import tab
from .constants import DEFAULTS, HOMOPOLYMER_MIN_LENGTH
from .summary import annotate_dgv, filter_by_annotations, filter_by_call_method, filter_by_evidence, get_pairing_state, group_by_distance
from ..constants import CALL_METHOD, COLUMNS, PROTOCOL, SVTYPE
from ..pairing.constants import DEFAULTS as PAIRING_DEFAULTS
from ..util import generate_complete_stamp, log, output_tabbed_file, read_inputs, soft_cast
[docs]def soft_cast_null(value):
try:
return tab.cast_null(value)
except TypeError:
return value
[docs]def main(
inputs, output, annotations,
dgv_annotation=None,
filter_cdna_synon=DEFAULTS.filter_cdna_synon,
filter_protein_synon=DEFAULTS.filter_protein_synon,
filter_min_remapped_reads=DEFAULTS.filter_min_remapped_reads,
filter_min_spanning_reads=DEFAULTS.filter_min_spanning_reads,
filter_min_flanking_reads=DEFAULTS.filter_min_flanking_reads,
filter_min_split_reads=DEFAULTS.filter_min_split_reads,
filter_trans_homopolymers=DEFAULTS.filter_trans_homopolymers,
filter_min_linking_split_reads=DEFAULTS.filter_min_linking_split_reads,
flanking_call_distance=PAIRING_DEFAULTS.flanking_call_distance,
split_call_distance=PAIRING_DEFAULTS.split_call_distance,
contig_call_distance=PAIRING_DEFAULTS.contig_call_distance,
spanning_call_distance=PAIRING_DEFAULTS.spanning_call_distance,
start_time=int(time.time()),
**kwargs
):
# pairing threshold parameters to be defined in config file
distances = {
CALL_METHOD.FLANK: flanking_call_distance,
CALL_METHOD.SPLIT: split_call_distance,
CALL_METHOD.CONTIG: contig_call_distance,
CALL_METHOD.SPAN: spanning_call_distance
}
bpps = []
bpps.extend(read_inputs(
inputs,
require=[
COLUMNS.event_type,
COLUMNS.fusion_cdna_coding_end,
COLUMNS.fusion_cdna_coding_start,
COLUMNS.fusion_splicing_pattern,
COLUMNS.fusion_mapped_domains,
COLUMNS.gene1,
COLUMNS.gene1_direction,
COLUMNS.gene2,
COLUMNS.gene2_direction,
COLUMNS.gene_product_type,
COLUMNS.genes_encompassed,
COLUMNS.library,
COLUMNS.protocol,
COLUMNS.transcript1,
COLUMNS.transcript2,
COLUMNS.untemplated_seq,
COLUMNS.tools,
COLUMNS.exon_last_5prime,
COLUMNS.exon_first_3prime,
COLUMNS.disease_status
],
add_default={**{k: None for k in [
COLUMNS.contig_remapped_reads,
COLUMNS.contig_seq,
COLUMNS.break1_split_reads,
COLUMNS.break1_split_reads_forced,
COLUMNS.break2_split_reads,
COLUMNS.break2_split_reads_forced,
COLUMNS.linking_split_reads,
COLUMNS.flanking_pairs,
COLUMNS.contigs_aligned,
COLUMNS.contigs_assembled,
COLUMNS.contig_alignment_score,
COLUMNS.contig_remap_score,
COLUMNS.spanning_reads,
COLUMNS.annotation_figure,
COLUMNS.gene1_aliases,
COLUMNS.gene2_aliases,
COLUMNS.protein_synon,
COLUMNS.cdna_synon,
COLUMNS.net_size,
COLUMNS.tracking_id,
COLUMNS.assumed_untemplated,
'dgv',
'summary_pairing']
}, COLUMNS.call_method: CALL_METHOD.INPUT},
expand_strand=False, expand_orient=False, expand_svtype=False,
cast={
COLUMNS.break1_split_reads: partial(soft_cast, cast_type=int),
COLUMNS.break2_split_reads: partial(soft_cast, cast_type=int),
COLUMNS.contig_remapped_reads: partial(soft_cast, cast_type=int),
COLUMNS.spanning_reads: partial(soft_cast, cast_type=int),
COLUMNS.break1_split_reads_forced: partial(soft_cast, cast_type=int),
COLUMNS.break2_split_reads_forced: partial(soft_cast, cast_type=int),
COLUMNS.flanking_pairs: partial(soft_cast, cast_type=int),
COLUMNS.linking_split_reads: partial(soft_cast, cast_type=int),
COLUMNS.protein_synon: soft_cast_null,
COLUMNS.cdna_synon: soft_cast_null
}
))
# load all transcripts
reference_transcripts = dict()
best_transcripts = dict()
for chr, genes in annotations.items():
for gene in genes:
for t in gene.transcripts:
reference_transcripts[t.name] = t
if t.is_best_transcript:
best_transcripts[t.name] = t
filtered_pairs = []
# filter by synonymous and RNA homopolymers
if filter_cdna_synon or filter_protein_synon or filter_trans_homopolymers:
temp = []
for bpp in bpps:
if filter_protein_synon and bpp.protein_synon:
bpp.data[COLUMNS.filter_comment] = 'synonymous protein'
filtered_pairs.append(bpp)
continue
elif filter_cdna_synon and bpp.cdna_synon:
bpp.data[COLUMNS.filter_comment] = 'synonymous cdna'
filtered_pairs.append(bpp)
continue
elif bpp.protocol == PROTOCOL.TRANS and bpp.data.get(COLUMNS.repeat_count, None) and bpp.event_type in [SVTYPE.DUP, SVTYPE.INS, SVTYPE.DEL]:
# a transcriptome event in a repeat region
match = re.match(r'^(-?\d+)-(-?\d+)$', str(bpp.data[COLUMNS.net_size]))
if match:
netsize_min = abs(int(match.group(1)))
netsize_max = abs(int(match.group(2)))
if all([
int(bpp.repeat_count) >= HOMOPOLYMER_MIN_LENGTH,
netsize_min == netsize_max and netsize_min == 1,
PROTOCOL.GENOME not in bpp.data.get(COLUMNS.pairing, '')
]):
bpp.data[COLUMNS.filter_comment] = 'homopolymer filter'
filtered_pairs.append(bpp)
continue
temp.append(bpp)
bpps = temp
# filter based on minimum evidence levels
bpps, filtered = filter_by_evidence(
bpps, filter_min_remapped_reads=filter_min_remapped_reads,
filter_min_spanning_reads=filter_min_spanning_reads,
filter_min_flanking_reads=filter_min_flanking_reads,
filter_min_split_reads=filter_min_split_reads,
filter_min_linking_split_reads=filter_min_linking_split_reads
)
for pair in filtered:
pair.data[COLUMNS.filter_comment] = 'low evidence'
filtered_pairs.append(pair)
bpps_by_library = {} # split the input pairs by library
libraries = {}
for bpp in bpps:
bpps_by_library.setdefault(bpp.library, []).append(bpp)
libraries[bpp.library] = (bpp.protocol, bpp.disease_status)
# collapse identical calls with different call methods
for library in bpps_by_library:
uncollapsed = dict()
for bpp in bpps_by_library[library]:
group = (
bpp,
bpp.transcript1,
bpp.transcript2,
bpp.fusion_sequence_fasta_id,
bpp.fusion_splicing_pattern,
bpp.fusion_cdna_coding_start,
bpp.fusion_cdna_coding_end
)
uncollapsed.setdefault(group, []).append(bpp)
collapsed = []
for bpp_set in uncollapsed.values():
result, removed = filter_by_call_method(bpp_set)
collapsed.extend(result)
for bpp in removed:
bpp.data[COLUMNS.filter_comment] = 'collapsed into another call'
filtered_pairs.append(bpp)
bpps_by_library[library] = collapsed
# collapse similar annotations for breakpoints with the same call position
for library in bpps_by_library:
uncollapsed = dict()
for bpp in bpps_by_library[library]:
uncollapsed.setdefault(bpp, []).append(bpp)
collapsed = []
for bpp_set in uncollapsed.values():
result, removed = filter_by_annotations(bpp_set, best_transcripts)
collapsed.extend(result)
for bpp in removed:
bpp.data[COLUMNS.filter_comment] = 'collapsed into another call'
filtered_pairs.append(bpp)
bpps_by_library[library] = collapsed
# group close split read calls with identical annotations
for library in bpps_by_library:
uncollapsed = dict()
for bpp in bpps_by_library[library]:
uncollapsed.setdefault((
bpp.event_type,
bpp.break1.chr, bpp.break2.chr,
bpp.break1.orient, bpp.break2.orient,
bpp.opposing_strands,
bpp.break1.strand, bpp.break2.strand,
bpp.transcript1 if bpp.gene1 else None,
bpp.transcript2 if bpp.gene2 else None,
bpp.fusion_sequence_fasta_id, # id is a hash of the sequence
bpp.fusion_cdna_coding_start,
bpp.fusion_cdna_coding_end
), []).append(bpp)
collapsed = []
for bpp_set in uncollapsed.values():
collapsed.extend([b for b in bpp_set if b.call_method != CALL_METHOD.SPLIT])
grouped, removed = group_by_distance([b for b in bpp_set if b.call_method == CALL_METHOD.SPLIT], distances)
collapsed.extend(grouped)
for bpp in removed:
bpp.data[COLUMNS.filter_comment] = 'collapsed into another call'
filtered_pairs.append(bpp)
bpps_by_library[library] = collapsed
# TODO: give an evidence score to the events based on call method and evidence levels
# TODO: report the pairings so that germline and somatic etc can be determined properly
output_columns = {
COLUMNS.annotation_id,
COLUMNS.break1_chromosome,
COLUMNS.break1_homologous_seq,
COLUMNS.break1_orientation,
COLUMNS.break1_position_end,
COLUMNS.break1_position_start,
COLUMNS.break2_chromosome,
COLUMNS.break2_homologous_seq,
COLUMNS.break2_orientation,
COLUMNS.break2_position_end,
COLUMNS.break2_position_start,
COLUMNS.contig_seq,
COLUMNS.event_type,
COLUMNS.fusion_cdna_coding_end,
COLUMNS.fusion_cdna_coding_start,
COLUMNS.fusion_protein_hgvs,
COLUMNS.fusion_mapped_domains,
COLUMNS.gene1,
COLUMNS.gene1_direction,
COLUMNS.gene2,
COLUMNS.gene2_direction,
COLUMNS.gene_product_type,
COLUMNS.genes_encompassed,
COLUMNS.library,
COLUMNS.protocol,
COLUMNS.transcript1,
COLUMNS.transcript2,
COLUMNS.untemplated_seq,
COLUMNS.tools,
COLUMNS.break1_strand,
COLUMNS.break2_strand,
COLUMNS.gene1_aliases,
COLUMNS.gene2_aliases,
COLUMNS.annotation_figure,
COLUMNS.exon_last_5prime,
COLUMNS.exon_first_3prime,
# For debugging
COLUMNS.call_method,
COLUMNS.flanking_pairs,
COLUMNS.break1_split_reads,
COLUMNS.break2_split_reads,
COLUMNS.linking_split_reads,
COLUMNS.contig_alignment_score,
COLUMNS.spanning_reads,
COLUMNS.contig_remapped_reads,
COLUMNS.tracking_id,
COLUMNS.supplementary_call,
COLUMNS.protein_synon,
COLUMNS.cdna_synon,
COLUMNS.net_size,
COLUMNS.assumed_untemplated,
'dgv'}
rows = []
for lib in bpps_by_library:
log('annotating dgv for', lib)
if dgv_annotation:
annotate_dgv(bpps_by_library[lib], dgv_annotation, distance=10) # TODO make distance a parameter
log('adding pairing states for', lib)
for row in bpps_by_library[lib]:
# in case no pairing was done, add default (applicable to single library summaries)
row.data.setdefault(COLUMNS.inferred_pairing, '')
row.data.setdefault(COLUMNS.pairing, '')
row.data.setdefault(COLUMNS.library, lib)
# filter pairing ids based on what is still kept?
paired_libraries = set([p.split('_')[0] for p in row.pairing.split(';')])
inferred_paired_libraries = set([p.split('_')[0] for p in row.inferred_pairing.split(';')])
for other_lib, (other_protocol, other_disease_state) in libraries.items():
column_name = '{}_{}_{}'.format(other_lib, other_disease_state, other_protocol)
if other_lib != row.library:
pairing_state = get_pairing_state(
*libraries[row.library],
other_protocol=other_protocol, other_disease_state=other_disease_state,
is_matched=other_lib in paired_libraries,
inferred_is_matched=other_lib in inferred_paired_libraries)
else:
pairing_state = 'Not Applicable'
row.data[column_name] = pairing_state
output_columns.add(column_name)
rows.append(row.flatten())
fname = os.path.join(
output,
'mavis_summary_{}.tab'.format('_'.join(sorted(list(libraries.keys()))))
)
output_tabbed_file(rows, fname, header=output_columns)
log('wrote {} structural variants to {}'.format(len(rows), fname))
output_tabbed_file(filtered_pairs, os.path.join(output, 'filtered_pairs.tab'))
# output by library non-synon protein-product
for lib in bpps_by_library:
filename = os.path.join(output, 'mavis_summary_{}_non-synonymous_coding_variants.tab'.format(lib))
lib_rows = []
for row in rows:
if all([
not row.get(COLUMNS.protein_synon, ''),
not row.get(COLUMNS.cdna_synon, ''),
str(row.get(COLUMNS.fusion_cdna_coding_start, None)) != 'None',
row[COLUMNS.library] == lib,
str(row.get(COLUMNS.supplementary_call, False)) != 'True'
]):
lib_rows.append(row)
output_tabbed_file(lib_rows, filename, header=output_columns)
generate_complete_stamp(output, log, start_time=start_time)