Coverage for scoring.py : 30%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import math
2from elfragmentador import annotate, constants
3from elfragmentador import isoforms
4from numpy import float64
5from typing import Any, Dict, List, Tuple, Union
8def get_site_localizing_ions(
9 seq: str, mod: List[str], aas: List[str]
10) -> Union[
11 Tuple[Dict[str, Dict[Any, Any]], Dict[str, Dict[str, float64]]],
12 Tuple[Dict[str, Dict[str, float64]], Dict[str, Dict[str, float64]]],
13]:
14 """
15 get_site_localizing_ions
17 [extended_summary]
19 Parameters
20 ----------
21 seq : str
22 [description]
23 mod : List[str]
24 [description]
25 aas : List[str]
26 [description]
28 Returns
29 -------
30 Tuple[
31 Dict[str, Dict[str, float64]],
32 Dict[str, Dict[str, float64]]]
34 Returns two dictionaries whose keys are isoform sequences and the values are
35 dictionaries of ion:mass pairs.
37 The first one contins only the ions unique to each sequence and the second contains
38 all the ions for each modified sequence
40 Example
41 -------
42 >>> seq = "MY[PHOSPHO]PTMIDE"
43 >>> mods_list = ["PHOSPHO"]
44 >>> aas_list = ["YST"]
45 >>> sli = get_site_localizing_ions(seq, mods_list, aas_list)
46 >>> sorted(sli[0].keys())
47 ['MYPT[PHOSPHO]MIDE', 'MY[PHOSPHO]PTMIDE']
48 >>> sorted(sli[0]['MYPT[PHOSPHO]MIDE'].keys())
49 ['z1b2', 'z1b3', 'z1y5', 'z1y6', 'z2b2', 'z2b3', 'z2y5', 'z2y6', 'z3b2', 'z3b3', 'z3y5', 'z3y6']
50 >>> sorted(sli[1].keys())
51 ['MYPT[PHOSPHO]MIDE', 'MY[PHOSPHO]PTMIDE']
52 >>> sorted(sli[1]['MYPT[PHOSPHO]MIDE'].keys())
53 ['z1b1', 'z1b2', 'z1b3', ..., 'z1y6', 'z1y7', 'z2b1', 'z2b2', 'z2b3', 'z2b4', 'z2b5', 'z2b6', 'z2b7', 'z2y1', 'z2y2', 'z2y3', 'z2y4', 'z2y5', 'z2y6', 'z2y7', 'z3b1', 'z3b2', 'z3b3', 'z3b4', 'z3b5', 'z3b6', 'z3b7', 'z3y1', 'z3y2', 'z3y3', 'z3y4', 'z3y5', 'z3y6', 'z3y7']
54 >>> sli[1]['MYPT[PHOSPHO]MIDE']['z1y6']
55 785.278700167
56 >>> out = get_site_localizing_ions(seq, mods_list, aas_list)
57 >>> [{k: len(x[k]) for k in sorted(x)} for x in out] # Show the length of every sub-item
58 [{'MYPT[PHOSPHO]MIDE': 12, 'MY[PHOSPHO]PTMIDE': 12}, {'MYPT[PHOSPHO]MIDE': 42, 'MY[PHOSPHO]PTMIDE': 42}]
59 """
60 mod_isoforms = isoforms.get_mod_isoforms(seq, mod, aas)
61 mod_isoform_ions = {k: annotate.get_peptide_ions(k) for k in mod_isoforms}
63 filtered_out_dict = {k: {} for k in mod_isoforms}
64 out_dict = {k: {} for k in mod_isoforms}
65 for ion in mod_isoform_ions[mod_isoforms[0]]:
66 unique_vals = list(
67 set([round(v[ion], 10) for k, v in mod_isoform_ions.items()])
68 )
70 for mi in mod_isoforms:
71 out_dict[mi].update({ion: mod_isoform_ions[mi][ion]})
72 if len(unique_vals) > 1:
73 filtered_out_dict[mi].update({ion: mod_isoform_ions[mi][ion]})
75 return filtered_out_dict, out_dict
78# TODO consider if this has to be a public API
79def calc_ascore(
80 seq: str,
81 mod: List[str],
82 aas: List[str],
83 mzs: Union[List[float64], List[float]],
84 ints: Union[List[float], List[int]],
85) -> Dict[str, Union[float, int]]:
86 WINDOW_SIZE = 100 # daltons
87 # BINNING = WINDOW_SIZE / (constants.TOLERANCE_FTMS / (1e6) )# aka, how many peaks can fit in a 100da window
88 # Theoretically this would be correct but leads to really high scores...
89 BINNING = 100
90 N_PER_WINDOW = list(range(1, 11, 1))
92 sli, all_ions = get_site_localizing_ions(seq, mod, aas)
93 max_mz = math.ceil(max(mzs) / WINDOW_SIZE) * WINDOW_SIZE
94 norm_spectra = {str(x): [] for x in N_PER_WINDOW}
96 for range_start in range(0, max_mz, WINDOW_SIZE):
97 curr_mzs_pairs = [
98 [m, i]
99 for m, i in zip(mzs, ints)
100 if m <= range_start + WINDOW_SIZE and m > range_start
101 ]
102 if len(curr_mzs_pairs) == 0:
103 continue
105 # from : https://stackoverflow.com/questions/13070461/
106 sorted_indices = sorted(
107 range(len(curr_mzs_pairs)), key=lambda x: curr_mzs_pairs[x][1]
108 )
109 for n in N_PER_WINDOW:
110 keep_indices = sorted_indices[-n:]
111 norm_spectra[str(n)].extend([curr_mzs_pairs[x][0] for x in keep_indices])
113 # First pass finding best depth
114 scores = {k: [] for k in norm_spectra}
115 for norm_depth, norm_depth_spectra in norm_spectra.items():
116 prior = int(norm_depth) / BINNING
117 norm_depth_spectra_ints = [1 for _ in range(len(norm_depth_spectra))]
118 tmp_scores = _calculate_scores_dict(
119 norm_depth_spectra, norm_depth_spectra_ints, all_ions, prior
120 )
121 scores[norm_depth].extend(list(tmp_scores.values()))
123 try:
124 # define best as highest delta score
125 deltascores = {k: sorted(v)[-1] - sorted(v)[-2] for k, v in scores.items()}
126 # define best as highest cumulative score
127 # deltascores = {k: sum(v) for k, v in scores.items()}
128 except IndexError:
129 return {seq: max([x[0] for x in scores.values()]), "": 0}
131 deltascores
132 max_deltascores = max(deltascores.values())
133 best_depth = [k for k, v in deltascores.items() if v == max_deltascores][0]
134 best_depth
136 # second pass finding score
137 prior = int(best_depth) / BINNING
138 norm_depth_spectra = norm_spectra[best_depth]
139 norm_depth_spectra_ints = [1 for _ in range(len(norm_depth_spectra))]
140 scores = {k: None for k in sli}
142 scores = _calculate_scores_dict(
143 norm_depth_spectra, norm_depth_spectra_ints, sli, prior
144 )
145 return scores
148def calc_delta_ascore(
149 seq: str,
150 mod: List[str],
151 aas: List[str],
152 mzs: Union[List[float64], List[float]],
153 ints: Union[List[float], List[int]],
154) -> float:
155 ascores = calc_ascore(seq, mod, aas, mzs, ints)
156 seq_score = ascores.pop(seq)
157 return seq_score - max(ascores.values())
160def _calculate_scores_dict(
161 mzs: Union[List[float64], List[float]],
162 ints: List[int],
163 ions_dict: Dict[str, Dict[str, float64]],
164 prior: float,
165) -> Dict[str, float]:
166 EPSILON = 1e-31
168 scores = {k: None for k in ions_dict}
169 for isoform, theo_peaks_isoform in ions_dict.items():
170 num_tot_peaks = len(theo_peaks_isoform)
171 matched_peaks = sum(
172 annotate.annotate_peaks2(
173 theo_peaks_isoform,
174 mzs=mzs,
175 intensities=ints,
176 ).values()
177 )
178 unmatched_peaks = num_tot_peaks - matched_peaks
179 prob = (prior ** matched_peaks) * ((1 - prior) ** unmatched_peaks)
180 score = -10 * math.log10(prob + EPSILON)
181 scores[isoform] = score
182 return scores