Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import logging 

2 

3from collections import namedtuple 

4import pandas as pd 

5from elfragmentador import constants, annotate 

6from pandas.core.frame import DataFrame 

7from torch import Tensor 

8from typing import Dict, List, Optional, Sequence, Union 

9 

10SequencePair = namedtuple("SequencePair", "aas, mods") 

11 

12 

13def encode_mod_seq(seq): 

14 """ 

15 Encodes a peptide sequence to a numeric vector 

16 

17 Raises: 

18 ValueError 

19 

20 Example 

21 ======= 

22 >>> samp_seq = "_AAIFVVAR_" 

23 >>> print(constants.MAX_TENSOR_SEQUENCE) 

24 32 

25 >>> out = encode_mod_seq(samp_seq) 

26 >>> out 

27 SequencePair(aas=[23, 1, 1, 8, 5, 19, 19, 1, 15, ..., 0], mods=[0, 0, 0, 0,..., 0, 0]) 

28 >>> len(out) 

29 2 

30 >>> [len(x) for x in out] 

31 [32, 32] 

32 """ 

33 seq_out = [0] * constants.MAX_TENSOR_SEQUENCE 

34 mod_out = [0] * constants.MAX_TENSOR_SEQUENCE 

35 

36 try: 

37 split_seq = list(annotate.peptide_parser(seq, solve_aliases=True)) 

38 seq_out_i = [constants.ALPHABET[x[:1]] for x in split_seq] 

39 mod_out_i = [ 

40 constants.MOD_PEPTIDE_ALIASES[x] if len(x) > 1 else 0 for x in split_seq 

41 ] 

42 mod_out_i = [constants.MOD_INDICES.get(x, 0) for x in mod_out_i] 

43 if len(seq_out_i) > len(seq_out): 

44 logging.warning( 

45 f"Length of the encoded sequence is more than the one allowed {constants.MAX_SEQUENCE}." 

46 f" Sequence={seq}, the remainder will be clipped" 

47 ) 

48 

49 seq_out[: len(seq_out_i)] = seq_out_i 

50 mod_out[: len(mod_out_i)] = mod_out_i 

51 except ValueError as e: 

52 logging.error(seq) 

53 logging.error(e) 

54 raise ValueError( 

55 f"Sequence provided is longer than the supported length of {constants.MAX_SEQUENCE}" 

56 ) 

57 

58 return SequencePair(seq_out, mod_out) 

59 

60 

61def clip_explicit_terminus(seq): 

62 """Remove explicit terminus 

63 

64 Args: 

65 seq: Sequence to be stripped form eplicit termini 

66 

67 Returns: 

68 Same as sequence input but removing explicit 

69 n and c termini 

70 

71 Examples: 

72 >>> clip_explicit_terminus("PEPTIDEPINK") 

73 'PEPTIDEPINK' 

74 >>> clip_explicit_terminus("nPEPTIDEPINKc") 

75 'PEPTIDEPINK' 

76 >>> clip_explicit_terminus("n[ACETYL]PEPTIDEPINKc") 

77 'n[ACETYL]PEPTIDEPINK' 

78 """ 

79 

80 if seq[0] == "n" and not seq[1].startswith("["): 

81 seq = seq[1:] 

82 

83 if seq[-1] == "c": 

84 seq = seq[:-1] 

85 

86 return seq 

87 

88 

89def decode_mod_seq( 

90 seq_encoding: List[int], 

91 mod_encoding: Optional[List[int]] = None, 

92 clip_explicit_term=True, 

93) -> str: 

94 out = [] 

95 

96 if mod_encoding is None: 

97 mod_encoding = [0] * len(seq_encoding) 

98 

99 for i, s in enumerate(seq_encoding): 

100 if s == 0: 

101 break 

102 

103 out.append(constants.ALPHABET_S[s]) 

104 if mod_encoding[i] != 0: 

105 out.append(f"[{constants.MOD_INDICES_S[mod_encoding[i]]}]") 

106 

107 if clip_explicit_term: 

108 out = clip_explicit_terminus(out) 

109 return "".join(out) 

110 

111 

112def get_fragment_encoding_labels( 

113 annotated_peaks: Optional[Union[Dict[str, int], Dict[str, float]]] = None 

114) -> Union[List[Union[int, float]], List[int], List[str]]: 

115 """ 

116 Gets either the laels or an sequence that encodes a spectra 

117 

118 Examples 

119 ======== 

120 >>> get_fragment_encoding_labels() 

121 ['z1b1', 'z1y1', ..., 'z3b29', 'z3y29'] 

122 >>> get_fragment_encoding_labels({'z1y2': 100, 'z2y2': 52}) 

123 [0, 0, 0, 100, ..., 0, 52, ...] 

124 """ 

125 

126 # TODO just redefine this to use the constant keys for fragments ... 

127 encoding = [] 

128 ion_encoding_iterables = { 

129 "ION_TYPE": "".join(sorted(constants.ION_TYPES)), 

130 "CHARGE": [f"z{z}" for z in range(1, constants.MAX_FRAG_CHARGE + 1)], 

131 "POSITION": list(range(1, constants.MAX_ION + 1)), 

132 } 

133 

134 # TODO implement neutral losses ... if needed 

135 for charge in ion_encoding_iterables[constants.ION_ENCODING_NESTING[0]]: 

136 for pos in ion_encoding_iterables[constants.ION_ENCODING_NESTING[1]]: 

137 for ion in ion_encoding_iterables[constants.ION_ENCODING_NESTING[2]]: 

138 key = f"{charge}{ion}{pos}" 

139 if annotated_peaks is None: 

140 encoding.append(key) 

141 else: 

142 encoding.append(annotated_peaks.get(key, 0)) 

143 

144 return encoding 

145 

146 

147def decode_fragment_tensor( 

148 sequence: str, 

149 tensor: Union[List[int], Tensor], 

150) -> DataFrame: 

151 """ 

152 Returns a data frame with annotations from sequence 

153 and a tensor encoding a spectra 

154 

155 Example 

156 ======= 

157 >>> import torch 

158 >>> foo = decode_fragment_tensor("AAACK", torch.arange(0, (constants.NUM_FRAG_EMBEDINGS))) 

159 >>> foo.head() 

160 Fragment Mass Intensity 

161 0 z1b1 72.044390 0.0 

162 1 z1y1 147.112804 1.0 

163 2 z1b2 143.081504 2.0 

164 3 z1y2 307.143453 3.0 

165 4 z1b3 214.118618 4.0 

166 >>> # import matplotlib.pyplot as plt 

167 >>> # plt.vlines(foo['Mass'], 0, foo['Intensity']) 

168 >>> # plt.show() 

169 """ 

170 key_list = constants.FRAG_EMBEDING_LABELS 

171 fragment_ions = annotate.get_peptide_ions(sequence) 

172 masses = [fragment_ions.get(k, 0) for k in key_list] 

173 intensities = [float(x) for x in tensor] 

174 

175 assert len(intensities) == len(masses), logging.error( 

176 f"Int {len(intensities)}: \n{intensities}\n\nmasses {len(masses)}: \n{masses}" 

177 ) 

178 

179 out_dict = {"Fragment": key_list, "Mass": masses, "Intensity": intensities} 

180 out_df = pd.DataFrame(out_dict) 

181 out_df = out_df[out_df["Mass"] != 0].copy() 

182 

183 return out_df