Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Defines the constants that are used in the rest of the project. 

3 

4Such as the masses of aminoacids, supported modifications, length of the encodings, 

5maximum length supported, labels and order of the encoded ions ... 

6 

7Greatly inspired/copied from: 

8https://github.com/kusterlab/prosit/blob/master/prosit/constants.py 

9 

10And released under an Apache 2.0 license 

11""" 

12 

13VAL_SPLIT = 0.8 

14 

15TOLERANCE_FTMS = 25 

16TOLERANCE_ITMS = 0.35 

17TOLERANCE_TRIPLETOF = 0.5 

18 

19TOLERANCE = {"FTMS": (25, "ppm"), "ITMS": (0.35, "da"), "TripleTOF": (50, "ppm")} 

20 

21CHARGES = [1, 2, 3, 4, 5, 6] 

22DEFAULT_MAX_CHARGE = max(CHARGES) 

23MAX_FRAG_CHARGE = 3 

24MAX_SEQUENCE = 30 

25MAX_TENSOR_SEQUENCE = 30 + 2 

26MAX_ION = MAX_SEQUENCE - 1 

27ION_TYPES = ["y", "b"] 

28ION_TYPES = sorted(ION_TYPES) 

29 

30NLOSSES = ["", "H2O", "NH3"] 

31 

32FORWARD = {"a", "b", "c"} 

33BACKWARD = {"x", "y", "z"} 

34 

35# Atomic elements 

36PROTON = 1.007276467 

37ELECTRON = 0.00054858 

38H = 1.007825035 

39C = 12.0 

40O = 15.99491463 

41N = 14.003074 

42 

43# Tiny molecules 

44N_TERMINUS = H 

45C_TERMINUS = O + H 

46CO = C + O 

47CHO = C + H + O 

48NH2 = N + H * 2 

49H2O = H * 2 + O 

50NH3 = N + H * 3 

51 

52NEUTRAL_LOSS = {"NH3": NH3, "H2O": H2O} 

53 

54ION_OFFSET = { 

55 "a": 0 - CHO, 

56 "b": 0 - H, 

57 "c": 0 + NH2, 

58 "x": 0 + CO - H, 

59 "y": 0 + H, 

60 "z": 0 - NH2, 

61} 

62 

63# Amino acids 

64# Modifications use high caps PSI-MS name 

65MODIFICATION = { 

66 "CARBAMIDOMETHYL": 57.0214637236, # Carbamidomethylation (CAM) 

67 "ACETYL": 42.010565, # Acetylation 

68 "DEAMIDATED": 0.984016, # Deamidation 

69 "OXIDATION": 15.99491, # Oxidation 

70 "PHOSPHO": 79.966331, # Phosphorylation 

71 "METHYL": 14.015650, # Methylation 

72 "DIMETHYL": 28.031300, # Dimethylation 

73 "TRIMETHYL": 42.046950, # Trimethylation 

74 "FORMYL": 27.994915, # Formylation 

75 "GG": 114.042927, # GlyGly ubiquitinylation residue 

76 "LRGG": 383.228103, # LeuArgGlyGly ubiquitinylation residue 

77 "NITRO": 44.985078, # Oxidation to nitro 

78 "BIOTINYL": 226.077598, # Biotinilation 

79} 

80 

81VARIABLE_MODS = { 

82 "ACETYL": "Kn", # Acetylation 

83 "BIOTINYL": "K", # Biotinilation 

84 "DEAMIDATED": "RNQ", # Deamidation 

85 "OXIDATION": "MP", # Oxidation 

86 "PHOSPHO": "STY", # Phosphorylation 

87 "METHYL": "KR", # Methylation 

88 "DIMETHYL": "KR", # Dimethylation 

89 "TRIMETHYL": "K", # Trimethylation 

90 "FORMYL": "K", # Formylation 

91 "GG": "K", # GlyGly ubiquitinylation residue 

92 "NITRO": "Y", # Oxidation to nitro 

93} 

94 

95MOD_INDICES = {v: i + 1 for i, v in enumerate(MODIFICATION)} 

96# {'': 0, 'CARBAMIDOMETHYL': 1, 'ACETYL': 2, 'DEAMIDATED': 3, ... 

97 

98MOD_INDICES_S = {integer: char for char, integer in MOD_INDICES.items()} 

99# {0: '', 1: 'CARBAMIDOMETHYL', 2: 'ACETYL', 

100 

101AMINO_ACID = { 

102 "G": 57.021464, 

103 "R": 156.101111, 

104 "V": 99.068414, 

105 "P": 97.052764, 

106 "S": 87.032028, 

107 "U": 150.95363, 

108 "L": 113.084064, 

109 "M": 131.040485, 

110 "Q": 128.058578, 

111 "N": 114.042927, 

112 "Y": 163.063329, 

113 "E": 129.042593, 

114 "C": 103.009185 + MODIFICATION["CARBAMIDOMETHYL"], 

115 "F": 147.068414, 

116 "I": 113.084064, 

117 "A": 71.037114, 

118 "T": 101.047679, 

119 "W": 186.079313, 

120 "H": 137.058912, 

121 "D": 115.026943, 

122 "K": 128.094963, 

123 "n": N_TERMINUS, # Placeholder to have n terminal modifications 

124 "c": C_TERMINUS, # Placeholder to have c terminal modifications 

125} 

126 

127AMINO_ACID_SET = set(AMINO_ACID) 

128ALPHABET = {v: i + 1 for i, v in enumerate(sorted(AMINO_ACID))} 

129# {'A': 1, 'C': 2, ... 'W': 20, 'Y': 21} 

130 

131ALPHABET_S = {integer: char for char, integer in ALPHABET.items()} 

132# {1: 'A', 2: 'C', ..., 20: 'W', 21: 'Y'} 

133 

134AAS_NUM = len(ALPHABET) 

135 

136MOD_PEPTIDE_ALIASES = { 

137 "C[160]": "", # This makes it so it assumes it is always modified 

138 "C[+57]": "", # This makes it so it assumes it is always modified 

139 "M(ox)": "OXIDATION", 

140 "M[OXIDATION]": "OXIDATION", 

141 "P[OXIDATION]": "OXIDATION", # Hydroxylation of proline 

142 "S[PHOSPHO]": "PHOSPHO", 

143 "Y[PHOSPHO]": "PHOSPHO", 

144 "S[PHOS]": "PHOSPHO", 

145 "T[PHOSPHO]": "PHOSPHO", 

146 "T[PHOS]": "PHOSPHO", 

147 "K[Acetyl]": "ACETYL", 

148 "K[GlyGly]": "GG", 

149 "K[156]": "FORMYL", # or "DIMETHYL", 

150 "P[113]": "OXIDATION", # aka hydroxilation 

151 "R[157]": "DEAMIDATED", # aka citrullinated 

152 "n[43]": "ACETYL", # n-terminal acetylation 

153 "n[ACETYL]": "ACETYL", # n-terminal acetylation 

154} 

155 

156# Adds the cannonical names to the aliases, like K[GG] 

157[ 

158 MOD_PEPTIDE_ALIASES.update( 

159 {f"{mod_aa}[{mod_name}]": mod_name for mod_aa in mod_aminoacids} 

160 ) 

161 for mod_name, mod_aminoacids in VARIABLE_MODS.items() 

162] 

163 

164# This generages aliases like T[+80], M[+16.99], M[+16.9999] 

165int_aliases = [] 

166for rounding_term in [0, 2, 4]: 

167 for k, v in VARIABLE_MODS.items(): 

168 int_aliases.append( 

169 { 

170 aa + f"[+{round(MODIFICATION[k], rounding_term):.{rounding_term}f}]": k 

171 for aa in v 

172 } 

173 ) 

174 

175# This generates M[80] from M[+80] 

176MASS_DIFF_ALIASES = {} 

177_ = [MASS_DIFF_ALIASES.update(x) for x in int_aliases[::-1]] 

178MASS_DIFF_ALIASES_I = {k[0] + f"[{v}]": k for k, v in MASS_DIFF_ALIASES.items()} 

179MASS_DIFF_ALIASES_I.update({"C": "C[+57]"}) 

180MASS_DIFF_ALIASES_I.update({k: k for k in AMINO_ACID}) 

181 

182MOD_PEPTIDE_ALIASES.update(MASS_DIFF_ALIASES) 

183# This generages aliases like T[181] 

184int_aliases = [ 

185 {aa + f"[{str(round(MODIFICATION[k] + AMINO_ACID[aa]))}]": k for aa in v} 

186 for k, v in VARIABLE_MODS.items() 

187] 

188[MOD_PEPTIDE_ALIASES.update(x) for x in int_aliases[::-1]] 

189del int_aliases 

190 

191MOD_AA_MASSES = AMINO_ACID.copy() 

192MOD_AA_MASSES.update( 

193 { 

194 k: AMINO_ACID[k[0]] + MODIFICATION.get(v, 0) 

195 for k, v in MOD_PEPTIDE_ALIASES.items() 

196 } 

197) 

198 

199 

200ION_ENCODING_NESTING = ["CHARGE", "POSITION", "ION_TYPE"] 

201ION_ENCODING_ITERABLES = { 

202 "ION_TYPE": "".join(sorted(ION_TYPES)), 

203 "CHARGE": [f"z{z}" for z in range(1, MAX_FRAG_CHARGE + 1)], 

204 "POSITION": list(range(1, MAX_ION + 1)), 

205} 

206FRAG_EMBEDING_LABELS = [] 

207 

208# TODO implement neutral losses ... if needed 

209for charge in ION_ENCODING_ITERABLES[ION_ENCODING_NESTING[0]]: 

210 for pos in ION_ENCODING_ITERABLES[ION_ENCODING_NESTING[1]]: 

211 for ion in ION_ENCODING_ITERABLES[ION_ENCODING_NESTING[2]]: 

212 key = f"{charge}{ion}{pos}" 

213 FRAG_EMBEDING_LABELS.append(key) 

214 

215NUM_FRAG_EMBEDINGS = len(FRAG_EMBEDING_LABELS) 

216 

217 

218IRT_PEPTIDES = { 

219 "LGGNEQVTR": {"vendor": "biognosys", "irt": -24.92}, 

220 "GAGSSEPVTGLDAK": {"vendor": "biognosys", "irt": 0}, 

221 "VEATFGVDESNAK": {"vendor": "biognosys", "irt": 12.39}, 

222 "YILAGVENSK": {"vendor": "biognosys", "irt": 19.79}, 

223 "TPVISGGPYEYR": {"vendor": "biognosys", "irt": 28.71}, 

224 "TPVITGAPYEYR": {"vendor": "biognosys", "irt": 33.38}, 

225 "DGLDAASYYAPVR": {"vendor": "biognosys", "irt": 42.26}, 

226 "ADVTPADFSEWSK": {"vendor": "biognosys", "irt": 54.62}, 

227 "GTFIIDPGGVIR": {"vendor": "biognosys", "irt": 70.52}, 

228 "GTFIIDPAAVIR": {"vendor": "biognosys", "irt": 87.23}, 

229 "LFLQFGAQGSPFLK": {"vendor": "biognosys", "irt": 100}, 

230 "HEHISSDYAGK": {"vendor": "procal", "irt": -36.83}, 

231 "IGYDHGHIEHK": {"vendor": "procal", "irt": -33.5}, 

232 "TFAHTESHISK": {"vendor": "procal", "irt": -33.32}, 

233 "ISLGEHEGGGK": {"vendor": "procal", "irt": -18.54}, 

234 "YVGDSYDSSAK": {"vendor": "procal", "irt": -16.87}, 

235 "FGTGTYAGGEK": {"vendor": "procal", "irt": -9.35}, 

236 "LSSGYDGTSYK": {"vendor": "procal", "irt": -8.82}, 

237 "TASGVGGFSTK": {"vendor": "procal", "irt": -4.18}, 

238 "LTSGDFGEDSK": {"vendor": "procal", "irt": -3.76}, 

239 "AGDEALGDTYK": {"vendor": "procal", "irt": -3.52}, 

240 "SYASDFGSSAK": {"vendor": "procal", "irt": 1.79}, 

241 "LYSYYSSTESK": {"vendor": "procal", "irt": 6.39}, 

242 "FASDTSDEAFK": {"vendor": "procal", "irt": 7.2}, 

243 "LTDTFADDDTK": {"vendor": "procal", "irt": 8.25}, 

244 "LYTGAGYDEVK": {"vendor": "procal", "irt": 10.53}, 

245 "TLIAYDDSSTK": {"vendor": "procal", "irt": 14.98}, 

246 "TASEFDSAIAQDK": {"vendor": "procal", "irt": 17.84}, 

247 "HDLDYGIDSYK": {"vendor": "procal", "irt": 19.86}, 

248 "FLASSEGGFTK": {"vendor": "procal", "irt": 20.88}, 

249 "HTAYSDFLSDK": {"vendor": "procal", "irt": 25.9}, 

250 "FVGTEYDGLAK": {"vendor": "procal", "irt": 26.82}, 

251 "YALDSYSLSSK": {"vendor": "procal", "irt": 32}, 

252 "YYGTIEDTEFK": {"vendor": "procal", "irt": 33.73}, 

253 "GFLDYESTGAK": {"vendor": "procal", "irt": 35.9}, 

254 "HLTGLTFDTYK": {"vendor": "procal", "irt": 36.5}, 

255 "YFGYTSDTFGK": {"vendor": "procal", "irt": 41.42}, 

256 "HDTVFGSYLYK": {"vendor": "procal", "irt": 41.42}, 

257 "FSYDGFEEDYK": {"vendor": "procal", "irt": 44.22}, 

258 "ALFSSITDSEK": {"vendor": "procal", "irt": 44.88}, 

259 "LYLSEYDTIGK": {"vendor": "procal", "irt": 48.16}, 

260 "HFALFSTDVTK": {"vendor": "procal", "irt": 50.41}, 

261 "VSGFSDISIYK": {"vendor": "procal", "irt": 51.67}, 

262 "GSGGFTEFDLK": {"vendor": "procal", "irt": 51.97}, 

263 "TFTGTTDSFFK": {"vendor": "procal", "irt": 52.2}, 

264 "TFGTETFDTFK": {"vendor": "procal", "irt": 54.53}, 

265 "YTSFYGAYFEK": {"vendor": "procal", "irt": 56.65}, 

266 "LTDELLSEYYK": {"vendor": "procal", "irt": 57.66}, 

267 "ASDLLSGYYIK": {"vendor": "procal", "irt": 57.68}, 

268 "YGFSSEDIFTK": {"vendor": "procal", "irt": 57.77}, 

269 "HTYDDEFFTFK": {"vendor": "procal", "irt": 58.44}, 

270 "FLFTGYDTSVK": {"vendor": "procal", "irt": 61.07}, 

271 "GLSDYLVSTVK": {"vendor": "procal", "irt": 61.34}, 

272 "VYAETLSGFIK": {"vendor": "procal", "irt": 62.57}, 

273 "GLFYGGYEFTK": {"vendor": "procal", "irt": 62.96}, 

274 "GSTDDGFIILK": {"vendor": "procal", "irt": 63.07}, 

275 "TSIDSFIDSYK": {"vendor": "procal", "irt": 63.51}, 

276 "TLLLDAEGFEK": {"vendor": "procal", "irt": 65.49}, 

277 "GFVIDDGLITK": {"vendor": "procal", "irt": 66.46}, 

278 "GFEYSIDYFSK": {"vendor": "procal", "irt": 66.9}, 

279 "GIFGAFTDDYK": {"vendor": "procal", "irt": 71.49}, 

280 "LEIYTDFDAIK": {"vendor": "procal", "irt": 71.99}, 

281 "FTEGGILDLYK": {"vendor": "procal", "irt": 72.95}, 

282 "LLFSYSSGFVK": {"vendor": "procal", "irt": 73.23}, 

283 "STFFSFGDVGK": {"vendor": "procal", "irt": 74.29}, 

284 "LTAYFEDLELK": {"vendor": "procal", "irt": 75.09}, 

285 "VDTFLDGFSVK": {"vendor": "procal", "irt": 76.57}, 

286 "GASDFLSFAVK": {"vendor": "procal", "irt": 77.42}, 

287 "GEDLDFIYVVK": {"vendor": "procal", "irt": 79.62}, 

288 "VSSIFFDTFDK": {"vendor": "procal", "irt": 82.28}, 

289 "SILDYVSLVEKK": {"vendor": "procal", "irt": 83.05}, 

290 "VYGYELTSLFK": {"vendor": "procal", "irt": 87.89}, 

291 "GGFFSFGDLTK": {"vendor": "procal", "irt": 88.04}, 

292 "YDTAIDFGLFK": {"vendor": "procal", "irt": 89.4}, 

293 "IVLFELEGITK": {"vendor": "procal", "irt": 94.97}, 

294 "GIEDYYIFFAK": {"vendor": "procal", "irt": 95.37}, 

295 "SILDYVSLVEK": {"vendor": "procal", "irt": 96.26}, 

296 "AFSDEFSYFFK": {"vendor": "procal", "irt": 99.13}, 

297 "AFLYEIIDIGK": {"vendor": "procal", "irt": 99.61}, 

298} 

299 

300 

301del charge 

302del pos 

303del ion 

304del key 

305 

306if __name__ == "__main__": 

307 # This is implemented so the constants can be printed if needed running this file directly 

308 my_vars = {k: v for k, v in globals().items() if not k.startswith("_")} 

309 for k, v in my_vars.items(): 

310 print(f"\n>>> {k} {type(v)} = {v}")