Coverage for denofo/converter/convert.py: 84%

108 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-09 15:27 +0200

1import pickle 

2import re 

3import warnings 

4from pathlib import Path 

5from Bio import SeqIO 

6from denofo.models import DeNovoGeneAnnotation 

7from denofo.utils.helpers import get_short_repr, get_model_from_short_repr 

8 

9 

10def convert_to_pickle( 

11 dnga_model: DeNovoGeneAnnotation, outf: Path | None = None 

12) -> bytes: 

13 """ 

14 Convert a DeNovoGeneAnnotation model to a pickle file. 

15 

16 :param dnga_model: DeNovoGeneAnnotation model to convert 

17 :type dnga_model: DeNovoGeneAnnotation 

18 :param outf: output file path, defaults to None 

19 :type outf: Path, optional 

20 :return: pickled model 

21 :rtype: bytes 

22 """ 

23 if outf: 

24 with open(outf, "wb") as outfile: 

25 pickle.dump(dnga_model, outfile) 

26 return pickle.dumps(dnga_model) 

27 

28 

29def load_from_pickle(pkl_file: Path) -> DeNovoGeneAnnotation: 

30 """ 

31 Load a DeNovoGeneAnnotation model from a pickle file. 

32 

33 :param pkl_file: pickle file path 

34 :type pkl_file: Path 

35 :return: DeNovoGeneAnnotation model 

36 :rtype: DeNovoGeneAnnotation 

37 """ 

38 with open(pkl_file, "rb") as infile: 

39 return pickle.load(infile) 

40 

41 

42def convert_to_json(dnga_model: DeNovoGeneAnnotation, outf: Path | None = None) -> str: 

43 """ 

44 Convert a DeNovoGeneAnnotation model to a JSON file. 

45 

46 :param dnga_model: DeNovoGeneAnnotation model to convert 

47 :type dnga_model: DeNovoGeneAnnotation 

48 :param outf: output file path, defaults to None 

49 :type outf: Path, optional 

50 :return: JSON string 

51 :rtype: str 

52 """ 

53 json_str = dnga_model.model_dump_json(exclude_none=True, by_alias=True, indent=2) 

54 if outf: 

55 with open(outf, "w") as outfile: 

56 outfile.write(json_str) 

57 return json_str 

58 

59 

60def load_from_json(json_file: Path) -> DeNovoGeneAnnotation: 

61 """ 

62 Load a DeNovoGeneAnnotation model from a JSON file. 

63 

64 :param json_file: JSON file path 

65 :type json_file: Path 

66 :return: DeNovoGeneAnnotation model 

67 :rtype: DeNovoGeneAnnotation 

68 """ 

69 with open(json_file, "r") as infile: 

70 return DeNovoGeneAnnotation.model_validate_json(infile.read()) 

71 

72 

73def load_from_fasta( 

74 fasta_file: Path, identifiers: set[str] | None = None 

75) -> DeNovoGeneAnnotation: 

76 """ 

77 Load a DeNovoGeneAnnotation model from a FASTA file. 

78 

79 :param fasta_file: FASTA file path 

80 :type fasta_file: Path 

81 :param identifiers: identifiers to filter, defaults to None 

82 :type identifiers: set[str], optional 

83 :return: DeNovoGeneAnnotation model 

84 :rtype: DeNovoGeneAnnotation 

85 """ 

86 short_strs = set() 

87 

88 with open(fasta_file, "r") as infile: 

89 for record in SeqIO.parse(infile, "fasta"): 

90 if identifiers and record.id not in identifiers: 

91 continue 

92 short_str = re.search( 

93 r'denofo:["\'](.+?)(?=["\'](?: |$))', record.description 

94 ) # regex to match until first space outside of quotes or end of string 

95 if short_str: 

96 short_strs.add(short_str.group(1)) 

97 

98 if len(short_strs) == 0: 

99 raise ValueError( 

100 f"No denofo annotation found in the FASTA file" 

101 f"{' with given identifiers' if identifiers else ''}." 

102 ) 

103 elif len(short_strs) > 1: 

104 raise ValueError( 

105 f"Multiple different denofo annotations found in the FASTA file" 

106 f"{' with given identifiers' if identifiers else ''}.\n" 

107 f"The following annotations were found: {short_strs}\n" 

108 ) 

109 

110 return decode_short_str(short_strs.pop()) 

111 

112 

113def encode_short_str(dnga_model: DeNovoGeneAnnotation) -> str: 

114 """ 

115 Encode a DeNovoGeneAnnotation model as a short representation string. 

116 

117 :param dnga_model: DeNovoGeneAnnotation model to encode 

118 :type dnga_model: DeNovoGeneAnnotation 

119 :return: short representation string 

120 :rtype: str 

121 """ 

122 short_str = get_short_repr(dnga_model) 

123 return short_str 

124 

125 

126def decode_short_str(short_str: str) -> DeNovoGeneAnnotation: 

127 """ 

128 Decode a short representation string into a DeNovoGeneAnnotation model. 

129 

130 :param short_str: short representation string 

131 :type short_str: str 

132 :return: DeNovoGeneAnnotation model 

133 :rtype: DeNovoGeneAnnotation 

134 """ 

135 return get_model_from_short_repr(short_str, DeNovoGeneAnnotation) 

136 

137 

138def annotate_fasta( 

139 dnga_model: DeNovoGeneAnnotation, 

140 fasta_file: Path, 

141 outf: Path | None = None, 

142 identifiers: set[str] | None = None, 

143) -> str: 

144 """ 

145 Annotate a FASTA file with the model short string representation. 

146 

147 :param dnga_model: DeNovoGeneAnnotation model to annotate with 

148 :type dnga_model: DeNovoGeneAnnotation 

149 :param fasta_file: additional input file path 

150 :type fasta_file: Path 

151 :param outf: output file path, defaults to None 

152 :type outf: Path, optional 

153 :param identifiers: identifiers to filter, defaults to None 

154 :type identifiers: set[str], optional 

155 :return: annotated FASTA string, if outf is None 

156 :rtype: str 

157 """ 

158 short_str = encode_short_str(dnga_model) 

159 seqs = [] 

160 

161 with open(fasta_file, "r") as infile: 

162 for record in SeqIO.parse(infile, "fasta"): 

163 if identifiers and record.id not in identifiers: 

164 seqs.append(record) 

165 continue 

166 record.description += f' denofo:"{short_str}"' # attributeName:value according to ncbi or ensemble standards 

167 seqs.append(record) 

168 

169 if outf: 

170 with open(outf, "w") as outfile: 

171 SeqIO.write(seqs, outfile, "fasta") 

172 

173 return "\n".join(f">{record.description}\n{record.seq}\n" for record in seqs) 

174 

175 

176def annotate_gff( 

177 dnga_model: DeNovoGeneAnnotation, 

178 gff_file: Path, 

179 outf: Path | None = None, 

180 feature: str = "gene", 

181 identifiers: set[str] | None = None, 

182) -> str: 

183 """ 

184 Annotate a GFF file with the model short string representation. 

185 

186 :param dnga_model: DeNovoGeneAnnotation model to annotate with 

187 :type dnga_model: DeNovoGeneAnnotation 

188 :param gff_file: GFF file path 

189 :type gff_file: Path 

190 :param outf: output file path, defaults to None 

191 :type outf: Path, optional 

192 :param feature: feature to annotate, defaults to "gene" 

193 :type feature: str, optional 

194 :param identifiers: identifiers to filter, defaults to None 

195 :type identifiers: set[str], optional 

196 :return: annotated GFF string, if outf is None 

197 :rtype: str 

198 """ 

199 short_str = encode_short_str(dnga_model) 

200 outstr = "" 

201 

202 with open(gff_file, "r") as infile: 

203 for line in infile: 

204 if line.startswith("#"): 

205 outstr += f"{line}\n" 

206 continue 

207 fields = line.strip().split("\t") 

208 if len(fields) < 8 or len(fields) > 9: 

209 raise ValueError( 

210 f"GFF file has {len(fields)} columns. " 

211 "Only 8 or 9 columns are allowed." 

212 ) 

213 if len(fields) == 8: 

214 warnings.warn( 

215 "The line has only 8 columns, if identifiers were " 

216 "given, this line will be ignored. Otherwise, the line will be " 

217 "annotated if feature matches.\n" 

218 f"line: {line}" 

219 ) 

220 if identifiers: 

221 outstr += f"{line}\n" 

222 continue 

223 if fields[2] != feature: 

224 outstr += f"{line}\n" 

225 continue 

226 if identifiers and not any(ident in fields[8] for ident in identifiers): 

227 outstr += f"{line}\n" 

228 continue 

229 

230 prefix = " " if fields[8] else "" 

231 fields[8] += f'{prefix}denofo "{short_str}";' 

232 # "; " and 'attrName "attrValue";' according to gff3/gtf standards (see e.g. ensembl) 

233 cstr = "\t".join(fields) 

234 outstr += f"{cstr}\n" 

235 

236 if outf: 

237 with open(outf, "w") as outfile: 

238 outfile.write(outstr) 

239 

240 return outstr 

241 

242 

243def load_from_gff( 

244 gff_file: Path, feature: str = "gene", identifiers: set[str] | None = None 

245) -> DeNovoGeneAnnotation: 

246 """ 

247 Load a DeNovoGeneAnnotation model from a GFF file. 

248 

249 :param gff_file: GFF file path 

250 :type gff_file: Path 

251 :param feature: feature to load, defaults to "gene" 

252 :type feature: str, optional 

253 :param identifiers: identifiers to filter, defaults to None 

254 :type identifiers: set[str], optional 

255 :return: DeNovoGeneAnnotation model 

256 :rtype: DeNovoGeneAnnotation 

257 """ 

258 short_strs = set() 

259 

260 with open(gff_file, "r") as infile: 

261 for line in infile: 

262 if line.startswith("#"): 

263 continue 

264 fields = line.strip().split("\t") 

265 if len(fields) < 9: 

266 continue 

267 if fields[2] != feature: 

268 continue 

269 if identifiers and not any(ident in fields[8] for ident in identifiers): 

270 continue 

271 short_str = re.search( 

272 r'denofo ["\'](.+?)(?=["\'];)', # regex to match until first semicolon outside of quotes 

273 fields[8], 

274 ) 

275 if short_str: 

276 short_strs.add(short_str.group(1)) 

277 

278 if len(short_strs) == 0: 

279 raise ValueError("No denofo annotation found in the GFF file.") 

280 elif len(short_strs) > 1: 

281 raise ValueError( 

282 "Multiple different denofo annotations found in the GFF file.\n" 

283 f"The following annotations were found: {short_strs}\n" 

284 ) 

285 

286 return decode_short_str(short_strs.pop())