Coverage for denofo/converter/convert.py: 84%
108 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-09 15:27 +0200
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-09 15:27 +0200
1import pickle
2import re
3import warnings
4from pathlib import Path
5from Bio import SeqIO
6from denofo.models import DeNovoGeneAnnotation
7from denofo.utils.helpers import get_short_repr, get_model_from_short_repr
10def convert_to_pickle(
11 dnga_model: DeNovoGeneAnnotation, outf: Path | None = None
12) -> bytes:
13 """
14 Convert a DeNovoGeneAnnotation model to a pickle file.
16 :param dnga_model: DeNovoGeneAnnotation model to convert
17 :type dnga_model: DeNovoGeneAnnotation
18 :param outf: output file path, defaults to None
19 :type outf: Path, optional
20 :return: pickled model
21 :rtype: bytes
22 """
23 if outf:
24 with open(outf, "wb") as outfile:
25 pickle.dump(dnga_model, outfile)
26 return pickle.dumps(dnga_model)
29def load_from_pickle(pkl_file: Path) -> DeNovoGeneAnnotation:
30 """
31 Load a DeNovoGeneAnnotation model from a pickle file.
33 :param pkl_file: pickle file path
34 :type pkl_file: Path
35 :return: DeNovoGeneAnnotation model
36 :rtype: DeNovoGeneAnnotation
37 """
38 with open(pkl_file, "rb") as infile:
39 return pickle.load(infile)
42def convert_to_json(dnga_model: DeNovoGeneAnnotation, outf: Path | None = None) -> str:
43 """
44 Convert a DeNovoGeneAnnotation model to a JSON file.
46 :param dnga_model: DeNovoGeneAnnotation model to convert
47 :type dnga_model: DeNovoGeneAnnotation
48 :param outf: output file path, defaults to None
49 :type outf: Path, optional
50 :return: JSON string
51 :rtype: str
52 """
53 json_str = dnga_model.model_dump_json(exclude_none=True, by_alias=True, indent=2)
54 if outf:
55 with open(outf, "w") as outfile:
56 outfile.write(json_str)
57 return json_str
60def load_from_json(json_file: Path) -> DeNovoGeneAnnotation:
61 """
62 Load a DeNovoGeneAnnotation model from a JSON file.
64 :param json_file: JSON file path
65 :type json_file: Path
66 :return: DeNovoGeneAnnotation model
67 :rtype: DeNovoGeneAnnotation
68 """
69 with open(json_file, "r") as infile:
70 return DeNovoGeneAnnotation.model_validate_json(infile.read())
73def load_from_fasta(
74 fasta_file: Path, identifiers: set[str] | None = None
75) -> DeNovoGeneAnnotation:
76 """
77 Load a DeNovoGeneAnnotation model from a FASTA file.
79 :param fasta_file: FASTA file path
80 :type fasta_file: Path
81 :param identifiers: identifiers to filter, defaults to None
82 :type identifiers: set[str], optional
83 :return: DeNovoGeneAnnotation model
84 :rtype: DeNovoGeneAnnotation
85 """
86 short_strs = set()
88 with open(fasta_file, "r") as infile:
89 for record in SeqIO.parse(infile, "fasta"):
90 if identifiers and record.id not in identifiers:
91 continue
92 short_str = re.search(
93 r'denofo:["\'](.+?)(?=["\'](?: |$))', record.description
94 ) # regex to match until first space outside of quotes or end of string
95 if short_str:
96 short_strs.add(short_str.group(1))
98 if len(short_strs) == 0:
99 raise ValueError(
100 f"No denofo annotation found in the FASTA file"
101 f"{' with given identifiers' if identifiers else ''}."
102 )
103 elif len(short_strs) > 1:
104 raise ValueError(
105 f"Multiple different denofo annotations found in the FASTA file"
106 f"{' with given identifiers' if identifiers else ''}.\n"
107 f"The following annotations were found: {short_strs}\n"
108 )
110 return decode_short_str(short_strs.pop())
113def encode_short_str(dnga_model: DeNovoGeneAnnotation) -> str:
114 """
115 Encode a DeNovoGeneAnnotation model as a short representation string.
117 :param dnga_model: DeNovoGeneAnnotation model to encode
118 :type dnga_model: DeNovoGeneAnnotation
119 :return: short representation string
120 :rtype: str
121 """
122 short_str = get_short_repr(dnga_model)
123 return short_str
126def decode_short_str(short_str: str) -> DeNovoGeneAnnotation:
127 """
128 Decode a short representation string into a DeNovoGeneAnnotation model.
130 :param short_str: short representation string
131 :type short_str: str
132 :return: DeNovoGeneAnnotation model
133 :rtype: DeNovoGeneAnnotation
134 """
135 return get_model_from_short_repr(short_str, DeNovoGeneAnnotation)
138def annotate_fasta(
139 dnga_model: DeNovoGeneAnnotation,
140 fasta_file: Path,
141 outf: Path | None = None,
142 identifiers: set[str] | None = None,
143) -> str:
144 """
145 Annotate a FASTA file with the model short string representation.
147 :param dnga_model: DeNovoGeneAnnotation model to annotate with
148 :type dnga_model: DeNovoGeneAnnotation
149 :param fasta_file: additional input file path
150 :type fasta_file: Path
151 :param outf: output file path, defaults to None
152 :type outf: Path, optional
153 :param identifiers: identifiers to filter, defaults to None
154 :type identifiers: set[str], optional
155 :return: annotated FASTA string, if outf is None
156 :rtype: str
157 """
158 short_str = encode_short_str(dnga_model)
159 seqs = []
161 with open(fasta_file, "r") as infile:
162 for record in SeqIO.parse(infile, "fasta"):
163 if identifiers and record.id not in identifiers:
164 seqs.append(record)
165 continue
166 record.description += f' denofo:"{short_str}"' # attributeName:value according to ncbi or ensemble standards
167 seqs.append(record)
169 if outf:
170 with open(outf, "w") as outfile:
171 SeqIO.write(seqs, outfile, "fasta")
173 return "\n".join(f">{record.description}\n{record.seq}\n" for record in seqs)
176def annotate_gff(
177 dnga_model: DeNovoGeneAnnotation,
178 gff_file: Path,
179 outf: Path | None = None,
180 feature: str = "gene",
181 identifiers: set[str] | None = None,
182) -> str:
183 """
184 Annotate a GFF file with the model short string representation.
186 :param dnga_model: DeNovoGeneAnnotation model to annotate with
187 :type dnga_model: DeNovoGeneAnnotation
188 :param gff_file: GFF file path
189 :type gff_file: Path
190 :param outf: output file path, defaults to None
191 :type outf: Path, optional
192 :param feature: feature to annotate, defaults to "gene"
193 :type feature: str, optional
194 :param identifiers: identifiers to filter, defaults to None
195 :type identifiers: set[str], optional
196 :return: annotated GFF string, if outf is None
197 :rtype: str
198 """
199 short_str = encode_short_str(dnga_model)
200 outstr = ""
202 with open(gff_file, "r") as infile:
203 for line in infile:
204 if line.startswith("#"):
205 outstr += f"{line}\n"
206 continue
207 fields = line.strip().split("\t")
208 if len(fields) < 8 or len(fields) > 9:
209 raise ValueError(
210 f"GFF file has {len(fields)} columns. "
211 "Only 8 or 9 columns are allowed."
212 )
213 if len(fields) == 8:
214 warnings.warn(
215 "The line has only 8 columns, if identifiers were "
216 "given, this line will be ignored. Otherwise, the line will be "
217 "annotated if feature matches.\n"
218 f"line: {line}"
219 )
220 if identifiers:
221 outstr += f"{line}\n"
222 continue
223 if fields[2] != feature:
224 outstr += f"{line}\n"
225 continue
226 if identifiers and not any(ident in fields[8] for ident in identifiers):
227 outstr += f"{line}\n"
228 continue
230 prefix = " " if fields[8] else ""
231 fields[8] += f'{prefix}denofo "{short_str}";'
232 # "; " and 'attrName "attrValue";' according to gff3/gtf standards (see e.g. ensembl)
233 cstr = "\t".join(fields)
234 outstr += f"{cstr}\n"
236 if outf:
237 with open(outf, "w") as outfile:
238 outfile.write(outstr)
240 return outstr
243def load_from_gff(
244 gff_file: Path, feature: str = "gene", identifiers: set[str] | None = None
245) -> DeNovoGeneAnnotation:
246 """
247 Load a DeNovoGeneAnnotation model from a GFF file.
249 :param gff_file: GFF file path
250 :type gff_file: Path
251 :param feature: feature to load, defaults to "gene"
252 :type feature: str, optional
253 :param identifiers: identifiers to filter, defaults to None
254 :type identifiers: set[str], optional
255 :return: DeNovoGeneAnnotation model
256 :rtype: DeNovoGeneAnnotation
257 """
258 short_strs = set()
260 with open(gff_file, "r") as infile:
261 for line in infile:
262 if line.startswith("#"):
263 continue
264 fields = line.strip().split("\t")
265 if len(fields) < 9:
266 continue
267 if fields[2] != feature:
268 continue
269 if identifiers and not any(ident in fields[8] for ident in identifiers):
270 continue
271 short_str = re.search(
272 r'denofo ["\'](.+?)(?=["\'];)', # regex to match until first semicolon outside of quotes
273 fields[8],
274 )
275 if short_str:
276 short_strs.add(short_str.group(1))
278 if len(short_strs) == 0:
279 raise ValueError("No denofo annotation found in the GFF file.")
280 elif len(short_strs) > 1:
281 raise ValueError(
282 "Multiple different denofo annotations found in the GFF file.\n"
283 f"The following annotations were found: {short_strs}\n"
284 )
286 return decode_short_str(short_strs.pop())