Coverage for denofo/converter/converter_cli.py: 83%
75 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-09 15:27 +0200
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-09 15:27 +0200
1import argparse
2import warnings
3from pathlib import Path
4from denofo.utils.ncbiTaxDBcheck import check_NCBI_taxDB
5from denofo.utils.helpers import infer_format_from_extension
6from denofo.converter.convert import (
7 convert_to_pickle,
8 load_from_pickle,
9 convert_to_json,
10 load_from_json,
11 load_from_fasta,
12 annotate_fasta,
13 load_from_gff,
14 annotate_gff,
15 decode_short_str,
16 encode_short_str,
17)
20def main():
21 """
22 The main function of the program including argument parsing.
23 """
24 parser = argparse.ArgumentParser(
25 description=("Convert a de DeNoFo annotation to a different file format.")
26 )
27 parser.add_argument(
28 "-i",
29 "--input",
30 type=str,
31 required=True,
32 help=(
33 "The path and name of the input file. "
34 "This file has to contain the DeNoFo annotation to be converted "
35 "to another file format. "
36 "If you want to annotate sequences in a FASTA/GFF file, please "
37 "provide the FASTA/GFF file to annotate as --add_input parameter."
38 ),
39 metavar="\b",
40 )
41 parser.add_argument(
42 "-if",
43 "--input_format",
44 type=str,
45 choices=["dngf", "pickle", "fasta", "shortstr", "gff"],
46 default="", # infer from file extension
47 help=(
48 "The format of the input file. If not provided, the format is "
49 "inferred based on the file extension."
50 ),
51 metavar="\b",
52 )
53 parser.add_argument(
54 "-o",
55 "--output",
56 type=str,
57 default="", # stdout
58 help=(
59 "Select the destination for the converted file "
60 "(optional). If no output file is specified, the output will be "
61 "printed to the console. "
62 "If you want to annotate sequences in a FASTA/GFF file, please "
63 "select the FASTA/GFF file to annotate as --add_input parameter "
64 "and either another file location as --output to keep "
65 "the original FASTA/GFF file unchanged (recommended) or select "
66 "the same file as --output, which overwrites the original "
67 "FASTA/GFF file."
68 ),
69 metavar="\b",
70 )
71 parser.add_argument(
72 "-of",
73 "--output_format",
74 type=str,
75 choices=["dngf", "pickle", "fasta", "shortstr", "gff"],
76 help=(
77 "The format of the output file. If not provided, the format is "
78 "inferred based on the file extension. output_format is required if "
79 "--output is empty (i.e. output is printed to console) or output "
80 "file has no known file extension."
81 ),
82 metavar="\b",
83 )
84 parser.add_argument(
85 "-a",
86 "--add_input",
87 type=str,
88 help=(
89 "Select an additional input file if required by the output "
90 "format (only for FASTA or GFF). "
91 "If the --output file is the same as the --add_input file, "
92 "the original FASTA/GFF file will be overwritten with the "
93 "annotated version of the file. It is recommended to select a "
94 "different --output file to keep the original FASTA/GFF file unchanged."
95 ),
96 metavar="\b",
97 )
98 parser.add_argument(
99 "-ids",
100 "--identifiers",
101 type=str,
102 help=(
103 "Optional file containing sequence identifiers for annotation/"
104 "extraction in/from a FASTA/GFF file. "
105 "If not provided, all FASTA/GFF entries will be considered. "
106 "The file should contain one identifier per line. "
107 "In FASTA format, the identifiers are matched with sequence IDs at "
108 "the beginning of the fasta headers. In GFF format, existence of "
109 "given identifiers is checked in the 9th attributes column."
110 ),
111 metavar="\b",
112 )
113 parser.add_argument(
114 "-f",
115 "--feature",
116 type=str,
117 default="gene",
118 help=(
119 "Specify the feature type for GFF annotation/extraction. "
120 "Only sequences with this feature type are considered "
121 "(default: gene)."
122 ),
123 metavar="\b",
124 )
126 # argument parsing and pre-processing
127 args = parser.parse_args()
129 # Check the NCBI Taxonomy Database
130 check_NCBI_taxDB()
132 warnings.filterwarnings("ignore")
134 if not args.input_format:
135 args.input_format = infer_format_from_extension(Path(args.input))
136 if not args.input_format:
137 raise ValueError(
138 "Format couldn't be inferred from the input file extension. "
139 "Please provide the input format."
140 )
141 args.input = Path(args.input)
143 if not args.output:
144 if not args.output_format:
145 raise ValueError(
146 "Output format is required if the output is printed to the "
147 "console (empty output parameter)."
148 )
149 else:
150 if not args.output_format:
151 args.output_format = infer_format_from_extension(Path(args.output))
152 if not args.output_format:
153 raise ValueError(
154 "Format couldn't be inferred from the output file extension. "
155 "Please provide the output format."
156 )
157 args.output = Path(args.output)
159 if (
160 args.output_format == "fasta" or args.output_format == "gff"
161 ) and not args.add_input:
162 raise ValueError(
163 f"Please provide an additional input file (add_inp parameter) for "
164 f"conversion to {args.output_format} format."
165 )
167 # process identifiers
168 identifiers = None
169 if args.identifiers:
170 if Path(args.identifiers).is_file():
171 with open(args.identifiers, "r") as infile:
172 identifiers = set([line.strip() for line in infile])
173 else:
174 raise ValueError(f"Identifiers file not found at {args.identifiers}.")
176 # load the model
177 if args.input_format == "dngf":
178 dnga_model = load_from_json(Path(args.input))
179 elif args.input_format == "pickle":
180 dnga_model = load_from_pickle(Path(args.input))
181 elif args.input_format == "shortstr":
182 short_str = ""
183 with open(Path(args.input), "r") as infile:
184 short_str = infile.readline().strip()
185 dnga_model = decode_short_str(short_str)
186 elif args.input_format == "fasta":
187 dnga_model = load_from_fasta(Path(args.input), identifiers)
188 elif args.input_format == "gff":
189 dnga_model = load_from_gff(Path(args.input), args.feature, identifiers)
191 # convert the model to output format
192 if args.output_format == "dngf":
193 if args.output:
194 convert_to_json(dnga_model, Path(args.output))
195 else:
196 print(convert_to_json(dnga_model))
197 elif args.output_format == "pickle":
198 if args.output:
199 convert_to_pickle(dnga_model, Path(args.output))
200 else:
201 print(convert_to_pickle(dnga_model))
202 elif args.output_format == "fasta":
203 if args.output:
204 annotate_fasta(
205 dnga_model, Path(args.add_input), Path(args.output), identifiers
206 )
207 else:
208 print(annotate_fasta(dnga_model, Path(args.add_input), identifiers))
209 elif args.output_format == "gff":
210 if args.output:
211 annotate_gff(
212 dnga_model,
213 Path(args.add_input),
214 Path(args.output),
215 args.feature,
216 identifiers,
217 )
218 else:
219 print(
220 annotate_gff(
221 dnga_model, Path(args.add_input), None, args.feature, identifiers
222 )
223 )
224 elif args.output_format == "shortstr":
225 if args.output:
226 with open(args.output, "w") as outfile:
227 outfile.write(encode_short_str(dnga_model))
228 else:
229 print(encode_short_str(dnga_model))
232if __name__ == "__main__":
233 main()