Coverage for denofo/converter/converter_cli.py: 83%

75 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-09 15:27 +0200

1import argparse 

2import warnings 

3from pathlib import Path 

4from denofo.utils.ncbiTaxDBcheck import check_NCBI_taxDB 

5from denofo.utils.helpers import infer_format_from_extension 

6from denofo.converter.convert import ( 

7 convert_to_pickle, 

8 load_from_pickle, 

9 convert_to_json, 

10 load_from_json, 

11 load_from_fasta, 

12 annotate_fasta, 

13 load_from_gff, 

14 annotate_gff, 

15 decode_short_str, 

16 encode_short_str, 

17) 

18 

19 

20def main(): 

21 """ 

22 The main function of the program including argument parsing. 

23 """ 

24 parser = argparse.ArgumentParser( 

25 description=("Convert a de DeNoFo annotation to a different file format.") 

26 ) 

27 parser.add_argument( 

28 "-i", 

29 "--input", 

30 type=str, 

31 required=True, 

32 help=( 

33 "The path and name of the input file. " 

34 "This file has to contain the DeNoFo annotation to be converted " 

35 "to another file format. " 

36 "If you want to annotate sequences in a FASTA/GFF file, please " 

37 "provide the FASTA/GFF file to annotate as --add_input parameter." 

38 ), 

39 metavar="\b", 

40 ) 

41 parser.add_argument( 

42 "-if", 

43 "--input_format", 

44 type=str, 

45 choices=["dngf", "pickle", "fasta", "shortstr", "gff"], 

46 default="", # infer from file extension 

47 help=( 

48 "The format of the input file. If not provided, the format is " 

49 "inferred based on the file extension." 

50 ), 

51 metavar="\b", 

52 ) 

53 parser.add_argument( 

54 "-o", 

55 "--output", 

56 type=str, 

57 default="", # stdout 

58 help=( 

59 "Select the destination for the converted file " 

60 "(optional). If no output file is specified, the output will be " 

61 "printed to the console. " 

62 "If you want to annotate sequences in a FASTA/GFF file, please " 

63 "select the FASTA/GFF file to annotate as --add_input parameter " 

64 "and either another file location as --output to keep " 

65 "the original FASTA/GFF file unchanged (recommended) or select " 

66 "the same file as --output, which overwrites the original " 

67 "FASTA/GFF file." 

68 ), 

69 metavar="\b", 

70 ) 

71 parser.add_argument( 

72 "-of", 

73 "--output_format", 

74 type=str, 

75 choices=["dngf", "pickle", "fasta", "shortstr", "gff"], 

76 help=( 

77 "The format of the output file. If not provided, the format is " 

78 "inferred based on the file extension. output_format is required if " 

79 "--output is empty (i.e. output is printed to console) or output " 

80 "file has no known file extension." 

81 ), 

82 metavar="\b", 

83 ) 

84 parser.add_argument( 

85 "-a", 

86 "--add_input", 

87 type=str, 

88 help=( 

89 "Select an additional input file if required by the output " 

90 "format (only for FASTA or GFF). " 

91 "If the --output file is the same as the --add_input file, " 

92 "the original FASTA/GFF file will be overwritten with the " 

93 "annotated version of the file. It is recommended to select a " 

94 "different --output file to keep the original FASTA/GFF file unchanged." 

95 ), 

96 metavar="\b", 

97 ) 

98 parser.add_argument( 

99 "-ids", 

100 "--identifiers", 

101 type=str, 

102 help=( 

103 "Optional file containing sequence identifiers for annotation/" 

104 "extraction in/from a FASTA/GFF file. " 

105 "If not provided, all FASTA/GFF entries will be considered. " 

106 "The file should contain one identifier per line. " 

107 "In FASTA format, the identifiers are matched with sequence IDs at " 

108 "the beginning of the fasta headers. In GFF format, existence of " 

109 "given identifiers is checked in the 9th attributes column." 

110 ), 

111 metavar="\b", 

112 ) 

113 parser.add_argument( 

114 "-f", 

115 "--feature", 

116 type=str, 

117 default="gene", 

118 help=( 

119 "Specify the feature type for GFF annotation/extraction. " 

120 "Only sequences with this feature type are considered " 

121 "(default: gene)." 

122 ), 

123 metavar="\b", 

124 ) 

125 

126 # argument parsing and pre-processing 

127 args = parser.parse_args() 

128 

129 # Check the NCBI Taxonomy Database 

130 check_NCBI_taxDB() 

131 

132 warnings.filterwarnings("ignore") 

133 

134 if not args.input_format: 

135 args.input_format = infer_format_from_extension(Path(args.input)) 

136 if not args.input_format: 

137 raise ValueError( 

138 "Format couldn't be inferred from the input file extension. " 

139 "Please provide the input format." 

140 ) 

141 args.input = Path(args.input) 

142 

143 if not args.output: 

144 if not args.output_format: 

145 raise ValueError( 

146 "Output format is required if the output is printed to the " 

147 "console (empty output parameter)." 

148 ) 

149 else: 

150 if not args.output_format: 

151 args.output_format = infer_format_from_extension(Path(args.output)) 

152 if not args.output_format: 

153 raise ValueError( 

154 "Format couldn't be inferred from the output file extension. " 

155 "Please provide the output format." 

156 ) 

157 args.output = Path(args.output) 

158 

159 if ( 

160 args.output_format == "fasta" or args.output_format == "gff" 

161 ) and not args.add_input: 

162 raise ValueError( 

163 f"Please provide an additional input file (add_inp parameter) for " 

164 f"conversion to {args.output_format} format." 

165 ) 

166 

167 # process identifiers 

168 identifiers = None 

169 if args.identifiers: 

170 if Path(args.identifiers).is_file(): 

171 with open(args.identifiers, "r") as infile: 

172 identifiers = set([line.strip() for line in infile]) 

173 else: 

174 raise ValueError(f"Identifiers file not found at {args.identifiers}.") 

175 

176 # load the model 

177 if args.input_format == "dngf": 

178 dnga_model = load_from_json(Path(args.input)) 

179 elif args.input_format == "pickle": 

180 dnga_model = load_from_pickle(Path(args.input)) 

181 elif args.input_format == "shortstr": 

182 short_str = "" 

183 with open(Path(args.input), "r") as infile: 

184 short_str = infile.readline().strip() 

185 dnga_model = decode_short_str(short_str) 

186 elif args.input_format == "fasta": 

187 dnga_model = load_from_fasta(Path(args.input), identifiers) 

188 elif args.input_format == "gff": 

189 dnga_model = load_from_gff(Path(args.input), args.feature, identifiers) 

190 

191 # convert the model to output format 

192 if args.output_format == "dngf": 

193 if args.output: 

194 convert_to_json(dnga_model, Path(args.output)) 

195 else: 

196 print(convert_to_json(dnga_model)) 

197 elif args.output_format == "pickle": 

198 if args.output: 

199 convert_to_pickle(dnga_model, Path(args.output)) 

200 else: 

201 print(convert_to_pickle(dnga_model)) 

202 elif args.output_format == "fasta": 

203 if args.output: 

204 annotate_fasta( 

205 dnga_model, Path(args.add_input), Path(args.output), identifiers 

206 ) 

207 else: 

208 print(annotate_fasta(dnga_model, Path(args.add_input), identifiers)) 

209 elif args.output_format == "gff": 

210 if args.output: 

211 annotate_gff( 

212 dnga_model, 

213 Path(args.add_input), 

214 Path(args.output), 

215 args.feature, 

216 identifiers, 

217 ) 

218 else: 

219 print( 

220 annotate_gff( 

221 dnga_model, Path(args.add_input), None, args.feature, identifiers 

222 ) 

223 ) 

224 elif args.output_format == "shortstr": 

225 if args.output: 

226 with open(args.output, "w") as outfile: 

227 outfile.write(encode_short_str(dnga_model)) 

228 else: 

229 print(encode_short_str(dnga_model)) 

230 

231 

232if __name__ == "__main__": 

233 main()