Coverage for src/topsim_cli.py: 0%

26 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-04 17:47 -0700

1#! /usr/bin/env python3 

2 

3""" 

4Usage: 

5 topsim-cli <query> [options] [<file>] 

6 

7 topsim-cli --help 

8 

9 

10Options: 

11 -I Case-sensitive matching. 

12 -k <k> Maximum number of search results. [default: 1] 

13 --tie Include all the results with the same similarity of the "k"-th result. May return more than "k" results. 

14 

15 -s, --search Search the query within each line rather than against the whole line, by preferring partial matching of the line. 

16 Tversky similarity is used instead of Jaccard similarity. 

17 -e <e> Parameter for Tversky similarity. [default: 0.001] 

18 

19 --mapping=<mapping> Map each string to a set of either "gram"s or "word"s. [default: gram] 

20 --numgrams=<numgrams> Number of characters for each gram when mapping by "gram". [default: 2] 

21 

22 --quiet Do not print additional information to standard error. 

23""" # noqa: E501 

24 

25import os 

26import sys 

27from functools import partial 

28from typing import cast 

29 

30from docopt import ParsedOptions, docopt 

31from extratools_core.debug import peakmem, stopwatch 

32 

33from topsim import TopSim 

34from topsim.localtyping import Output 

35 

36argv: ParsedOptions = docopt(cast("str", __doc__)) 

37 

38print2 = partial( 

39 print, 

40 file=( 

41 open(os.devnull, 'w', encoding='utf-8') if argv["--quiet"] 

42 else sys.stderr 

43 ), 

44) 

45 

46 

47def print_resource_usage() -> None: 

48 print2(f"{stopwatch() * 1_000:.2f} ms | {peakmem() / 1024 / 1024 / 1024:.2f} KB") 

49 

50 

51stopwatch() 

52 

53 

54s_raw_strs = [ 

55 line.rstrip('\r\n') 

56 for line in (open(argv["<file>"]) if argv["<file>"] else sys.stdin) 

57] 

58 

59print2("Indexing...", end=" ") 

60 

61ts = TopSim( 

62 s_raw_strs, 

63 case_sensitive=argv["-I"], 

64 mapping=argv["--mapping"], 

65 num_grams=int(argv["--numgrams"]), 

66) 

67 

68print_resource_usage() 

69 

70print2("Searching...", end=" ") 

71 

72r_best: Output = ts.search( 

73 argv["<query>"], 

74 k=int(argv["-k"]), 

75 tie=argv["--tie"], 

76 sim_func="tversky" if argv["--search"] else "jaccard", 

77 e=float(argv["-e"]), 

78) 

79 

80print_resource_usage() 

81 

82print2() 

83 

84for sim, lns in r_best: 

85 for ln in lns: 

86 print(f"{s_raw_strs[ln]}\t{sim:.4}") 

87 

88 

89# Placeholder function for pyproject.toml requirement of scripts 

90def run(): 

91 pass