Coverage for src/topsim_cli.py: 0%
26 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 17:47 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 17:47 -0700
1#! /usr/bin/env python3
3"""
4Usage:
5 topsim-cli <query> [options] [<file>]
7 topsim-cli --help
10Options:
11 -I Case-sensitive matching.
12 -k <k> Maximum number of search results. [default: 1]
13 --tie Include all the results with the same similarity of the "k"-th result. May return more than "k" results.
15 -s, --search Search the query within each line rather than against the whole line, by preferring partial matching of the line.
16 Tversky similarity is used instead of Jaccard similarity.
17 -e <e> Parameter for Tversky similarity. [default: 0.001]
19 --mapping=<mapping> Map each string to a set of either "gram"s or "word"s. [default: gram]
20 --numgrams=<numgrams> Number of characters for each gram when mapping by "gram". [default: 2]
22 --quiet Do not print additional information to standard error.
23""" # noqa: E501
25import os
26import sys
27from functools import partial
28from typing import cast
30from docopt import ParsedOptions, docopt
31from extratools_core.debug import peakmem, stopwatch
33from topsim import TopSim
34from topsim.localtyping import Output
36argv: ParsedOptions = docopt(cast("str", __doc__))
38print2 = partial(
39 print,
40 file=(
41 open(os.devnull, 'w', encoding='utf-8') if argv["--quiet"]
42 else sys.stderr
43 ),
44)
47def print_resource_usage() -> None:
48 print2(f"{stopwatch() * 1_000:.2f} ms | {peakmem() / 1024 / 1024 / 1024:.2f} KB")
51stopwatch()
54s_raw_strs = [
55 line.rstrip('\r\n')
56 for line in (open(argv["<file>"]) if argv["<file>"] else sys.stdin)
57]
59print2("Indexing...", end=" ")
61ts = TopSim(
62 s_raw_strs,
63 case_sensitive=argv["-I"],
64 mapping=argv["--mapping"],
65 num_grams=int(argv["--numgrams"]),
66)
68print_resource_usage()
70print2("Searching...", end=" ")
72r_best: Output = ts.search(
73 argv["<query>"],
74 k=int(argv["-k"]),
75 tie=argv["--tie"],
76 sim_func="tversky" if argv["--search"] else "jaccard",
77 e=float(argv["-e"]),
78)
80print_resource_usage()
82print2()
84for sim, lns in r_best:
85 for ln in lns:
86 print(f"{s_raw_strs[ln]}\t{sim:.4}")
89# Placeholder function for pyproject.toml requirement of scripts
90def run():
91 pass