Coverage for src/topsim/topsim.py: 54%
28 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 16:06 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 16:06 -0700
1import re
2from collections.abc import Iterable
3from functools import partial
4from re import Pattern
6from extratools_core.dict import inverted_index
7from extratools_core.str import str_to_grams
9from . import setsimilarity
10from .best import find_best
11from .grammap import apply_gram_map, create_gram_map, update_gram_map
12from .localtyping import Output, StringSet
14re_split: Pattern = re.compile(r"[_\W]+", re.UNICODE)
17def str_to_words(s: str) -> Iterable[str]:
18 return (w for w in re_split.split(s) if len(w) > 0)
21class TopSim:
22 def __init__(
23 self,
24 s_raw_strs: Iterable[str],
25 *,
26 case_sensitive: bool = False,
27 mapping: str = "gram",
28 num_grams: int = 2,
29 ) -> None:
30 self._str2set_func = lambda s: {
31 "gram": partial(str_to_grams, n=num_grams, pad='\0'),
32 "word": str_to_words,
33 }[mapping](s if case_sensitive else s.lower())
35 s_raw_str_sets = [
36 list(self._str2set_func(sRawStr))
37 for sRawStr in s_raw_strs
38 ]
39 self.gramMap = create_gram_map(s_raw_str_sets)
41 self.sStrs = [
42 apply_gram_map(self.gramMap, sRawStrSet)
43 for sRawStrSet in s_raw_str_sets
44 ]
45 self.sIndex = inverted_index(self.sStrs)
47 def search(
48 self,
49 r_raw_str: str,
50 *,
51 k: int = 1,
52 tie: bool = False,
53 sim_func: str = "jaccard",
54 e: float = 1 / 1000,
55 ) -> Output:
56 r_raw_str_set = list(self._str2set_func(r_raw_str))
57 update_gram_map(self.gramMap, r_raw_str_set)
59 r_str: StringSet = apply_gram_map(self.gramMap, r_raw_str_set)
61 upbound_func = {
62 "overlap": setsimilarity.overlap_upbound,
63 "jaccard": setsimilarity.jaccard_upbound,
64 "tversky": setsimilarity.tversky_upbound,
65 }[sim_func]
66 if sim_func == "tversky":
67 setsimilarity.e = e
69 return find_best(
70 r_str,
71 self.sStrs,
72 self.sIndex,
73 k=k,
74 tie=tie,
75 upbound_func=upbound_func,
76 )