Coverage for src/topsim/topsim.py: 54%

28 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-04 16:06 -0700

1import re 

2from collections.abc import Iterable 

3from functools import partial 

4from re import Pattern 

5 

6from extratools_core.dict import inverted_index 

7from extratools_core.str import str_to_grams 

8 

9from . import setsimilarity 

10from .best import find_best 

11from .grammap import apply_gram_map, create_gram_map, update_gram_map 

12from .localtyping import Output, StringSet 

13 

14re_split: Pattern = re.compile(r"[_\W]+", re.UNICODE) 

15 

16 

17def str_to_words(s: str) -> Iterable[str]: 

18 return (w for w in re_split.split(s) if len(w) > 0) 

19 

20 

21class TopSim: 

22 def __init__( 

23 self, 

24 s_raw_strs: Iterable[str], 

25 *, 

26 case_sensitive: bool = False, 

27 mapping: str = "gram", 

28 num_grams: int = 2, 

29 ) -> None: 

30 self._str2set_func = lambda s: { 

31 "gram": partial(str_to_grams, n=num_grams, pad='\0'), 

32 "word": str_to_words, 

33 }[mapping](s if case_sensitive else s.lower()) 

34 

35 s_raw_str_sets = [ 

36 list(self._str2set_func(sRawStr)) 

37 for sRawStr in s_raw_strs 

38 ] 

39 self.gramMap = create_gram_map(s_raw_str_sets) 

40 

41 self.sStrs = [ 

42 apply_gram_map(self.gramMap, sRawStrSet) 

43 for sRawStrSet in s_raw_str_sets 

44 ] 

45 self.sIndex = inverted_index(self.sStrs) 

46 

47 def search( 

48 self, 

49 r_raw_str: str, 

50 *, 

51 k: int = 1, 

52 tie: bool = False, 

53 sim_func: str = "jaccard", 

54 e: float = 1 / 1000, 

55 ) -> Output: 

56 r_raw_str_set = list(self._str2set_func(r_raw_str)) 

57 update_gram_map(self.gramMap, r_raw_str_set) 

58 

59 r_str: StringSet = apply_gram_map(self.gramMap, r_raw_str_set) 

60 

61 upbound_func = { 

62 "overlap": setsimilarity.overlap_upbound, 

63 "jaccard": setsimilarity.jaccard_upbound, 

64 "tversky": setsimilarity.tversky_upbound, 

65 }[sim_func] 

66 if sim_func == "tversky": 

67 setsimilarity.e = e 

68 

69 return find_best( 

70 r_str, 

71 self.sStrs, 

72 self.sIndex, 

73 k=k, 

74 tie=tie, 

75 upbound_func=upbound_func, 

76 )