Coverage for src/topsim/best.py: 18%
38 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 17:54 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 17:54 -0700
1from collections import defaultdict
2from collections.abc import Callable
3from heapq import heappop, heappush
5from extratools_core.set import add_to_set
7from .localtyping import Index, Output, StringSet
8from .setsimilarity import check_sim
11def find_best(
12 r_str: StringSet,
13 s_strs: list[StringSet],
14 s_index: Index,
15 *,
16 k: int,
17 tie: bool,
18 upbound_func: Callable[[int, int, int, int, int], float],
19) -> Output:
20 worst_sim: float = 0.0
21 total_num: int = 0
23 sim_heap: list[float] = []
24 sim_map: dict[float, list[int]] = defaultdict(list)
26 ln_set: set[int] = set()
27 for i, item in enumerate(r_str):
28 if upbound_func(len(r_str), i + 1, len(r_str) - (i + 1) + 1, 1, 1) < worst_sim:
29 break
31 for ln, p in s_index[item]:
32 if not add_to_set(ln_set, ln):
33 continue
35 curr_sim: float | None = check_sim(
36 worst_sim,
37 upbound_func,
38 r_str,
39 s_strs[ln],
40 i + 1,
41 p[0] + 1,
42 1,
43 )
44 if curr_sim is None:
45 continue
47 if curr_sim not in sim_map:
48 heappush(sim_heap, curr_sim)
49 sim_map[curr_sim].append(ln)
50 total_num += 1
52 if total_num > k:
53 curr_worst_sim: float = sim_heap[0]
54 curr_worst_num: int = len(sim_map[curr_worst_sim])
56 if total_num - curr_worst_num >= k:
57 del sim_map[curr_worst_sim]
58 total_num -= curr_worst_num
59 heappop(sim_heap)
60 elif not tie:
61 del sim_map[curr_worst_sim][curr_worst_num - (total_num - k):]
62 total_num = k
64 if total_num >= k:
65 worst_sim = sim_heap[0]
67 return sorted(sim_map.items(), key=lambda x: x[0], reverse=True)