Coverage for src/topsim/setsimilarity.py: 28%
32 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 17:54 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-04 17:54 -0700
1from collections.abc import Callable
3from extratools_core.math import safediv
5from .localtyping import StringSet
7__numStepForBound = 10
10def check_sim(
11 bound: float,
12 upbound_func: Callable[[int, int, int, int, int], float],
13 a: StringSet,
14 b: StringSet,
15 i: int = 0,
16 j: int = 0,
17 count: int = 0,
18) -> float | None:
19 la, lb = len(a), len(b)
21 if bound > upbound_func(la, i, lb, j, count):
22 return None
24 step = 0
25 while i < la and j < lb:
26 x = a[i]
27 y = b[j]
29 if x < y:
30 i += 1
31 elif x > y:
32 j += 1
33 else:
34 i += 1
35 j += 1
36 count += 1
38 step += 1
39 if step % __numStepForBound == 0 and bound > upbound_func(la, i, lb, j, count):
40 return None
42 sim = upbound_func(la, i, lb, j, count)
43 return None if bound > sim else sim
46def overlap_upbound(a_len: int, a_passed: int, b_len: int, b_passed: int, count: int) -> float:
47 return count + min(a_len - a_passed, b_len - b_passed)
50def jaccard_upbound(a_len: int, a_passed: int, b_len: int, b_passed: int, count: int) -> float:
51 max_count = overlap_upbound(**locals())
52 return min(
53 1.0,
54 safediv(max_count, a_len + b_len - max_count),
55 )
58# Assume all sets are shorter than 1 / e
59e = 1 / 1000
62def tversky_upbound(a_len: int, a_passed: int, b_len: int, b_passed: int, count: int) -> float:
63 return min(
64 1.0,
65 safediv(
66 overlap_upbound(**locals()),
67 (1 - e) * a_len + e * b_len,
68 ),
69 )