Coverage for src/topsim/setsimilarity.py: 28%

32 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-04 17:54 -0700

1from collections.abc import Callable 

2 

3from extratools_core.math import safediv 

4 

5from .localtyping import StringSet 

6 

7__numStepForBound = 10 

8 

9 

10def check_sim( 

11 bound: float, 

12 upbound_func: Callable[[int, int, int, int, int], float], 

13 a: StringSet, 

14 b: StringSet, 

15 i: int = 0, 

16 j: int = 0, 

17 count: int = 0, 

18) -> float | None: 

19 la, lb = len(a), len(b) 

20 

21 if bound > upbound_func(la, i, lb, j, count): 

22 return None 

23 

24 step = 0 

25 while i < la and j < lb: 

26 x = a[i] 

27 y = b[j] 

28 

29 if x < y: 

30 i += 1 

31 elif x > y: 

32 j += 1 

33 else: 

34 i += 1 

35 j += 1 

36 count += 1 

37 

38 step += 1 

39 if step % __numStepForBound == 0 and bound > upbound_func(la, i, lb, j, count): 

40 return None 

41 

42 sim = upbound_func(la, i, lb, j, count) 

43 return None if bound > sim else sim 

44 

45 

46def overlap_upbound(a_len: int, a_passed: int, b_len: int, b_passed: int, count: int) -> float: 

47 return count + min(a_len - a_passed, b_len - b_passed) 

48 

49 

50def jaccard_upbound(a_len: int, a_passed: int, b_len: int, b_passed: int, count: int) -> float: 

51 max_count = overlap_upbound(**locals()) 

52 return min( 

53 1.0, 

54 safediv(max_count, a_len + b_len - max_count), 

55 ) 

56 

57 

58# Assume all sets are shorter than 1 / e 

59e = 1 / 1000 

60 

61 

62def tversky_upbound(a_len: int, a_passed: int, b_len: int, b_passed: int, count: int) -> float: 

63 return min( 

64 1.0, 

65 safediv( 

66 overlap_upbound(**locals()), 

67 (1 - e) * a_len + e * b_len, 

68 ), 

69 )