Coverage for src/extratools_core/seq/subseq.py: 0%

55 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-05 23:54 -0700

1from collections.abc import Callable, Iterable, Sequence 

2from functools import cache 

3from itertools import chain, combinations 

4 

5from . import iter_to_seq 

6 

7 

8def enumerate_subseqs[T](seq: Iterable[T]) -> Iterable[Sequence[T]]: 

9 seq = iter_to_seq(seq) 

10 seq_len: int = len(seq) 

11 

12 for i in range(seq_len): 

13 for j in range(i + 1, seq_len + 1 if i > 0 else seq_len): 

14 yield seq[i:j] 

15 

16 

17def enumerate_subseqs_with_gaps[T](seq: Iterable[T]) -> Iterable[Sequence[T]]: 

18 seq = iter_to_seq(seq) 

19 

20 for i in range(1, len(seq)): 

21 yield from combinations(seq, i) 

22 

23 

24def best_subseq[T]( 

25 a: Iterable[T], 

26 score_func: Callable[[Iterable[T]], float], 

27) -> Sequence[T]: 

28 return max( 

29 chain([[]], enumerate_subseqs(a)), 

30 key=score_func, 

31 ) 

32 

33 

34def best_subseq_with_gaps[T]( 

35 a: Iterable[T], 

36 score_func: Callable[[Iterable[T]], float], 

37) -> Sequence[T]: 

38 return max( 

39 chain([[]], enumerate_subseqs_with_gaps(a)), 

40 key=score_func, 

41 ) 

42 

43 

44def common_subseq[T](a: Iterable[T], b: Iterable[T]) -> Iterable[T]: 

45 @cache 

46 # Find the start pos in `a` for longest common subseq aligned from right to left 

47 # between `a[:alen]` and `b[:blen]` 

48 def align_rec(alen: int, blen: int) -> int: 

49 if alen == 0 or blen == 0 or aseq[alen - 1] != bseq[blen - 1]: 

50 return alen 

51 

52 return align_rec(alen - 1, blen - 1) 

53 

54 aseq: Sequence[T] = iter_to_seq(a) 

55 bseq: Sequence[T] = iter_to_seq(b) 

56 

57 for k in range(*max( 

58 ( 

59 (align_rec(i, j), i) 

60 for i in range(len(aseq) + 1) 

61 for j in range(len(bseq) + 1) 

62 ), 

63 key=lambda x: x[1] - x[0], 

64 )): 

65 yield aseq[k] 

66 

67 

68def is_subseq[T](a: Iterable[T], b: Iterable[T]) -> bool: 

69 aseq: Sequence[T] = iter_to_seq(a) 

70 bseq: Sequence[T] = iter_to_seq(b) 

71 

72 if len(aseq) > len(bseq): 

73 return False 

74 

75 return any( 

76 aseq == bseq[j:j + len(aseq)] 

77 for j in range(len(bseq) - len(aseq) + 1) 

78 ) 

79 

80 

81def common_subseq_with_gaps[T](a: Iterable[T], b: Iterable[T]) -> Iterable[T]: 

82 alignment: tuple[Iterable[T | None], Iterable[T | None]] = align(a, b) 

83 

84 return ( 

85 x 

86 for x, y in zip( 

87 *alignment, 

88 strict=True, 

89 ) 

90 if x is not None and y is not None 

91 ) 

92 

93 

94def is_subseq_with_gaps[T](a: Iterable[T], b: Iterable[T]) -> bool: 

95 alignment: tuple[Iterable[T | None], Iterable[T | None]] = align(a, b) 

96 

97 return all( 

98 y is not None 

99 for y in alignment[1] 

100 ) 

101 

102 

103def align[T]( 

104 a: Iterable[T], 

105 b: Iterable[T], 

106 *, 

107 default: T | None = None, 

108) -> tuple[Iterable[T | None], Iterable[T | None]]: 

109 def merge( 

110 prev: tuple[int, tuple[Sequence[T | None], Sequence[T | None]]], 

111 curr: tuple[T | None, T | None], 

112 ) -> tuple[int, tuple[Sequence[T | None], Sequence[T | None]]]: 

113 prev_matches: int 

114 u: Sequence[T | None] 

115 v: Sequence[T | None] 

116 prev_matches, (u, v) = prev 

117 

118 x: T | None 

119 y: T | None 

120 x, y = curr 

121 

122 return (prev_matches + 1) if x == y else prev_matches, ([*u, x], [*v, y]) 

123 

124 @cache 

125 def align_rec(alen: int, blen: int) -> tuple[ 

126 int, 

127 tuple[Sequence[T | None], Sequence[T | None]], 

128 ]: 

129 if alen == 0: 

130 return 0, ( 

131 [default] * blen, bseq[:blen], 

132 ) 

133 if blen == 0: 

134 return 0, ( 

135 aseq[:alen], [default] * alen, 

136 ) 

137 

138 return max( 

139 ( 

140 merge(align_rec(alen - 1, blen), (aseq[alen - 1], default)), 

141 merge(align_rec(alen, blen - 1), (default, bseq[blen - 1])), 

142 merge(align_rec(alen - 1, blen - 1), (aseq[alen - 1], bseq[blen - 1])), 

143 ), 

144 key=lambda x: x[0], 

145 ) 

146 

147 aseq: Sequence[T] = iter_to_seq(a) 

148 bseq: Sequence[T] = iter_to_seq(b) 

149 

150 return align_rec(len(aseq), len(bseq))[1]