Coverage for src/seqrule/rulesets/dna.py: 52%

134 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-27 10:39 -0600

1""" 

2DNA sequence rules. 

3 

4This module implements sequence rules for DNA sequences, with support for: 

5- Base patterns and structural constraints 

6- Motif detection and analysis 

7- Secondary structure prediction 

8- Methylation patterns 

9- Common sequence elements (promoters, binding sites) 

10- GC content and skew analysis 

11- Codon usage patterns 

12 

13Common use cases: 

14- Validating PCR primers 

15- Checking promoter sequences 

16- Analyzing CpG islands 

17- Detecting binding motifs 

18- Validating gene sequences 

19""" 

20 

21import re 

22from dataclasses import dataclass 

23from enum import Enum 

24from typing import Callable, Optional 

25 

26from ..core import AbstractObject, Sequence 

27from ..dsl import DSLRule 

28 

29 

30class BaseType(Enum): 

31 """Types of DNA bases.""" 

32 

33 PURINE = "purine" # A, G 

34 PYRIMIDINE = "pyrimidine" # C, T 

35 STRONG = "strong" # G, C (3 H-bonds) 

36 WEAK = "weak" # A, T (2 H-bonds) 

37 AMINO = "amino" # A, C (amino group) 

38 KETO = "keto" # G, T (keto group) 

39 

40 

41class MethylationState(Enum): 

42 """DNA methylation states.""" 

43 

44 UNMETHYLATED = "unmethylated" 

45 METHYLATED = "methylated" 

46 HYDROXYMETHYLATED = "hydroxymethylated" # 5-hmC 

47 UNKNOWN = "unknown" 

48 

49 

50@dataclass 

51class StructuralElement: 

52 """DNA structural element properties.""" 

53 

54 melting_temp: float # Melting temperature in Celsius 

55 gc_content: float # GC content as fraction 

56 stability: float # Free energy (ΔG) in kcal/mol 

57 complexity: float # Sequence complexity score 

58 

59 

60class Nucleotide(AbstractObject): 

61 """ 

62 Represents a DNA nucleotide with properties. 

63 

64 Properties: 

65 base: The nucleotide base (A, T, G, C) 

66 type: Base type classification 

67 methylation: Methylation state 

68 position: Position in sequence (0-based) 

69 """ 

70 

71 def __init__( 

72 self, 

73 base: str, 

74 methylation: MethylationState = MethylationState.UNKNOWN, 

75 position: Optional[int] = None, 

76 ): 

77 """Initialize a nucleotide with its properties.""" 

78 if base not in ["A", "T", "G", "C"]: 

79 raise ValueError(f"Invalid base: {base}. Must be one of A, T, G, C") 

80 

81 # Determine base types 

82 types = set() 

83 if base in ["A", "G"]: 

84 types.add(BaseType.PURINE) 

85 if base in ["C", "T"]: 

86 types.add(BaseType.PYRIMIDINE) 

87 if base in ["G", "C"]: 

88 types.add(BaseType.STRONG) 

89 if base in ["A", "T"]: 

90 types.add(BaseType.WEAK) 

91 if base in ["A", "C"]: 

92 types.add(BaseType.AMINO) 

93 if base in ["G", "T"]: 

94 types.add(BaseType.KETO) 

95 

96 super().__init__( 

97 base=base, types=types, methylation=methylation.value, position=position 

98 ) 

99 

100 def __repr__(self) -> str: 

101 return f"Nucleotide({self.properties.get('base')})" 

102 

103 

104def nucleotide_base_is(base: str) -> Callable[[AbstractObject], bool]: 

105 """Creates a predicate that checks if a nucleotide has a specific base.""" 

106 return lambda obj: obj["base"] == base 

107 

108 

109def nucleotide_type_is(base_type: BaseType) -> Callable[[AbstractObject], bool]: 

110 """Creates a predicate that checks if a nucleotide is of a specific type.""" 

111 return lambda obj: base_type in obj["types"] 

112 

113 

114def create_no_consecutive_rule(count: int) -> DSLRule: 

115 """ 

116 Creates a rule forbidding 'count' consecutive identical bases. 

117 

118 Example: 

119 no_poly_a = create_no_consecutive_rule(4) # No poly-A stretches 

120 """ 

121 

122 def check_consecutive(seq: Sequence) -> bool: 

123 if len(seq) < count: 

124 return True 

125 for i in range(len(seq) - count + 1): 

126 base = seq[i]["base"] 

127 consecutive_count = 1 

128 for j in range(i + 1, len(seq)): 

129 if seq[j]["base"] == base: 

130 consecutive_count += 1 

131 if consecutive_count > count: 

132 return False 

133 else: 

134 break 

135 return True 

136 

137 return DSLRule(check_consecutive, f"no {count} consecutive identical bases") 

138 

139 

140def create_motif_rule( 

141 motif: str, max_mismatches: int = 0, allow_iupac: bool = True 

142) -> DSLRule: 

143 """ 

144 Creates a rule requiring a specific sequence motif. 

145 

146 Args: 

147 motif: The sequence motif (can include IUPAC codes if allow_iupac=True) 

148 max_mismatches: Maximum allowed mismatches 

149 allow_iupac: Whether to interpret IUPAC ambiguity codes 

150 

151 Example: 

152 tata_box = create_motif_rule("TATAAA") 

153 cpg_site = create_motif_rule("CG") 

154 """ 

155 iupac_map = { 

156 "R": "[AG]", # Purine 

157 "Y": "[CT]", # Pyrimidine 

158 "S": "[GC]", # Strong 

159 "W": "[AT]", # Weak 

160 "K": "[GT]", # Keto 

161 "M": "[AC]", # Amino 

162 "B": "[CGT]", # Not A 

163 "D": "[AGT]", # Not C 

164 "H": "[ACT]", # Not G 

165 "V": "[ACG]", # Not T 

166 "N": "[ACGT]", # Any 

167 } 

168 

169 if allow_iupac: 

170 pattern = "".join(iupac_map.get(b, b) for b in motif.upper()) 

171 else: 

172 pattern = motif.upper() 

173 

174 regex = re.compile(pattern) 

175 

176 def check_motif(seq: Sequence) -> bool: 

177 sequence = "".join(n["base"] for n in seq) 

178 return bool(regex.search(sequence)) 

179 

180 return DSLRule(check_motif, f"contains motif {motif}") 

181 

182 

183def create_gc_content_rule(min_percent: float, max_percent: float) -> DSLRule: 

184 """ 

185 Creates a rule requiring GC content within a percentage range. 

186 

187 Example: 

188 cpg_island = create_gc_content_rule(min_percent=60, max_percent=100) 

189 """ 

190 

191 def check_gc_content(seq: Sequence) -> bool: 

192 if not seq: 

193 return False 

194 gc_count = sum(1 for obj in seq if obj["base"] in ["G", "C"]) 

195 gc_percent = (gc_count / len(seq)) * 100 

196 return min_percent <= gc_percent <= max_percent 

197 

198 return DSLRule( 

199 check_gc_content, f"GC content between {min_percent}% and {max_percent}%" 

200 ) 

201 

202 

203def create_gc_skew_rule(window_size: int, threshold: float) -> DSLRule: 

204 """ 

205 Creates a rule checking GC skew [(G-C)/(G+C)] in sliding windows. 

206 

207 Example: 

208 ori_finder = create_gc_skew_rule(window_size=1000, threshold=0.2) 

209 """ 

210 

211 def check_gc_skew(seq: Sequence) -> bool: 

212 if len(seq) < window_size: 

213 return True 

214 

215 for i in range(len(seq) - window_size + 1): 

216 window = seq[i : i + window_size] 

217 g_count = sum(1 for n in window if n["base"] == "G") 

218 c_count = sum(1 for n in window if n["base"] == "C") 

219 if g_count + c_count == 0: 

220 continue 

221 skew = (g_count - c_count) / (g_count + c_count) 

222 if abs(skew) > threshold: 

223 return False 

224 return True 

225 

226 return DSLRule(check_gc_skew, f"GC skew <= {threshold} in {window_size}bp windows") 

227 

228 

229def create_methylation_rule(pattern: str = "CG") -> DSLRule: 

230 """ 

231 Creates a rule checking methylation patterns. 

232 

233 Example: 

234 cpg_methylation = create_methylation_rule("CG") 

235 """ 

236 

237 def check_methylation(seq: Sequence) -> bool: 

238 bases = "".join(n["base"] for n in seq) 

239 for match in re.finditer(pattern, bases): 

240 start = match.start() 

241 if any( 

242 seq[i]["methylation"] == MethylationState.UNMETHYLATED.value 

243 for i in range(start, start + len(pattern)) 

244 ): 

245 return False 

246 return True 

247 

248 return DSLRule(check_methylation, f"methylated {pattern} sites") 

249 

250 

251def create_complementary_rule(other_seq: Sequence) -> DSLRule: 

252 """ 

253 Creates a rule requiring the sequence to be complementary to another. 

254 

255 Example: 

256 is_complement = create_complementary_rule(forward_strand) 

257 """ 

258 complement = {"A": "T", "T": "A", "G": "C", "C": "G"} 

259 

260 def check_complementary(seq: Sequence) -> bool: 

261 if len(seq) != len(other_seq): 

262 return False 

263 return all( 

264 seq[i]["base"] == complement[other_seq[i]["base"]] for i in range(len(seq)) 

265 ) 

266 

267 return DSLRule(check_complementary, "is complementary to reference sequence") 

268 

269 

270def create_complexity_rule(min_complexity: float) -> DSLRule: 

271 """ 

272 Creates a rule checking sequence complexity. 

273 

274 Example: 

275 sufficient_complexity = create_complexity_rule(min_complexity=0.8) 

276 """ 

277 

278 def calculate_complexity(seq: Sequence) -> float: 

279 if not seq: 

280 return 0.0 

281 # Use linguistic complexity (unique k-mers / possible k-mers) 

282 k = min(len(seq), 5) 

283 bases = "".join(n["base"] for n in seq) 

284 

285 # For very short sequences or repeating sequences, adjust complexity 

286 if ( 

287 len(bases) < 3 

288 ): # Sequences shorter than 3 bases are considered low complexity 

289 return 0.0 

290 

291 # For repeating sequences (all same base), return 0 complexity 

292 if len(set(bases)) == 1: 

293 return 0.0 

294 

295 unique_kmers = {bases[i : i + k] for i in range(len(bases) - k + 1)} 

296 max_kmers = min(4**k, len(seq) - k + 1) 

297 return len(unique_kmers) / max_kmers 

298 

299 def check_complexity(seq: Sequence) -> bool: 

300 return calculate_complexity(seq) >= min_complexity 

301 

302 return DSLRule(check_complexity, f"sequence complexity >= {min_complexity}") 

303 

304 

305# Common DNA sequence rules 

306promoter_rules = [ 

307 create_motif_rule("TATAAA", max_mismatches=1), # TATA box 

308 create_gc_content_rule(min_percent=40, max_percent=60), 

309] 

310 

311cpg_island_rules = [ 

312 create_gc_content_rule(min_percent=60, max_percent=100), 

313 create_motif_rule("CG"), 

314 create_complexity_rule(min_complexity=0.8), 

315] 

316 

317primer_rules = [ 

318 create_gc_content_rule(min_percent=40, max_percent=60), 

319 create_no_consecutive_rule(4), 

320] 

321 

322# Example sequences 

323tata_box = [Nucleotide(base) for base in "TATAAA"] 

324cpg_site = [ 

325 Nucleotide("C", methylation=MethylationState.METHYLATED), 

326 Nucleotide("G", methylation=MethylationState.METHYLATED), 

327] 

328primer = [Nucleotide(base) for base in "ATCGATCGATCG"]