Coverage for src/seqrule/rulesets/dna.py: 52%
134 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-27 10:39 -0600
« prev ^ index » next coverage.py v7.6.12, created at 2025-02-27 10:39 -0600
1"""
2DNA sequence rules.
4This module implements sequence rules for DNA sequences, with support for:
5- Base patterns and structural constraints
6- Motif detection and analysis
7- Secondary structure prediction
8- Methylation patterns
9- Common sequence elements (promoters, binding sites)
10- GC content and skew analysis
11- Codon usage patterns
13Common use cases:
14- Validating PCR primers
15- Checking promoter sequences
16- Analyzing CpG islands
17- Detecting binding motifs
18- Validating gene sequences
19"""
21import re
22from dataclasses import dataclass
23from enum import Enum
24from typing import Callable, Optional
26from ..core import AbstractObject, Sequence
27from ..dsl import DSLRule
30class BaseType(Enum):
31 """Types of DNA bases."""
33 PURINE = "purine" # A, G
34 PYRIMIDINE = "pyrimidine" # C, T
35 STRONG = "strong" # G, C (3 H-bonds)
36 WEAK = "weak" # A, T (2 H-bonds)
37 AMINO = "amino" # A, C (amino group)
38 KETO = "keto" # G, T (keto group)
41class MethylationState(Enum):
42 """DNA methylation states."""
44 UNMETHYLATED = "unmethylated"
45 METHYLATED = "methylated"
46 HYDROXYMETHYLATED = "hydroxymethylated" # 5-hmC
47 UNKNOWN = "unknown"
50@dataclass
51class StructuralElement:
52 """DNA structural element properties."""
54 melting_temp: float # Melting temperature in Celsius
55 gc_content: float # GC content as fraction
56 stability: float # Free energy (ΔG) in kcal/mol
57 complexity: float # Sequence complexity score
60class Nucleotide(AbstractObject):
61 """
62 Represents a DNA nucleotide with properties.
64 Properties:
65 base: The nucleotide base (A, T, G, C)
66 type: Base type classification
67 methylation: Methylation state
68 position: Position in sequence (0-based)
69 """
71 def __init__(
72 self,
73 base: str,
74 methylation: MethylationState = MethylationState.UNKNOWN,
75 position: Optional[int] = None,
76 ):
77 """Initialize a nucleotide with its properties."""
78 if base not in ["A", "T", "G", "C"]:
79 raise ValueError(f"Invalid base: {base}. Must be one of A, T, G, C")
81 # Determine base types
82 types = set()
83 if base in ["A", "G"]:
84 types.add(BaseType.PURINE)
85 if base in ["C", "T"]:
86 types.add(BaseType.PYRIMIDINE)
87 if base in ["G", "C"]:
88 types.add(BaseType.STRONG)
89 if base in ["A", "T"]:
90 types.add(BaseType.WEAK)
91 if base in ["A", "C"]:
92 types.add(BaseType.AMINO)
93 if base in ["G", "T"]:
94 types.add(BaseType.KETO)
96 super().__init__(
97 base=base, types=types, methylation=methylation.value, position=position
98 )
100 def __repr__(self) -> str:
101 return f"Nucleotide({self.properties.get('base')})"
104def nucleotide_base_is(base: str) -> Callable[[AbstractObject], bool]:
105 """Creates a predicate that checks if a nucleotide has a specific base."""
106 return lambda obj: obj["base"] == base
109def nucleotide_type_is(base_type: BaseType) -> Callable[[AbstractObject], bool]:
110 """Creates a predicate that checks if a nucleotide is of a specific type."""
111 return lambda obj: base_type in obj["types"]
114def create_no_consecutive_rule(count: int) -> DSLRule:
115 """
116 Creates a rule forbidding 'count' consecutive identical bases.
118 Example:
119 no_poly_a = create_no_consecutive_rule(4) # No poly-A stretches
120 """
122 def check_consecutive(seq: Sequence) -> bool:
123 if len(seq) < count:
124 return True
125 for i in range(len(seq) - count + 1):
126 base = seq[i]["base"]
127 consecutive_count = 1
128 for j in range(i + 1, len(seq)):
129 if seq[j]["base"] == base:
130 consecutive_count += 1
131 if consecutive_count > count:
132 return False
133 else:
134 break
135 return True
137 return DSLRule(check_consecutive, f"no {count} consecutive identical bases")
140def create_motif_rule(
141 motif: str, max_mismatches: int = 0, allow_iupac: bool = True
142) -> DSLRule:
143 """
144 Creates a rule requiring a specific sequence motif.
146 Args:
147 motif: The sequence motif (can include IUPAC codes if allow_iupac=True)
148 max_mismatches: Maximum allowed mismatches
149 allow_iupac: Whether to interpret IUPAC ambiguity codes
151 Example:
152 tata_box = create_motif_rule("TATAAA")
153 cpg_site = create_motif_rule("CG")
154 """
155 iupac_map = {
156 "R": "[AG]", # Purine
157 "Y": "[CT]", # Pyrimidine
158 "S": "[GC]", # Strong
159 "W": "[AT]", # Weak
160 "K": "[GT]", # Keto
161 "M": "[AC]", # Amino
162 "B": "[CGT]", # Not A
163 "D": "[AGT]", # Not C
164 "H": "[ACT]", # Not G
165 "V": "[ACG]", # Not T
166 "N": "[ACGT]", # Any
167 }
169 if allow_iupac:
170 pattern = "".join(iupac_map.get(b, b) for b in motif.upper())
171 else:
172 pattern = motif.upper()
174 regex = re.compile(pattern)
176 def check_motif(seq: Sequence) -> bool:
177 sequence = "".join(n["base"] for n in seq)
178 return bool(regex.search(sequence))
180 return DSLRule(check_motif, f"contains motif {motif}")
183def create_gc_content_rule(min_percent: float, max_percent: float) -> DSLRule:
184 """
185 Creates a rule requiring GC content within a percentage range.
187 Example:
188 cpg_island = create_gc_content_rule(min_percent=60, max_percent=100)
189 """
191 def check_gc_content(seq: Sequence) -> bool:
192 if not seq:
193 return False
194 gc_count = sum(1 for obj in seq if obj["base"] in ["G", "C"])
195 gc_percent = (gc_count / len(seq)) * 100
196 return min_percent <= gc_percent <= max_percent
198 return DSLRule(
199 check_gc_content, f"GC content between {min_percent}% and {max_percent}%"
200 )
203def create_gc_skew_rule(window_size: int, threshold: float) -> DSLRule:
204 """
205 Creates a rule checking GC skew [(G-C)/(G+C)] in sliding windows.
207 Example:
208 ori_finder = create_gc_skew_rule(window_size=1000, threshold=0.2)
209 """
211 def check_gc_skew(seq: Sequence) -> bool:
212 if len(seq) < window_size:
213 return True
215 for i in range(len(seq) - window_size + 1):
216 window = seq[i : i + window_size]
217 g_count = sum(1 for n in window if n["base"] == "G")
218 c_count = sum(1 for n in window if n["base"] == "C")
219 if g_count + c_count == 0:
220 continue
221 skew = (g_count - c_count) / (g_count + c_count)
222 if abs(skew) > threshold:
223 return False
224 return True
226 return DSLRule(check_gc_skew, f"GC skew <= {threshold} in {window_size}bp windows")
229def create_methylation_rule(pattern: str = "CG") -> DSLRule:
230 """
231 Creates a rule checking methylation patterns.
233 Example:
234 cpg_methylation = create_methylation_rule("CG")
235 """
237 def check_methylation(seq: Sequence) -> bool:
238 bases = "".join(n["base"] for n in seq)
239 for match in re.finditer(pattern, bases):
240 start = match.start()
241 if any(
242 seq[i]["methylation"] == MethylationState.UNMETHYLATED.value
243 for i in range(start, start + len(pattern))
244 ):
245 return False
246 return True
248 return DSLRule(check_methylation, f"methylated {pattern} sites")
251def create_complementary_rule(other_seq: Sequence) -> DSLRule:
252 """
253 Creates a rule requiring the sequence to be complementary to another.
255 Example:
256 is_complement = create_complementary_rule(forward_strand)
257 """
258 complement = {"A": "T", "T": "A", "G": "C", "C": "G"}
260 def check_complementary(seq: Sequence) -> bool:
261 if len(seq) != len(other_seq):
262 return False
263 return all(
264 seq[i]["base"] == complement[other_seq[i]["base"]] for i in range(len(seq))
265 )
267 return DSLRule(check_complementary, "is complementary to reference sequence")
270def create_complexity_rule(min_complexity: float) -> DSLRule:
271 """
272 Creates a rule checking sequence complexity.
274 Example:
275 sufficient_complexity = create_complexity_rule(min_complexity=0.8)
276 """
278 def calculate_complexity(seq: Sequence) -> float:
279 if not seq:
280 return 0.0
281 # Use linguistic complexity (unique k-mers / possible k-mers)
282 k = min(len(seq), 5)
283 bases = "".join(n["base"] for n in seq)
285 # For very short sequences or repeating sequences, adjust complexity
286 if (
287 len(bases) < 3
288 ): # Sequences shorter than 3 bases are considered low complexity
289 return 0.0
291 # For repeating sequences (all same base), return 0 complexity
292 if len(set(bases)) == 1:
293 return 0.0
295 unique_kmers = {bases[i : i + k] for i in range(len(bases) - k + 1)}
296 max_kmers = min(4**k, len(seq) - k + 1)
297 return len(unique_kmers) / max_kmers
299 def check_complexity(seq: Sequence) -> bool:
300 return calculate_complexity(seq) >= min_complexity
302 return DSLRule(check_complexity, f"sequence complexity >= {min_complexity}")
305# Common DNA sequence rules
306promoter_rules = [
307 create_motif_rule("TATAAA", max_mismatches=1), # TATA box
308 create_gc_content_rule(min_percent=40, max_percent=60),
309]
311cpg_island_rules = [
312 create_gc_content_rule(min_percent=60, max_percent=100),
313 create_motif_rule("CG"),
314 create_complexity_rule(min_complexity=0.8),
315]
317primer_rules = [
318 create_gc_content_rule(min_percent=40, max_percent=60),
319 create_no_consecutive_rule(4),
320]
322# Example sequences
323tata_box = [Nucleotide(base) for base in "TATAAA"]
324cpg_site = [
325 Nucleotide("C", methylation=MethylationState.METHYLATED),
326 Nucleotide("G", methylation=MethylationState.METHYLATED),
327]
328primer = [Nucleotide(base) for base in "ATCGATCGATCG"]