Module xi_covutils.primers.infer
Function and classes to guess primer from sequences.
Expand source code
"""
Function and classes to guess primer from sequences.
"""
from abc import abstractmethod
from dataclasses import dataclass
from enum import Enum
import sys
from typing import Literal, Protocol, Tuple, Union, cast, runtime_checkable
import click
import numpy as np
import pandas as pd
from typing_extensions import override
from xi_covutils.seqs.seq_collection import (
BioAlphabet,
BioSeq,
SequenceCollection
)
class NucleicSequenceEnd(Enum):
"""
The termini of a DNA sequence.
"""
FIVEPRIME = 0
THREEPRIME = 1
deg_indexes = list("ACGTMRWSYKVHDBN")
rev_indexes = list("ACMGRSVTWYHKDBN")
deg_bases = [
["A"], ["C"], ["G"], ["T"],
["A", "C"], ["A", "G"], ["A", "T"],
["C", "G"], ["C", "T"], ["G", "T"],
["A", "C", "G"], ["A", "C", "T"],
["A", "G", "T"], ["C", "G", "T"],
["A", "C", "G", "T"]
]
deg_bases = pd.Series(
[np.array(x) for x in deg_bases],
deg_indexes
)
@dataclass
class CountResult:
primer: BioSeq
matches: int
mismatches: int
class PrimerMatchCounter:
"""
Count matches and mismatches of a primer to a collection of sequences.
"""
def count(
self,
sequences:SequenceCollection,
primer:BioSeq,
end: NucleicSequenceEnd
) -> CountResult:
"""
Count matches and mismatches of a primer to a collection of sequences.
Args:
sequences (SequenceCollection): A Sequence collection
primer (BioSeq): A primer to check
end (NucleicSequenceEnd): The end of the sequence to check the primer.
Returns:
CountResult: The result of the counts.
"""
if end == NucleicSequenceEnd.THREEPRIME:
primer = primer.reverse_complement()
guess_seq = list(primer.sequence)
sequence_extractor = EndSequenceExtractor.build(
subsequence_length = len(guess_seq),
end=end
)
seqs = [
list(sequence_extractor.subsequence(s).sequence)
for s in sequences
]
deg_matrix = pd.DataFrame(
[
reversed([int(x) for x in (f"{i+1:04b}")])
for i in range(15)
],
columns = list("ACGT"),
index = rev_indexes
)
guessed_matrix = deg_matrix.loc[guess_seq, :].to_numpy()
matches = sum(
(
deg_matrix
.loc[seq, :]
.to_numpy() *
guessed_matrix
)
.sum(axis=1)
.prod()
for seq in seqs
)
mismatches = len(sequences) - matches
return CountResult(primer, matches, mismatches)
def histogram(
self,
sequences:SequenceCollection,
primer:BioSeq,
end: NucleicSequenceEnd
) -> pd.Series:
"""
Makes an histogram of mismatches of a primer to a collection of sequences.
Args:
sequences (SequenceCollection): A Sequence collection
primer (BioSeq): A primer to check
end (NucleicSequenceEnd): The end of the sequence to check the primer.
Returns:
CountResult: The result of the counts.
"""
if end == NucleicSequenceEnd.THREEPRIME:
primer = primer.reverse_complement()
guess_seq = list(primer.sequence)
sequence_extractor = EndSequenceExtractor.build(
subsequence_length = len(guess_seq),
end=end
)
seqs = [
list(sequence_extractor.subsequence(s).sequence)
for s in sequences
]
deg_matrix = pd.DataFrame(
[
reversed([int(x) for x in (f"{i+1:04b}")])
for i in range(15)
],
columns = list("ACGT"),
index = rev_indexes
)
guessed_matrix = deg_matrix.loc[guess_seq, :].to_numpy()
mismatches = [
np.count_nonzero(
(
deg_matrix
.loc[seq, :]
.to_numpy() *
guessed_matrix
)
.sum(axis=1) == 0
)
for seq in seqs
]
return pd.Series(mismatches).value_counts()
class PrimerGuesser:
"""
Guess primer sequences from a collection of sequences.
The primer guessed will start a 5' end or 3' end.
"""
def __init__(
self,
end: NucleicSequenceEnd,
primer_length:int,
min_frq:float = 0.01
):
self.end = end
self.primer_length = primer_length
self.sequence_estractor = EndSequenceExtractor.build(end, primer_length)
if min_frq <0 or min_frq >=1:
raise ValueError(f"min_frq should be in range [0, 1): {min_frq}")
self.min_frq = min_frq
def guess(self, sequences: SequenceCollection) -> BioSeq:
"""
Guess a primer Sequence.
Args:
sequences (SequenceCollection): A collection of sequences.
Returns:
BioSeq: A BioSeq sequence representing the primer.
"""
bases_df = (
self
.inspect(sequences)
.apply(lambda x: x.mask(x<self.min_frq, 0))
.apply(lambda x: x.mask(x>0, 1), axis=1)
.apply(lambda x: x * np.array([1, 2, 4, 8]), axis=1)
.apply(lambda x: x.sum()-1, axis=1)
.apply(lambda x: rev_indexes[int(x)])
.tolist()
)
seq = "".join(bases_df)
if self.end == NucleicSequenceEnd.THREEPRIME:
return BioSeq("primer", seq, BioAlphabet.DNA).reverse_complement()
return BioSeq("primer", seq, BioAlphabet.DNA)
def inspect(self, sequences: SequenceCollection) -> pd.DataFrame:
"""
Computes bases frequencies for each position of the guessed primer.
Args:
sequences (SequenceCollection): A collection of sequences.
Returns:
pd.DataFrame: A Dataframe with the frequencies of each base at each
position.
"""
seqs = [
list(self.sequence_estractor.subsequence(s).sequence)
for s in sequences
]
df = (
pd.DataFrame(
seqs
)
)
result = (
df.apply(
lambda x:
pd.Series(
np.concatenate(deg_bases[x.tolist()].to_numpy())
.flatten()
)
.value_counts(),
axis=0
)
.fillna(0)
.transpose()
.apply(lambda x: x / sum(x), axis=1)
)
return cast(pd.DataFrame, result)
@runtime_checkable
class EndSequenceExtractor(Protocol):
"""
End sequence extractor abstract class.
"""
@abstractmethod
def subsequence(self, seq: BioSeq) -> BioSeq:
"""
Extract a subsequence from a BioSeq.
"""
@staticmethod
def build(
end: NucleicSequenceEnd,
subsequence_length: int
) -> "EndSequenceExtractor":
"""
Builds a 5' sequence extractor or a 3' sequence extractor.
Args:
end (NucleicSequenceEnd): _description_
subsequence_length (int): _description_
Returns:
EndSequenceExtractor: _description_
"""
if end == NucleicSequenceEnd.FIVEPRIME:
return FivePrimeSequenceExtractor(subsequence_length)
return ThreePrimeSequenceExtractor(subsequence_length)
class FivePrimeSequenceExtractor(EndSequenceExtractor):
"""
A 5's sequence extractor.
"""
def __init__(self, subsequence_length:int):
self.subsequence_length = subsequence_length
@override
def subsequence(self, seq: BioSeq) -> BioSeq:
new_seq = seq.sequence[0:self.subsequence_length]
return BioSeq(seq.identifier, new_seq, seq.alphabet)
class ThreePrimeSequenceExtractor(EndSequenceExtractor):
"""
A 3's sequence extractor.
"""
def __init__(self, subsequence_length:int):
self.subsequence_length = subsequence_length
@override
def subsequence(self, seq: BioSeq) -> BioSeq:
new_seq = seq.sequence[-self.subsequence_length:]
return BioSeq(seq.identifier, new_seq, seq.alphabet)
SeqType = Union[Literal["fasta"], Literal["fastq"]]
@click.command()
@click.argument(
'sequence_file',
type=click.Path(exists=True),
)
@click.argument(
'outfile',
type=click.Path(),
)
@click.option(
"--seq-type",
default = "fasta",
help="Input file type. Default: fasta",
type=click.Choice(["fasta", "fastq"], case_sensitive=False)
)
@click.option(
"--end",
default = "both",
help="Sequence end to search for primers. Default: both",
type=click.Choice(["5", "3", "both"], case_sensitive=False)
)
@click.option(
"--length",
default = 20,
help="Primer length",
type=click.INT
)
@click.option(
"--min-frq",
default = 0.01,
help="Minimum frequency for a base to be included in the primer sequence.",
type=click.FLOAT
)
# pylint: disable=too-many-arguments
def guess(
sequence_file: str,
outfile:str,
seq_type: str = "fasta",
end:str = "both",
length:int = 20,
min_frq:float = 0.01
):
"""
Guess primers from sequence.
"""
seq_type = seq_type.lower()
if seq_type not in ("fasta", "fastq"):
sys.exit("Invalid sequence type")
if seq_type == "fasta":
seqs = SequenceCollection.from_fasta(sequence_file)
if seq_type == "fastq":
seqs = SequenceCollection.from_fastq(sequence_file)
seq_end = end.lower()
if seq_end not in ("5", "3", "both"):
sys.exit("Invalid sequence type")
if seq_end == "both":
seq_end = [NucleicSequenceEnd.FIVEPRIME, NucleicSequenceEnd.THREEPRIME]
elif seq_end == "5":
seq_end = [NucleicSequenceEnd.FIVEPRIME]
else:
seq_end = [NucleicSequenceEnd.THREEPRIME]
result = []
for c_end in seq_end:
guesser = PrimerGuesser(c_end, length, min_frq)
primer = guesser.guess(seqs)
primer.identifier = f"{primer.identifier}.{c_end}"
result.append(primer)
result_seq = SequenceCollection(result)
result_seq.to_fasta(outfile)
@click.command()
@click.argument(
'sequence_file',
type=click.Path(exists=True),
)
@click.argument(
'outfile',
type=click.Path(),
)
@click.argument("primer")
@click.option(
"--seq-type",
default = "fasta",
help="Input file type. Default: fasta",
type=click.Choice(["fasta", "fastq"], case_sensitive=False)
)
@click.option(
"--end",
default = "both",
help="Sequence end to search for primers. Default: both",
type=click.Choice(["5", "3", "both"], case_sensitive=False)
)
# pylint: disable=too-many-arguments
def mismatch_histogram(
sequence_file: str,
outfile:str,
primer:str,
seq_type: str = "fasta",
end:str = "5",
):
"""
Makes an histogram of mismatches of guessed primers.
"""
seq_type = seq_type.lower()
if seq_type not in ("fasta", "fastq"):
sys.exit("Invalid sequence type")
if seq_type == "fasta":
seqs = SequenceCollection.from_fasta(sequence_file)
if seq_type == "fastq":
seqs = SequenceCollection.from_fastq(sequence_file)
seq_end = end.lower()
if seq_end not in ("5", "3"):
sys.exit("Invalid sequence type")
elif seq_end == "5":
seq_end = NucleicSequenceEnd.FIVEPRIME
else:
seq_end = NucleicSequenceEnd.THREEPRIME
c_primer = BioSeq(f"primer.{seq_end}", primer, BioAlphabet.DNA)
counter = PrimerMatchCounter()
counts = counter.histogram(seqs, c_primer, seq_end)
counts = counts.sort_index()
counts.to_csv(outfile, header = False)
@click.command()
@click.argument(
'sequence_file',
type=click.Path(exists=True),
)
@click.argument(
'outfile',
type=click.Path(),
)
@click.argument("primer")
@click.option(
"--seq-type",
default = "fasta",
help="Input file type. Default: fasta",
type=click.Choice(["fasta", "fastq"], case_sensitive=False)
)
@click.option(
"--end",
default = "both",
help="Sequence end to search for primers. Default: both",
type=click.Choice(["5", "3", "both"], case_sensitive=False)
)
# pylint: disable=too-many-arguments
def count_matches(
sequence_file: str,
outfile:str,
primer:str,
seq_type: str = "fasta",
end:str = "5",
):
"""
Count matches and mismatches of guessed primers.
"""
seq_type = seq_type.lower()
if seq_type not in ("fasta", "fastq"):
sys.exit("Invalid sequence type")
if seq_type == "fasta":
seqs = SequenceCollection.from_fasta(sequence_file)
if seq_type == "fastq":
seqs = SequenceCollection.from_fastq(sequence_file)
seq_end = end.lower()
if seq_end not in ("5", "3", "both"):
sys.exit("Invalid sequence type")
if seq_end == "both":
seq_end = [NucleicSequenceEnd.FIVEPRIME, NucleicSequenceEnd.THREEPRIME]
elif seq_end == "5":
seq_end = [NucleicSequenceEnd.FIVEPRIME]
else:
seq_end = [NucleicSequenceEnd.THREEPRIME]
result: list[CountResult] = []
for c_end in seq_end:
c_primer = BioSeq(f"primer.{c_end}", primer, BioAlphabet.DNA)
counter = PrimerMatchCounter()
counts = counter.count(seqs, c_primer, c_end)
result.append(counts)
pd.DataFrame(
[
(
res.primer.identifier,
res.primer.sequence,
res.matches,
res.mismatches
)
for res in result
],
columns = ["Identifier", "sequence", "matches", "mismatches"]
).to_csv(outfile)
@click.command()
@click.argument(
'sequence_file',
type=click.Path(exists=True),
)
@click.argument(
'outfile',
type=click.Path(),
)
@click.option(
"--seq-type",
default = "fasta",
help="Input file type. Default: fasta",
type=click.Choice(["fasta", "fastq"], case_sensitive=False)
)
@click.option(
"--end",
default = "5",
help="Sequence end to search for primers. Default: 5",
type=click.Choice(["5", "3"], case_sensitive=False)
)
@click.option(
"--length",
default = 20,
help="Primer length",
type=click.INT
)
def inspect(
sequence_file: str,
outfile:str,
seq_type: str = "fasta",
end:str = "5",
length:int = 20
):
"""
Guess primers from sequence.
"""
seq_type = seq_type.lower()
if seq_type not in ("fasta", "fastq"):
sys.exit("Invalid sequence type")
if seq_type == "fasta":
seqs = SequenceCollection.from_fasta(sequence_file)
if seq_type == "fastq":
seqs = SequenceCollection.from_fastq(sequence_file)
seq_end = end.lower()
if seq_end not in ("5", "3"):
sys.exit("Invalid sequence type")
elif seq_end == "5":
seq_end = NucleicSequenceEnd.FIVEPRIME
else:
seq_end = NucleicSequenceEnd.THREEPRIME
guesser = PrimerGuesser(seq_end, length)
freqs = guesser.inspect(seqs)
freqs.to_csv(outfile)
Classes
class CountResult (primer: BioSeq, matches: int, mismatches: int)
-
CountResult(primer: xi_covutils.seqs.seq_collection.BioSeq, matches: int, mismatches: int)
Expand source code
@dataclass class CountResult: primer: BioSeq matches: int mismatches: int
Class variables
var matches : int
var mismatches : int
var primer : BioSeq
class EndSequenceExtractor (*args, **kwargs)
-
End sequence extractor abstract class.
Expand source code
@runtime_checkable class EndSequenceExtractor(Protocol): """ End sequence extractor abstract class. """ @abstractmethod def subsequence(self, seq: BioSeq) -> BioSeq: """ Extract a subsequence from a BioSeq. """ @staticmethod def build( end: NucleicSequenceEnd, subsequence_length: int ) -> "EndSequenceExtractor": """ Builds a 5' sequence extractor or a 3' sequence extractor. Args: end (NucleicSequenceEnd): _description_ subsequence_length (int): _description_ Returns: EndSequenceExtractor: _description_ """ if end == NucleicSequenceEnd.FIVEPRIME: return FivePrimeSequenceExtractor(subsequence_length) return ThreePrimeSequenceExtractor(subsequence_length)
Ancestors
- typing.Protocol
- typing.Generic
Subclasses
Static methods
def build(end: NucleicSequenceEnd, subsequence_length: int) ‑> EndSequenceExtractor
-
Builds a 5' sequence extractor or a 3' sequence extractor.
Args
end
:NucleicSequenceEnd
- description
subsequence_length
:int
- description
Returns
EndSequenceExtractor
- description
Expand source code
@staticmethod def build( end: NucleicSequenceEnd, subsequence_length: int ) -> "EndSequenceExtractor": """ Builds a 5' sequence extractor or a 3' sequence extractor. Args: end (NucleicSequenceEnd): _description_ subsequence_length (int): _description_ Returns: EndSequenceExtractor: _description_ """ if end == NucleicSequenceEnd.FIVEPRIME: return FivePrimeSequenceExtractor(subsequence_length) return ThreePrimeSequenceExtractor(subsequence_length)
Methods
def subsequence(self, seq: BioSeq) ‑> BioSeq
-
Extract a subsequence from a BioSeq.
Expand source code
@abstractmethod def subsequence(self, seq: BioSeq) -> BioSeq: """ Extract a subsequence from a BioSeq. """
class FivePrimeSequenceExtractor (subsequence_length: int)
-
A 5's sequence extractor.
Expand source code
class FivePrimeSequenceExtractor(EndSequenceExtractor): """ A 5's sequence extractor. """ def __init__(self, subsequence_length:int): self.subsequence_length = subsequence_length @override def subsequence(self, seq: BioSeq) -> BioSeq: new_seq = seq.sequence[0:self.subsequence_length] return BioSeq(seq.identifier, new_seq, seq.alphabet)
Ancestors
- EndSequenceExtractor
- typing.Protocol
- typing.Generic
Methods
def subsequence(self, seq: BioSeq) ‑> BioSeq
-
Extract a subsequence from a BioSeq.
Expand source code
@override def subsequence(self, seq: BioSeq) -> BioSeq: new_seq = seq.sequence[0:self.subsequence_length] return BioSeq(seq.identifier, new_seq, seq.alphabet)
class NucleicSequenceEnd (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
The termini of a DNA sequence.
Expand source code
class NucleicSequenceEnd(Enum): """ The termini of a DNA sequence. """ FIVEPRIME = 0 THREEPRIME = 1
Ancestors
- enum.Enum
Class variables
var FIVEPRIME
var THREEPRIME
class PrimerGuesser (end: NucleicSequenceEnd, primer_length: int, min_frq: float = 0.01)
-
Guess primer sequences from a collection of sequences.
The primer guessed will start a 5' end or 3' end.
Expand source code
class PrimerGuesser: """ Guess primer sequences from a collection of sequences. The primer guessed will start a 5' end or 3' end. """ def __init__( self, end: NucleicSequenceEnd, primer_length:int, min_frq:float = 0.01 ): self.end = end self.primer_length = primer_length self.sequence_estractor = EndSequenceExtractor.build(end, primer_length) if min_frq <0 or min_frq >=1: raise ValueError(f"min_frq should be in range [0, 1): {min_frq}") self.min_frq = min_frq def guess(self, sequences: SequenceCollection) -> BioSeq: """ Guess a primer Sequence. Args: sequences (SequenceCollection): A collection of sequences. Returns: BioSeq: A BioSeq sequence representing the primer. """ bases_df = ( self .inspect(sequences) .apply(lambda x: x.mask(x<self.min_frq, 0)) .apply(lambda x: x.mask(x>0, 1), axis=1) .apply(lambda x: x * np.array([1, 2, 4, 8]), axis=1) .apply(lambda x: x.sum()-1, axis=1) .apply(lambda x: rev_indexes[int(x)]) .tolist() ) seq = "".join(bases_df) if self.end == NucleicSequenceEnd.THREEPRIME: return BioSeq("primer", seq, BioAlphabet.DNA).reverse_complement() return BioSeq("primer", seq, BioAlphabet.DNA) def inspect(self, sequences: SequenceCollection) -> pd.DataFrame: """ Computes bases frequencies for each position of the guessed primer. Args: sequences (SequenceCollection): A collection of sequences. Returns: pd.DataFrame: A Dataframe with the frequencies of each base at each position. """ seqs = [ list(self.sequence_estractor.subsequence(s).sequence) for s in sequences ] df = ( pd.DataFrame( seqs ) ) result = ( df.apply( lambda x: pd.Series( np.concatenate(deg_bases[x.tolist()].to_numpy()) .flatten() ) .value_counts(), axis=0 ) .fillna(0) .transpose() .apply(lambda x: x / sum(x), axis=1) ) return cast(pd.DataFrame, result)
Methods
def guess(self, sequences: SequenceCollection) ‑> BioSeq
-
Guess a primer Sequence.
Args
sequences
:SequenceCollection
- A collection of sequences.
Returns
BioSeq
- A BioSeq sequence representing the primer.
Expand source code
def guess(self, sequences: SequenceCollection) -> BioSeq: """ Guess a primer Sequence. Args: sequences (SequenceCollection): A collection of sequences. Returns: BioSeq: A BioSeq sequence representing the primer. """ bases_df = ( self .inspect(sequences) .apply(lambda x: x.mask(x<self.min_frq, 0)) .apply(lambda x: x.mask(x>0, 1), axis=1) .apply(lambda x: x * np.array([1, 2, 4, 8]), axis=1) .apply(lambda x: x.sum()-1, axis=1) .apply(lambda x: rev_indexes[int(x)]) .tolist() ) seq = "".join(bases_df) if self.end == NucleicSequenceEnd.THREEPRIME: return BioSeq("primer", seq, BioAlphabet.DNA).reverse_complement() return BioSeq("primer", seq, BioAlphabet.DNA)
def inspect(self, sequences: SequenceCollection) ‑> pandas.core.frame.DataFrame
-
Computes bases frequencies for each position of the guessed primer.
Args
sequences
:SequenceCollection
- A collection of sequences.
Returns
pd.DataFrame
- A Dataframe with the frequencies of each base at each position.
Expand source code
def inspect(self, sequences: SequenceCollection) -> pd.DataFrame: """ Computes bases frequencies for each position of the guessed primer. Args: sequences (SequenceCollection): A collection of sequences. Returns: pd.DataFrame: A Dataframe with the frequencies of each base at each position. """ seqs = [ list(self.sequence_estractor.subsequence(s).sequence) for s in sequences ] df = ( pd.DataFrame( seqs ) ) result = ( df.apply( lambda x: pd.Series( np.concatenate(deg_bases[x.tolist()].to_numpy()) .flatten() ) .value_counts(), axis=0 ) .fillna(0) .transpose() .apply(lambda x: x / sum(x), axis=1) ) return cast(pd.DataFrame, result)
class PrimerMatchCounter
-
Count matches and mismatches of a primer to a collection of sequences.
Expand source code
class PrimerMatchCounter: """ Count matches and mismatches of a primer to a collection of sequences. """ def count( self, sequences:SequenceCollection, primer:BioSeq, end: NucleicSequenceEnd ) -> CountResult: """ Count matches and mismatches of a primer to a collection of sequences. Args: sequences (SequenceCollection): A Sequence collection primer (BioSeq): A primer to check end (NucleicSequenceEnd): The end of the sequence to check the primer. Returns: CountResult: The result of the counts. """ if end == NucleicSequenceEnd.THREEPRIME: primer = primer.reverse_complement() guess_seq = list(primer.sequence) sequence_extractor = EndSequenceExtractor.build( subsequence_length = len(guess_seq), end=end ) seqs = [ list(sequence_extractor.subsequence(s).sequence) for s in sequences ] deg_matrix = pd.DataFrame( [ reversed([int(x) for x in (f"{i+1:04b}")]) for i in range(15) ], columns = list("ACGT"), index = rev_indexes ) guessed_matrix = deg_matrix.loc[guess_seq, :].to_numpy() matches = sum( ( deg_matrix .loc[seq, :] .to_numpy() * guessed_matrix ) .sum(axis=1) .prod() for seq in seqs ) mismatches = len(sequences) - matches return CountResult(primer, matches, mismatches) def histogram( self, sequences:SequenceCollection, primer:BioSeq, end: NucleicSequenceEnd ) -> pd.Series: """ Makes an histogram of mismatches of a primer to a collection of sequences. Args: sequences (SequenceCollection): A Sequence collection primer (BioSeq): A primer to check end (NucleicSequenceEnd): The end of the sequence to check the primer. Returns: CountResult: The result of the counts. """ if end == NucleicSequenceEnd.THREEPRIME: primer = primer.reverse_complement() guess_seq = list(primer.sequence) sequence_extractor = EndSequenceExtractor.build( subsequence_length = len(guess_seq), end=end ) seqs = [ list(sequence_extractor.subsequence(s).sequence) for s in sequences ] deg_matrix = pd.DataFrame( [ reversed([int(x) for x in (f"{i+1:04b}")]) for i in range(15) ], columns = list("ACGT"), index = rev_indexes ) guessed_matrix = deg_matrix.loc[guess_seq, :].to_numpy() mismatches = [ np.count_nonzero( ( deg_matrix .loc[seq, :] .to_numpy() * guessed_matrix ) .sum(axis=1) == 0 ) for seq in seqs ] return pd.Series(mismatches).value_counts()
Methods
def count(self, sequences: SequenceCollection, primer: BioSeq, end: NucleicSequenceEnd) ‑> CountResult
-
Count matches and mismatches of a primer to a collection of sequences.
Args
sequences
:SequenceCollection
- A Sequence collection
primer
:BioSeq
- A primer to check
end
:NucleicSequenceEnd
- The end of the sequence to check the primer.
Returns
CountResult
- The result of the counts.
Expand source code
def count( self, sequences:SequenceCollection, primer:BioSeq, end: NucleicSequenceEnd ) -> CountResult: """ Count matches and mismatches of a primer to a collection of sequences. Args: sequences (SequenceCollection): A Sequence collection primer (BioSeq): A primer to check end (NucleicSequenceEnd): The end of the sequence to check the primer. Returns: CountResult: The result of the counts. """ if end == NucleicSequenceEnd.THREEPRIME: primer = primer.reverse_complement() guess_seq = list(primer.sequence) sequence_extractor = EndSequenceExtractor.build( subsequence_length = len(guess_seq), end=end ) seqs = [ list(sequence_extractor.subsequence(s).sequence) for s in sequences ] deg_matrix = pd.DataFrame( [ reversed([int(x) for x in (f"{i+1:04b}")]) for i in range(15) ], columns = list("ACGT"), index = rev_indexes ) guessed_matrix = deg_matrix.loc[guess_seq, :].to_numpy() matches = sum( ( deg_matrix .loc[seq, :] .to_numpy() * guessed_matrix ) .sum(axis=1) .prod() for seq in seqs ) mismatches = len(sequences) - matches return CountResult(primer, matches, mismatches)
def histogram(self, sequences: SequenceCollection, primer: BioSeq, end: NucleicSequenceEnd) ‑> pandas.core.series.Series
-
Makes an histogram of mismatches of a primer to a collection of sequences.
Args
sequences
:SequenceCollection
- A Sequence collection
primer
:BioSeq
- A primer to check
end
:NucleicSequenceEnd
- The end of the sequence to check the primer.
Returns
CountResult
- The result of the counts.
Expand source code
def histogram( self, sequences:SequenceCollection, primer:BioSeq, end: NucleicSequenceEnd ) -> pd.Series: """ Makes an histogram of mismatches of a primer to a collection of sequences. Args: sequences (SequenceCollection): A Sequence collection primer (BioSeq): A primer to check end (NucleicSequenceEnd): The end of the sequence to check the primer. Returns: CountResult: The result of the counts. """ if end == NucleicSequenceEnd.THREEPRIME: primer = primer.reverse_complement() guess_seq = list(primer.sequence) sequence_extractor = EndSequenceExtractor.build( subsequence_length = len(guess_seq), end=end ) seqs = [ list(sequence_extractor.subsequence(s).sequence) for s in sequences ] deg_matrix = pd.DataFrame( [ reversed([int(x) for x in (f"{i+1:04b}")]) for i in range(15) ], columns = list("ACGT"), index = rev_indexes ) guessed_matrix = deg_matrix.loc[guess_seq, :].to_numpy() mismatches = [ np.count_nonzero( ( deg_matrix .loc[seq, :] .to_numpy() * guessed_matrix ) .sum(axis=1) == 0 ) for seq in seqs ] return pd.Series(mismatches).value_counts()
class ThreePrimeSequenceExtractor (subsequence_length: int)
-
A 3's sequence extractor.
Expand source code
class ThreePrimeSequenceExtractor(EndSequenceExtractor): """ A 3's sequence extractor. """ def __init__(self, subsequence_length:int): self.subsequence_length = subsequence_length @override def subsequence(self, seq: BioSeq) -> BioSeq: new_seq = seq.sequence[-self.subsequence_length:] return BioSeq(seq.identifier, new_seq, seq.alphabet)
Ancestors
- EndSequenceExtractor
- typing.Protocol
- typing.Generic
Methods
def subsequence(self, seq: BioSeq) ‑> BioSeq
-
Extract a subsequence from a BioSeq.
Expand source code
@override def subsequence(self, seq: BioSeq) -> BioSeq: new_seq = seq.sequence[-self.subsequence_length:] return BioSeq(seq.identifier, new_seq, seq.alphabet)