Module xi_covutils.seqs.seq_collection

Sequence collections

Expand source code
"""
Sequence collections
"""
from dataclasses import dataclass
from enum import Enum
from typing import (
  Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast
)
from abc import abstractmethod
import warnings

from Bio.Seq import reverse_complement, reverse_complement_rna

from xi_covutils import fastq, msa

class BioAlphabet(Enum):
  """
  Represents alphabets of biomolecules.
  """
  DNA=0
  RNA=1
  PROTEIN=2
  UNKNOWN=3

@dataclass
class BioSeq:
  """
  A Generic Biological sequence with and identifier.
  """
  identifier: str
  sequence: str
  alphabet: BioAlphabet
  def __eq__(self, other: object) -> bool:
    if not isinstance(other, BioSeq):
      return False
    return (
      self.identifier == other.identifier and
      self.sequence == other.sequence and
      self.alphabet == other.alphabet
    )
  @staticmethod
  def unknownBioSeq(identifier:str, sequence:str) -> "BioSeq":
    return BioSeq(identifier, sequence, BioAlphabet.UNKNOWN)
  def reverse_complement(self) -> "BioSeq":
    if self.alphabet == BioAlphabet.DNA:
      return BioSeq(
        identifier=self.identifier,
        sequence=str(reverse_complement(self.sequence)),
        alphabet=self.alphabet
      )
    if self.alphabet == BioAlphabet.RNA:
      return BioSeq(
        identifier=self.identifier,
        sequence=str(reverse_complement(self.sequence)),
        alphabet=self.alphabet
      )
    if self.alphabet == BioAlphabet.PROTEIN:
      raise ValueError(
        "Protein sequences cannot have reverse complement sequence."
      )
    raise ValueError(
      "Unknown type sequences cannot have reverse complement sequence."
    )


T = TypeVar("T")
class NonConsumableSeqCol:
  @abstractmethod
  def __len__(self):
    pass
  @abstractmethod
  def append(self, seq: BioSeq):
    pass

class AbstractIterableSeqCol(Iterator[BioSeq]):
  """
  A Collection of ordered biological sequences.
  """
  def __init__(self):
    self.bioseqs = []
    self.iterator = None
  def __iter__(self) -> Iterator[BioSeq]:
    self.iterator = iter(self.bioseqs)
    return self
  def __next__(self) -> BioSeq:
    assert self.iterator is not None
    return next(self.iterator)
  def map(self, func: Callable[[BioSeq], BioSeq]) -> "ConsumableSeqCol":
    return ConsumableSeqCol(
      func(bioseq) for bioseq in self
    )
  def map_with_index(self, func: Callable[[int, BioSeq], BioSeq]) -> "ConsumableSeqCol":
    return ConsumableSeqCol(
      func(i, bioseq) for i, bioseq in enumerate(self)
    )
  def filter(self, func: Callable[[BioSeq], bool]) -> "ConsumableSeqCol":
    return ConsumableSeqCol(
      bioseq for bioseq in self
      if func(bioseq)
    )
  def apply(self, func: Callable[[BioSeq], T]) -> Iterable[T]:
    return (func(bioseq) for bioseq in self)
  def fold(self, func: Callable[[T, BioSeq], T], initial: T) -> T:
    acc = initial
    for bioseq in self:
      acc = func(acc, bioseq)
    return acc
  def reduce(
    self,
    func: Callable[["SequenceCollection", BioSeq], "SequenceCollection"]
  ) -> "SequenceCollection":
    result = self.fold(func, SequenceCollection([]))
    return result
  def flat_map(self, func: Callable[[BioSeq], Iterable["SequenceCollection"]]) -> "ConsumableSeqCol":
    return ConsumableSeqCol(
      bioseq2
      for bioseq in self
      for bio_iter in func(bioseq)
      for bioseq2 in bio_iter
    )

class SequenceCollection(
  AbstractIterableSeqCol,
  NonConsumableSeqCol
):
  def __init__(self, seqs: List[BioSeq]):
    self.bioseqs: List[BioSeq] = seqs
    self.indexes: Dict[str, int] = {
      bs.identifier: i
      for i, bs in enumerate(self.bioseqs)
    }
    self.iterator = Optional[Iterator[BioSeq]]
    if len(self.bioseqs) != len(self.indexes):
      print(f"There are {len(self.bioseqs)} sequences and {len(self.indexes)} indexes")
      print(self.indexes)
      raise ValueError("Sequences has repeated identifiers")
    if len({s.alphabet for s in self.bioseqs}) > 1:
      raise ValueError("Sequences differ in alphabet")
  def append(self, seq: BioSeq):
    if seq.identifier in self.indexes:
      raise ValueError("Sequence has repeated identifier")
    if len(self) > 0 and seq.alphabet != self.bioseqs[0].alphabet:
      raise ValueError("New sequence has different alphabet")
    self.bioseqs.append(seq)
    self.indexes[seq.identifier] = len(self.bioseqs)
    return self
  def __len__(self) -> int:
    return len(self.bioseqs)
  def __eq__(self, other: object) -> bool:
    if not isinstance(other, SequenceCollection):
      return False
    if not len(self) == len(other):
      return False
    return all(
      s1 == s2 for s1, s2 in zip(
        self.bioseqs,
        other.bioseqs
      )
    )
  def __str__(self) -> str:
    return f"SequenceCollection[Size:{len(self)}]"
  def __repr__(self) -> str:
    max_seqs_to_repr = 20
    if len(self) < max_seqs_to_repr:
      warnings.warn(
        "SequenceCollection too large to be represented. Using 20",
        DeprecationWarning
      )
    seqs = [
      repr(s)
      for s in self.bioseqs[:max_seqs_to_repr]
    ]
    return f"SequenceCollection({repr(seqs)})"
  @staticmethod
  def from_fasta(
    fasta_file:str,
    alphabet: BioAlphabet = BioAlphabet.UNKNOWN
  ) -> "SequenceCollection":
    """
    Generates A Sequence Collection from a fasta file.

    Args:
      fasta_file (str): The input fasta file.
      alphabet (BioAlphabet, optional): The alphabet of the sequences.
        Defaults to BioAlphabet.UNKNOWN.

    Returns:
      SequenceCollection: The resulting sequence collection.
    """
    seqs = [
      BioSeq(sid, seq, alphabet)
      for (sid, seq) in msa.read_msa(fasta_file)
    ]
    return SequenceCollection(seqs)
  @staticmethod
  def from_fastq(
    fastq_file:str,
    alphabet: BioAlphabet = BioAlphabet.UNKNOWN
  ) -> "SequenceCollection":
    """
    Generates A Sequence Collection from a fastq file.

    Args:
      fastq_file (str): The input fastq file.
      alphabet (BioAlphabet, optional): The alphabet of the sequences.
        Defaults to BioAlphabet.UNKNOWN.

    Returns:
      SequenceCollection: The resulting sequence collection.
    """
    reader = fastq.FastqReader()
    seqs = [
      BioSeq(entry.identifier, entry.sequence, alphabet)
      for entry in reader.read_fastq_from_file(fastq_file)
    ]
    return SequenceCollection(seqs)
  def to_fasta(self, outfile:str):
    """
    Exports the sequence collection to a fasta file.

    Args:
      outfile (str): The output file.
    """
    seqs = [(x.identifier, x.sequence) for x in self]
    msa.write_msa(seqs, outfile)


class ConsumableSeqCol(AbstractIterableSeqCol):
  def __init__(self, data:Iterable[BioSeq]):
    self.bioseqs = data
    self.iterator = None
  def collect(self) -> "SequenceCollection":
    seq_col = SequenceCollection(list(self))
    return seq_col


def filter_by_identifier(allowed_identifiers:set[str]) -> Callable[[BioSeq], bool]:
  def filter(bioseq:BioSeq) -> bool:
    return bioseq.identifier in allowed_identifiers
  return filter

Functions

def filter_by_identifier(allowed_identifiers: set) ‑> Callable[[BioSeq], bool]
Expand source code
def filter_by_identifier(allowed_identifiers:set[str]) -> Callable[[BioSeq], bool]:
  def filter(bioseq:BioSeq) -> bool:
    return bioseq.identifier in allowed_identifiers
  return filter

Classes

class AbstractIterableSeqCol

A Collection of ordered biological sequences.

Expand source code
class AbstractIterableSeqCol(Iterator[BioSeq]):
  """
  A Collection of ordered biological sequences.
  """
  def __init__(self):
    self.bioseqs = []
    self.iterator = None
  def __iter__(self) -> Iterator[BioSeq]:
    self.iterator = iter(self.bioseqs)
    return self
  def __next__(self) -> BioSeq:
    assert self.iterator is not None
    return next(self.iterator)
  def map(self, func: Callable[[BioSeq], BioSeq]) -> "ConsumableSeqCol":
    return ConsumableSeqCol(
      func(bioseq) for bioseq in self
    )
  def map_with_index(self, func: Callable[[int, BioSeq], BioSeq]) -> "ConsumableSeqCol":
    return ConsumableSeqCol(
      func(i, bioseq) for i, bioseq in enumerate(self)
    )
  def filter(self, func: Callable[[BioSeq], bool]) -> "ConsumableSeqCol":
    return ConsumableSeqCol(
      bioseq for bioseq in self
      if func(bioseq)
    )
  def apply(self, func: Callable[[BioSeq], T]) -> Iterable[T]:
    return (func(bioseq) for bioseq in self)
  def fold(self, func: Callable[[T, BioSeq], T], initial: T) -> T:
    acc = initial
    for bioseq in self:
      acc = func(acc, bioseq)
    return acc
  def reduce(
    self,
    func: Callable[["SequenceCollection", BioSeq], "SequenceCollection"]
  ) -> "SequenceCollection":
    result = self.fold(func, SequenceCollection([]))
    return result
  def flat_map(self, func: Callable[[BioSeq], Iterable["SequenceCollection"]]) -> "ConsumableSeqCol":
    return ConsumableSeqCol(
      bioseq2
      for bioseq in self
      for bio_iter in func(bioseq)
      for bioseq2 in bio_iter
    )

Ancestors

  • collections.abc.Iterator
  • collections.abc.Iterable
  • typing.Generic

Subclasses

Methods

def apply(self, func: Callable[[BioSeq], ~T]) ‑> Iterable[~T]
Expand source code
def apply(self, func: Callable[[BioSeq], T]) -> Iterable[T]:
  return (func(bioseq) for bioseq in self)
def filter(self, func: Callable[[BioSeq], bool]) ‑> ConsumableSeqCol
Expand source code
def filter(self, func: Callable[[BioSeq], bool]) -> "ConsumableSeqCol":
  return ConsumableSeqCol(
    bioseq for bioseq in self
    if func(bioseq)
  )
def flat_map(self, func: Callable[[BioSeq], Iterable[ForwardRef('SequenceCollection')]]) ‑> ConsumableSeqCol
Expand source code
def flat_map(self, func: Callable[[BioSeq], Iterable["SequenceCollection"]]) -> "ConsumableSeqCol":
  return ConsumableSeqCol(
    bioseq2
    for bioseq in self
    for bio_iter in func(bioseq)
    for bioseq2 in bio_iter
  )
def fold(self, func: Callable[[~T, BioSeq], ~T], initial: ~T) ‑> ~T
Expand source code
def fold(self, func: Callable[[T, BioSeq], T], initial: T) -> T:
  acc = initial
  for bioseq in self:
    acc = func(acc, bioseq)
  return acc
def map(self, func: Callable[[BioSeq], BioSeq]) ‑> ConsumableSeqCol
Expand source code
def map(self, func: Callable[[BioSeq], BioSeq]) -> "ConsumableSeqCol":
  return ConsumableSeqCol(
    func(bioseq) for bioseq in self
  )
def map_with_index(self, func: Callable[[int, BioSeq], BioSeq]) ‑> ConsumableSeqCol
Expand source code
def map_with_index(self, func: Callable[[int, BioSeq], BioSeq]) -> "ConsumableSeqCol":
  return ConsumableSeqCol(
    func(i, bioseq) for i, bioseq in enumerate(self)
  )
def reduce(self, func: Callable[[ForwardRef('SequenceCollection'), BioSeq], ForwardRef('SequenceCollection')]) ‑> SequenceCollection
Expand source code
def reduce(
  self,
  func: Callable[["SequenceCollection", BioSeq], "SequenceCollection"]
) -> "SequenceCollection":
  result = self.fold(func, SequenceCollection([]))
  return result
class BioAlphabet (value, names=None, *, module=None, qualname=None, type=None, start=1)

Represents alphabets of biomolecules.

Expand source code
class BioAlphabet(Enum):
  """
  Represents alphabets of biomolecules.
  """
  DNA=0
  RNA=1
  PROTEIN=2
  UNKNOWN=3

Ancestors

  • enum.Enum

Class variables

var DNA
var PROTEIN
var RNA
var UNKNOWN
class BioSeq (identifier: str, sequence: str, alphabet: BioAlphabet)

A Generic Biological sequence with and identifier.

Expand source code
@dataclass
class BioSeq:
  """
  A Generic Biological sequence with and identifier.
  """
  identifier: str
  sequence: str
  alphabet: BioAlphabet
  def __eq__(self, other: object) -> bool:
    if not isinstance(other, BioSeq):
      return False
    return (
      self.identifier == other.identifier and
      self.sequence == other.sequence and
      self.alphabet == other.alphabet
    )
  @staticmethod
  def unknownBioSeq(identifier:str, sequence:str) -> "BioSeq":
    return BioSeq(identifier, sequence, BioAlphabet.UNKNOWN)
  def reverse_complement(self) -> "BioSeq":
    if self.alphabet == BioAlphabet.DNA:
      return BioSeq(
        identifier=self.identifier,
        sequence=str(reverse_complement(self.sequence)),
        alphabet=self.alphabet
      )
    if self.alphabet == BioAlphabet.RNA:
      return BioSeq(
        identifier=self.identifier,
        sequence=str(reverse_complement(self.sequence)),
        alphabet=self.alphabet
      )
    if self.alphabet == BioAlphabet.PROTEIN:
      raise ValueError(
        "Protein sequences cannot have reverse complement sequence."
      )
    raise ValueError(
      "Unknown type sequences cannot have reverse complement sequence."
    )

Class variables

var alphabetBioAlphabet
var identifier : str
var sequence : str

Static methods

def unknownBioSeq(identifier: str, sequence: str) ‑> BioSeq
Expand source code
@staticmethod
def unknownBioSeq(identifier:str, sequence:str) -> "BioSeq":
  return BioSeq(identifier, sequence, BioAlphabet.UNKNOWN)

Methods

def reverse_complement(self) ‑> BioSeq
Expand source code
def reverse_complement(self) -> "BioSeq":
  if self.alphabet == BioAlphabet.DNA:
    return BioSeq(
      identifier=self.identifier,
      sequence=str(reverse_complement(self.sequence)),
      alphabet=self.alphabet
    )
  if self.alphabet == BioAlphabet.RNA:
    return BioSeq(
      identifier=self.identifier,
      sequence=str(reverse_complement(self.sequence)),
      alphabet=self.alphabet
    )
  if self.alphabet == BioAlphabet.PROTEIN:
    raise ValueError(
      "Protein sequences cannot have reverse complement sequence."
    )
  raise ValueError(
    "Unknown type sequences cannot have reverse complement sequence."
  )
class ConsumableSeqCol (data: Iterable[BioSeq])

A Collection of ordered biological sequences.

Expand source code
class ConsumableSeqCol(AbstractIterableSeqCol):
  def __init__(self, data:Iterable[BioSeq]):
    self.bioseqs = data
    self.iterator = None
  def collect(self) -> "SequenceCollection":
    seq_col = SequenceCollection(list(self))
    return seq_col

Ancestors

Methods

def collect(self) ‑> SequenceCollection
Expand source code
def collect(self) -> "SequenceCollection":
  seq_col = SequenceCollection(list(self))
  return seq_col
class NonConsumableSeqCol
Expand source code
class NonConsumableSeqCol:
  @abstractmethod
  def __len__(self):
    pass
  @abstractmethod
  def append(self, seq: BioSeq):
    pass

Subclasses

Methods

def append(self, seq: BioSeq)
Expand source code
@abstractmethod
def append(self, seq: BioSeq):
  pass
class SequenceCollection (seqs: List[BioSeq])

A Collection of ordered biological sequences.

Expand source code
class SequenceCollection(
  AbstractIterableSeqCol,
  NonConsumableSeqCol
):
  def __init__(self, seqs: List[BioSeq]):
    self.bioseqs: List[BioSeq] = seqs
    self.indexes: Dict[str, int] = {
      bs.identifier: i
      for i, bs in enumerate(self.bioseqs)
    }
    self.iterator = Optional[Iterator[BioSeq]]
    if len(self.bioseqs) != len(self.indexes):
      print(f"There are {len(self.bioseqs)} sequences and {len(self.indexes)} indexes")
      print(self.indexes)
      raise ValueError("Sequences has repeated identifiers")
    if len({s.alphabet for s in self.bioseqs}) > 1:
      raise ValueError("Sequences differ in alphabet")
  def append(self, seq: BioSeq):
    if seq.identifier in self.indexes:
      raise ValueError("Sequence has repeated identifier")
    if len(self) > 0 and seq.alphabet != self.bioseqs[0].alphabet:
      raise ValueError("New sequence has different alphabet")
    self.bioseqs.append(seq)
    self.indexes[seq.identifier] = len(self.bioseqs)
    return self
  def __len__(self) -> int:
    return len(self.bioseqs)
  def __eq__(self, other: object) -> bool:
    if not isinstance(other, SequenceCollection):
      return False
    if not len(self) == len(other):
      return False
    return all(
      s1 == s2 for s1, s2 in zip(
        self.bioseqs,
        other.bioseqs
      )
    )
  def __str__(self) -> str:
    return f"SequenceCollection[Size:{len(self)}]"
  def __repr__(self) -> str:
    max_seqs_to_repr = 20
    if len(self) < max_seqs_to_repr:
      warnings.warn(
        "SequenceCollection too large to be represented. Using 20",
        DeprecationWarning
      )
    seqs = [
      repr(s)
      for s in self.bioseqs[:max_seqs_to_repr]
    ]
    return f"SequenceCollection({repr(seqs)})"
  @staticmethod
  def from_fasta(
    fasta_file:str,
    alphabet: BioAlphabet = BioAlphabet.UNKNOWN
  ) -> "SequenceCollection":
    """
    Generates A Sequence Collection from a fasta file.

    Args:
      fasta_file (str): The input fasta file.
      alphabet (BioAlphabet, optional): The alphabet of the sequences.
        Defaults to BioAlphabet.UNKNOWN.

    Returns:
      SequenceCollection: The resulting sequence collection.
    """
    seqs = [
      BioSeq(sid, seq, alphabet)
      for (sid, seq) in msa.read_msa(fasta_file)
    ]
    return SequenceCollection(seqs)
  @staticmethod
  def from_fastq(
    fastq_file:str,
    alphabet: BioAlphabet = BioAlphabet.UNKNOWN
  ) -> "SequenceCollection":
    """
    Generates A Sequence Collection from a fastq file.

    Args:
      fastq_file (str): The input fastq file.
      alphabet (BioAlphabet, optional): The alphabet of the sequences.
        Defaults to BioAlphabet.UNKNOWN.

    Returns:
      SequenceCollection: The resulting sequence collection.
    """
    reader = fastq.FastqReader()
    seqs = [
      BioSeq(entry.identifier, entry.sequence, alphabet)
      for entry in reader.read_fastq_from_file(fastq_file)
    ]
    return SequenceCollection(seqs)
  def to_fasta(self, outfile:str):
    """
    Exports the sequence collection to a fasta file.

    Args:
      outfile (str): The output file.
    """
    seqs = [(x.identifier, x.sequence) for x in self]
    msa.write_msa(seqs, outfile)

Ancestors

Static methods

def from_fasta(fasta_file: str, alphabet: BioAlphabet = BioAlphabet.UNKNOWN) ‑> SequenceCollection

Generates A Sequence Collection from a fasta file.

Args

fasta_file : str
The input fasta file.
alphabet : BioAlphabet, optional
The alphabet of the sequences. Defaults to BioAlphabet.UNKNOWN.

Returns

SequenceCollection
The resulting sequence collection.
Expand source code
@staticmethod
def from_fasta(
  fasta_file:str,
  alphabet: BioAlphabet = BioAlphabet.UNKNOWN
) -> "SequenceCollection":
  """
  Generates A Sequence Collection from a fasta file.

  Args:
    fasta_file (str): The input fasta file.
    alphabet (BioAlphabet, optional): The alphabet of the sequences.
      Defaults to BioAlphabet.UNKNOWN.

  Returns:
    SequenceCollection: The resulting sequence collection.
  """
  seqs = [
    BioSeq(sid, seq, alphabet)
    for (sid, seq) in msa.read_msa(fasta_file)
  ]
  return SequenceCollection(seqs)
def from_fastq(fastq_file: str, alphabet: BioAlphabet = BioAlphabet.UNKNOWN) ‑> SequenceCollection

Generates A Sequence Collection from a fastq file.

Args

fastq_file : str
The input fastq file.
alphabet : BioAlphabet, optional
The alphabet of the sequences. Defaults to BioAlphabet.UNKNOWN.

Returns

SequenceCollection
The resulting sequence collection.
Expand source code
@staticmethod
def from_fastq(
  fastq_file:str,
  alphabet: BioAlphabet = BioAlphabet.UNKNOWN
) -> "SequenceCollection":
  """
  Generates A Sequence Collection from a fastq file.

  Args:
    fastq_file (str): The input fastq file.
    alphabet (BioAlphabet, optional): The alphabet of the sequences.
      Defaults to BioAlphabet.UNKNOWN.

  Returns:
    SequenceCollection: The resulting sequence collection.
  """
  reader = fastq.FastqReader()
  seqs = [
    BioSeq(entry.identifier, entry.sequence, alphabet)
    for entry in reader.read_fastq_from_file(fastq_file)
  ]
  return SequenceCollection(seqs)

Methods

def append(self, seq: BioSeq)
Expand source code
def append(self, seq: BioSeq):
  if seq.identifier in self.indexes:
    raise ValueError("Sequence has repeated identifier")
  if len(self) > 0 and seq.alphabet != self.bioseqs[0].alphabet:
    raise ValueError("New sequence has different alphabet")
  self.bioseqs.append(seq)
  self.indexes[seq.identifier] = len(self.bioseqs)
  return self
def to_fasta(self, outfile: str)

Exports the sequence collection to a fasta file.

Args

outfile : str
The output file.
Expand source code
def to_fasta(self, outfile:str):
  """
  Exports the sequence collection to a fasta file.

  Args:
    outfile (str): The output file.
  """
  seqs = [(x.identifier, x.sequence) for x in self]
  msa.write_msa(seqs, outfile)