Module xi_covutils.seqmapper

Sequence Mapper

Expand source code
"""
Sequence Mapper
"""
from typing import Optional
from Bio.Align import Alignment, PairwiseAligner

from xi_covutils.msa import default_aligner

class SequenceMapper:
  """
  Maps positions between two aligned sequences.

  ```python
  # Example:
  seq1 = "-TGT-G"
  seq2 = "AT-TGG"
  mapper = SequenceMapper.from_aligned_sequences(seq1, seq2)
  aln_positions = list(range(1,7))
  seq1_positions = list(range(1,5))
  seq2_positions = list(range(1,6))
  aln_to_first = [mapper.from_aln_to_first(x) for x in aln_positions]
  assert aln_to_first == [None, 1, 2, 3, None, 4]
  aln_to_second = [mapper.from_aln_to_second(x) for x in aln_positions]
  assert aln_to_second == [1, 2, None, 3, 4, 5]
  first_to_aln = [mapper.from_first_to_aln(x) for x in seq1_positions]
  assert first_to_aln == [2, 3, 4, 6]
  first_to_second = [mapper.from_first_to_second(x) for x in seq1_positions]
  assert first_to_second == [2, None, 3, 5]
  second_to_aln = [mapper.from_second_to_aln(x) for x in seq2_positions]
  assert second_to_aln == [1, 2, 4, 5, 6]
  second_to_first = [mapper.from_second_to_first(x) for x in seq2_positions]
  assert second_to_first == [None, 1, 3, None, 4]
  ```
  """
  def __init__(self):
    self.storage: dict[str, str] = {}
    self.aligner: Optional[PairwiseAligner] = None
    self.seq1_mapping: dict[int, tuple[int, int]] = {}
    self.seq2_mapping: dict[int, tuple[int, int]] = {}
    self.aln_mapping: dict[int, tuple[int, int]] = {}

  def with_sequences(self, first:str, second:str) -> "SequenceMapper":
    """
    Sets the first and second sequences for alignment.

    Args:
      first (str): The first sequence to be aligned.
      second (str): The second sequence to be aligned.

    Returns:
      SequenceMapper: The updated SequenceMapper object.
    """
    self.storage["first"] = first
    self.storage["second"] = second
    return self

  def with_default_aligner(self) -> "SequenceMapper":
    """
    Sets the default aligner for the SequenceMapper object.

    Returns:
      SequenceMapper: The updated SequenceMapper object with the default
        aligner.
    """
    self.aligner = default_aligner()
    return self

  def with_aligner(self, aligner:PairwiseAligner) -> "SequenceMapper":
    """
    Sets a custom aligner for the SequenceMapper object.

    Args:
      aligner (PairwiseAligner): A custom aligner to be used for sequence
        alignment.

    Returns:
      SequenceMapper: The updated SequenceMapper object with the custom aligner.
    """
    self.aligner = aligner
    return self

  def build(self) -> "SequenceMapper":
    """
    Aligns the sequences using the assigned aligner and creates mappings.

    Returns:
      SequenceMapper: The updated SequenceMapper object with created mappings.

    Throws:
      ValueError: Raised if one or more sequences are missing or if there is an
        error creating alignment.
    """
    aligner = self.aligner
    if not aligner:
      aligner = default_aligner()
      self.aligner = aligner
    first = self.storage.get("first")
    second = self.storage.get("second")
    if not first or not second:
      raise ValueError("One or more sequences are missing")
    alns = aligner.align(first, second)
    alignment:Alignment = alns[0]
    first_aligned = alignment[0, :]
    second_aligned = alignment[1, :]
    if (
      not isinstance(first_aligned, str)
      or not isinstance(second_aligned, str)
    ):
      raise ValueError("There was an error creating alignment")
    self.storage["first_aligned"] = first_aligned
    self.storage["second_aligned"] = second_aligned
    return self

  @staticmethod
  def from_aligned_sequences(
      first_aligned:str,
      second_aligned:str
    ) -> "SequenceMapper":
    """
    Creates a SequenceMapper object from already aligned sequences.

    Args:
      first_aligned (str): The first aligned sequence.
      second_aligned (str): The second aligned sequence.

    Returns:
      SequenceMapper: A new SequenceMapper object created from the
        aligned sequences.

    Throws:
      ValueError: Raised if the sequences have different lengths.
    """
    if len(first_aligned) != len(second_aligned):
      raise ValueError("Sequences must have the same length")
    mapper = SequenceMapper()
    mapper.storage["first_aligned"] = first_aligned
    mapper.storage["second_aligned"] = second_aligned
    mapper.storage["first"] = first_aligned.replace("-", "")
    mapper.storage["second"] = second_aligned.replace("-", "")
    mapper = SequenceMapper._create_mappings(mapper)
    return mapper

  @staticmethod
  def _create_mappings(mapper:"SequenceMapper"):
    first_aligned = mapper.storage["first_aligned"]
    second_aligned = mapper.storage["second_aligned"]
    seq1p = 0
    seq2p = 0
    aln_mapping = {}
    seq1_mapping = {}
    seq2_mapping = {}
    seq_tuple_iterator = zip(first_aligned, second_aligned)
    for i, (char1, char2) in enumerate(seq_tuple_iterator):
      seq1p, next_c1 = (seq1p+1, seq1p+1) if char1 != "-" else (seq1p, None)
      seq2p, next_c2 = (seq2p+1, seq2p+1) if char2 != "-" else (seq2p, None)
      aln_mapping[i+1] = (next_c1, next_c2)
      if char1 != "-":
        seq1_mapping[next_c1] = (i+1, next_c2)
      if char2 != "-":
        seq2_mapping[next_c2] = (i+1, next_c1)
    mapper.aln_mapping = aln_mapping
    mapper.seq1_mapping = seq1_mapping
    mapper.seq2_mapping = seq2_mapping
    return mapper

  def from_first_to_aln(self, position:int) -> Optional[int]:
    """
    Maps a position from the first sequence to the aligned position.

    Args:
      position (int): The position in the first sequence.

    Returns:
      Optional[int]: The corresponding aligned position or None if the
        position is not found.
    """
    result = self.seq1_mapping.get(position)
    if not result:
      return None
    return result[0]

  def from_first_to_second(self, position:int) -> Optional[int]:
    """
    Maps a position from the first sequence to the corresponding position in the
      second sequence.

    Args:
      position (int): The position in the first sequence.

    Returns:
      Optional[int]: The corresponding position in the second sequence or
        None if the position is not found.
    """
    result = self.seq1_mapping.get(position)
    if not result:
      return None
    return result[1]

  def from_second_to_aln(self, position:int) -> Optional[int]:
    """
    Maps a position from the second sequence to the aligned position.

    Args:
      position (int): The position in the second sequence.

    Returns:
      Optional[int]: The corresponding aligned position or None if the
        position is not found.
    """
    result = self.seq2_mapping.get(position)
    if not result:
      return None
    return result[0]

  def from_second_to_first(self, position:int) -> Optional[int]:
    """
    Maps a position from the second sequence to the corresponding position in
      the first sequence.

    Args:
      position (int): The position in the second sequence.

    Returns:
      Optional[int]: The corresponding position in the first sequence or
        None if the position is not found.
    """
    result = self.seq2_mapping.get(position)
    if not result:
      return None
    return result[1]

  def from_aln_to_first(self, position:int) -> Optional[int]:
    """
    Maps a position from the alignment to the corresponding position in the
      first sequence.

    Args:
      position (int): The position in the alignment.

    Returns:
      Optional[int]: The corresponding position in the first sequence or
        None if the position is not found.
    """
    result = self.aln_mapping.get(position)
    if not result:
      return None
    return result[0]

  def from_aln_to_second(self, position:int) -> Optional[int]:
    """
    Maps a position from the alignment to the corresponding position in the
      second sequence.

    Args:
      position (int): The position in the alignment.

    Returns:
      Optional[int]: The corresponding position in the second sequence or
        None if the position is not found.
    """
    result = self.aln_mapping.get(position)
    if not result:
      return None
    return result[1]

Classes

class SequenceMapper

Maps positions between two aligned sequences.

# Example:
seq1 = "-TGT-G"
seq2 = "AT-TGG"
mapper = SequenceMapper.from_aligned_sequences(seq1, seq2)
aln_positions = list(range(1,7))
seq1_positions = list(range(1,5))
seq2_positions = list(range(1,6))
aln_to_first = [mapper.from_aln_to_first(x) for x in aln_positions]
assert aln_to_first == [None, 1, 2, 3, None, 4]
aln_to_second = [mapper.from_aln_to_second(x) for x in aln_positions]
assert aln_to_second == [1, 2, None, 3, 4, 5]
first_to_aln = [mapper.from_first_to_aln(x) for x in seq1_positions]
assert first_to_aln == [2, 3, 4, 6]
first_to_second = [mapper.from_first_to_second(x) for x in seq1_positions]
assert first_to_second == [2, None, 3, 5]
second_to_aln = [mapper.from_second_to_aln(x) for x in seq2_positions]
assert second_to_aln == [1, 2, 4, 5, 6]
second_to_first = [mapper.from_second_to_first(x) for x in seq2_positions]
assert second_to_first == [None, 1, 3, None, 4]
Expand source code
class SequenceMapper:
  """
  Maps positions between two aligned sequences.

  ```python
  # Example:
  seq1 = "-TGT-G"
  seq2 = "AT-TGG"
  mapper = SequenceMapper.from_aligned_sequences(seq1, seq2)
  aln_positions = list(range(1,7))
  seq1_positions = list(range(1,5))
  seq2_positions = list(range(1,6))
  aln_to_first = [mapper.from_aln_to_first(x) for x in aln_positions]
  assert aln_to_first == [None, 1, 2, 3, None, 4]
  aln_to_second = [mapper.from_aln_to_second(x) for x in aln_positions]
  assert aln_to_second == [1, 2, None, 3, 4, 5]
  first_to_aln = [mapper.from_first_to_aln(x) for x in seq1_positions]
  assert first_to_aln == [2, 3, 4, 6]
  first_to_second = [mapper.from_first_to_second(x) for x in seq1_positions]
  assert first_to_second == [2, None, 3, 5]
  second_to_aln = [mapper.from_second_to_aln(x) for x in seq2_positions]
  assert second_to_aln == [1, 2, 4, 5, 6]
  second_to_first = [mapper.from_second_to_first(x) for x in seq2_positions]
  assert second_to_first == [None, 1, 3, None, 4]
  ```
  """
  def __init__(self):
    self.storage: dict[str, str] = {}
    self.aligner: Optional[PairwiseAligner] = None
    self.seq1_mapping: dict[int, tuple[int, int]] = {}
    self.seq2_mapping: dict[int, tuple[int, int]] = {}
    self.aln_mapping: dict[int, tuple[int, int]] = {}

  def with_sequences(self, first:str, second:str) -> "SequenceMapper":
    """
    Sets the first and second sequences for alignment.

    Args:
      first (str): The first sequence to be aligned.
      second (str): The second sequence to be aligned.

    Returns:
      SequenceMapper: The updated SequenceMapper object.
    """
    self.storage["first"] = first
    self.storage["second"] = second
    return self

  def with_default_aligner(self) -> "SequenceMapper":
    """
    Sets the default aligner for the SequenceMapper object.

    Returns:
      SequenceMapper: The updated SequenceMapper object with the default
        aligner.
    """
    self.aligner = default_aligner()
    return self

  def with_aligner(self, aligner:PairwiseAligner) -> "SequenceMapper":
    """
    Sets a custom aligner for the SequenceMapper object.

    Args:
      aligner (PairwiseAligner): A custom aligner to be used for sequence
        alignment.

    Returns:
      SequenceMapper: The updated SequenceMapper object with the custom aligner.
    """
    self.aligner = aligner
    return self

  def build(self) -> "SequenceMapper":
    """
    Aligns the sequences using the assigned aligner and creates mappings.

    Returns:
      SequenceMapper: The updated SequenceMapper object with created mappings.

    Throws:
      ValueError: Raised if one or more sequences are missing or if there is an
        error creating alignment.
    """
    aligner = self.aligner
    if not aligner:
      aligner = default_aligner()
      self.aligner = aligner
    first = self.storage.get("first")
    second = self.storage.get("second")
    if not first or not second:
      raise ValueError("One or more sequences are missing")
    alns = aligner.align(first, second)
    alignment:Alignment = alns[0]
    first_aligned = alignment[0, :]
    second_aligned = alignment[1, :]
    if (
      not isinstance(first_aligned, str)
      or not isinstance(second_aligned, str)
    ):
      raise ValueError("There was an error creating alignment")
    self.storage["first_aligned"] = first_aligned
    self.storage["second_aligned"] = second_aligned
    return self

  @staticmethod
  def from_aligned_sequences(
      first_aligned:str,
      second_aligned:str
    ) -> "SequenceMapper":
    """
    Creates a SequenceMapper object from already aligned sequences.

    Args:
      first_aligned (str): The first aligned sequence.
      second_aligned (str): The second aligned sequence.

    Returns:
      SequenceMapper: A new SequenceMapper object created from the
        aligned sequences.

    Throws:
      ValueError: Raised if the sequences have different lengths.
    """
    if len(first_aligned) != len(second_aligned):
      raise ValueError("Sequences must have the same length")
    mapper = SequenceMapper()
    mapper.storage["first_aligned"] = first_aligned
    mapper.storage["second_aligned"] = second_aligned
    mapper.storage["first"] = first_aligned.replace("-", "")
    mapper.storage["second"] = second_aligned.replace("-", "")
    mapper = SequenceMapper._create_mappings(mapper)
    return mapper

  @staticmethod
  def _create_mappings(mapper:"SequenceMapper"):
    first_aligned = mapper.storage["first_aligned"]
    second_aligned = mapper.storage["second_aligned"]
    seq1p = 0
    seq2p = 0
    aln_mapping = {}
    seq1_mapping = {}
    seq2_mapping = {}
    seq_tuple_iterator = zip(first_aligned, second_aligned)
    for i, (char1, char2) in enumerate(seq_tuple_iterator):
      seq1p, next_c1 = (seq1p+1, seq1p+1) if char1 != "-" else (seq1p, None)
      seq2p, next_c2 = (seq2p+1, seq2p+1) if char2 != "-" else (seq2p, None)
      aln_mapping[i+1] = (next_c1, next_c2)
      if char1 != "-":
        seq1_mapping[next_c1] = (i+1, next_c2)
      if char2 != "-":
        seq2_mapping[next_c2] = (i+1, next_c1)
    mapper.aln_mapping = aln_mapping
    mapper.seq1_mapping = seq1_mapping
    mapper.seq2_mapping = seq2_mapping
    return mapper

  def from_first_to_aln(self, position:int) -> Optional[int]:
    """
    Maps a position from the first sequence to the aligned position.

    Args:
      position (int): The position in the first sequence.

    Returns:
      Optional[int]: The corresponding aligned position or None if the
        position is not found.
    """
    result = self.seq1_mapping.get(position)
    if not result:
      return None
    return result[0]

  def from_first_to_second(self, position:int) -> Optional[int]:
    """
    Maps a position from the first sequence to the corresponding position in the
      second sequence.

    Args:
      position (int): The position in the first sequence.

    Returns:
      Optional[int]: The corresponding position in the second sequence or
        None if the position is not found.
    """
    result = self.seq1_mapping.get(position)
    if not result:
      return None
    return result[1]

  def from_second_to_aln(self, position:int) -> Optional[int]:
    """
    Maps a position from the second sequence to the aligned position.

    Args:
      position (int): The position in the second sequence.

    Returns:
      Optional[int]: The corresponding aligned position or None if the
        position is not found.
    """
    result = self.seq2_mapping.get(position)
    if not result:
      return None
    return result[0]

  def from_second_to_first(self, position:int) -> Optional[int]:
    """
    Maps a position from the second sequence to the corresponding position in
      the first sequence.

    Args:
      position (int): The position in the second sequence.

    Returns:
      Optional[int]: The corresponding position in the first sequence or
        None if the position is not found.
    """
    result = self.seq2_mapping.get(position)
    if not result:
      return None
    return result[1]

  def from_aln_to_first(self, position:int) -> Optional[int]:
    """
    Maps a position from the alignment to the corresponding position in the
      first sequence.

    Args:
      position (int): The position in the alignment.

    Returns:
      Optional[int]: The corresponding position in the first sequence or
        None if the position is not found.
    """
    result = self.aln_mapping.get(position)
    if not result:
      return None
    return result[0]

  def from_aln_to_second(self, position:int) -> Optional[int]:
    """
    Maps a position from the alignment to the corresponding position in the
      second sequence.

    Args:
      position (int): The position in the alignment.

    Returns:
      Optional[int]: The corresponding position in the second sequence or
        None if the position is not found.
    """
    result = self.aln_mapping.get(position)
    if not result:
      return None
    return result[1]

Static methods

def from_aligned_sequences(first_aligned: str, second_aligned: str) ‑> SequenceMapper

Creates a SequenceMapper object from already aligned sequences.

Args

first_aligned : str
The first aligned sequence.
second_aligned : str
The second aligned sequence.

Returns

SequenceMapper
A new SequenceMapper object created from the aligned sequences.

Throws

ValueError: Raised if the sequences have different lengths.

Expand source code
@staticmethod
def from_aligned_sequences(
    first_aligned:str,
    second_aligned:str
  ) -> "SequenceMapper":
  """
  Creates a SequenceMapper object from already aligned sequences.

  Args:
    first_aligned (str): The first aligned sequence.
    second_aligned (str): The second aligned sequence.

  Returns:
    SequenceMapper: A new SequenceMapper object created from the
      aligned sequences.

  Throws:
    ValueError: Raised if the sequences have different lengths.
  """
  if len(first_aligned) != len(second_aligned):
    raise ValueError("Sequences must have the same length")
  mapper = SequenceMapper()
  mapper.storage["first_aligned"] = first_aligned
  mapper.storage["second_aligned"] = second_aligned
  mapper.storage["first"] = first_aligned.replace("-", "")
  mapper.storage["second"] = second_aligned.replace("-", "")
  mapper = SequenceMapper._create_mappings(mapper)
  return mapper

Methods

def build(self) ‑> SequenceMapper

Aligns the sequences using the assigned aligner and creates mappings.

Returns

SequenceMapper
The updated SequenceMapper object with created mappings.

Throws

ValueError: Raised if one or more sequences are missing or if there is an error creating alignment.

Expand source code
def build(self) -> "SequenceMapper":
  """
  Aligns the sequences using the assigned aligner and creates mappings.

  Returns:
    SequenceMapper: The updated SequenceMapper object with created mappings.

  Throws:
    ValueError: Raised if one or more sequences are missing or if there is an
      error creating alignment.
  """
  aligner = self.aligner
  if not aligner:
    aligner = default_aligner()
    self.aligner = aligner
  first = self.storage.get("first")
  second = self.storage.get("second")
  if not first or not second:
    raise ValueError("One or more sequences are missing")
  alns = aligner.align(first, second)
  alignment:Alignment = alns[0]
  first_aligned = alignment[0, :]
  second_aligned = alignment[1, :]
  if (
    not isinstance(first_aligned, str)
    or not isinstance(second_aligned, str)
  ):
    raise ValueError("There was an error creating alignment")
  self.storage["first_aligned"] = first_aligned
  self.storage["second_aligned"] = second_aligned
  return self
def from_aln_to_first(self, position: int) ‑> Optional[int]

Maps a position from the alignment to the corresponding position in the first sequence.

Args

position : int
The position in the alignment.

Returns

Optional[int]
The corresponding position in the first sequence or None if the position is not found.
Expand source code
def from_aln_to_first(self, position:int) -> Optional[int]:
  """
  Maps a position from the alignment to the corresponding position in the
    first sequence.

  Args:
    position (int): The position in the alignment.

  Returns:
    Optional[int]: The corresponding position in the first sequence or
      None if the position is not found.
  """
  result = self.aln_mapping.get(position)
  if not result:
    return None
  return result[0]
def from_aln_to_second(self, position: int) ‑> Optional[int]

Maps a position from the alignment to the corresponding position in the second sequence.

Args

position : int
The position in the alignment.

Returns

Optional[int]
The corresponding position in the second sequence or None if the position is not found.
Expand source code
def from_aln_to_second(self, position:int) -> Optional[int]:
  """
  Maps a position from the alignment to the corresponding position in the
    second sequence.

  Args:
    position (int): The position in the alignment.

  Returns:
    Optional[int]: The corresponding position in the second sequence or
      None if the position is not found.
  """
  result = self.aln_mapping.get(position)
  if not result:
    return None
  return result[1]
def from_first_to_aln(self, position: int) ‑> Optional[int]

Maps a position from the first sequence to the aligned position.

Args

position : int
The position in the first sequence.

Returns

Optional[int]
The corresponding aligned position or None if the position is not found.
Expand source code
def from_first_to_aln(self, position:int) -> Optional[int]:
  """
  Maps a position from the first sequence to the aligned position.

  Args:
    position (int): The position in the first sequence.

  Returns:
    Optional[int]: The corresponding aligned position or None if the
      position is not found.
  """
  result = self.seq1_mapping.get(position)
  if not result:
    return None
  return result[0]
def from_first_to_second(self, position: int) ‑> Optional[int]

Maps a position from the first sequence to the corresponding position in the second sequence.

Args

position : int
The position in the first sequence.

Returns

Optional[int]
The corresponding position in the second sequence or None if the position is not found.
Expand source code
def from_first_to_second(self, position:int) -> Optional[int]:
  """
  Maps a position from the first sequence to the corresponding position in the
    second sequence.

  Args:
    position (int): The position in the first sequence.

  Returns:
    Optional[int]: The corresponding position in the second sequence or
      None if the position is not found.
  """
  result = self.seq1_mapping.get(position)
  if not result:
    return None
  return result[1]
def from_second_to_aln(self, position: int) ‑> Optional[int]

Maps a position from the second sequence to the aligned position.

Args

position : int
The position in the second sequence.

Returns

Optional[int]
The corresponding aligned position or None if the position is not found.
Expand source code
def from_second_to_aln(self, position:int) -> Optional[int]:
  """
  Maps a position from the second sequence to the aligned position.

  Args:
    position (int): The position in the second sequence.

  Returns:
    Optional[int]: The corresponding aligned position or None if the
      position is not found.
  """
  result = self.seq2_mapping.get(position)
  if not result:
    return None
  return result[0]
def from_second_to_first(self, position: int) ‑> Optional[int]

Maps a position from the second sequence to the corresponding position in the first sequence.

Args

position : int
The position in the second sequence.

Returns

Optional[int]
The corresponding position in the first sequence or None if the position is not found.
Expand source code
def from_second_to_first(self, position:int) -> Optional[int]:
  """
  Maps a position from the second sequence to the corresponding position in
    the first sequence.

  Args:
    position (int): The position in the second sequence.

  Returns:
    Optional[int]: The corresponding position in the first sequence or
      None if the position is not found.
  """
  result = self.seq2_mapping.get(position)
  if not result:
    return None
  return result[1]
def with_aligner(self, aligner: Bio.Align.PairwiseAligner) ‑> SequenceMapper

Sets a custom aligner for the SequenceMapper object.

Args

aligner : PairwiseAligner
A custom aligner to be used for sequence alignment.

Returns

SequenceMapper
The updated SequenceMapper object with the custom aligner.
Expand source code
def with_aligner(self, aligner:PairwiseAligner) -> "SequenceMapper":
  """
  Sets a custom aligner for the SequenceMapper object.

  Args:
    aligner (PairwiseAligner): A custom aligner to be used for sequence
      alignment.

  Returns:
    SequenceMapper: The updated SequenceMapper object with the custom aligner.
  """
  self.aligner = aligner
  return self
def with_default_aligner(self) ‑> SequenceMapper

Sets the default aligner for the SequenceMapper object.

Returns

SequenceMapper
The updated SequenceMapper object with the default aligner.
Expand source code
def with_default_aligner(self) -> "SequenceMapper":
  """
  Sets the default aligner for the SequenceMapper object.

  Returns:
    SequenceMapper: The updated SequenceMapper object with the default
      aligner.
  """
  self.aligner = default_aligner()
  return self
def with_sequences(self, first: str, second: str) ‑> SequenceMapper

Sets the first and second sequences for alignment.

Args

first : str
The first sequence to be aligned.
second : str
The second sequence to be aligned.

Returns

SequenceMapper
The updated SequenceMapper object.
Expand source code
def with_sequences(self, first:str, second:str) -> "SequenceMapper":
  """
  Sets the first and second sequences for alignment.

  Args:
    first (str): The first sequence to be aligned.
    second (str): The second sequence to be aligned.

  Returns:
    SequenceMapper: The updated SequenceMapper object.
  """
  self.storage["first"] = first
  self.storage["second"] = second
  return self