Module xi_covutils.read_results

Read results from covariation files

Expand source code
"""
  Read results from covariation files
"""
from re import match
from typing import Iterable, Optional

from Bio import SeqIO


def _single_cov_line_reader(accepting_pattern):
  def wrapper(infile):
    with open(infile, 'r', encoding="utf-8") as file_handle:
      for line in file_handle:
        c_match = match(accepting_pattern, line.strip())
        if c_match:
          yield (
            c_match.group("i"),
            c_match.group("j"),
            c_match.group("score")
          )
  return wrapper


def from_gauss_dca(infile:str) -> dict[tuple[int, int], float]:
  """
  Reads a the results of the covariation files from Gauss DCA.

  Args:
    infile (str): A string with the path of the input file.

  Returns:
    dict[tuple[int, int], float]: Returns a dictionary of tuples of indices
      (i,j) where i<=j as keys and score as value. The indices i,j starts at 1.
  """
  gauss_pattern = r"(?P<i>[0-9]+) (?P<j>[0-9]+) (?P<score>[e+-.0-9]+)$"
  gauss_reader = _single_cov_line_reader(gauss_pattern)
  return {
    (int(i), int(j)): float(v)
    for i, j, v in gauss_reader(infile)
  }

def from_mitos_mi(infile:str) -> dict[tuple[int, int], float]:
  """
  Reads a the results of the covariation files from MIToS MI.

  Args:
    infile (str): A string with the path of the input file.

  Returns:
    dict[tuple[int, int], float]: Returns a dictionary of tuples of indices
    (i,j) where i<=j as keys and score as value. The indices i,j starts at 1.
  """
  mitos_mi_pattern = (
    r"(?P<i>[0-9]+),(?P<j>[0-9]+),"
    r"(?P<score>[+-.0-9]+),[e+-.0-9]+$"
  )
  mi_reader = _single_cov_line_reader(mitos_mi_pattern)
  return {
    (int(i), int(j)): float(v)
    for i, j, v in mi_reader(infile)
  }

def from_ccmpred(infile:str) -> dict[tuple[int, int], float]:
  """
  Reads a the results of the covariation files from CCMPRED.

  Args:
    infile (str):A string with the path of the input file.

  Returns:
    dict[tuple[int, int], float]: Returns a dictionary of tuples of indices
      (i,j) where i<=j as keys and score as value. The indices i,j starts at 1.
  """
  with open(infile, 'r', encoding='utf-8') as f_in:
    raw = f_in.readlines()
  scores = {(i+1, j+1):float(v)
        for i, l in enumerate(raw)
        for j, v in enumerate(l.split()) if i <= j}
  return scores

def remap_paired(
    cov_data:dict[tuple[int, int], float],
    msa_file:Optional[str],
    chain_a_len:int,
    chain_a_id:str="1",
    chain_b_id:str="2"
  ) -> dict[tuple[tuple[str, int], tuple[str, int]], float]:
  """
  Remaps the positions of the covariation scores of a paired MSA to each
  individual ungapped chain sequence.

  Args:
    cov_data (dict[tuple[int, int], float]): Input covariation data of a paired
      MSA.
    msa_file (str): A paired input MSA data.
    chain_a_len (int): The number of columns in the MSA that corresponds to the
      first protein of the paired MSA.
    chain_a_id (str): The Chain identifier of the first protein of the paired
      MSA.
    chain_b_id (str): The Chain identifier of the second protein of the paired
      MSA.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: The mapped positons
      of the covariation scores.
  """
  if msa_file:
    records = SeqIO.parse(msa_file, "fasta")
    first_chain_seq = str(next(records).seq)[:chain_a_len]
    first_chain_ungapped_length = len(first_chain_seq.replace("-", ""))
  else:
    first_chain_ungapped_length = chain_a_len
  def _adapt_index(index: int) ->  tuple[str, int]:
    if index <= first_chain_ungapped_length:
      return (chain_a_id, index)
    return (chain_b_id, index-first_chain_ungapped_length)
  return {
    (_adapt_index(i), _adapt_index(j)):v
    for (i, j), v in cov_data.items()
  }

def to_tuple_positions(
    cov_data:dict[tuple[int, int], float],
    chain_id:str
  ) -> dict[tuple[tuple[str, int], tuple[str,int]], float]:
  """
  Converts simple positions to tuples.

  Converts the indexes of covariation results to tupled positions that include
  the protein chain id. Make the results compatible with output of
  remap_paired function

  Args:
    cov_data (dict[tuple[int, int], float]): The results of covariation scores.
      as a dict of indices as keys and scores as values.
    chain_id (str): A chain identifier.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: The covariation data
      with the included chain identifiers.
  """
  return {
    ((chain_id, i), (chain_id, j)): v
    for (i, j), v in cov_data.items()
  }

def remap_tuple_positions(
    cov_data:dict[tuple[tuple[str, int], tuple[str, int]], float],
    mapping:dict[str,dict[int,int]]
  ) -> dict[tuple[tuple[str, int], tuple[str, int]], float]:
  """
  Remaps the positions of cov_data. Cov_data should be represented as a dict
  with keys of the form ((chain_a, index_a), (chain_b, index_b)) and scores as
  values.

  Args:
    cov_data (dict[tuple[tuple[str, int], tuple[str, int]], float]): A dict with
      covariation scores.
    mapping (dict[str, dict[int, int]]): A dict to map the positions, the dict
      should have chain ids as keys, and values should be dicts that maps from
      old positions to new positions.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: The covariation data
      with the mapped positions.
  """
  return {((c1, mapping[c1][p1]), (c2, mapping[c2][p2])):s
      for ((c1, p1), (c2, p2)), s in cov_data.items()
      if p1 in mapping[c1] and p2 in mapping[c2]}

def remove_trivial_tuple(
    cov_data:dict[tuple[tuple[str, int], tuple[str, int]], float],
    min_pos_dif:int=5
  ) -> dict[tuple[tuple[str, int], tuple[str, int]], float]:
  """
  Removes positions from covariation data from residue pairs that are
  a lesser distance than five positions in sequence.

  Covariation data is assumed to be a dict with keys of the form
  ((chain_a, index_a), (chain_b, index_b)) and scores as keys.

  Args:
    cov_data (dict[tuple[tuple[str, int], tuple[str, int]], float]): The input
      covariation data.
    min_pos_dif (int): Minimum distance that two residues should
    have to be included.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: The covariation data
      without the trivials pairs.
  """
  return {((c1, p1), (c2, p2)):score
      for ((c1, p1), (c2, p2)), score in cov_data.items()
      if not ((c1 == c2) and (abs(p2 - p1) < min_pos_dif))}

def intra_covariation(
    cov_data: dict[tuple[tuple[str, int], tuple[str, int]], float]
  ) -> dict[str, dict[tuple[tuple[str, int], tuple[str, int]], float]]:
  """
  Extract intra-chain interactions from paired covariation data.

  Returns a new dict which chain ids are keys and values are subsets
  of cov_data that correspond to intra chain residue pairs.

  Args:
    cov_data (dict[tuple[tuple[str, int], tuple[str, int]], float]): The paired
      covariation data.

  Returns:
    dict[str, dict[tuple[tuple[str, int], tuple[str, int]], float]]: The intra
      chain covariation data for all chains.
  """
  chains = {c for ((c1, _), (c2, _)) in cov_data
        for c in [c1, c2]}
  intra_data = {c:{} for c in chains}
  for ((ch1, po1), (ch2, po2)), score in cov_data.items():
    if ch1 == ch2:
      intra_data[ch1][((ch1, po1), (ch2, po2))] = score
  return intra_data

def merge(
    cov_data_iter: Iterable[
      dict[tuple[tuple[str, int], tuple[str, int]], float]
    ]
  ) -> dict[tuple[tuple[str, int], tuple[str, int]], float]:
  """
  Merge covariation data.

  Args:
    cov_data_iter (Iterable[dict[tuple[tuple[str, int], tuple[str, int]],
      float]]): Is a iterator where each element is covariation
    data.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: Merged covariation
      data.
  """
  return {pair:s for cov in cov_data_iter
      for pair, s in cov.items()}

def inter_covariation(
    cov_data:dict[tuple[tuple[str, int], tuple[str,int]], float]
  ) -> dict[
    tuple[str, str],
    dict[tuple[tuple[str, int], tuple[str,int]],float]
  ]:
  """
  Extract inter-chain interactions from paired covariation data.

  Args:
    cov_data (dict[tuple[tuple[str, int], tuple[str, int]], float]): The
      covariation data from a paired MSA.

  Returns:
    dict[tuple[str, str], dict[tuple[tuple[str, int], tuple[str, int]], float]]:
      Returns a new dict which chain id tuples are keys and values are subsets
      of cov_data that correspond to inter chain residue pairs.
      Chain ids in tuple keys are sorted lexicographically.
  """
  chain_pairs = {
    tuple(sorted([c1, c2])) for ((c1, _), (c2, _)) in cov_data
    if not c1 == c2
  }
  inter_data = {c:{} for c in chain_pairs}
  for ((ch1, po1), (ch2, po2)), score in cov_data.items():
    if not ch1 == ch2:
      key_positions = tuple(
        sorted(
          [(ch1, po1), (ch2, po2)],
          key=lambda x: x[0])
      )
      key_chains = tuple(sorted([ch1, ch2]))
      inter_data[key_chains][key_positions] = score
  return inter_data

Functions

def from_ccmpred(infile: str) ‑> dict

Reads a the results of the covariation files from CCMPRED.

Args

infile (str):A string with the path of the input file.

Returns

dict[tuple[int, int], float]
Returns a dictionary of tuples of indices (i,j) where i<=j as keys and score as value. The indices i,j starts at 1.
Expand source code
def from_ccmpred(infile:str) -> dict[tuple[int, int], float]:
  """
  Reads a the results of the covariation files from CCMPRED.

  Args:
    infile (str):A string with the path of the input file.

  Returns:
    dict[tuple[int, int], float]: Returns a dictionary of tuples of indices
      (i,j) where i<=j as keys and score as value. The indices i,j starts at 1.
  """
  with open(infile, 'r', encoding='utf-8') as f_in:
    raw = f_in.readlines()
  scores = {(i+1, j+1):float(v)
        for i, l in enumerate(raw)
        for j, v in enumerate(l.split()) if i <= j}
  return scores
def from_gauss_dca(infile: str) ‑> dict

Reads a the results of the covariation files from Gauss DCA.

Args

infile : str
A string with the path of the input file.

Returns

dict[tuple[int, int], float]
Returns a dictionary of tuples of indices (i,j) where i<=j as keys and score as value. The indices i,j starts at 1.
Expand source code
def from_gauss_dca(infile:str) -> dict[tuple[int, int], float]:
  """
  Reads a the results of the covariation files from Gauss DCA.

  Args:
    infile (str): A string with the path of the input file.

  Returns:
    dict[tuple[int, int], float]: Returns a dictionary of tuples of indices
      (i,j) where i<=j as keys and score as value. The indices i,j starts at 1.
  """
  gauss_pattern = r"(?P<i>[0-9]+) (?P<j>[0-9]+) (?P<score>[e+-.0-9]+)$"
  gauss_reader = _single_cov_line_reader(gauss_pattern)
  return {
    (int(i), int(j)): float(v)
    for i, j, v in gauss_reader(infile)
  }
def from_mitos_mi(infile: str) ‑> dict

Reads a the results of the covariation files from MIToS MI.

Args

infile : str
A string with the path of the input file.

Returns

dict[tuple[int, int], float]
Returns a dictionary of tuples of indices

(i,j) where i<=j as keys and score as value. The indices i,j starts at 1.

Expand source code
def from_mitos_mi(infile:str) -> dict[tuple[int, int], float]:
  """
  Reads a the results of the covariation files from MIToS MI.

  Args:
    infile (str): A string with the path of the input file.

  Returns:
    dict[tuple[int, int], float]: Returns a dictionary of tuples of indices
    (i,j) where i<=j as keys and score as value. The indices i,j starts at 1.
  """
  mitos_mi_pattern = (
    r"(?P<i>[0-9]+),(?P<j>[0-9]+),"
    r"(?P<score>[+-.0-9]+),[e+-.0-9]+$"
  )
  mi_reader = _single_cov_line_reader(mitos_mi_pattern)
  return {
    (int(i), int(j)): float(v)
    for i, j, v in mi_reader(infile)
  }
def inter_covariation(cov_data: dict) ‑> dict

Extract inter-chain interactions from paired covariation data.

Args

cov_data : dict[tuple[tuple[str, int], tuple[str, int]], float]
The covariation data from a paired MSA.

Returns

dict[tuple[str, str], dict[tuple[tuple[str, int], tuple[str, int]], float]]: Returns a new dict which chain id tuples are keys and values are subsets of cov_data that correspond to inter chain residue pairs. Chain ids in tuple keys are sorted lexicographically.

Expand source code
def inter_covariation(
    cov_data:dict[tuple[tuple[str, int], tuple[str,int]], float]
  ) -> dict[
    tuple[str, str],
    dict[tuple[tuple[str, int], tuple[str,int]],float]
  ]:
  """
  Extract inter-chain interactions from paired covariation data.

  Args:
    cov_data (dict[tuple[tuple[str, int], tuple[str, int]], float]): The
      covariation data from a paired MSA.

  Returns:
    dict[tuple[str, str], dict[tuple[tuple[str, int], tuple[str, int]], float]]:
      Returns a new dict which chain id tuples are keys and values are subsets
      of cov_data that correspond to inter chain residue pairs.
      Chain ids in tuple keys are sorted lexicographically.
  """
  chain_pairs = {
    tuple(sorted([c1, c2])) for ((c1, _), (c2, _)) in cov_data
    if not c1 == c2
  }
  inter_data = {c:{} for c in chain_pairs}
  for ((ch1, po1), (ch2, po2)), score in cov_data.items():
    if not ch1 == ch2:
      key_positions = tuple(
        sorted(
          [(ch1, po1), (ch2, po2)],
          key=lambda x: x[0])
      )
      key_chains = tuple(sorted([ch1, ch2]))
      inter_data[key_chains][key_positions] = score
  return inter_data
def intra_covariation(cov_data: dict) ‑> dict

Extract intra-chain interactions from paired covariation data.

Returns a new dict which chain ids are keys and values are subsets of cov_data that correspond to intra chain residue pairs.

Args

cov_data : dict[tuple[tuple[str, int], tuple[str, int]], float]
The paired covariation data.

Returns

dict[str, dict[tuple[tuple[str, int], tuple[str, int]], float]]
The intra chain covariation data for all chains.
Expand source code
def intra_covariation(
    cov_data: dict[tuple[tuple[str, int], tuple[str, int]], float]
  ) -> dict[str, dict[tuple[tuple[str, int], tuple[str, int]], float]]:
  """
  Extract intra-chain interactions from paired covariation data.

  Returns a new dict which chain ids are keys and values are subsets
  of cov_data that correspond to intra chain residue pairs.

  Args:
    cov_data (dict[tuple[tuple[str, int], tuple[str, int]], float]): The paired
      covariation data.

  Returns:
    dict[str, dict[tuple[tuple[str, int], tuple[str, int]], float]]: The intra
      chain covariation data for all chains.
  """
  chains = {c for ((c1, _), (c2, _)) in cov_data
        for c in [c1, c2]}
  intra_data = {c:{} for c in chains}
  for ((ch1, po1), (ch2, po2)), score in cov_data.items():
    if ch1 == ch2:
      intra_data[ch1][((ch1, po1), (ch2, po2))] = score
  return intra_data
def merge(cov_data_iter: Iterable[dict[tuple[tuple[str, int], tuple[str, int]], float]]) ‑> dict

Merge covariation data.

Args

cov_data_iter (Iterable[dict[tuple[tuple[str, int], tuple[str, int]], float]]): Is a iterator where each element is covariation data.

Returns

dict[tuple[tuple[str, int], tuple[str, int]], float]
Merged covariation data.
Expand source code
def merge(
    cov_data_iter: Iterable[
      dict[tuple[tuple[str, int], tuple[str, int]], float]
    ]
  ) -> dict[tuple[tuple[str, int], tuple[str, int]], float]:
  """
  Merge covariation data.

  Args:
    cov_data_iter (Iterable[dict[tuple[tuple[str, int], tuple[str, int]],
      float]]): Is a iterator where each element is covariation
    data.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: Merged covariation
      data.
  """
  return {pair:s for cov in cov_data_iter
      for pair, s in cov.items()}
def remap_paired(cov_data: dict, msa_file: Optional[str], chain_a_len: int, chain_a_id: str = '1', chain_b_id: str = '2') ‑> dict

Remaps the positions of the covariation scores of a paired MSA to each individual ungapped chain sequence.

Args

cov_data : dict[tuple[int, int], float]
Input covariation data of a paired MSA.
msa_file : str
A paired input MSA data.
chain_a_len : int
The number of columns in the MSA that corresponds to the first protein of the paired MSA.
chain_a_id : str
The Chain identifier of the first protein of the paired MSA.
chain_b_id : str
The Chain identifier of the second protein of the paired MSA.

Returns

dict[tuple[tuple[str, int], tuple[str, int]], float]
The mapped positons of the covariation scores.
Expand source code
def remap_paired(
    cov_data:dict[tuple[int, int], float],
    msa_file:Optional[str],
    chain_a_len:int,
    chain_a_id:str="1",
    chain_b_id:str="2"
  ) -> dict[tuple[tuple[str, int], tuple[str, int]], float]:
  """
  Remaps the positions of the covariation scores of a paired MSA to each
  individual ungapped chain sequence.

  Args:
    cov_data (dict[tuple[int, int], float]): Input covariation data of a paired
      MSA.
    msa_file (str): A paired input MSA data.
    chain_a_len (int): The number of columns in the MSA that corresponds to the
      first protein of the paired MSA.
    chain_a_id (str): The Chain identifier of the first protein of the paired
      MSA.
    chain_b_id (str): The Chain identifier of the second protein of the paired
      MSA.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: The mapped positons
      of the covariation scores.
  """
  if msa_file:
    records = SeqIO.parse(msa_file, "fasta")
    first_chain_seq = str(next(records).seq)[:chain_a_len]
    first_chain_ungapped_length = len(first_chain_seq.replace("-", ""))
  else:
    first_chain_ungapped_length = chain_a_len
  def _adapt_index(index: int) ->  tuple[str, int]:
    if index <= first_chain_ungapped_length:
      return (chain_a_id, index)
    return (chain_b_id, index-first_chain_ungapped_length)
  return {
    (_adapt_index(i), _adapt_index(j)):v
    for (i, j), v in cov_data.items()
  }
def remap_tuple_positions(cov_data: dict, mapping: dict) ‑> dict

Remaps the positions of cov_data. Cov_data should be represented as a dict with keys of the form ((chain_a, index_a), (chain_b, index_b)) and scores as values.

Args

cov_data : dict[tuple[tuple[str, int], tuple[str, int]], float]
A dict with covariation scores.
mapping : dict[str, dict[int, int]]
A dict to map the positions, the dict should have chain ids as keys, and values should be dicts that maps from old positions to new positions.

Returns

dict[tuple[tuple[str, int], tuple[str, int]], float]
The covariation data with the mapped positions.
Expand source code
def remap_tuple_positions(
    cov_data:dict[tuple[tuple[str, int], tuple[str, int]], float],
    mapping:dict[str,dict[int,int]]
  ) -> dict[tuple[tuple[str, int], tuple[str, int]], float]:
  """
  Remaps the positions of cov_data. Cov_data should be represented as a dict
  with keys of the form ((chain_a, index_a), (chain_b, index_b)) and scores as
  values.

  Args:
    cov_data (dict[tuple[tuple[str, int], tuple[str, int]], float]): A dict with
      covariation scores.
    mapping (dict[str, dict[int, int]]): A dict to map the positions, the dict
      should have chain ids as keys, and values should be dicts that maps from
      old positions to new positions.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: The covariation data
      with the mapped positions.
  """
  return {((c1, mapping[c1][p1]), (c2, mapping[c2][p2])):s
      for ((c1, p1), (c2, p2)), s in cov_data.items()
      if p1 in mapping[c1] and p2 in mapping[c2]}
def remove_trivial_tuple(cov_data: dict, min_pos_dif: int = 5) ‑> dict

Removes positions from covariation data from residue pairs that are a lesser distance than five positions in sequence.

Covariation data is assumed to be a dict with keys of the form ((chain_a, index_a), (chain_b, index_b)) and scores as keys.

Args

cov_data : dict[tuple[tuple[str, int], tuple[str, int]], float]
The input covariation data.
min_pos_dif : int
Minimum distance that two residues should

have to be included.

Returns

dict[tuple[tuple[str, int], tuple[str, int]], float]
The covariation data without the trivials pairs.
Expand source code
def remove_trivial_tuple(
    cov_data:dict[tuple[tuple[str, int], tuple[str, int]], float],
    min_pos_dif:int=5
  ) -> dict[tuple[tuple[str, int], tuple[str, int]], float]:
  """
  Removes positions from covariation data from residue pairs that are
  a lesser distance than five positions in sequence.

  Covariation data is assumed to be a dict with keys of the form
  ((chain_a, index_a), (chain_b, index_b)) and scores as keys.

  Args:
    cov_data (dict[tuple[tuple[str, int], tuple[str, int]], float]): The input
      covariation data.
    min_pos_dif (int): Minimum distance that two residues should
    have to be included.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: The covariation data
      without the trivials pairs.
  """
  return {((c1, p1), (c2, p2)):score
      for ((c1, p1), (c2, p2)), score in cov_data.items()
      if not ((c1 == c2) and (abs(p2 - p1) < min_pos_dif))}
def to_tuple_positions(cov_data: dict, chain_id: str) ‑> dict

Converts simple positions to tuples.

Converts the indexes of covariation results to tupled positions that include the protein chain id. Make the results compatible with output of remap_paired function

Args

cov_data : dict[tuple[int, int], float]
The results of covariation scores. as a dict of indices as keys and scores as values.
chain_id : str
A chain identifier.

Returns

dict[tuple[tuple[str, int], tuple[str, int]], float]
The covariation data with the included chain identifiers.
Expand source code
def to_tuple_positions(
    cov_data:dict[tuple[int, int], float],
    chain_id:str
  ) -> dict[tuple[tuple[str, int], tuple[str,int]], float]:
  """
  Converts simple positions to tuples.

  Converts the indexes of covariation results to tupled positions that include
  the protein chain id. Make the results compatible with output of
  remap_paired function

  Args:
    cov_data (dict[tuple[int, int], float]): The results of covariation scores.
      as a dict of indices as keys and scores as values.
    chain_id (str): A chain identifier.

  Returns:
    dict[tuple[tuple[str, int], tuple[str, int]], float]: The covariation data
      with the included chain identifiers.
  """
  return {
    ((chain_id, i), (chain_id, j)): v
    for (i, j), v in cov_data.items()
  }