Module xi_covutils.conservation

Computes conservation for a collection of protein sequences.

Expand source code
"""
Computes conservation for a collection of protein sequences.
"""

import math
from collections import defaultdict
from typing import Dict, List, Optional
from enum import Enum
import matplotlib.pyplot as plt
import click
from xi_covutils.clustering import hobohm1
from xi_covutils.matrices.common import RESIDUE_ORDER_MAP
from xi_covutils.matrices.blosum_45 import BLOSUM_45_BG
from xi_covutils.matrices.blosum_50 import BLOSUM_50_BG
from xi_covutils.matrices.blosum_62 import BLOSUM_62_BG
from xi_covutils.matrices.blosum_80 import BLOSUM_80_BG
from xi_covutils.matrices.blosum_90 import BLOSUM_90_BG
from xi_covutils.msa._msa import gap_content_by_column, read_msa

def _replace_non_standard_aa(seq_data: List[str]) -> List[str]:
  allowed_chars = set("QWERTYIPASDFGHKLCVNM-")
  return [
    "".join(
      [
        c if c in allowed_chars else "-"
        for c in seq.upper()
      ]
    )
    for seq in seq_data
  ]

def _count_table(
    sequences: List[str],
    clustering_id=None
  ) -> Dict[int, Dict[str, float]]:
  counts:Dict[int, Dict[str, float]] = defaultdict(
    lambda: defaultdict(lambda: 0)
  )
  if clustering_id:
    clusters = hobohm1(
      sequences=sequences,
      identity_cutoff=float(clustering_id)/100
    )
    weigths = {
      i:float(1) / len(c.indexes)
      for c in clusters
      for i in c.indexes
    }
  else:
    weigths = defaultdict(lambda: 1)
  for i, seq in enumerate(sequences):
    for col, char in enumerate(seq):
      char = char.upper()
      if char != "-":
        counts[col][char] += weigths[i]
  return counts

def _counts_by_column(
    count_table: Dict[int, Dict[str, float]]
  ) -> Dict[int, float]:
  return {
    col: sum(chars.values())
    for col, chars in count_table.items()
  }

def _frequency_table(
    count_table: Dict[int, Dict[str, float]]
  ) -> Dict[int, Dict[str, float]]:
  sums = _counts_by_column(count_table)
  return  {
    col: {
      char: float(count) / sums[col]
      for char, count in chars.items()
    }
    for col, chars in count_table.items()
  }

class BackgroundFreq(Enum):
  """
  Types of amino acid background frequency.
  """
  UNIFORM = 1
  BLOSUM45 = 2
  BLOSUM50 = 3
  BLOSUM62 = 4
  BLOSUM80 = 5
  BLOSUM90 = 6

def get_background_frequencies(mat: BackgroundFreq) -> Optional[List[float]]:
  """
  Retrieves amino acid background frequency vectors given a substitution matrix.

  Args:
    mat (BackgroundFreq): A subsitution matrix.

  Returns:
    List[float]: A list with the background frequencies for each amino acid,
      according to different substitution matrices.
  """
  freq = None
  if mat == BackgroundFreq.UNIFORM:
    freq = [1.0] * 20
  if mat == BackgroundFreq.BLOSUM45:
    freq =  BLOSUM_45_BG
  if mat == BackgroundFreq.BLOSUM50:
    freq =  BLOSUM_50_BG
  if mat == BackgroundFreq.BLOSUM62:
    freq =  BLOSUM_62_BG
  if mat == BackgroundFreq.BLOSUM80:
    freq =  BLOSUM_80_BG
  if mat == BackgroundFreq.BLOSUM90:
    freq =  BLOSUM_90_BG
  return freq

def entropy(
    seq_data: List[str],
    background_frq: BackgroundFreq = BackgroundFreq.UNIFORM,
    clustering_id: Optional[float] = None,
    max_diff: bool = True
  ) -> List[Optional[float]]:
  """
  Computes Shannon entropy for a collection of proteins.
  The calculation of Shannon entropy and Kullback/Leiber divergence for entropy
  correction is made using base 2 logarhytms. Therefore, the scores are bits.

  Args:
    seq_data (List[str]): Is a collection of protein sequences.
    background_frq (BackgroundFreq, optional): Is a BackgroundFreq enum value,
      indicating which substitution matrix is used to correct the entropy
      values. Defaults to BackgroundFreq.Uniform, that does not do any
      correction of Shannon Entropy,
    clustering_id (float, optional): A percentage to make a clustering of
      sequences previous to the calculation of Entropy. Defaults to None, no
      clustering is done.
    max_diff (bool): Only used with background_frq = BackgroundFreq.Uniform.
      The values returned are the difference between the maximum entropy and the
      actual entropy. This makes bigger values are more conserved. Defaults to
      True.

  Returns:
    List[float]: A list conservation scores in bits.
  """
  def _build_background_mapping(
        background_frq: List[float]
      ) -> Dict[str, float]:
    return  {
      aa: background_frq[order]
      for aa, order in RESIDUE_ORDER_MAP.items()
    }
  if not seq_data:
    return []
  seq_data = _replace_non_standard_aa(seq_data)
  bgfreqs = get_background_frequencies(background_frq)
  if not bgfreqs:
    return []
  background = _build_background_mapping(bgfreqs)
  sign = -1 if background_frq == BackgroundFreq.UNIFORM else 1
  ncols = len(seq_data[0])
  counts = _count_table(
    sequences=seq_data,
    clustering_id=clustering_id
  )
  freqs = _frequency_table(counts)
  entropy_values = [
    sign * float(
      sum(
        (
          freq * math.log2(freq / background[char])
          for char, freq in freqs[col].items()
        )
      )
    ) if col in freqs else None
    for col in range(ncols)
  ]
  if max_diff and background_frq == BackgroundFreq.UNIFORM:
    entropy_values = [
      (math.log2(20) - x) if x is not None else x
      for x in entropy_values
    ]
  return entropy_values

# pylint: disable=too-many-locals
def plot_conservation(
    sequences: List[str],
    background_frq: BackgroundFreq = BackgroundFreq.UNIFORM,
    clustering_id: Optional[float] = None,
    outfile: str = "plot_conservation.png",
    with_caption: bool = True
  ):
  """
  Generate
  Args:
    seq_data (List[str]): Is a collection of protein sequences.
    background_frq (BackgroundFreq, optional): Is a BackgroundFreq enum value,
    indicating which substitution matrix is used to correct the entropy
    values. Defaults to BackgroundFreq.Uniform, that does not do any
    correction of Shannon Entropy,
    clustering_id (float, optional): A percentage to make a clustering of
    sequences previous to the calculation of Entropy. Defaults to None, no
    clustering is done.
    outfile (str): The output file.
  """
  def _shannon_caption() -> str:
    return (
      "Conservation is calculated as the difference of Maximum Shannon "
      "Entropy\n"
      "for the amino acid alphabet and the actual Shannon Entropy."
    )
  def _blosum_caption(perc: int) -> str:
    return (
      "Conservation is calculated as the Shannon Entropy relative to \n"
      "(Kullback-Leibler) amino acid background frequencies derived from the \n"
      f"BLOSUM {str(perc)} substitution matrix."
    )
  def _get_xlabel(
      with_caption:bool,
      bg_frq: BackgroundFreq,
      clustering_id: Optional[float]
    ):
    base_label = "Amino acid position"
    if not with_caption:
      return base_label
    xlabel = ""
    if bg_frq == BackgroundFreq.UNIFORM:
      xlabel = _shannon_caption()
    if bg_frq == BackgroundFreq.BLOSUM45:
      xlabel = _blosum_caption(45)
    if bg_frq == BackgroundFreq.BLOSUM50:
      xlabel = _blosum_caption(50)
    if bg_frq == BackgroundFreq.BLOSUM62:
      xlabel = _blosum_caption(62)
    if bg_frq == BackgroundFreq.BLOSUM80:
      xlabel = _blosum_caption(80)
    if bg_frq == BackgroundFreq.BLOSUM90:
      xlabel = _blosum_caption(90)
    if clustering_id:
      xlabel = (
        f"{xlabel}\nSequences where clustered at {clustering_id} % identity."
      )
    return f"{base_label}\n\n{xlabel}"
  data = entropy(
    seq_data = sequences,
    background_frq = background_frq,
    clustering_id = clustering_id
  )
  fig, host = plt.subplots(figsize=(16,9))
  axes2 = host.twinx()
  host.set_ylabel("Conservation (bits)", fontsize = 16)
  host.set_title("Conservation Entropy", fontsize = 18)
  host.set_xlabel(
    _get_xlabel(with_caption, background_frq, clustering_id),
    fontsize=16
  )
  cons_plot = host.plot(
    [x+1 for x in range(len(data))],
    data,
    label = "Conservation",
    color = "red"
  )
  host.set_xlim(0, len(data)+1)
  max_value = max(x for x in data if x is not None)
  min_value = min(x for x in data if x is not None)
  if background_frq == BackgroundFreq.UNIFORM:
    host.set_ylim(0, math.log2(20)+.1)
  else:
    host.set_ylim(min_value-0.1, max_value+0.1)
  scatter_plot = []
  if any(x is None for x in data):
    scatter_plot = axes2.scatter(
      [i+1 for i,x in enumerate(data) if x is None],
      [1 for x in data if x is None],
      label = "All gap columns",
      color = "green"
    )
    scatter_plot = [scatter_plot]
  gap_frq = gap_content_by_column(sequences)
  gap_plot = axes2.plot(
    [x+1 for x in range(len(gap_frq))],
    gap_frq,
    label = "Gap frequency",
    color = "blue"
  )
  host.legend(
    cons_plot + gap_plot + scatter_plot,
    ["Conservation", "Gap frequency", "All Gap Columns"]
  )
  fig.tight_layout()
  fig.savefig(outfile)

@click.command()
@click.argument('filename', default=None)
@click.option('--blosum', default=None)
@click.option('--clustering', default=None)
@click.option('--maxdiff/--no-maxdiff', default=True)
def calculate_conservation(filename, blosum, clustering, maxdiff):
  """
  Calculates conservation of a protein MSA.

  Args:
      filename (str): the input fasta MSA.
  """
  print("# Conservation")
  print(f"# Input filename: {filename}")
  if clustering:
    print(f"# Clustering weighting at {clustering}% id.")
  else:
    print("# No clustering weighting.")
  records = read_msa(filename, msa_format="fasta")
  bg_frq = None
  if not blosum:
    bg_frq = BackgroundFreq.UNIFORM
    if maxdiff:
      print("# Values = Max_Entropy - Shannon_Entropy")
  if blosum == 45:
    bg_frq = BackgroundFreq.BLOSUM45
    print("# Corrected with BLOSUM 45")
  if blosum == 50:
    bg_frq = BackgroundFreq.BLOSUM50
    print("# Corrected with BLOSUM 50")
  if blosum == 62:
    bg_frq = BackgroundFreq.BLOSUM62
    print("# Corrected with BLOSUM 62")
  if blosum == 80:
    bg_frq = BackgroundFreq.BLOSUM80
    print("# Corrected with BLOSUM 80")
  if blosum == 90:
    bg_frq = BackgroundFreq.BLOSUM90
    print("# Corrected with BLOSUM 90")
  sequences = [
    seq for _, seq in records
  ]
  if not bg_frq:
    print("# Error: No background frequencies.")
    return
  cons = entropy(
    sequences,
    background_frq=bg_frq,
    clustering_id=clustering,
    max_diff = maxdiff
  )
  gap_frq = gap_content_by_column(sequences)
  print("# Columns:")
  print("# Position, Conservation, Shannon Entropy")
  for i, (con, gap) in enumerate(zip(cons, gap_frq)):
    print(f"{i+1}, {con}, {gap}")

@click.command()
@click.argument('filename', default=None)
@click.option('--blosum', default=None)
@click.option('--clustering', default=None)
@click.option('--outfile', default="conservation_plot.png")
@click.option('--with-caption/--with-no-caption', default=True)
def conservation_plot(filename, blosum, clustering, outfile, with_caption):
  """
  Calculates conservation of a protein MSA.

  Args:
      filename (str): the input fasta MSA.
  """
  print("# Conservation")
  print(f"# Input filename: {filename}")
  if clustering:
    print(f"# Clustering weighting at {clustering}% id.")
  else:
    print("# No clustering weighting.")
  records = read_msa(filename, msa_format="fasta")
  bg_frq = None
  if not blosum:
    bg_frq = BackgroundFreq.UNIFORM
  if blosum == 45:
    bg_frq = BackgroundFreq.BLOSUM45
    print("# Corrected with BLOSUM 45")
  if blosum == 50:
    bg_frq = BackgroundFreq.BLOSUM50
    print("# Corrected with BLOSUM 50")
  if blosum == 62:
    bg_frq = BackgroundFreq.BLOSUM62
    print("# Corrected with BLOSUM 62")
  if blosum == 80:
    bg_frq = BackgroundFreq.BLOSUM80
    print("# Corrected with BLOSUM 80")
  if blosum == 90:
    bg_frq = BackgroundFreq.BLOSUM90
    print("# Corrected with BLOSUM 90")
  sequences = [
    seq for _, seq in records
  ]
  if not bg_frq:
    print("# Error: No background frequencies.")
    return
  plot_conservation(
    sequences = sequences,
    background_frq = bg_frq,
    clustering_id=clustering,
    outfile = outfile,
    with_caption = with_caption
  )

Functions

def entropy(seq_data: List[str], background_frq: BackgroundFreq = BackgroundFreq.UNIFORM, clustering_id: Optional[float] = None, max_diff: bool = True) ‑> List[Optional[float]]

Computes Shannon entropy for a collection of proteins. The calculation of Shannon entropy and Kullback/Leiber divergence for entropy correction is made using base 2 logarhytms. Therefore, the scores are bits.

Args

seq_data : List[str]
Is a collection of protein sequences.
background_frq : BackgroundFreq, optional
Is a BackgroundFreq enum value, indicating which substitution matrix is used to correct the entropy values. Defaults to BackgroundFreq.Uniform, that does not do any correction of Shannon Entropy,
clustering_id : float, optional
A percentage to make a clustering of sequences previous to the calculation of Entropy. Defaults to None, no clustering is done.
max_diff : bool
Only used with background_frq = BackgroundFreq.Uniform. The values returned are the difference between the maximum entropy and the actual entropy. This makes bigger values are more conserved. Defaults to True.

Returns

List[float]
A list conservation scores in bits.
Expand source code
def entropy(
    seq_data: List[str],
    background_frq: BackgroundFreq = BackgroundFreq.UNIFORM,
    clustering_id: Optional[float] = None,
    max_diff: bool = True
  ) -> List[Optional[float]]:
  """
  Computes Shannon entropy for a collection of proteins.
  The calculation of Shannon entropy and Kullback/Leiber divergence for entropy
  correction is made using base 2 logarhytms. Therefore, the scores are bits.

  Args:
    seq_data (List[str]): Is a collection of protein sequences.
    background_frq (BackgroundFreq, optional): Is a BackgroundFreq enum value,
      indicating which substitution matrix is used to correct the entropy
      values. Defaults to BackgroundFreq.Uniform, that does not do any
      correction of Shannon Entropy,
    clustering_id (float, optional): A percentage to make a clustering of
      sequences previous to the calculation of Entropy. Defaults to None, no
      clustering is done.
    max_diff (bool): Only used with background_frq = BackgroundFreq.Uniform.
      The values returned are the difference between the maximum entropy and the
      actual entropy. This makes bigger values are more conserved. Defaults to
      True.

  Returns:
    List[float]: A list conservation scores in bits.
  """
  def _build_background_mapping(
        background_frq: List[float]
      ) -> Dict[str, float]:
    return  {
      aa: background_frq[order]
      for aa, order in RESIDUE_ORDER_MAP.items()
    }
  if not seq_data:
    return []
  seq_data = _replace_non_standard_aa(seq_data)
  bgfreqs = get_background_frequencies(background_frq)
  if not bgfreqs:
    return []
  background = _build_background_mapping(bgfreqs)
  sign = -1 if background_frq == BackgroundFreq.UNIFORM else 1
  ncols = len(seq_data[0])
  counts = _count_table(
    sequences=seq_data,
    clustering_id=clustering_id
  )
  freqs = _frequency_table(counts)
  entropy_values = [
    sign * float(
      sum(
        (
          freq * math.log2(freq / background[char])
          for char, freq in freqs[col].items()
        )
      )
    ) if col in freqs else None
    for col in range(ncols)
  ]
  if max_diff and background_frq == BackgroundFreq.UNIFORM:
    entropy_values = [
      (math.log2(20) - x) if x is not None else x
      for x in entropy_values
    ]
  return entropy_values
def get_background_frequencies(mat: BackgroundFreq) ‑> Optional[List[float]]

Retrieves amino acid background frequency vectors given a substitution matrix.

Args

mat : BackgroundFreq
A subsitution matrix.

Returns

List[float]
A list with the background frequencies for each amino acid, according to different substitution matrices.
Expand source code
def get_background_frequencies(mat: BackgroundFreq) -> Optional[List[float]]:
  """
  Retrieves amino acid background frequency vectors given a substitution matrix.

  Args:
    mat (BackgroundFreq): A subsitution matrix.

  Returns:
    List[float]: A list with the background frequencies for each amino acid,
      according to different substitution matrices.
  """
  freq = None
  if mat == BackgroundFreq.UNIFORM:
    freq = [1.0] * 20
  if mat == BackgroundFreq.BLOSUM45:
    freq =  BLOSUM_45_BG
  if mat == BackgroundFreq.BLOSUM50:
    freq =  BLOSUM_50_BG
  if mat == BackgroundFreq.BLOSUM62:
    freq =  BLOSUM_62_BG
  if mat == BackgroundFreq.BLOSUM80:
    freq =  BLOSUM_80_BG
  if mat == BackgroundFreq.BLOSUM90:
    freq =  BLOSUM_90_BG
  return freq
def plot_conservation(sequences: List[str], background_frq: BackgroundFreq = BackgroundFreq.UNIFORM, clustering_id: Optional[float] = None, outfile: str = 'plot_conservation.png', with_caption: bool = True)

Generate

Args

seq_data : List[str]
Is a collection of protein sequences.
background_frq : BackgroundFreq, optional
Is a BackgroundFreq enum value,
indicating which substitution matrix is used to correct the entropy
values. Defaults to BackgroundFreq.Uniform, that does not do any
correction of Shannon Entropy,
clustering_id : float, optional
A percentage to make a clustering of
sequences previous to the calculation of Entropy. Defaults to None, no
clustering is done.
outfile : str
The output file.
Expand source code
def plot_conservation(
    sequences: List[str],
    background_frq: BackgroundFreq = BackgroundFreq.UNIFORM,
    clustering_id: Optional[float] = None,
    outfile: str = "plot_conservation.png",
    with_caption: bool = True
  ):
  """
  Generate
  Args:
    seq_data (List[str]): Is a collection of protein sequences.
    background_frq (BackgroundFreq, optional): Is a BackgroundFreq enum value,
    indicating which substitution matrix is used to correct the entropy
    values. Defaults to BackgroundFreq.Uniform, that does not do any
    correction of Shannon Entropy,
    clustering_id (float, optional): A percentage to make a clustering of
    sequences previous to the calculation of Entropy. Defaults to None, no
    clustering is done.
    outfile (str): The output file.
  """
  def _shannon_caption() -> str:
    return (
      "Conservation is calculated as the difference of Maximum Shannon "
      "Entropy\n"
      "for the amino acid alphabet and the actual Shannon Entropy."
    )
  def _blosum_caption(perc: int) -> str:
    return (
      "Conservation is calculated as the Shannon Entropy relative to \n"
      "(Kullback-Leibler) amino acid background frequencies derived from the \n"
      f"BLOSUM {str(perc)} substitution matrix."
    )
  def _get_xlabel(
      with_caption:bool,
      bg_frq: BackgroundFreq,
      clustering_id: Optional[float]
    ):
    base_label = "Amino acid position"
    if not with_caption:
      return base_label
    xlabel = ""
    if bg_frq == BackgroundFreq.UNIFORM:
      xlabel = _shannon_caption()
    if bg_frq == BackgroundFreq.BLOSUM45:
      xlabel = _blosum_caption(45)
    if bg_frq == BackgroundFreq.BLOSUM50:
      xlabel = _blosum_caption(50)
    if bg_frq == BackgroundFreq.BLOSUM62:
      xlabel = _blosum_caption(62)
    if bg_frq == BackgroundFreq.BLOSUM80:
      xlabel = _blosum_caption(80)
    if bg_frq == BackgroundFreq.BLOSUM90:
      xlabel = _blosum_caption(90)
    if clustering_id:
      xlabel = (
        f"{xlabel}\nSequences where clustered at {clustering_id} % identity."
      )
    return f"{base_label}\n\n{xlabel}"
  data = entropy(
    seq_data = sequences,
    background_frq = background_frq,
    clustering_id = clustering_id
  )
  fig, host = plt.subplots(figsize=(16,9))
  axes2 = host.twinx()
  host.set_ylabel("Conservation (bits)", fontsize = 16)
  host.set_title("Conservation Entropy", fontsize = 18)
  host.set_xlabel(
    _get_xlabel(with_caption, background_frq, clustering_id),
    fontsize=16
  )
  cons_plot = host.plot(
    [x+1 for x in range(len(data))],
    data,
    label = "Conservation",
    color = "red"
  )
  host.set_xlim(0, len(data)+1)
  max_value = max(x for x in data if x is not None)
  min_value = min(x for x in data if x is not None)
  if background_frq == BackgroundFreq.UNIFORM:
    host.set_ylim(0, math.log2(20)+.1)
  else:
    host.set_ylim(min_value-0.1, max_value+0.1)
  scatter_plot = []
  if any(x is None for x in data):
    scatter_plot = axes2.scatter(
      [i+1 for i,x in enumerate(data) if x is None],
      [1 for x in data if x is None],
      label = "All gap columns",
      color = "green"
    )
    scatter_plot = [scatter_plot]
  gap_frq = gap_content_by_column(sequences)
  gap_plot = axes2.plot(
    [x+1 for x in range(len(gap_frq))],
    gap_frq,
    label = "Gap frequency",
    color = "blue"
  )
  host.legend(
    cons_plot + gap_plot + scatter_plot,
    ["Conservation", "Gap frequency", "All Gap Columns"]
  )
  fig.tight_layout()
  fig.savefig(outfile)

Classes

class BackgroundFreq (value, names=None, *, module=None, qualname=None, type=None, start=1)

Types of amino acid background frequency.

Expand source code
class BackgroundFreq(Enum):
  """
  Types of amino acid background frequency.
  """
  UNIFORM = 1
  BLOSUM45 = 2
  BLOSUM50 = 3
  BLOSUM62 = 4
  BLOSUM80 = 5
  BLOSUM90 = 6

Ancestors

  • enum.Enum

Class variables

var BLOSUM45
var BLOSUM50
var BLOSUM62
var BLOSUM80
var BLOSUM90
var UNIFORM