Module xi_covutils.taxonomy

Taxonomy module

Function and classes to work with taxonomy.

Expand source code
"""
Taxonomy module

Function and classes to work with taxonomy.
"""
from collections import defaultdict
from functools import reduce
from typing import Optional, cast

import pandas as pd

class TaxonomyTree:
  """
  Taxonomy Tree class.
  """
  def __init__(self, children_to_parent_mapping:dict[int, int]):
    self.mapping = children_to_parent_mapping
    self.reverse_mapping = self._build_reverse_mapping(self.mapping)

  @staticmethod
  def _build_reverse_mapping(mapping:dict[int, int]) -> dict[int, list[int]]:
    result = defaultdict(list[int])
    for child, parent in mapping.items():
      result[parent].append(child)
    return result

  def get_parent(self, taxid:int) -> tuple[bool, Optional[int]]:
    """
    Get the parent of a Node.

    Args:
      taxid (int): A taxid.

    Returns:
      tuple[bool, Optional[int]]: If given taxid is in the taxonomy tree.
        The first value is True. If the given node has a parent return its
        taxid, if the node is the root, i.e. do no have a parent, returns None.
    """
    parent = self.mapping.get(taxid)
    if parent is None:
      return (False, None)
    if parent == taxid:
      return (True, None)
    return (True, parent)

  def valid_taxid(self, taxid:int) -> bool:
    """
    Checks if a given taxid, belongs to the Taxonomy Tree.

    Args:
      taxid (int): A taxid.

    Returns:
      bool: True if the given taxid is in the Taxonomy Tree.
    """
    return taxid in self.mapping or taxid in self.mapping.values()

  def is_parent(self, taxid:int) -> bool:
    """
    Checks if a given taxid, is a parent of another node in the
    Taxonomy Tree.

    Args:
      taxid (int): A taxid.

    Returns:
      bool: True if the given taxid is a parent node.
    """
    return taxid in self.mapping

  def retrieve_lineage(self, taxid:int) -> list[int]:
    """
    Retrieves the complete lineage from the root to the given
    taxid.

    Args:
      taxid (int): A taxid.

    Returns:
      list[int]: A list of taxids representing the complete lineage.
    """
    lineage = [taxid]
    node = taxid
    while True:
      parent, child = self.get_parent(node)
      if not parent:
        break
      if not child:
        break
      lineage.insert(0, child)
      node = child
    return lineage

  def common_lineage(self, taxids:list[int]) -> list[int]:
    """
    Gets the common lineage from the root for a group of nodes.

    Args:
      taxids (list[int]): A groups of nodes taxids.

    Returns:
      list[int]: The common lineage of all given nodes.
    """
    def _max_common_lineage(lin1: list[int], lin2:list[int]) -> list[int]:
      if lin1 == []:
        return lin2
      if lin2 == []:
        return lin1
      return [
        a
        for a, b in zip(lin1, lin2)
        if a == b
      ]
    all_lineages = [
      self.retrieve_lineage(node)
      for node in taxids
    ]
    return reduce(_max_common_lineage, all_lineages, [])

  def get_children(self, taxid:int) -> list[int]:
    """
    Get the children nodes of a given node in the Taxonomy Tree.

    Args:
      taxid (int): A taxid Node.

    Returns:
      list[int]: A list of the children nodes.
    """
    return self.reverse_mapping.get(taxid, [])

  def get_subtree_nodes(self, taxid:int) -> list[int]:
    """
    Gets all nodes in a subtree starting from a given node including the node
    itself.

    Args:
      taxid (int): A taxid Node.

    Returns:
      list[int]: A list of the children Nodes.
    """
    result = []
    to_visit = [taxid]
    while to_visit:
      c_id = to_visit.pop()
      if not self.valid_taxid(c_id):
        continue
      result.append(c_id)
      c_children = self.get_children(c_id)
      for child in c_children:
        if not child in result:
          to_visit.append(child)
    return result

  def lineage_contains(self, query: int, subject:set[int]) -> Optional[int]:
    """
    Check if the lineage of a taxid contains any of the given suject taxids.
    Checks from terminal leaf to root of the lineage. The first element that
    is in subject is returned.

    Args:
      query (int): a Query Tax Id.
      subject (set[int]): A set of taxids.

    Returns:
      Optional[int]: A first taxid in the lineage that is found in subject.
    """
    lineage = self.retrieve_lineage(query)
    for c_id in reversed(lineage):
      if c_id in subject:
        return c_id
    return None

def create_children_to_parent_mapping(
    taxonomy_nodes_file:str
  ) -> dict[int, int]:
  """
  Create a dictionary mapping children taxids to their parents
  reading a NCBI taxonomy nodes dump file.
  Processing this file can be time consuming, is better to cache this function
  results to reuse it.

  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  Args:
    taxonomy_nodes_file (str): the path to the taxonomy nodes dump file.

  Returns:
    dict[int, int]: A dict from child taxids to their parents.
  """
  mapping = (
    pd
      .read_csv(
        taxonomy_nodes_file,
        sep="\t",
        header = None
      )
      .iloc[:, [0, 2]]
      .rename(
        columns={0:"taxid", 2:"parent"}
      )
      .set_index(
        "taxid"
      )
      ["parent"]
      .to_dict()
  )
  mapping = cast(dict[int, int], mapping)
  return mapping

def get_species_level_taxids(
    taxonomy_nodes_file: str
  ) -> list[int]:
  """
  Retrieves all taxids from species level taxonomy
  reading a NCBI taxonomy nodes dump file.
  Processing this file can be time consuming, is better to cache this function
  results to reuse it.

  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  Args:
    taxonomy_nodes_file (str): the path to the taxonomy nodes dump file.

  Returns:
    list[int]: A list with all species level taxids.
  """
  species = (
    pd
      .read_csv(
        taxonomy_nodes_file,
        sep="\t",
        header=None
      )
      .iloc[:, [0, 4]]
      .rename(
        columns = {0:"taxid", 4:"rank"}
      )
      .query("rank=='species'")
      .taxid
      .to_list()
  )
  return species

def read_taxonomy_names_dump_file(
    taxonomy_namesdump_file:str
  ) -> pd.DataFrame:
  """
  Read the taxonomy names dump file into Pandas DataFrame.

  Args:
    taxonomy_namesdump_file (str): The taxonomy dump file.

  Returns:
    pd.DataFrame: A pandas DataFrame
  """
  data = (
    pd.read_csv(
      taxonomy_namesdump_file,
      sep="\t",
      header=None
    )
    .drop([1, 3, 5, 7], axis=1)
    .rename(
      columns={
        0:"taxid",
        2:"name",
        4:"unique_name",
        6:"category_name"
      }
    )
  )
  return data

def create_names_to_taxid_mapping(
    taxonomy_namesdump_file:str
  ) -> dict[str, int]:
  """
  Generates a mappping from names, and unique names to the taxid.
  Reads the NCBI taxonomy names dump file.
  Processing this file can be time consuming, is better to cache this function
  results to reuse it.

  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  Returns:
    dict[str, int]: The mapping from names and unique names to taxid.
  """
  result:dict[str, int] = {}
  data = read_taxonomy_names_dump_file(taxonomy_namesdump_file)
  for _, row in data.iterrows():
    taxid:int = int(row["taxid"])
    name:str = str(row["name"])
    unique:str = str(row["unique_name"])
    if name != 'nan':
      result[name] = taxid
    if unique != 'nan':
      result[unique] = taxid
  return result

def create_taxid_to_names_mapping(
    taxonomy_namesdump_file:str
  ) -> dict[int, str]:
  """
  Generates a mappping from taxid to scientific names.
  Reads the NCBI taxonomy names dump file.
  Processing this file can be time consuming, is better to cache this function
  results to reuse it.

  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  Returns:
    dict[int, str]: The mapping from taxid to scientific names.
  """
  result:dict[int, str] = {}
  data = read_taxonomy_names_dump_file(taxonomy_namesdump_file)
  for _, row in data.iterrows():
    if row["category_name"] != "scientific name":
      continue
    taxid:int = int(row["taxid"])
    name:str = str(row["name"])
    result[taxid] = name
  return result

Functions

def create_children_to_parent_mapping(taxonomy_nodes_file: str) ‑> dict

Create a dictionary mapping children taxids to their parents reading a NCBI taxonomy nodes dump file. Processing this file can be time consuming, is better to cache this function results to reuse it.

https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

Args

taxonomy_nodes_file : str
the path to the taxonomy nodes dump file.

Returns

dict[int, int]
A dict from child taxids to their parents.
Expand source code
def create_children_to_parent_mapping(
    taxonomy_nodes_file:str
  ) -> dict[int, int]:
  """
  Create a dictionary mapping children taxids to their parents
  reading a NCBI taxonomy nodes dump file.
  Processing this file can be time consuming, is better to cache this function
  results to reuse it.

  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  Args:
    taxonomy_nodes_file (str): the path to the taxonomy nodes dump file.

  Returns:
    dict[int, int]: A dict from child taxids to their parents.
  """
  mapping = (
    pd
      .read_csv(
        taxonomy_nodes_file,
        sep="\t",
        header = None
      )
      .iloc[:, [0, 2]]
      .rename(
        columns={0:"taxid", 2:"parent"}
      )
      .set_index(
        "taxid"
      )
      ["parent"]
      .to_dict()
  )
  mapping = cast(dict[int, int], mapping)
  return mapping
def create_names_to_taxid_mapping(taxonomy_namesdump_file: str) ‑> dict

Generates a mappping from names, and unique names to the taxid. Reads the NCBI taxonomy names dump file. Processing this file can be time consuming, is better to cache this function results to reuse it.

https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

Returns

dict[str, int]
The mapping from names and unique names to taxid.
Expand source code
def create_names_to_taxid_mapping(
    taxonomy_namesdump_file:str
  ) -> dict[str, int]:
  """
  Generates a mappping from names, and unique names to the taxid.
  Reads the NCBI taxonomy names dump file.
  Processing this file can be time consuming, is better to cache this function
  results to reuse it.

  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  Returns:
    dict[str, int]: The mapping from names and unique names to taxid.
  """
  result:dict[str, int] = {}
  data = read_taxonomy_names_dump_file(taxonomy_namesdump_file)
  for _, row in data.iterrows():
    taxid:int = int(row["taxid"])
    name:str = str(row["name"])
    unique:str = str(row["unique_name"])
    if name != 'nan':
      result[name] = taxid
    if unique != 'nan':
      result[unique] = taxid
  return result
def create_taxid_to_names_mapping(taxonomy_namesdump_file: str) ‑> dict

Generates a mappping from taxid to scientific names. Reads the NCBI taxonomy names dump file. Processing this file can be time consuming, is better to cache this function results to reuse it.

https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

Returns

dict[int, str]
The mapping from taxid to scientific names.
Expand source code
def create_taxid_to_names_mapping(
    taxonomy_namesdump_file:str
  ) -> dict[int, str]:
  """
  Generates a mappping from taxid to scientific names.
  Reads the NCBI taxonomy names dump file.
  Processing this file can be time consuming, is better to cache this function
  results to reuse it.

  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  Returns:
    dict[int, str]: The mapping from taxid to scientific names.
  """
  result:dict[int, str] = {}
  data = read_taxonomy_names_dump_file(taxonomy_namesdump_file)
  for _, row in data.iterrows():
    if row["category_name"] != "scientific name":
      continue
    taxid:int = int(row["taxid"])
    name:str = str(row["name"])
    result[taxid] = name
  return result
def get_species_level_taxids(taxonomy_nodes_file: str) ‑> list

Retrieves all taxids from species level taxonomy reading a NCBI taxonomy nodes dump file. Processing this file can be time consuming, is better to cache this function results to reuse it.

https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

Args

taxonomy_nodes_file : str
the path to the taxonomy nodes dump file.

Returns

list[int]
A list with all species level taxids.
Expand source code
def get_species_level_taxids(
    taxonomy_nodes_file: str
  ) -> list[int]:
  """
  Retrieves all taxids from species level taxonomy
  reading a NCBI taxonomy nodes dump file.
  Processing this file can be time consuming, is better to cache this function
  results to reuse it.

  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  Args:
    taxonomy_nodes_file (str): the path to the taxonomy nodes dump file.

  Returns:
    list[int]: A list with all species level taxids.
  """
  species = (
    pd
      .read_csv(
        taxonomy_nodes_file,
        sep="\t",
        header=None
      )
      .iloc[:, [0, 4]]
      .rename(
        columns = {0:"taxid", 4:"rank"}
      )
      .query("rank=='species'")
      .taxid
      .to_list()
  )
  return species
def read_taxonomy_names_dump_file(taxonomy_namesdump_file: str) ‑> pandas.core.frame.DataFrame

Read the taxonomy names dump file into Pandas DataFrame.

Args

taxonomy_namesdump_file : str
The taxonomy dump file.

Returns

pd.DataFrame
A pandas DataFrame
Expand source code
def read_taxonomy_names_dump_file(
    taxonomy_namesdump_file:str
  ) -> pd.DataFrame:
  """
  Read the taxonomy names dump file into Pandas DataFrame.

  Args:
    taxonomy_namesdump_file (str): The taxonomy dump file.

  Returns:
    pd.DataFrame: A pandas DataFrame
  """
  data = (
    pd.read_csv(
      taxonomy_namesdump_file,
      sep="\t",
      header=None
    )
    .drop([1, 3, 5, 7], axis=1)
    .rename(
      columns={
        0:"taxid",
        2:"name",
        4:"unique_name",
        6:"category_name"
      }
    )
  )
  return data

Classes

class TaxonomyTree (children_to_parent_mapping: dict)

Taxonomy Tree class.

Expand source code
class TaxonomyTree:
  """
  Taxonomy Tree class.
  """
  def __init__(self, children_to_parent_mapping:dict[int, int]):
    self.mapping = children_to_parent_mapping
    self.reverse_mapping = self._build_reverse_mapping(self.mapping)

  @staticmethod
  def _build_reverse_mapping(mapping:dict[int, int]) -> dict[int, list[int]]:
    result = defaultdict(list[int])
    for child, parent in mapping.items():
      result[parent].append(child)
    return result

  def get_parent(self, taxid:int) -> tuple[bool, Optional[int]]:
    """
    Get the parent of a Node.

    Args:
      taxid (int): A taxid.

    Returns:
      tuple[bool, Optional[int]]: If given taxid is in the taxonomy tree.
        The first value is True. If the given node has a parent return its
        taxid, if the node is the root, i.e. do no have a parent, returns None.
    """
    parent = self.mapping.get(taxid)
    if parent is None:
      return (False, None)
    if parent == taxid:
      return (True, None)
    return (True, parent)

  def valid_taxid(self, taxid:int) -> bool:
    """
    Checks if a given taxid, belongs to the Taxonomy Tree.

    Args:
      taxid (int): A taxid.

    Returns:
      bool: True if the given taxid is in the Taxonomy Tree.
    """
    return taxid in self.mapping or taxid in self.mapping.values()

  def is_parent(self, taxid:int) -> bool:
    """
    Checks if a given taxid, is a parent of another node in the
    Taxonomy Tree.

    Args:
      taxid (int): A taxid.

    Returns:
      bool: True if the given taxid is a parent node.
    """
    return taxid in self.mapping

  def retrieve_lineage(self, taxid:int) -> list[int]:
    """
    Retrieves the complete lineage from the root to the given
    taxid.

    Args:
      taxid (int): A taxid.

    Returns:
      list[int]: A list of taxids representing the complete lineage.
    """
    lineage = [taxid]
    node = taxid
    while True:
      parent, child = self.get_parent(node)
      if not parent:
        break
      if not child:
        break
      lineage.insert(0, child)
      node = child
    return lineage

  def common_lineage(self, taxids:list[int]) -> list[int]:
    """
    Gets the common lineage from the root for a group of nodes.

    Args:
      taxids (list[int]): A groups of nodes taxids.

    Returns:
      list[int]: The common lineage of all given nodes.
    """
    def _max_common_lineage(lin1: list[int], lin2:list[int]) -> list[int]:
      if lin1 == []:
        return lin2
      if lin2 == []:
        return lin1
      return [
        a
        for a, b in zip(lin1, lin2)
        if a == b
      ]
    all_lineages = [
      self.retrieve_lineage(node)
      for node in taxids
    ]
    return reduce(_max_common_lineage, all_lineages, [])

  def get_children(self, taxid:int) -> list[int]:
    """
    Get the children nodes of a given node in the Taxonomy Tree.

    Args:
      taxid (int): A taxid Node.

    Returns:
      list[int]: A list of the children nodes.
    """
    return self.reverse_mapping.get(taxid, [])

  def get_subtree_nodes(self, taxid:int) -> list[int]:
    """
    Gets all nodes in a subtree starting from a given node including the node
    itself.

    Args:
      taxid (int): A taxid Node.

    Returns:
      list[int]: A list of the children Nodes.
    """
    result = []
    to_visit = [taxid]
    while to_visit:
      c_id = to_visit.pop()
      if not self.valid_taxid(c_id):
        continue
      result.append(c_id)
      c_children = self.get_children(c_id)
      for child in c_children:
        if not child in result:
          to_visit.append(child)
    return result

  def lineage_contains(self, query: int, subject:set[int]) -> Optional[int]:
    """
    Check if the lineage of a taxid contains any of the given suject taxids.
    Checks from terminal leaf to root of the lineage. The first element that
    is in subject is returned.

    Args:
      query (int): a Query Tax Id.
      subject (set[int]): A set of taxids.

    Returns:
      Optional[int]: A first taxid in the lineage that is found in subject.
    """
    lineage = self.retrieve_lineage(query)
    for c_id in reversed(lineage):
      if c_id in subject:
        return c_id
    return None

Methods

def common_lineage(self, taxids: list) ‑> list

Gets the common lineage from the root for a group of nodes.

Args

taxids : list[int]
A groups of nodes taxids.

Returns

list[int]
The common lineage of all given nodes.
Expand source code
def common_lineage(self, taxids:list[int]) -> list[int]:
  """
  Gets the common lineage from the root for a group of nodes.

  Args:
    taxids (list[int]): A groups of nodes taxids.

  Returns:
    list[int]: The common lineage of all given nodes.
  """
  def _max_common_lineage(lin1: list[int], lin2:list[int]) -> list[int]:
    if lin1 == []:
      return lin2
    if lin2 == []:
      return lin1
    return [
      a
      for a, b in zip(lin1, lin2)
      if a == b
    ]
  all_lineages = [
    self.retrieve_lineage(node)
    for node in taxids
  ]
  return reduce(_max_common_lineage, all_lineages, [])
def get_children(self, taxid: int) ‑> list

Get the children nodes of a given node in the Taxonomy Tree.

Args

taxid : int
A taxid Node.

Returns

list[int]
A list of the children nodes.
Expand source code
def get_children(self, taxid:int) -> list[int]:
  """
  Get the children nodes of a given node in the Taxonomy Tree.

  Args:
    taxid (int): A taxid Node.

  Returns:
    list[int]: A list of the children nodes.
  """
  return self.reverse_mapping.get(taxid, [])
def get_parent(self, taxid: int) ‑> tuple

Get the parent of a Node.

Args

taxid : int
A taxid.

Returns

tuple[bool, Optional[int]]
If given taxid is in the taxonomy tree. The first value is True. If the given node has a parent return its taxid, if the node is the root, i.e. do no have a parent, returns None.
Expand source code
def get_parent(self, taxid:int) -> tuple[bool, Optional[int]]:
  """
  Get the parent of a Node.

  Args:
    taxid (int): A taxid.

  Returns:
    tuple[bool, Optional[int]]: If given taxid is in the taxonomy tree.
      The first value is True. If the given node has a parent return its
      taxid, if the node is the root, i.e. do no have a parent, returns None.
  """
  parent = self.mapping.get(taxid)
  if parent is None:
    return (False, None)
  if parent == taxid:
    return (True, None)
  return (True, parent)
def get_subtree_nodes(self, taxid: int) ‑> list

Gets all nodes in a subtree starting from a given node including the node itself.

Args

taxid : int
A taxid Node.

Returns

list[int]
A list of the children Nodes.
Expand source code
def get_subtree_nodes(self, taxid:int) -> list[int]:
  """
  Gets all nodes in a subtree starting from a given node including the node
  itself.

  Args:
    taxid (int): A taxid Node.

  Returns:
    list[int]: A list of the children Nodes.
  """
  result = []
  to_visit = [taxid]
  while to_visit:
    c_id = to_visit.pop()
    if not self.valid_taxid(c_id):
      continue
    result.append(c_id)
    c_children = self.get_children(c_id)
    for child in c_children:
      if not child in result:
        to_visit.append(child)
  return result
def is_parent(self, taxid: int) ‑> bool

Checks if a given taxid, is a parent of another node in the Taxonomy Tree.

Args

taxid : int
A taxid.

Returns

bool
True if the given taxid is a parent node.
Expand source code
def is_parent(self, taxid:int) -> bool:
  """
  Checks if a given taxid, is a parent of another node in the
  Taxonomy Tree.

  Args:
    taxid (int): A taxid.

  Returns:
    bool: True if the given taxid is a parent node.
  """
  return taxid in self.mapping
def lineage_contains(self, query: int, subject: set) ‑> Optional[int]

Check if the lineage of a taxid contains any of the given suject taxids. Checks from terminal leaf to root of the lineage. The first element that is in subject is returned.

Args

query : int
a Query Tax Id.
subject : set[int]
A set of taxids.

Returns

Optional[int]
A first taxid in the lineage that is found in subject.
Expand source code
def lineage_contains(self, query: int, subject:set[int]) -> Optional[int]:
  """
  Check if the lineage of a taxid contains any of the given suject taxids.
  Checks from terminal leaf to root of the lineage. The first element that
  is in subject is returned.

  Args:
    query (int): a Query Tax Id.
    subject (set[int]): A set of taxids.

  Returns:
    Optional[int]: A first taxid in the lineage that is found in subject.
  """
  lineage = self.retrieve_lineage(query)
  for c_id in reversed(lineage):
    if c_id in subject:
      return c_id
  return None
def retrieve_lineage(self, taxid: int) ‑> list

Retrieves the complete lineage from the root to the given taxid.

Args

taxid : int
A taxid.

Returns

list[int]
A list of taxids representing the complete lineage.
Expand source code
def retrieve_lineage(self, taxid:int) -> list[int]:
  """
  Retrieves the complete lineage from the root to the given
  taxid.

  Args:
    taxid (int): A taxid.

  Returns:
    list[int]: A list of taxids representing the complete lineage.
  """
  lineage = [taxid]
  node = taxid
  while True:
    parent, child = self.get_parent(node)
    if not parent:
      break
    if not child:
      break
    lineage.insert(0, child)
    node = child
  return lineage
def valid_taxid(self, taxid: int) ‑> bool

Checks if a given taxid, belongs to the Taxonomy Tree.

Args

taxid : int
A taxid.

Returns

bool
True if the given taxid is in the Taxonomy Tree.
Expand source code
def valid_taxid(self, taxid:int) -> bool:
  """
  Checks if a given taxid, belongs to the Taxonomy Tree.

  Args:
    taxid (int): A taxid.

  Returns:
    bool: True if the given taxid is in the Taxonomy Tree.
  """
  return taxid in self.mapping or taxid in self.mapping.values()