Module xi_covutils.clustering
Clustering functions
Expand source code
"""
Clustering functions
"""
from collections import defaultdict
from typing import List, Optional
from xi_covutils.identity import IdentityCalculator
from xi_covutils.msa import gapstrip_sequences
class Cluster(): # pylint: disable=too-few-public-methods
"""
Simple class to represent sequence clusters.
"""
def __init__(self):
self.representative:Optional[str] = None
self.representative_index:Optional[int] = None
self.sequences = []
self.indexes = []
self.nseq = 0
def __repr__(self):
return (
"Cluster:"
f"[{self.nseq}]"
f"[{self.representative}]"
f" {', '.join(self.sequences)}"
)
# pylint: disable=too-many-locals
def hobohm1(
sequences:list[str],
identity_cutoff:float=0.62,
use_gapstrip:bool=False,
use_c_extension:bool=True,
max_clusters:float=float('inf')
) -> list[Cluster] :
"""
Performs a sequence clustering using Hobohm algorithm 1.
Implementation of clusternig algorithm 1 published in:
Hobohm U, Scharf M, Schneider R, Sander C. Selection of representative protein
data sets.
Protein Sci. 1992;1(3):409-17.
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2142204/pdf/1304348.pdf
Args:
sequences (list[str]): Input sequences.
identity_cutoff (float, optional): A float between 0 and 1, used as cutoff
for including a new sequence in a cluster. . Defaults to 0.62.
use_gapstrip (bool, optional): If True, columns of all sequences that are
gaps in every sequence are removed. This not affect the results
and may improve the performance. Defaults to False.
use_c_extension (bool, optional): If true, C language extension is used to
compute sequence identity. If False, pure python implementation is used.
Defaults to True.
max_clusters (float, optional): The max number of clusters to return.
Defaults to float('inf').
Returns:
list[Cluster]: A list of clusters.
"""
select: List[Cluster] = []
sequences = gapstrip_sequences(sequences) if use_gapstrip else sequences
if use_c_extension:
ic = IdentityCalculator()
id_function = ic.identity_fraction
else:
id_function = sequence_identity
for i, seq in enumerate(sequences):
should_add_new_cluster = True
add_to_cluster = None
for clu in select:
representative = clu.representative
if not representative:
continue
identity = id_function(seq, representative)
if identity and identity >= identity_cutoff:
should_add_new_cluster = False
add_to_cluster = clu
break
if should_add_new_cluster:
cluster = Cluster()
cluster.representative = seq
cluster.representative_index = i
cluster.sequences.append(seq)
cluster.indexes.append(i)
cluster.nseq = 1
select.append(cluster)
if add_to_cluster:
add_to_cluster.sequences.append(seq)
add_to_cluster.indexes.append(i)
add_to_cluster.nseq += 1
if len(select) >= max_clusters:
break
return select
def sequence_identity(seq1:str, seq2:str) -> float:
"""
Computes sequence identity for two sequences.
Lower and upper case characters are assumed to be different.
Gapped positions in both sequences are not considered for the calculation.
Args:
seq1 (str): A sequence.
seq2 (str): A sequence.
Raises:
ValueError: If sequences have different lengths.
Returns:
float: A value between 0.0 (dissimilar sequences) and 1.0 (identical
sequences).
"""
if not len(seq1) == len(seq2):
raise ValueError("Sequence length is not equal")
equals = 0
total = 0
for i, char_a in enumerate(seq1):
char_b = seq2[i]
if not (char_a in ("-", ".") and (char_b in ("-", "."))):
total += 1
if char_a == char_b:
equals += 1
return float(equals) / max(1, total)
def _build_kmers(sequence, kmer_size=3):
return {sequence[x:y] for x, y in zip(
range(len(sequence)), range(kmer_size, len(sequence)+1))}
def _build_kmer_map(kmers):
result = defaultdict(set)
for seq_id, kmer_set in kmers.items():
for kmer in kmer_set:
result[kmer].add(seq_id)
return result
def _closest_kmer(kmer_set, exclude, kmers_map, seq_map, include):
index_counter = defaultdict(int)
max_index = None
max_count = 0
for kmer in kmer_set:
seq_indexes = (kmers_map[kmer]-{exclude}) & include
for s_index in seq_indexes:
index_counter[s_index] += 1
if index_counter[s_index] > max_count:
max_count = index_counter[s_index]
max_index = s_index
if max_index is not None:
return (
max_index,
float(index_counter[max_index])
/ min(len(kmer_set), len(seq_map[max_index]))
)
return None
def kmer_clustering(
sequences: List[str],
kmer_length:int=3,
identity_cutoff:float=0.62
) -> list[Cluster]:
"""
Makes a simple sequence clustering bases on kmer content.
The kmers identity of two sequence is the number of equal kmers divided by
the number of kmers of the sequence that has fewer kmers.
Repeated kmers in a sequences are counted once.
Args:
sequences (List[str]): A list of sequences.
kmer_length (int, optional): The Kmer length. Defaults to 3.
identity_cutoff (float, optional): Kmer cutoff value for sequences in the
same cluster. Defaults to 0.62.
Returns:
list[Cluster]: A list of clusters.
"""
sequences_map = dict(enumerate(sequences))
seq_map = {
x: _build_kmers(seq, kmer_length)
for x, seq in sequences_map.items()
}
kmer_map = _build_kmer_map(seq_map)
cluster_map = defaultdict(Cluster)
for i, kmers in seq_map.items():
closest = _closest_kmer(
kmers, i, kmer_map, seq_map, cluster_map.keys()
)
if (
closest is not None and
closest[0] in cluster_map and
closest[1] >= identity_cutoff
):
cluster_map[closest[0]].sequences.append(sequences_map[i])
cluster_map[closest[0]].nseq += 1
cluster_map[i] = cluster_map[closest[0]]
else:
cluster_map[i].sequences.append(sequences_map[i])
cluster_map[i].representative_index = i
cluster_map[i].nseq = 1
result:List[Cluster] = []
visited = set()
for cluster in cluster_map.values():
if cluster.representative_index not in visited:
visited.add(cluster.representative_index)
result.append(cluster)
for res in result:
if res.representative_index is None:
res.representative = None
continue
res.representative = sequences_map[res.representative_index]
return result
Functions
def hobohm1(sequences: list, identity_cutoff: float = 0.62, use_gapstrip: bool = False, use_c_extension: bool = True, max_clusters: float = inf) ‑> list
-
Performs a sequence clustering using Hobohm algorithm 1.
Implementation of clusternig algorithm 1 published in: Hobohm U, Scharf M, Schneider R, Sander C. Selection of representative protein data sets. Protein Sci. 1992;1(3):409-17. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2142204/pdf/1304348.pdf
Args
sequences
:list[str]
- Input sequences.
identity_cutoff
:float
, optional- A float between 0 and 1, used as cutoff for including a new sequence in a cluster. . Defaults to 0.62.
use_gapstrip
:bool
, optional- If True, columns of all sequences that are gaps in every sequence are removed. This not affect the results and may improve the performance. Defaults to False.
use_c_extension
:bool
, optional- If true, C language extension is used to compute sequence identity. If False, pure python implementation is used. Defaults to True.
max_clusters
:float
, optional- The max number of clusters to return. Defaults to float('inf').
Returns
list[Cluster]
- A list of clusters.
Expand source code
def hobohm1( sequences:list[str], identity_cutoff:float=0.62, use_gapstrip:bool=False, use_c_extension:bool=True, max_clusters:float=float('inf') ) -> list[Cluster] : """ Performs a sequence clustering using Hobohm algorithm 1. Implementation of clusternig algorithm 1 published in: Hobohm U, Scharf M, Schneider R, Sander C. Selection of representative protein data sets. Protein Sci. 1992;1(3):409-17. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2142204/pdf/1304348.pdf Args: sequences (list[str]): Input sequences. identity_cutoff (float, optional): A float between 0 and 1, used as cutoff for including a new sequence in a cluster. . Defaults to 0.62. use_gapstrip (bool, optional): If True, columns of all sequences that are gaps in every sequence are removed. This not affect the results and may improve the performance. Defaults to False. use_c_extension (bool, optional): If true, C language extension is used to compute sequence identity. If False, pure python implementation is used. Defaults to True. max_clusters (float, optional): The max number of clusters to return. Defaults to float('inf'). Returns: list[Cluster]: A list of clusters. """ select: List[Cluster] = [] sequences = gapstrip_sequences(sequences) if use_gapstrip else sequences if use_c_extension: ic = IdentityCalculator() id_function = ic.identity_fraction else: id_function = sequence_identity for i, seq in enumerate(sequences): should_add_new_cluster = True add_to_cluster = None for clu in select: representative = clu.representative if not representative: continue identity = id_function(seq, representative) if identity and identity >= identity_cutoff: should_add_new_cluster = False add_to_cluster = clu break if should_add_new_cluster: cluster = Cluster() cluster.representative = seq cluster.representative_index = i cluster.sequences.append(seq) cluster.indexes.append(i) cluster.nseq = 1 select.append(cluster) if add_to_cluster: add_to_cluster.sequences.append(seq) add_to_cluster.indexes.append(i) add_to_cluster.nseq += 1 if len(select) >= max_clusters: break return select
def kmer_clustering(sequences: List[str], kmer_length: int = 3, identity_cutoff: float = 0.62) ‑> list
-
Makes a simple sequence clustering bases on kmer content.
The kmers identity of two sequence is the number of equal kmers divided by the number of kmers of the sequence that has fewer kmers. Repeated kmers in a sequences are counted once.
Args
sequences
:List[str]
- A list of sequences.
kmer_length
:int
, optional- The Kmer length. Defaults to 3.
identity_cutoff
:float
, optional- Kmer cutoff value for sequences in the same cluster. Defaults to 0.62.
Returns
list[Cluster]
- A list of clusters.
Expand source code
def kmer_clustering( sequences: List[str], kmer_length:int=3, identity_cutoff:float=0.62 ) -> list[Cluster]: """ Makes a simple sequence clustering bases on kmer content. The kmers identity of two sequence is the number of equal kmers divided by the number of kmers of the sequence that has fewer kmers. Repeated kmers in a sequences are counted once. Args: sequences (List[str]): A list of sequences. kmer_length (int, optional): The Kmer length. Defaults to 3. identity_cutoff (float, optional): Kmer cutoff value for sequences in the same cluster. Defaults to 0.62. Returns: list[Cluster]: A list of clusters. """ sequences_map = dict(enumerate(sequences)) seq_map = { x: _build_kmers(seq, kmer_length) for x, seq in sequences_map.items() } kmer_map = _build_kmer_map(seq_map) cluster_map = defaultdict(Cluster) for i, kmers in seq_map.items(): closest = _closest_kmer( kmers, i, kmer_map, seq_map, cluster_map.keys() ) if ( closest is not None and closest[0] in cluster_map and closest[1] >= identity_cutoff ): cluster_map[closest[0]].sequences.append(sequences_map[i]) cluster_map[closest[0]].nseq += 1 cluster_map[i] = cluster_map[closest[0]] else: cluster_map[i].sequences.append(sequences_map[i]) cluster_map[i].representative_index = i cluster_map[i].nseq = 1 result:List[Cluster] = [] visited = set() for cluster in cluster_map.values(): if cluster.representative_index not in visited: visited.add(cluster.representative_index) result.append(cluster) for res in result: if res.representative_index is None: res.representative = None continue res.representative = sequences_map[res.representative_index] return result
def sequence_identity(seq1: str, seq2: str) ‑> float
-
Computes sequence identity for two sequences.
Lower and upper case characters are assumed to be different. Gapped positions in both sequences are not considered for the calculation.
Args
seq1
:str
- A sequence.
seq2
:str
- A sequence.
Raises
ValueError
- If sequences have different lengths.
Returns
float
- A value between 0.0 (dissimilar sequences) and 1.0 (identical sequences).
Expand source code
def sequence_identity(seq1:str, seq2:str) -> float: """ Computes sequence identity for two sequences. Lower and upper case characters are assumed to be different. Gapped positions in both sequences are not considered for the calculation. Args: seq1 (str): A sequence. seq2 (str): A sequence. Raises: ValueError: If sequences have different lengths. Returns: float: A value between 0.0 (dissimilar sequences) and 1.0 (identical sequences). """ if not len(seq1) == len(seq2): raise ValueError("Sequence length is not equal") equals = 0 total = 0 for i, char_a in enumerate(seq1): char_b = seq2[i] if not (char_a in ("-", ".") and (char_b in ("-", "."))): total += 1 if char_a == char_b: equals += 1 return float(equals) / max(1, total)
Classes
class Cluster
-
Simple class to represent sequence clusters.
Expand source code
class Cluster(): # pylint: disable=too-few-public-methods """ Simple class to represent sequence clusters. """ def __init__(self): self.representative:Optional[str] = None self.representative_index:Optional[int] = None self.sequences = [] self.indexes = [] self.nseq = 0 def __repr__(self): return ( "Cluster:" f"[{self.nseq}]" f"[{self.representative}]" f" {', '.join(self.sequences)}" )