Source code for ccHBGF.core
import logging
from typing import Literal
import numpy as np
from numpy.typing import NDArray
from ._adjacency import _construct_adj_matrix
from ._spectral import _spectral_partitioning
logger = logging.getLogger('ccHBGF')
[docs]
def ccHBGF(clustering_matrix: NDArray,
n_clusters: int | None = None,
tol: float = 0.1,
init: Literal['orthogonal', 'kmeans++'] = 'orthogonal',
random_state: int | np.random.RandomState | np.random.Generator | None = None,
verbose: bool = False
) -> NDArray:
"""
Perform consensus clustering using Hybrid Bipartite Graph Formulation (HBGF).
This function performs consensus clustering on a `clustering_matrix`, which is a 2D array where each column
represents a clustering solution and each row represents an element being clustered. It constructs a bipartite
graph with vertices representing the clusters and elements, and then partitions the graph using spectral
partitioning to generate final cluster labels.
Parameters
----------
clustering_matrix : ndarray
A 2D array where each column represents a clustering solution, and each row represents an element being clustered.
n_clusters : int, optional
The number of clusters. If not provided, the function automatically detects the number of clusters.
tol : float, optional
The tolerance for scipy.sparse.linalg.svds(), where `0` is machine precision.
init : {'orthogonal', 'kmeans++'}, optional
Method for initializing KMeans centers. Default is 'orthogonal'.
random_state : {int, numpy.random.Generator, numpy.random.RandomState}, optional
Controls the randomness of the algorithm for reproducibility. Default is None.
verbose : bool, optional
Whether to print verbose output during processing. Default is False.
Returns
-------
ndarray
A 1D array of consensus clustering labels for the elements.
"""
# Check Input Parameters
assert init in ['orthogonal', 'kmeans++'], f"No center initialization method: {init}.\nAvailable methods:\n\t- 'orthogonal'\n\t- 'kmeans++'"
# Set verbosity level
if verbose:
logger.setLevel(logging.INFO)
# Define expected number of clusters, if not given
if not n_clusters:
n_clusters = int(np.max(np.apply_along_axis(lambda x: np.unique(x).size, 0, clustering_matrix)))
if n_clusters == 1:
logger.info('Only 1 cluster detected.')
return np.zeros(shape=clustering_matrix.shape[0])
logger.info(f'Detected {n_clusters} clusters.')
if n_clusters > 500:
logger.warning(f'Large numbers of clusters detected ({n_clusters}!). This may take a while.')
# Construct graph adjacency matrix (A)
A = _construct_adj_matrix(clustering_matrix)
logger.info(f'Graph adjacency matrix (A) constructed with shape {A.shape}')
# Derive cluster labels using spectral partitioning of graph
cluster_labels = _spectral_partitioning(A, n_clusters, tol, init, random_state)
logger.info('Consensus Labels Found')
return cluster_labels