Clustering of scikit-learn Toy data sets

[1]:
import sys

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from sklearn import datasets
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler

from cnnclustering import cluster
from cnnclustering import _types

The notebook was created using Python 3.8.

[2]:
# Version information
print(sys.version)
3.8.8 (default, Mar 11 2021, 08:58:19)
[GCC 8.3.0]

Notebook configuration

[3]:
# Matplotlib configuration
mpl.rc_file(
    "../../matplotlibrc",
    use_default_template=False
)
[4]:
# Axis property defaults for the plots
ax_props = {
    "xlabel": None,
    "ylabel": None,
    "xlim": (-2.5, 2.5),
    "ylim": (-2.5, 2.5),
    "xticks": (),
    "yticks": (),
    "aspect": "equal"
}

# Line plot property defaults
line_props = {
    "linewidth": 0,
    "marker": '.',
}

Data set generation

To see Common-nearest-neighbour clustering in action, let’s have a look at a handful of basic 2D data sets from scikit-learn (like here in the scikit-learn documentation).

[5]:
# Data set generation parameters
np.random.seed(0)
n_samples = 2000
[6]:
# Data set generation
# circles
noisy_circles, _ = datasets.make_circles(
    n_samples=n_samples,
    factor=.5,
    noise=.05
    )

# moons
noisy_moons, _ = datasets.make_moons(
    n_samples=n_samples,
    noise=.05
    )

# blobs
blobs, _ = datasets.make_blobs(
    n_samples=n_samples,
    random_state=8
    )

# None
no_structure = np.random.rand(
    n_samples, 2
    )

# aniso
random_state = 170
X, y = datasets.make_blobs(
    n_samples=n_samples,
    random_state=random_state
    )

transformation = [[0.6, -0.6], [-0.4, 0.8]]
aniso = np.dot(X, transformation)

# varied
varied, _ = datasets.make_blobs(
    n_samples=n_samples,
    cluster_std=[1.0, 2.5, 0.5],
    random_state=random_state
    )
[7]:
# Define cluster parameters
dsets = [ # "name", set, **parameters
    ('circles', noisy_circles, {
        'radius_cutoff': 0.5,
        'cnn_cutoff': 20,
        'member_cutoff': 100,
        'max_clusters': None
        }),
    ('moons', noisy_moons, {
        'radius_cutoff': 0.5,
        'cnn_cutoff': 20,
        'member_cutoff': 2,
        'max_clusters': None
        }),
    ('varied', varied, {
        'radius_cutoff': 0.28,
        'cnn_cutoff': 20,
        'member_cutoff': 20,
        'max_clusters': None
        }),
    ('aniso', aniso, {
        'radius_cutoff': 0.29,
        'cnn_cutoff': 30,
        'member_cutoff': 5,
        'max_clusters': None
         }),
    ('blobs', blobs, {
        'radius_cutoff': 0.4,
        'cnn_cutoff': 20,
        'member_cutoff': 2,
        'max_clusters': None
        }),
    ('None', no_structure, {
        'radius_cutoff': 0.5,
        'cnn_cutoff': 20,
        'member_cutoff': 1,
        'max_clusters': None
        }),
    ]
[8]:
# Plot the original data sets
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, *_) in enumerate(dsets):
    # Fit all datasets to the same value range
    data = StandardScaler().fit_transform(data)
    # Plot
    Ax[count].plot(data[:, 0], data[:, 1], **line_props)
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )

../_images/tutorial_scikit_learn_datasets_12_0.png

Common-nearest-neighbours clustering on data points as input

[9]:
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, params) in enumerate(dsets):

    # Fit all datasets to the same value range
    data = StandardScaler().fit_transform(data)

    clustering = cluster.prepare_clustering(data)

    # Calculate neighbours brute force
    clustering.fit(**params)
    print()

    clustering.evaluate(ax=Ax[count], annotate_pos="random")
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        100       None      2         0.500     0.000     00:00:0.078
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        2         None      2         0.500     0.000     00:00:0.073
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.280     20        20        None      3         0.338     0.114     00:00:0.097
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.290     30        5         None      3         0.319     0.050     00:00:0.080
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.400     20        2         None      3         0.334     0.001     00:00:0.104
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        1         None      1         1.000     0.000     00:00:0.067
-----------------------------------------------------------------------------------------------


../_images/tutorial_scikit_learn_datasets_14_1.png

Common-nearest-neighbours clustering with pre-computed distances

[10]:
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, params) in enumerate(dsets):

    # Fit all datasets to the same value range
    data = StandardScaler().fit_transform(data)

    clustering = cluster.prepare_clustering(data)

    # Pre-compute distances
    distances = pairwise_distances(data)
    clustering_dist = cluster.prepare_clustering(distances)
    clustering_dist._metric = _types.MetricExtPrecomputed()

    # Use pre-computed distances
    clustering_dist.fit(**params)
    clustering._labels = clustering_dist._labels
    print()

    clustering.evaluate(ax=Ax[count], annotate_pos="random")
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        100       None      2         0.500     0.000     00:00:0.028
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        2         None      2         0.500     0.000     00:00:0.030
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.280     20        20        None      3         0.338     0.114     00:00:0.040
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.290     30        5         None      3         0.319     0.050     00:00:0.033
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.400     20        2         None      3         0.334     0.001     00:00:0.043
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        1         None      1         1.000     0.000     00:00:0.028
-----------------------------------------------------------------------------------------------


../_images/tutorial_scikit_learn_datasets_16_1.png

Common-nearest-neighbours clustering with pre-computed neighbourhoods

[11]:
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, params) in enumerate(dsets):

    # Fit all datasets to the same value range
    data = StandardScaler().fit_transform(data)

    clustering = cluster.prepare_clustering(data)

    # Pre-compute neighbourhoods
    tree = KDTree(data)
    neighbourhoods = tree.query_radius(
        data, r=params["radius_cutoff"], return_distance=False
        )
    clustering_neighbourhoods = cluster.prepare_clustering(
        neighbourhoods,
        preparation_hook=cluster.prepare_neighbourhoods,
        **cluster.registered_recipies["from_neighbourhoods_lookup"]
    )

    # Use pre-computed neighbourhoods
    clustering_neighbourhoods.fit(**params)
    clustering._labels = clustering_neighbourhoods._labels
    print()

    clustering.evaluate(ax=Ax[count], annotate_pos="random")
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        100       None      2         0.500     0.000     00:00:0.005
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        2         None      2         0.500     0.000     00:00:0.007
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.280     20        20        None      3         0.338     0.114     00:00:0.011
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.290     30        5         None      3         0.319     0.050     00:00:0.008
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.400     20        2         None      3         0.334     0.001     00:00:0.014
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        1         None      1         1.000     0.000     00:00:0.007
-----------------------------------------------------------------------------------------------


../_images/tutorial_scikit_learn_datasets_18_1.png

Common-nearest-neighbours clustering with pre-computed sorted neighbourhoods

[12]:
fig, ax = plt.subplots(2, 3)
Ax = ax.flatten()

for count, (name, data, params) in enumerate(dsets):

    # Fit all datasets to the same value range
    data = StandardScaler().fit_transform(data)

    clustering = cluster.prepare_clustering(data)

    # Pre-compute neighbourhoods
    tree = KDTree(data)
    neighbourhoods = tree.query_radius(
        data, r=params["radius_cutoff"], return_distance=False
        )

    for n in neighbourhoods:
        n.sort()

    clustering_neighbourhoods = cluster.prepare_clustering(
        neighbourhoods,
        preparation_hook=cluster.prepare_neighbourhoods,
        **cluster.registered_recipies["from_neighbourhoods_lookup"]
    )

    clustering_neighbourhoods._similarity_checker = _types.SimilarityCheckerExtScreensorted()

    # Use pre-computed neighbourhoods
    clustering_neighbourhoods.fit(**params)
    clustering._labels = clustering_neighbourhoods._labels
    print()

    clustering.evaluate(ax=Ax[count], annotate_pos="random")
    Ax[count].set(**ax_props)
    Ax[count].set_title(f'{name}', fontsize=10, pad=4)

fig.subplots_adjust(
    left=0, right=1, bottom=0, top=1, wspace=0.1, hspace=0.3
    )
-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        100       None      2         0.500     0.000     00:00:0.002
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        2         None      2         0.500     0.000     00:00:0.003
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.280     20        20        None      3         0.338     0.114     00:00:0.004
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.290     30        5         None      3         0.319     0.050     00:00:0.003
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.400     20        2         None      3         0.334     0.001     00:00:0.007
-----------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------------
#points   r         c         min       max       #clusters %largest  %noise    time
2000      0.500     20        1         None      1         1.000     0.000     00:00:0.002
-----------------------------------------------------------------------------------------------


../_images/tutorial_scikit_learn_datasets_20_1.png