Source code for CytOpT.CytOpt_MinMax_Swapping

# Copyright (C) 2022, Kalidou BA, Paul Freulon <paul.freulon@math.u-bordeaux.fr>=
#
# License: MIT (see COPYING file)

import numpy as np
import pandas as pd
from scipy.special import logsumexp
from CytOpT.Label_Prop_sto import cost
# __all__ = ['cytopt_minmax']


def func_f(X_s, X_t, lbd, eps, j, u, G):
    """
    Compute the function f inside the expectation at the point (y_j, u).
    """

    arg1 = (u - cost(X_s, X_t[j])) / eps
    t1 = logsumexp(arg1)

    arg2 = -(G.T).dot(u) / lbd
    t2 = logsumexp(arg2)

    result = -eps * t1 - lbd * t2

    return result


def grad_f(X_s, X_t, lbd, eps, j, u, G):
    """
    Compute the gradient with respect to u of the function f inside the expectation
    """

    arg1 = (u - cost(X_s, X_t[j])) / eps
    cor1 = np.max(arg1)
    vec1 = np.exp(arg1 - cor1)
    t1 = - vec1 / np.sum(vec1)

    arg2 = -(G.T).dot(u) / lbd
    cor2 = np.max(arg2)
    vec2 = np.exp(arg2 - cor2)
    t2 = G.dot(vec2) / np.sum(vec2)

    return t1 + t2


def Gam_mat(Lab_source):
    """
    Compute the Gamma matrix that allows to pass from the class proportions to the weight vector
    """
    if Lab_source.min() == 0:
        K = int(Lab_source.max()) + 1
        I = Lab_source.shape[0]
        Gamma = np.zeros((I, K))

        for k in range(K):
            Gamma[:, k] = 1 / np.sum(Lab_source == k) * np.asarray(Lab_source == k, dtype=float)


    else:
        K = int(Lab_source.max())
        I = Lab_source.shape[0]
        Gamma = np.zeros((I, K))

        for k in range(K):
            Gamma[:, k] = 1 / np.sum(Lab_source == k + 1) * np.asarray(Lab_source == k + 1, dtype=float)

    return Gamma


def stomax(X_s, X_t, G, lbd, eps, n_iter):
    """
    Robbins-Monro algorithm to compute an approximate of the vector u^* solution of the maximization problem
    """

    I = X_s.shape[0]
    U = np.zeros(I)

    # Step size policy
    gamma = I * eps / 1.9
    c = 0.51

    sample = np.random.choice(I, n_iter)

    for n in range(n_iter):
        idx = sample[n]
        grd = grad_f(X_s, X_t, lbd, eps, idx, U, G)

        U = U + gamma / (n + 1) ** c * grd

    return U


# cytopt
[docs]def cytopt_minmax(X_s, X_t, Lab_source, eps=0.0001, lbd=0.0001, n_iter=4000, step=5, power=0.99, theta_true=None, monitoring=False): """ CytOpT algorithm. This methods is designed to estimate the proportions of cells in an unclassified Cytometry data set denoted X_t. CytOpT is a supervised method that leverage the classification denoted Lab_source associated to the flow cytometry data set X_s. The estimation relies on the resolution of an optimization problem. The optimization problem of this function involves an additional regularization term lambda. This regularization allows the application of a simple stochastic gradient-ascent to solve the optimization problem. We advocate the use of this method as it is faster than 'cytopt_desasc'. :param X_s: np.array of shape (n_samples_source, n_biomarkers). The source cytometry data set. :param X_t: np.array of shape (n_samples_target, n_biomarkers). The target cytometry data set. :param Lab_source: np.array of shape (n_samples_source,). The classification of the source data set. :param eps: float, ``default=0.0001``. Regularization parameter of the Wasserstein distance. This parameter must be positive. :param lbd: float, ``default=0.0001``. Additionnal regularization parameter of the Minmax swapping optimization method. This parameter lbd should be greater or equal to eps. :param n_iter: int, ``default=10000``. Number of iterations of the stochastic gradient ascent. :param step: float, ``default=5``. Multiplication factor of the stochastic gradient ascent step-size policy for the minmax optimization method. :param power: float, ``default=0.99``. Decreasing rate for the step-size policy of the stochastic gradient ascent for the Minmax swapping optimization method. The step-size decreases at a rate of 1/n^power. :param theta_true: np.array of shape (K,), ``default=None``. This array stores the true proportions of the K type of cells estimated in the target data set. This parameter is required if the user enables the monitoring option. :param monitoring: bool, ``default=False``. When set to true, the evolution of the Kullback-Leibler between the estimated proportions and the benchmark proportions is tracked and stored. :return: - hat_theta - np.array of shape (K,), where K is the number of different type of cell populations in the source data set. - KL_storage - np.array of shape (n_iter,) This array stores the evolution of the Kullback-Leibler divergence between the estimate and benchmark proportions, if monitoring==True and the theta_true variable is completed. Reference: Paul Freulon, Jérémie Bigot,and Boris P. Hejblum CytOpT: Optimal Transport with Domain Adaptation for Interpreting Flow Cytometry data, arXiv:2006.09003 [stat.AP]. """ I, J = X_s.shape[0], X_t.shape[0] U = np.zeros(I) D = Gam_mat(Lab_source) theta_hat = None # Step size policy if step == 0: gamma = I * eps / 1.9 else: gamma = step if power == 0: c = 0.51 else: c = power sample = np.random.choice(J, n_iter) # Storage of the KL divergence between theta_hat and theta_true KL_storage = np.zeros(n_iter) if monitoring: for it in range(1, n_iter): idx = sample[it] grd = grad_f(X_s, X_t, lbd, eps, idx, U, D) U = U + gamma / (it + 1) ** c * grd # Computation of the estimate h_hat arg = -D.T.dot(U) / lbd M = np.max(arg) theta_hat = np.exp(arg - M) theta_hat = theta_hat / theta_hat.sum() KL_current = np.sum(theta_hat * np.log(theta_hat / theta_true)) KL_storage[it] = KL_current return [theta_hat, KL_storage] else: G = Gam_mat(Lab_source) u_hat = stomax(X_s, X_t, G, lbd, eps, n_iter) # computation of the estimate of the class proportions theta_hat = np.exp(-(G.T).dot(u_hat) / lbd) theta_hat = theta_hat / theta_hat.sum() return [theta_hat]
if __name__ == '__main__': # Source Data Stanford1A_values = pd.read_csv('./tests/data/W2_1_values.csv', usecols=np.arange(1, 8)) Stanford1A_clust = pd.read_csv('./tests/data/W2_1_clust.csv', usecols=[1]) # Target Data Stanford3A_values = pd.read_csv('./tests/data/W2_7_values.csv', usecols=np.arange(1, 8)) Stanford3A_clust = pd.read_csv('./tests/data/W2_7_clust.csv', usecols=[1]) X_source = np.asarray(Stanford1A_values) X_target = np.asarray(Stanford3A_values) Lab_source = np.asarray(Stanford1A_clust['x']) Lab_target = np.asarray(Stanford3A_clust['x']) h_target = np.zeros(10) for k in range(10): h_target[k] = np.sum(Lab_target == k + 1) / len(Lab_target) res = cytopt_minmax(X_source, X_target, Lab_source, theta_true=h_target, monitoring=True)