Source code for CytOpT.CytOpt_Descent_Ascent

# Copyright (C) 2022, Kalidou BA, Paul Freulon <paul.freulon@math.u-bordeaux.fr>=
#
# License: MIT (see COPYING file)


import numpy as np
import pandas as pd
from CytOpT.Label_Prop_sto import Robbins_Wass


# __all__ = ['cytopt_desasc', 'Label_Prop_sto']


def diff_simplex(h):
    """
    Computation of the Jacobian matrix of the softmax function.

    :param h:
    :return:
    """

    try:
        h = np.array(h, np.float)
    except Exception as e:
        print(e)
    K = len(h)
    Diff = np.zeros((K, K), dtype=float)
    for i in range(K):
        for j in range(K):
            if i == j:
                Diff[i, j] = (np.exp(h[i]) * np.sum(np.exp(h)) - np.exp(2 * h[i])) / (np.sum(np.exp(h)) ** 2)
            else:
                Diff[i, j] = - np.exp(h[i] + h[j]) / (np.sum(np.exp(h)) ** 2)
    return Diff


def gammatrix(X_s, Lab_source):
    """    Computation of the operator D that maps the class proportions with the weights.

    :param X_s:
    :param Lab_source:
    :return:
    """

    I = X_s.shape[0]
    if min(Lab_source) == 0:
        K = int(max(Lab_source))
        D = np.zeros((I, K + 1))
        for k in range(K + 1):
            D[:, k] = 1 / np.sum(Lab_source == k) * np.asarray(Lab_source == k, dtype=float)

        h = np.ones(K + 1)

    else:
        K = int(max(Lab_source))
        D = np.zeros((I, K))
        for k in range(K):
            D[:, k] = 1 / np.sum(Lab_source == k + 1) * np.asarray(Lab_source == k + 1, dtype=float)

        h = np.ones(K)
    return D, h


# cytopt_desasc
[docs]def cytopt_desasc(X_s, X_t, Lab_source, eps=0.0001, n_it_grad=4000, n_it_sto=10, step_grad=50, cont=True, theta_true=None, monitoring=True): """ CytOpT algorithm. This methods is designed to estimate the proportions of cells in an unclassified Cytometry data set denoted X_t. CytOpT is a supervised method that leverage the classification denoted Lab_source associated to the flow cytometry data set X_s. The estimation relies on the resolution of an optimization problem. The optimization problem of this function is solved with a descent-ascent optimization procedure. :param X_s: np.array of shape (n_samples_source, n_biomarkers). The source cytometry data set. :param X_t: np.array of shape (n_samples_target, n_biomarkers). The target cytometry data set. :param Lab_source: np.array of shape (n_samples_source,). The classification of the source data set. :param eps: float, ``default=0.0001``. Regularization parameter of the Wasserstein distance. This parameter must be positive. :param n_it_grad: int, ``default=10000``. Number of iterations of the outer loop of the descent-ascent optimization method. This loop corresponds to the descent part of descent-ascent strategy. :param n_it_sto: int, ``default = 10``. Number of iterations of the inner loop of the descent-ascent optimization method. This loop corresponds to the stochastic ascent part of this optimization procedure. :param step_grad: float, ``default=10``. Constant step_size policy for the gradient descent of the descent-ascent optimization strategy. :param cont: bool, ``default=True``. When set to true, the progress is displayed. :param theta_true: np.array of shape (K,), ``default=None``. This array stores the true proportions of the K type of cells estimated in the target data set. This parameter is required if the user enables the monitoring option. :param monitoring: bool, ``default=False``. When set to true, the evolution of the Kullback-Leibler between the estimated proportions and the benchmark proportions is tracked and stored. :return: - hat_theta - np.array of shape (K,), where K is the number of different type of cell populations in the source data set. - KL_storage - np.array of shape (n_out, ). This array stores the evolution of the Kullback-Leibler divergence between the estimate and benchmark proportions, if monitoring==True. """ print('\n Epsilon: ', eps) I, J, prop_classes_new = X_s.shape[0], X_t.shape[0], 0 # Definition of the operator D that maps the class proportions with the weights. D, h = gammatrix(X_s, Lab_source) # Weights of the target distribution beta = 1 / J * np.ones(J) # Storage of the KL between theta_hat and theta_true KL_Storage = np.zeros(n_it_grad) # Descent-Ascent procedure if monitoring: for i in range(n_it_grad): prop_classes = np.exp(h) prop_classes = prop_classes / np.sum(prop_classes) Dif = diff_simplex(h) alpha_mod = D.dot(prop_classes) f_star_hat = Robbins_Wass(X_s, X_t, alpha_mod, beta, eps=eps, n_iter=n_it_sto)[0] h = h - step_grad * (D.dot(Dif)).T.dot(f_star_hat) prop_classes_new = np.exp(h) prop_classes_new = prop_classes_new / np.sum(prop_classes_new) if i % 100 == 0: if cont: print('Iteration ', i) print('Current h_hat') print(prop_classes_new) KL_current = np.sum(prop_classes_new * np.log(prop_classes_new / theta_true)) KL_Storage[i] = KL_current return [prop_classes_new, KL_Storage] else: for i in range(n_it_grad): prop_classes = np.exp(h) prop_classes = prop_classes / np.sum(prop_classes) Dif = diff_simplex(h) alpha_mod = D.dot(prop_classes) f_star_hat = Robbins_Wass(X_s, X_t, alpha_mod, beta, eps=eps, n_iter=n_it_sto)[0] h = h - step_grad * (D.dot(Dif)).T.dot(f_star_hat) prop_classes_new = np.exp(h) prop_classes_new = prop_classes_new / np.sum(prop_classes_new) if i % 100 == 0: if cont: print('Iteration ', i) print('Current h_hat') print(prop_classes_new) return [prop_classes_new]
if __name__ == '__main__': # Source Data Stanford1A_values = pd.read_csv('./tests/data/W2_1_values.csv', usecols=np.arange(1, 8)) Stanford1A_clust = pd.read_csv('./tests/data/W2_1_clust.csv', usecols=[1]) # Target Data Stanford3A_values = pd.read_csv('./tests/data/W2_7_values.csv', usecols=np.arange(1, 8)) Stanford3A_clust = pd.read_csv('./tests/data/W2_7_clust.csv', usecols=[1]) X_source = np.asarray(Stanford1A_values) X_target = np.asarray(Stanford3A_values) Lab_source = np.asarray(Stanford1A_clust['x']) Lab_target = np.asarray(Stanford3A_clust['x']) h_target = np.zeros(10) for k in range(10): h_target[k] = np.sum(Lab_target == k + 1) / len(Lab_target) res = cytopt_desasc(X_source, X_target, Lab_source, eps=0.0001, n_it_grad=1000, n_it_sto=10, step_grad=50, theta_true=h_target, monitoring=False)