Source code for group_lasso._group_lasso

from abc import ABC, abstractmethod
from math import sqrt
from numbers import Number
import warnings

import numpy.linalg as la
import numpy as np
from sklearn.utils import (
    check_random_state,
    check_array,
    check_consistent_length,
)
from sklearn.base import (
    BaseEstimator,
    TransformerMixin,
    RegressorMixin,
    ClassifierMixin,
)
from sklearn.preprocessing import LabelBinarizer

from group_lasso._singular_values import find_largest_singular_value
from group_lasso._subsampling import subsample
from group_lasso._fista import fista


_DEBUG = False


def _l1_l2_prox(w, l1_reg, group_reg, groups):
    return _group_l2_prox(_l1_prox(w, l1_reg), group_reg, groups)


def _l1_prox(w, reg):
    return np.sign(w) * np.maximum(0, np.abs(w) - reg)


def _l2_prox(w, reg):
    """The proximal operator for reg*||w||_2 (not squared).
    """
    return max(0, 1 - reg / la.norm(w)) * w


def _group_l2_prox(w, reg_coeffs, groups):
    """The proximal map for the specified groups of coefficients.
    """
    w = w.copy()

    for group, reg in zip(groups, reg_coeffs):
        reg = reg * sqrt(group.sum())
        w[group] = _l2_prox(w[group], reg)

    return w


def _split_intercept(w):
    return w[0], w[1:]


def _join_intercept(b, w):
    m, n = w.shape
    return np.concatenate([np.array(b).reshape(1, n), w], axis=0)


def _add_intercept_col(X):
    ones = np.ones([X.shape[0], 1])
    return np.concatenate([ones, X], axis=1)


class BaseGroupLasso(ABC, BaseEstimator, TransformerMixin):
    """Base class for sparse group lasso regularised optimisation.

    This class implements the Sparse Group Lasso [1] regularisation for
    optimisation problems with Lipschitz continuous gradients, which is
    approximately equivalent to having a bounded second derivative.

    The loss is optimised using the FISTA algorithm proposed in [2] with the
    generalised gradient-based restarting scheme proposed in [3].

    References
    ----------
    [1] Simon, N., Friedman, J., Hastie, T., & Tibshirani, R. (2013).
    A sparse-group lasso. Journal of Computational and Graphical
    Statistics, 22(2), 231-245.

    [2] Beck A, Teboulle M. (2009). A fast iterative shrinkage-thresholding
    algorithm for linear inverse problems. SIAM journal on imaging
    sciences. 2009 Mar 4;2(1):183-202.

    [3] O’Donoghue B, Candes E. (2015) Adaptive restart for accelerated
    gradient schemes. Foundations of computational mathematics.
    Jun 1;15(3):715-32.
    """

    # TODO: Document code

    LOG_LOSSES = False

    def __init__(
        self,
        groups,
        group_reg=0.05,
        l1_reg=0.05,
        n_iter=100,
        tol=1e-5,
        subsampling_scheme=None,
        fit_intercept=True,
        random_state=None,
    ):
        """

        Arguments
        ---------
        groups : Iterable
            Iterable that specifies which group each column corresponds to.
            For columns that should not be regularised, the corresponding
            group index should either be None or negative. For example, the
            list ``[1, 1, 1, 2, 2, -1]`` specifies that the first three
            columns of the data matrix belong to the first group, the next
            two columns belong to the second group and the last column should
            not be regularised.
        group_reg : float or iterable [default=0.05]
            The regularisation coefficient(s) for the group sparsity penalty.
            If ``group_reg`` is an iterable, then its length should be equal to
            the number of groups.
        l1_reg : float or iterable [default=0.05]
            The regularisation coefficient for the coefficient sparsity
            penalty.
        n_iter : int [default=100]
            The maximum number of iterations to perform
        tol : float [default=1e-5]
            The convergence tolerance. The optimisation algorithm
            will stop once ||x_{n+1} - x_n|| < ``tol``.
        subsampling_scheme : None, float, int or str [default=None]
            The subsampling rate used for the gradient and singular value
            computations. If it is a float, then it specifies the fraction
            of rows to use in the computations. If it is an int, it
            specifies the number of rows to use in the computation and if
            it is a string, then it must be 'sqrt' and the number of rows used
            in the computations is the square root of the number of rows
            in X.
        frobenius_lipschitz : bool [default=False]
            Use the Frobenius norm to estimate the lipschitz coefficient of the
            MSE loss. This works well for systems whose power iterations
            converge slowly. If False, then subsampled power iterations are
            used. Using the Frobenius approximation for the Lipschitz
            coefficient might fail, and end up with all-zero weights.
        fit_intercept : bool [default=True]
            Whether to fit an intercept or not.
        random_state : np.random.RandomState [default=None]
            The random state used for initialisation of parameters.
        """
        self.groups = groups
        self.group_reg = group_reg
        self.l1_reg = l1_reg
        self.n_iter = n_iter
        self.tol = tol
        self.subsampling_scheme = subsampling_scheme
        self.fit_intercept = fit_intercept
        self.random_state = random_state

    def _regulariser(self, w):
        """The regularisation penalty for a given coefficient vector, ``w``.
        """
        regulariser = 0
        b, w = _split_intercept(w)
        for group, reg in zip(self.groups_, self.group_reg_vector):
            regulariser += reg * la.norm(w[group, :])
        regulariser += la.norm(w.ravel(), 1)
        return regulariser

    def _get_reg_vector(self, reg):
        """Get the group-wise regularisation coefficients from ``reg``.
        """
        if isinstance(reg, Number):
            reg = [reg * sqrt(group.sum()) for group in self.groups_]
        else:
            reg = list(reg)
        return reg

    @abstractmethod
    def _unregularised_loss(self, X, y, w):
        """The unregularised reconstruction loss.
        """
        pass

    def _loss(self, X, y, w):
        """The group-lasso regularised loss.

        Arguments
        ---------
        X : np.ndarray
            Data matrix, ``X.shape == (num_datapoints, num_features)``
        y : np.ndarray
            Target vector/matrix, ``y.shape == (num_datapoints, num_targets)``,
            or ``y.shape == (num_datapoints,)``
        w : np.ndarray
            Coefficient vector, ``w.shape == (num_features, num_targets)``,
            or ``w.shape == (num_features,)``
        """
        return self._unregularised_loss(X, y, w) + self._regulariser(w)

    def loss(self, X, y):
        """The group-lasso regularised loss with the current coefficients

        Arguments
        ---------
        X : np.ndarray
            Data matrix, ``X.shape == (num_datapoints, num_features)``
        y : np.ndarray
            Target vector/matrix, ``y.shape == (num_datapoints, num_targets)``,
            or ``y.shape == (num_datapoints,)``
        """
        return self._loss(X, y, self.coef_)

    @abstractmethod
    def _compute_lipschitz(self, X, y):
        """Compute Lipschitz bound for the gradient of the unregularised loss.

        The Lipschitz bound is with respect to the coefficient vector or
        matrix.
        """
        pass

    @abstractmethod
    def _grad(self, X, y, w):
        """Compute the gradient of the unregularised loss wrt the coefficients.
        """
        pass

    def _minimise_loss(self, X, y, lipschitz=None):
        """Use the FISTA algorithm to solve the group lasso regularised loss.
        """
        if self.fit_intercept:
            X = _add_intercept_col(X)

        if lipschitz is None:
            lipschitz = self._compute_lipschitz(X, y)

        if not self.fit_intercept:
            X = _add_intercept_col(X)

        def grad(w):
            g = self._grad(X, y, w)
            if not self.fit_intercept:
                g[0] = 0
            return g

        def prox(w):
            b, w_ = _split_intercept(w)
            w_ = _l1_l2_prox(
                w_, self.l1_reg, self.group_reg_vector, self.groups_
            )
            return _join_intercept(b, w_)

        def loss(w):
            X_, y_ = self.subsample(X, y)
            self._loss(X_, y_, w)

        def callback(x, it_num, previous_x=None):
            X_, y_ = self.subsample(X, y)
            w = x
            previous_w = previous_x

            if self.LOG_LOSSES:
                self.losses_.append(self._loss(X_, y_, w))

            if previous_w is None and _DEBUG:
                print("Starting FISTA: ")
                print("\tInitial loss: {loss}".format(loss=self._loss(X_, y_, w)))

            elif _DEBUG:
                print("Completed iteration {it_num}:".format(it_num=it_num))
                print("\tLoss: {loss}".format(loss=self._loss(X_, y_, w)))
                print("\tWeight difference: {wdiff}".format(wdiff=la.norm(w-previous_w)))
                print("\tWeight norm: {wnorm}".format(wnorm=la.norm(w)))
                print("\tGrad: {gnorm}".format(gnorm=la.norm(grad(w))))

        weights = np.concatenate([self.intercept_, self.coef_])
        weights = fista(
            weights,
            grad=grad,
            prox=prox,
            loss=loss,
            lipschitz=lipschitz,
            n_iter=self.n_iter,
            tol=self.tol,
            callback=callback,
        )
        self.intercept_, self.coef_ = _split_intercept(weights)

    def _check_valid_parameters(self):
        """Check that the input parameters are valid.
        """
        assert all(reg >= 0 for reg in self.group_reg_vector)
        assert len(self.group_reg_vector) == len(np.unique(self.groups))
        assert self.n_iter > 0
        assert self.tol >= 0

    def _prepare_dataset(self, X, y):
        """Ensure that the inputs are valid and prepare them for fit.
        """
        check_consistent_length(X, y)
        check_array(X)
        check_array(y)
        if len(y.shape) == 1:
            y = y.reshape(-1, 1)
        return X, y

    def _init_fit(self, X, y):
        """Initialise model and check inputs.
        """
        X, y = self._prepare_dataset(X, y)
        groups = np.array([-1 if i is None else i for i in self.groups])

        self.random_state_ = check_random_state(self.random_state)
        self.groups_ = [self.groups == u for u in np.unique(groups) if u >= 0]
        self.group_reg_vector = self._get_reg_vector(self.group_reg)
        self.losses_ = []

        self.coef_ = self.random_state_.standard_normal(
            (X.shape[1], y.shape[1])
        )
        self.coef_ /= la.norm(self.coef_)
        self.intercept_ = np.zeros((1, self.coef_.shape[1]))

        self._check_valid_parameters()
        return X, y

    def fit(self, X, y, lipschitz=None):
        """Fit a group-lasso regularised linear model.
        """
        X, y = self._init_fit(X, y)
        self._minimise_loss(X, y, lipschitz=lipschitz)

    @abstractmethod
    def predict(self, X):
        """Predict using the linear model.
        """
        pass

    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)

    @property
    def sparsity_mask(self):
        """A boolean mask indicating whether features are used in prediction.
        """
        coef_ = self.coef_.mean(1)
        mean_abs_coef = abs(coef_.mean())

        return np.abs(coef_) > 1e-10 * mean_abs_coef

    def transform(self, X):
        """Remove columns corresponding to zero-valued coefficients.
        """
        return X[:, self.sparsity_mask]

    def fit_transform(self, X, y, lipschitz=None):
        """Fit a group lasso model to X and y and remove unused columns from X
        """
        self.fit(X, y, lipschitz)
        return self.transform(X)

    def subsample(self, *args):
        """Subsample the input using this class's subsampling scheme.
        """
        return subsample(
            self.subsampling_scheme, random_state=self.random_state_, *args
        )


def _l2_grad(A, b, x):
    """The gradient of the problem ||Ax - b||^2 wrt x.
    """
    return A.T @ (A @ x - b)


[docs]class GroupLasso(BaseGroupLasso, RegressorMixin): """Sparse group lasso regularised least squares linear regression. This class implements the Sparse Group Lasso [1] regularisation for linear regression with the mean squared penalty. This class is implemented as both a regressor and a transformation. If the ``transform`` method is called, then the columns of the input that correspond to zero-valued regression coefficients are dropped. The loss is optimised using the FISTA algorithm proposed in [2] with the generalised gradient-based restarting scheme proposed in [3]. This algorithm is not as accurate as a few other optimisation algorithms, but it is extremely efficient and does recover the sparsity patterns. We therefore reccomend that this class is used as a transformer to select the viable features and that the output is fed into another regression algorithm, such as RidgeRegression in scikit-learn. References ---------- [1] Simon, N., Friedman, J., Hastie, T., & Tibshirani, R. (2013). A sparse-group lasso. Journal of Computational and Graphical Statistics, 22(2), 231-245. [2] Beck A, Teboulle M. (2009). A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM journal on imaging sciences. 2009 Mar 4;2(1):183-202. [3] O’Donoghue B, Candes E. (2015) Adaptive restart for accelerated gradient schemes. Foundations of computational mathematics. Jun 1;15(3):715-32 """ def __init__( self, groups=None, group_reg=0.05, l1_reg=0.05, n_iter=100, tol=1e-5, subsampling_scheme=None, fit_intercept=True, frobenius_lipschitz=False, random_state=None, ): """ Arguments --------- groups : Iterable Iterable that specifies which group each column corresponds to. For columns that should not be regularised, the corresponding group index should either be None or negative. For example, the list ``[1, 1, 1, 2, 2, -1]`` specifies that the first three columns of the data matrix belong to the first group, the next two columns belong to the second group and the last column should not be regularised. group_reg : float or iterable [default=0.05] The regularisation coefficient(s) for the group sparsity penalty. If ``group_reg`` is an iterable, then its length should be equal to the number of groups. l1_reg : float or iterable [default=0.05] The regularisation coefficient for the coefficient sparsity penalty. n_iter : int [default=100] The maximum number of iterations to perform tol : float [default=1e-5] The convergence tolerance. The optimisation algorithm will stop once ||x_{n+1} - x_n|| < ``tol``. subsampling_scheme : None, float, int or str [default=None] The subsampling rate used for the gradient and singular value computations. If it is a float, then it specifies the fraction of rows to use in the computations. If it is an int, it specifies the number of rows to use in the computation and if it is a string, then it must be 'sqrt' and the number of rows used in the computations is the square root of the number of rows in X. frobenius_lipschitz : bool [default=False] Use the Frobenius norm to estimate the lipschitz coefficient of the MSE loss. This works well for systems whose power iterations converge slowly. If False, then subsampled power iterations are used. Using the Frobenius approximation for the Lipschitz coefficient might fail, and end up with all-zero weights. fit_intercept : bool [default=True] Whether to fit an intercept or not. random_state : np.random.RandomState [default=None] The random state used for initialisation of parameters. """ super().__init__( groups=groups, l1_reg=l1_reg, group_reg=group_reg, n_iter=n_iter, tol=tol, subsampling_scheme=subsampling_scheme, fit_intercept=fit_intercept, random_state=random_state, ) self.frobenius_lipchitz = frobenius_lipschitz
[docs] def fit(self, X, y, lipschitz=None): """Fit a group lasso regularised linear regression model. Arguments --------- X : np.ndarray Data matrix y : np.ndarray Target vector or matrix lipschitz : float or None [default=None] A Lipshitz bound for the mean squared loss with the given data and target matrices. If None, this is estimated. """ super().fit(X, y, lipschitz=lipschitz)
[docs] def predict(self, X): """Predict using the linear model. """ return self.intercept_ + X @ self.coef_
def _unregularised_loss(self, X, y, w): X_, y_ = self.subsample(X, y) MSE = 0.5 * np.sum((X_ @ w - y_) ** 2) / len(X_) return MSE def _grad(self, X, y, w): X_, y_ = self.subsample(X, y) SSE_grad = _l2_grad(X_, y_, w) return SSE_grad / len(X_) def _compute_lipschitz(self, X, y): num_rows, num_cols = X.shape if self.frobenius_lipchitz: return la.norm(X, "fro") ** 2 / (num_rows * num_cols) s_max = find_largest_singular_value( X, subsampling_scheme=self.subsampling_scheme, random_state=self.random_state_, ) SSE_lipschitz = 1.5 * s_max ** 2 return SSE_lipschitz / num_rows
def _sigmoid(x): return 1 / (1 + np.exp(-x)) def _logit(X, w): return X @ w def _logistic_proba(X, w): return _sigmoid(_logit(X, w)) def _logistic_cross_entropy(X, y, w): p = _logistic_proba(X, w) return -(y * np.log(p) + (1 - y) * np.log(1 - p))
[docs]class LogisticGroupLasso(BaseGroupLasso, ClassifierMixin): """Sparse group lasso regularised single-class logistic regression. This class implements the Sparse Group Lasso [1] regularisation for logistic regression with a cross entropy penalty. This class is implemented as both a regressor and a transformation. If the ``transform`` method is called, then the columns of the input that correspond to zero-valued regression coefficients are dropped. The loss is optimised using the FISTA algorithm proposed in [2] with the generalised gradient-based restarting scheme proposed in [3]. This algorithm is not as accurate as a few other optimisation algorithms, but it is extremely efficient and does recover the sparsity patterns. We therefore reccomend that this class is used as a transformer to select the viable features and that the output is fed into another classification algorithm, such as LogisticRegression in scikit-learn. References ---------- [1] Simon, N., Friedman, J., Hastie, T., & Tibshirani, R. (2013). A sparse-group lasso. Journal of Computational and Graphical Statistics, 22(2), 231-245. [2] Beck A, Teboulle M. (2009). A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM journal on imaging sciences. 2009 Mar 4;2(1):183-202. [3] O’Donoghue B, Candes E. (2015) Adaptive restart for accelerated gradient schemes. Foundations of computational mathematics. Jun 1;15(3):715-32. """ def _compute_proba(self, X, w): return _sigmoid(X @ w) def _unregularised_loss(self, X, y, w): X_, y_ = self.subsample(X, y) return _logistic_cross_entropy(X_, y_, w).sum() / len(X) def _grad(self, X, y, w): X_, y_ = self.subsample(X, y) p = _logistic_proba(X_, w) return X_.T @ (p - y_) / len(X_) def _compute_lipschitz(self, X, y): return np.sqrt(12) * np.linalg.norm(X, "fro") / len(X) def predict_proba(self, X): return _logistic_proba(X, self.coef_)
[docs] def predict(self, X): """Predict using the linear model. """ return self.predict_proba(X) >= 0.5
[docs] def fit(self, X, y, lipschitz=None): if y.ndim == 2 and y.shape[1] > 1: n = y.shape[1] warnings.warn( ( "You have passed {n} targets to a single class classifier." " This will simply train {n} different models meaning that" " multiple classes can be predicted as true at once." ).format(n=n) ) super().fit(X, y, lipschitz=lipschitz)
def _softmax(logit): logit = logit - logit.max(1, keepdims=True) expl = np.exp(logit) if np.any(np.isnan(expl)): from pdb import set_trace set_trace() return expl / expl.sum(axis=(logit.ndim - 1), keepdims=True) def _softmax_proba(X, W): return _softmax(_logit(X, W)) def _softmax_cross_entropy(X, Y, W): P = _softmax_proba(X, W) return -np.sum(Y * np.log(P)) def _one_hot_encode(y): if y.ndim == 1: y = LabelBinarizer().fit_transform(y[:, np.newaxis]) return y
[docs]class MultinomialGroupLasso(BaseGroupLasso, ClassifierMixin): """Sparse group lasso regularised multi-class logistic regression. This class implements the Sparse Group Lasso [1] regularisation for multinomial regression (also known as multi-class logistic regression) with a cross entropy penalty. This class is implemented as both a regressor and a transformation. If the ``transform`` method is called, then the columns of the input that correspond to zero-valued regression coefficients are dropped. The loss is optimised using the FISTA algorithm proposed in [2] with the generalised gradient-based restarting scheme proposed in [3]. This algorithm is not as accurate as a few other optimisation algorithms, but it is extremely efficient and does recover the sparsity patterns. We therefore reccomend that this class is used as a transformer to select the viable features and that the output is fed into another classification algorithm, such as LogisticRegression in scikit-learn. References ---------- [1] Simon, N., Friedman, J., Hastie, T., & Tibshirani, R. (2013). A sparse-group lasso. Journal of Computational and Graphical Statistics, 22(2), 231-245. [2] Beck A, Teboulle M. (2009). A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM journal on imaging sciences. 2009 Mar 4;2(1):183-202. [3] O’Donoghue B, Candes E. (2015) Adaptive restart for accelerated gradient schemes. Foundations of computational mathematics. Jun 1;15(3):715-32 """ def __init__( self, groups, group_reg=0.05, l1_reg=0.05, n_iter=100, tol=1e-5, subsampling_scheme=None, fit_intercept=True, random_state=None, ): """ Arguments --------- groups : Iterable Iterable that specifies which group each column corresponds to. For columns that should not be regularised, the corresponding group index should either be None or negative. For example, the list ``[1, 1, 1, 2, 2, -1]`` specifies that the first three columns of the data matrix belong to the first group, the next two columns belong to the second group and the last column should not be regularised. group_reg : float or iterable [default=0.05] The regularisation coefficient(s) for the group sparsity penalty. If ``group_reg`` is an iterable, then its length should be equal to the number of groups. l1_reg : float or iterable [default=0.05] The regularisation coefficient for the coefficient sparsity penalty. n_iter : int [default=100] The maximum number of iterations to perform tol : float [default=1e-5] The convergence tolerance. The optimisation algorithm will stop once ||x_{n+1} - x_n|| < ``tol``. subsampling_scheme : None, float, int or str [default=None] The subsampling rate used for the gradient and singular value computations. If it is a float, then it specifies the fraction of rows to use in the computations. If it is an int, it specifies the number of rows to use in the computation and if it is a string, then it must be 'sqrt' and the number of rows used in the computations is the square root of the number of rows in X. frobenius_lipschitz : bool [default=False] Use the Frobenius norm to estimate the lipschitz coefficient of the MSE loss. This works well for systems whose power iterations converge slowly. If False, then subsampled power iterations are used. Using the Frobenius approximation for the Lipschitz coefficient might fail, and end up with all-zero weights. fit_intercept : bool [default=True] Whether to fit an intercept or not. random_state : np.random.RandomState [default=None] The random state used for initialisation of parameters. """ if subsampling_scheme is not None: warnings.warn( "Subsampling is not stable for multinomial group lasso." ) super().__init__( groups=groups, group_reg=group_reg, l1_reg=l1_reg, n_iter=n_iter, tol=tol, subsampling_scheme=subsampling_scheme, fit_intercept=fit_intercept, random_state=random_state, ) def _compute_proba(self, X, w): return _softmax_proba(X, w) def _unregularised_loss(self, X, y, w): y = _one_hot_encode(y) X_, y_ = self.subsample(X, y) return _softmax_cross_entropy(X_, y_, w).sum() / len(X) def _grad(self, X, y, w): y = _one_hot_encode(y) X_, y_ = self.subsample(X, y) p = _softmax_proba(X_, w) return X_.T @ (p - y_) / len(X_) def _compute_lipschitz(self, X, y): C = y.shape[-1] return 2 * C ** (1 / 4) * np.linalg.norm(X, "fro") / len(X) def predict_proba(self, X): return _softmax_proba(X, self.coef_)
[docs] def predict(self, X): """Predict using the linear model. """ return np.argmax(self.predict_proba(X), axis=1)
def _prepare_dataset(self, X, y): """Ensure that the inputs are valid and prepare them for fit. """ y = _one_hot_encode(y) check_consistent_length(X, y) check_array(X) check_array(y, ensure_2d=False) if set(np.unique(y)) != {0, 1}: raise ValueError( "The target array must either be a 2D dummy encoded (binary)" "array or a 1D array with class labels as array elements." ) return X, y