Module statkit.feature_selection

Select features using statistical hypothesis testing.

Expand source code
"""Select features using statistical hypothesis testing."""
from typing import Literal

from numpy import linalg, nan
from pandas import DataFrame
from scipy.stats import (
    epps_singleton_2samp as epps_singleton,
    ks_2samp as kolmogorov_smirnov,
    mannwhitneyu as mann_whitney_u,
)
from sklearn.base import BaseEstimator
from sklearn.feature_selection import SelectorMixin
from sklearn.utils.multiclass import unique_labels
from sklearn.utils import check_X_y
from statsmodels.stats.multitest import fdrcorrection, multipletests


class StatisticalTestFilter(BaseEstimator, SelectorMixin):
    """Select columns with significant difference between labels.

    Test which features the distribution of the postive class is stastistically
    different from the negative class, using multiple testing correction. Keep only the
    features that passed the statistical test.
    """

    def _apply_test(
        self,
        X_pos: DataFrame,
        X_neg: DataFrame,
        multiple_testing: Literal[
            "benjamini-hochberg", "bonferroni"
        ] = "benjamini-hochberg",
    ) -> DataFrame:
        """Column-wise test between positive and negative group."""
        result = DataFrame(
            columns=["statistic", "pvalue"], index=self.feature_names_in_
        )

        # Perform test for each feature.
        for column in self.feature_names_in_:
            try:
                statistic, p_value = self.test_(
                    X_pos[column], X_neg[column], **self.test_kwargs_
                )
            except (linalg.LinAlgError, ValueError):
                statistic, p_value = nan, nan
            result.loc[column] = [statistic, p_value]

        # Apply multiple-testing correction.
        if multiple_testing == "benjamini-hochberg":
            reject, pvalue_corrected = fdrcorrection(result.pvalue, alpha=self.p_value)
        elif multiple_testing == "bonferroni":
            reject, pvalue_corrected = multipletests(
                result.pvalue, alpha=self.p_value, method="bonferroni"
            )

        result["pvalue-corrected"] = pvalue_corrected
        result["reject"] = reject

        return result

    def __init__(
        self,
        statistical_test: Literal[
            "kolmogorov-smirnov", "mann-whitney-u", "epps-singleton"
        ] = "kolmogorov-smirnov",
        p_value: float = 0.05,
        multiple_testing: Literal[
            "benjamini-hochberg", "bonferroni"
        ] = "benjamini-hochberg",
        **kwargs,
    ):
        """
        Args:
            statistical_test: Test for difference in feature distributions
                between labels.
            p_value: The null hypothesis rejection probability (including
                `correction`).
            multiple_testing: What type of correction strategy to apply to account for
                multiple testing.
        """
        super().__init__(**kwargs)
        self.statistical_test = statistical_test
        self.p_value = p_value
        self.multiple_testing = multiple_testing

    def _get_support_mask(self):
        """Compute support mask of features."""
        return self.scores_["reject"]

    def fit(self, X, y):
        """Perform column-wise statistical test."""
        check_X_y(X, y)
        self._check_feature_names(X, reset=True)

        self.test_kwargs_ = {}
        statistical_functions = {
            "mann-whitney-u": mann_whitney_u,
            "kolmogorov-smirnov": kolmogorov_smirnov,
            "epps-singleton": epps_singleton,
        }

        if self.statistical_test not in statistical_functions.keys():
            raise KeyError(f"Unknown statistical method {self.statistical_test}.")

        self.test_ = statistical_functions[self.statistical_test]

        # Only allow two classes right now.
        self.classes_ = unique_labels(y)
        assert len(self.classes_) == 2
        X_neg = X[y == self.classes_[0]]
        X_pos = X[y == self.classes_[1]]
        self.scores_ = self._apply_test(X_pos, X_neg, correction=self.correction)

        return self

Classes

class StatisticalTestFilter (statistical_test: Literal['kolmogorov-smirnov', 'mann-whitney-u', 'epps-singleton'] = 'kolmogorov-smirnov', p_value: float = 0.05, multiple_testing: Literal['benjamini-hochberg', 'bonferroni'] = 'benjamini-hochberg', **kwargs)

Select columns with significant difference between labels.

Test which features the distribution of the postive class is stastistically different from the negative class, using multiple testing correction. Keep only the features that passed the statistical test.

Args

statistical_test
Test for difference in feature distributions between labels.
p_value
The null hypothesis rejection probability (including correction).
multiple_testing
What type of correction strategy to apply to account for multiple testing.
Expand source code
class StatisticalTestFilter(BaseEstimator, SelectorMixin):
    """Select columns with significant difference between labels.

    Test which features the distribution of the postive class is stastistically
    different from the negative class, using multiple testing correction. Keep only the
    features that passed the statistical test.
    """

    def _apply_test(
        self,
        X_pos: DataFrame,
        X_neg: DataFrame,
        multiple_testing: Literal[
            "benjamini-hochberg", "bonferroni"
        ] = "benjamini-hochberg",
    ) -> DataFrame:
        """Column-wise test between positive and negative group."""
        result = DataFrame(
            columns=["statistic", "pvalue"], index=self.feature_names_in_
        )

        # Perform test for each feature.
        for column in self.feature_names_in_:
            try:
                statistic, p_value = self.test_(
                    X_pos[column], X_neg[column], **self.test_kwargs_
                )
            except (linalg.LinAlgError, ValueError):
                statistic, p_value = nan, nan
            result.loc[column] = [statistic, p_value]

        # Apply multiple-testing correction.
        if multiple_testing == "benjamini-hochberg":
            reject, pvalue_corrected = fdrcorrection(result.pvalue, alpha=self.p_value)
        elif multiple_testing == "bonferroni":
            reject, pvalue_corrected = multipletests(
                result.pvalue, alpha=self.p_value, method="bonferroni"
            )

        result["pvalue-corrected"] = pvalue_corrected
        result["reject"] = reject

        return result

    def __init__(
        self,
        statistical_test: Literal[
            "kolmogorov-smirnov", "mann-whitney-u", "epps-singleton"
        ] = "kolmogorov-smirnov",
        p_value: float = 0.05,
        multiple_testing: Literal[
            "benjamini-hochberg", "bonferroni"
        ] = "benjamini-hochberg",
        **kwargs,
    ):
        """
        Args:
            statistical_test: Test for difference in feature distributions
                between labels.
            p_value: The null hypothesis rejection probability (including
                `correction`).
            multiple_testing: What type of correction strategy to apply to account for
                multiple testing.
        """
        super().__init__(**kwargs)
        self.statistical_test = statistical_test
        self.p_value = p_value
        self.multiple_testing = multiple_testing

    def _get_support_mask(self):
        """Compute support mask of features."""
        return self.scores_["reject"]

    def fit(self, X, y):
        """Perform column-wise statistical test."""
        check_X_y(X, y)
        self._check_feature_names(X, reset=True)

        self.test_kwargs_ = {}
        statistical_functions = {
            "mann-whitney-u": mann_whitney_u,
            "kolmogorov-smirnov": kolmogorov_smirnov,
            "epps-singleton": epps_singleton,
        }

        if self.statistical_test not in statistical_functions.keys():
            raise KeyError(f"Unknown statistical method {self.statistical_test}.")

        self.test_ = statistical_functions[self.statistical_test]

        # Only allow two classes right now.
        self.classes_ = unique_labels(y)
        assert len(self.classes_) == 2
        X_neg = X[y == self.classes_[0]]
        X_pos = X[y == self.classes_[1]]
        self.scores_ = self._apply_test(X_pos, X_neg, correction=self.correction)

        return self

Ancestors

  • sklearn.base.BaseEstimator
  • sklearn.feature_selection._base.SelectorMixin
  • sklearn.base.TransformerMixin

Methods

def fit(self, X, y)

Perform column-wise statistical test.

Expand source code
def fit(self, X, y):
    """Perform column-wise statistical test."""
    check_X_y(X, y)
    self._check_feature_names(X, reset=True)

    self.test_kwargs_ = {}
    statistical_functions = {
        "mann-whitney-u": mann_whitney_u,
        "kolmogorov-smirnov": kolmogorov_smirnov,
        "epps-singleton": epps_singleton,
    }

    if self.statistical_test not in statistical_functions.keys():
        raise KeyError(f"Unknown statistical method {self.statistical_test}.")

    self.test_ = statistical_functions[self.statistical_test]

    # Only allow two classes right now.
    self.classes_ = unique_labels(y)
    assert len(self.classes_) == 2
    X_neg = X[y == self.classes_[0]]
    X_pos = X[y == self.classes_[1]]
    self.scores_ = self._apply_test(X_pos, X_neg, correction=self.correction)

    return self