Source code for MED3pa.models.concrete_classifiers

"""
This module offers concrete implementations of specific classification models, such as XGBoost. 
It adapts the abstract interfaces defined in ``abstract_models.py`` to provide fully functional models ready for training and prediction.
"""

from typing import Any, Dict, List, Optional, Union

import numpy as np
import xgboost as xgb

from .abstract_models import ClassificationModel
from .classification_metrics import *
from .data_strategies import ToDmatrixStrategy
from .xgboost_params import valid_xgboost_custom_params, valid_xgboost_params, xgboost_metrics


[docs] class XGBoostModel(ClassificationModel): """ A concrete implementation of the ClassificationModel class for XGBoost models. This class provides functionalities to train, predict, and evaluate models built with the XGBoost library. """ def __init__(self, params: Optional[Dict[str, Any]] = None, model: Optional[Union[xgb.Booster, xgb.XGBClassifier]] = None) -> None: """ Initializes the XGBoostModel either with parameters for a new model or a loaded pickled model. Args: params (Optional[Dict[str, Any]]): A dictionary of parameters for the booster model. model (Optional[Union[xgb.Booster, xgb.XGBClassifier]]): A loaded pickled model. """ super().__init__() self.set_params(params) # if the model is loaded from a pickled file if model is not None: self.set_model(model) # if it's a new model, use xgb.Booster by default else: self.model_class = xgb.Booster self.pickled_model = model is not None self.set_data_strategy(ToDmatrixStrategy()) def _ensure_dmatrix(self, features: Any, labels: Optional[np.ndarray] = None, weights: Optional[np.ndarray] = None) -> xgb.DMatrix: """ Ensures that the input data is converted to a DMatrix format, using the defined data preparation strategy. Args: features (Any): Features array. labels (Optional[np.ndarray]): Labels array. weights (Optional[np.ndarray]): Weights array. Returns: xgb.DMatrix: A DMatrix object. """ if not isinstance(features, xgb.DMatrix): return self.data_preparation_strategy.execute(features, labels, weights) else: return features
[docs] def train(self, x_train: np.ndarray, y_train: np.ndarray, x_validation: np.ndarray, y_validation: np.ndarray, training_parameters: Optional[Dict[str, Any]], balance_train_classes: bool) -> None: """ Trains the model on the provided dataset. Args: x_train (np.ndarray): Features for training. y_train (np.ndarray): Labels for training. x_validation (np.ndarray): Features for validation. y_validation (np.ndarray): Labels for validation. training_parameters (Optional[Dict[str, Any]]): Additional training parameters. balance_train_classes (bool): Whether to balance the training classes. Raises: ValueError: If parameters for xgb.Booster are not initialized before training. NotImplementedError: If the model_class is not supported for training. """ # if additional training parameters are provided if training_parameters: # ensure the provided training parameters are valid valid_param_sets = [valid_xgboost_params, valid_xgboost_custom_params] valid_training_params = self.validate_params(training_parameters, valid_param_sets) # update the model's params with the validated training params if self.params is not None : self.params.update(valid_training_params) else: self.params = valid_training_params weights = self.balance_train_weights(y_train) if balance_train_classes else training_parameters.get('training_weights', np.ones_like(y_train)) evaluation_metrics = training_parameters.get('custom_eval_metrics', self.params.get('eval_metric', ["Accuracy"]) if self.params else ["Accuracy"]) num_boost_rounds = training_parameters.get('num_boost_rounds', self.params.get('num_boost_rounds', 10) if self.params else 10) # if not additional training parameters were provided, use the model's params else: weights = self.balance_train_weights(y_train) if balance_train_classes else np.ones_like(y_train) evaluation_metrics = self.params.get('eval_metric', ["Accuracy"]) if self.params else ["Accuracy"] num_boost_rounds = self.params.get('num_boost_rounds', 10) if self.params else 10 if not self.params: raise ValueError("Parameters must be initialized before training.") # take off the custom training parameters filtered_params = {k: v for k, v in self.params.items() if k not in valid_xgboost_custom_params} if self.model_class is xgb.Booster: # train the xgb.Booster dtrain = self._ensure_dmatrix(x_train, y_train, weights) dval = self._ensure_dmatrix(x_validation, y_validation) self.model = xgb.train(filtered_params, dtrain, num_boost_round=num_boost_rounds, evals=[(dval, 'eval')], verbose_eval=False) self.evaluate(x_validation, y_validation, eval_metrics=evaluation_metrics, print_results=True) elif self.model_class is xgb.XGBClassifier: # train the xgb.XGBClassifier self.model = self.model_class(**filtered_params) self.model.fit(x_train, y_train, sample_weight=weights, eval_set=[(x_validation, y_validation)]) self.evaluate(x_validation, y_validation, eval_metrics=evaluation_metrics, print_results=True) else: # if the class is neither xgb.XGBClassifier or xgb.Booster, raise an error. raise NotImplementedError(f"Training not implemented for model class {self.model_class}")
[docs] def predict(self, X: np.ndarray, return_proba: bool = False, threshold: float = 0.5) -> np.ndarray: """ Makes predictions using the model for the given input. Args: X (np.ndarray): Features for prediction. return_proba (bool, optional): Whether to return probabilities. Defaults to False. threshold (float, optional): Threshold for converting probabilities to class labels. Defaults to 0.5. Returns: np.ndarray: Predictions made by the model. Raises: ValueError: If the model has not been initialized. NotImplementedError: If prediction is not implemented for the model class. """ if self.model is None: raise ValueError(f"The {self.model_class.__name__} model has not been initialized.") if self.model_class is xgb.Booster: dtest = self._ensure_dmatrix(X) preds = self.model.predict(dtest) elif self.model_class is xgb.XGBClassifier: preds = self.model.predict_proba(X) if return_proba else self.model.predict(X) else: raise NotImplementedError(f"Prediction not implemented for model class {self.model_class}") return preds if return_proba else (preds > threshold).astype(int)
[docs] def train_to_disagree(self, x_train: np.ndarray, y_train: np.ndarray, x_validation: np.ndarray, y_validation: np.ndarray, x_test: np.ndarray, y_test: np.ndarray, training_parameters: Optional[Dict[str, Any]], balance_train_classes: bool, N: int) -> None: """ Trains the model to disagree with another model using a specified dataset. This method is intended for scenarios where the model is trained to produce outputs that intentionally diverge from those of another model, to be used in the ``detectron`` method Args: x_train (np.ndarray): Features for training. y_train (np.ndarray): Labels for training. x_validation (np.ndarray): Features for validation. y_validation (np.ndarray): Labels for validation. x_test (np.ndarray): Features for testing or disagreement evaluation. y_test (np.ndarray): Labels for testing or disagreement evaluation. training_parameters (Optional[Dict[str, Any]]): Additional parameters for training the model. balance_train_classes (bool): Whether to balance the class distribution in the training data. N (int): The number of examples in the testing set that should be used for calculating disagreement. Raises: ValueError: If the necessary parameters for training are not properly initialized. NotImplementedError: If the model class does not support this type of training. """ if training_parameters: valid_param_sets = [valid_xgboost_params, valid_xgboost_custom_params] valid_training_params = self.validate_params(training_parameters, valid_param_sets) if self.params is not None : self.params.update(valid_training_params) else: self.params = valid_training_params training_weights = self.balance_train_weights(y_train) if balance_train_classes else training_parameters.get('training_weights', np.ones_like(y_train)) evaluation_metrics = training_parameters.get('custom_eval_metrics', self.params.get('eval_metric', ["Accuracy"]) if self.params else ["Accuracy"]) else: training_weights = np.ones_like(y_train) evaluation_metrics = self.params.get('eval_metric', ["Accuracy"]) if self.params else ["Accuracy"] if not self.params: raise ValueError("Parameters must be initialized before training.") filtered_params = {k: v for k, v in self.params.items() if k not in valid_xgboost_custom_params} # prepare the data for training data = np.concatenate([x_train, x_test]) label = np.concatenate([y_train, 1 - y_test]) weight = np.concatenate([training_weights, 1 / (N + 1) * np.ones(N)]) if self.model_class is xgb.Booster: dtrain = self._ensure_dmatrix(data, label, weight) dval = self._ensure_dmatrix(x_validation, y_validation) self.model = xgb.train(filtered_params, dtrain, num_boost_round=10, evals=[(dval, 'eval')], verbose_eval=False) self.evaluate(x_validation, y_validation, eval_metrics=evaluation_metrics) elif self.model_class is xgb.XGBClassifier: self.model = self.model_class(**filtered_params) self.model.fit(data, label, sample_weight=weight, eval_set=[(x_validation, y_validation)]) self.evaluate(x_validation, y_validation, eval_metrics=evaluation_metrics) else: raise NotImplementedError(f"Training not implemented for model class {self.model_class}")
[docs] def evaluate(self, X: np.ndarray, y: np.ndarray, eval_metrics: Union[str, List[str]], print_results: bool = False) -> Dict[str, float]: """ Evaluates the model using specified metrics. Args: X (np.ndarray): Features for evaluation. y (np.ndarray): True labels for evaluation. eval_metrics (List[str]): Metrics to use for evaluation. print_results (bool, optional): Whether to print the evaluation results. Returns: Dict[str, float]: A dictionary with metric names and their evaluated scores. Raises: ValueError: If the model has not been trained before evaluation. """ if self.model is None: raise ValueError("Model must be trained before evaluation.") # Ensure eval_metrics is a list if isinstance(eval_metrics, str): eval_metrics = [eval_metrics] probs = self.predict(X, return_proba=True) if probs.ndim == 1: preds = (probs > 0.5).astype(int) else: preds = None if preds is None: raise ValueError("Only binary classification is supported for this version.") evaluation_results = {} for metric_name in eval_metrics: translated_metric_name = xgboost_metrics.get(metric_name, metric_name) metric_function = ClassificationEvaluationMetrics.get_metric(translated_metric_name) if metric_function: if metric_name in {'Auc', 'Auprc', 'Logloss'}: evaluation_results[metric_name] = metric_function(y, probs) else: evaluation_results[metric_name] = metric_function(y, preds) else: print(f"Error: The metric '{metric_name}' is not supported.") if print_results: self.print_evaluation_results(results=evaluation_results) return evaluation_results