Source code for skchange.utils.validation.data
"""Validation functions for input data series."""
import numpy as np
import pandas as pd
from numpy.typing import ArrayLike
[docs]
def check_data(
X: pd.DataFrame | pd.Series | ArrayLike,
min_length: int,
min_length_name: str = "min_length",
allow_missing_values: bool = False,
) -> pd.DataFrame:
"""Check if input data is valid.
Parameters
----------
X : pd.DataFrame, pd.Series, np.ndarray
Input data to check.
min_length : int
Minimum number of samples in X.
min_length_name : str, optional (default="min_length")
Name of min_length parameter to be shown in the error message.
allow_missing_values : bool, optional (default=False)
Whether to allow missing values in X.
Returns
-------
X : pd.DataFrame
Input data in pd.DataFrame format.
"""
X = pd.DataFrame(X)
if not allow_missing_values and X.isna().any(axis=None):
raise ValueError(
f"X cannot contain missing values: X.isna().sum()={X.isna().sum()}."
)
n = X.shape[0]
if n < min_length:
raise ValueError(
f"X must have at least {min_length_name}={min_length} samples"
+ f" (X.shape[0]={n})"
)
return X
def as_2d_array(X: ArrayLike, vector_as_column=True, dtype=None) -> np.ndarray:
"""Convert an array-like object to a 2D numpy array.
Parameters
----------
X : `ArrayLike`
Array-like object.
Returns
-------
X : `np.ndarray`
2D numpy array.
"""
X = np.asarray(X, dtype=dtype)
if X.ndim == 1:
X = X.reshape(-1, 1) if vector_as_column else X.reshape(1, -1)
elif X.ndim > 2:
raise ValueError("X must be at most 2-dimensional.")
return X