Source code for ordinalcorr.polytomous

import numpy as np
from scipy.stats import norm, multivariate_normal
from scipy.optimize import minimize_scalar
from ordinalcorr.validation import check_if_zero_variance
from ordinalcorr.types import ArrayLike


def univariate_cdf(lower, upper):
    """Compute the univariate cumulative distribution function (CDF) for a standard normal distribution."""
    mean = 0.0
    var = 1.0
    std = np.sqrt(var)
    return norm.cdf(upper, loc=mean, scale=std) - norm.cdf(lower, loc=mean, scale=std)


def bivariate_cdf(lower, upper, rho: float) -> float:
    """Compute the bivariate cumulative distribution function (CDF) for a standard normal distribution."""
    var = 1
    cov = np.array([[var, rho], [rho, var]])

    # Compute probability as difference of CDFs
    # P_ij = Φ₂(τ_{i}, τ_{j}) - Φ₂(τ_{i-1}, τ_{j}) - Φ₂(τ_{i}, τ_{j-1}) + Φ₂(τ_{i-1}, τ_{j-1})
    Phi2 = multivariate_normal(mean=[0, 0], cov=cov).cdf
    return (
        Phi2(upper)
        - Phi2([upper[0], lower[1]])
        - Phi2([lower[0], upper[1]])
        + Phi2(lower)
    )


def estimate_thresholds(values):
    """Estimate thresholds from empirical marginal proportions"""
    inf = 100  # to make log-likelihood smooth, use large value instead of np.inf
    thresholds = []
    levels = np.sort(np.unique(values))
    for level in levels[:-1]:  # exclude top category
        p = np.mean(values <= level)
        thresholds.append(norm.ppf(p))  # τ_i = Φ⁻¹(P(X ≤ i))
    return np.concatenate(([-inf], thresholds, [inf]))


[docs] def polychoric_corr(x: ArrayLike, y: ArrayLike) -> float: """ Estimate the polychoric correlation coefficient between two ordinal variables. Parameters ---------- x : array_like Ordinal variable X (integer-coded). y : array_like Ordinal variable Y (integer-coded). Returns ------- float Estimated polychoric correlation coefficient (rho). """ # Step 1: Ensure inputs are numpy arrays and integer-coded x = np.asarray(x) y = np.asarray(y) x = check_if_zero_variance(x) y = check_if_zero_variance(y) # Step 2: Identify unique ordinal levels x_levels = np.sort(np.unique(x)) y_levels = np.sort(np.unique(y)) if x_levels.size <= 1 or y_levels.size <= 1: Warning("Both x and y must have at least two unique ordinal levels.") return np.nan # Step 3: Estimate thresholds from empirical marginal proportions tau_x = estimate_thresholds(x) # thresholds for X: τ_X tau_y = estimate_thresholds(y) # thresholds for Y: τ_Y # Step 4: Construct contingency table n_ij contingency = np.zeros((len(tau_x) - 1, len(tau_y) - 1), dtype=int) for i, xi in enumerate(x_levels): for j, yj in enumerate(y_levels): contingency[i, j] = np.sum((x == xi) & (y == yj)) # n_ij # Step 5: Define negative log-likelihood function based on P_ij = Φ₂(τ_i, τ_j; ρ) def neg_log_likelihood(rho): log_likelihood = 0.0 for i in range(len(tau_x) - 1): for j in range(len(tau_y) - 1): if contingency[i, j] == 0: continue lower = [tau_x[i], tau_y[j]] upper = [tau_x[i + 1], tau_y[j + 1]] p_ij = bivariate_cdf(lower, upper, rho) p_ij = max(p_ij, 1e-6) # soft clipping if np.isnan(p_ij): continue log_likelihood += contingency[i, j] * np.log(p_ij) return -log_likelihood # Step 6: Optimize to find MLE for rho eps = 1e-10 result = minimize_scalar( neg_log_likelihood, bounds=(-1 + eps, 1 - eps), method="bounded" ) return result.x
[docs] def polyserial_corr(x: ArrayLike, y: np.ndarray) -> float: """ Estimate the polyserial correlation coefficient between a continuous variable x and an ordinal variable y using maximum likelihood estimation. Parameters ---------- x : array_like Continuous variable (standardized recommended). y : array_like Ordinal variable (integer-coded, ordered categories). Returns ------- float Estimated polyserial correlation coefficient (rho). """ x = np.asarray(x) y = np.asarray(y) x = check_if_zero_variance(x) y = check_if_zero_variance(y) z = (x - np.mean(x)) / np.std(x) tau = estimate_thresholds(y) def neg_log_likelihood(rho): log_likelihood = 0.0 for zi, yi in zip(z, y): tau_lower = (tau[yi] - rho * zi) / np.sqrt(1 - rho**2) tau_upper = (tau[yi + 1] - rho * zi) / np.sqrt(1 - rho**2) p_i = univariate_cdf(tau_lower, tau_upper) p_i = max(p_i, 1e-6) # soft clipping if np.isnan(p_i): continue log_likelihood += np.log(p_i) return -log_likelihood eps = 1e-10 result = minimize_scalar( neg_log_likelihood, bounds=(-1 + eps, 1 - eps), method="bounded" ) return result.x