Source code for spacr.sp_stats

from scipy.stats import shapiro, normaltest, levene, ttest_ind, mannwhitneyu, kruskal, f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, fisher_exact
import itertools
from statsmodels.stats.multitest import multipletests


[docs] def choose_p_adjust_method(num_groups, num_data_points): """ Selects the most appropriate p-value adjustment method based on data characteristics. Parameters: - num_groups: Number of unique groups being compared - num_data_points: Number of data points per group (assuming balanced groups) Returns: - A string representing the recommended p-adjustment method """ num_comparisons = (num_groups * (num_groups - 1)) // 2 # Number of pairwise comparisons # Decision logic for choosing the adjustment method if num_comparisons <= 10 and num_data_points > 5: return 'holm' # Balanced between power and Type I error control elif num_comparisons > 10 and num_data_points <= 5: return 'fdr_bh' # FDR control for large number of comparisons and small sample size elif num_comparisons <= 10: return 'sidak' # Less conservative than Bonferroni, good for independent comparisons else: return 'bonferroni' # Very conservative, use for strict control of Type I errors
[docs] def perform_normality_tests(df, grouping_column, data_columns): """Perform normality tests for each group and data column.""" unique_groups = df[grouping_column].unique() normality_results = [] for column in data_columns: for group in unique_groups: data = df.loc[df[grouping_column] == group, column].dropna() n_samples = len(data) if n_samples < 3: # Skip test if there aren't enough data points print(f"Skipping normality test for group '{group}' on column '{column}' - Not enough data.") normality_results.append({ 'Comparison': f'Normality test for {group} on {column}', 'Test Statistic': None, 'p-value': None, 'Test Name': 'Skipped', 'Column': column, 'n': n_samples }) continue # Choose the appropriate normality test based on the sample size if n_samples >= 8: stat, p_value = normaltest(data) test_name = "D'Agostino-Pearson test" else: stat, p_value = shapiro(data) test_name = "Shapiro-Wilk test" normality_results.append({ 'Comparison': f'Normality test for {group} on {column}', 'Test Statistic': stat, 'p-value': p_value, 'Test Name': test_name, 'Column': column, 'n': n_samples }) # Check if all groups are normally distributed (p > 0.05) normal_p_values = [result['p-value'] for result in normality_results if result['Column'] == column and result['p-value'] is not None] is_normal = all(p > 0.05 for p in normal_p_values) return is_normal, normality_results
[docs] def perform_levene_test(df, grouping_column, data_column): """Perform Levene's test for equal variance.""" unique_groups = df[grouping_column].unique() grouped_data = [df.loc[df[grouping_column] == group, data_column].dropna() for group in unique_groups] stat, p_value = levene(*grouped_data) return stat, p_value
[docs] def perform_statistical_tests(df, grouping_column, data_columns, paired=False): """Perform statistical tests for each data column.""" unique_groups = df[grouping_column].unique() test_results = [] for column in data_columns: grouped_data = [df.loc[df[grouping_column] == group, column].dropna() for group in unique_groups] if len(unique_groups) == 2: # For two groups if paired: print("Performing paired tests (not implemented in this template).") continue # Extend as needed else: # Check normality for two groups is_normal, _ = perform_normality_tests(df, grouping_column, [column]) if is_normal: stat, p = ttest_ind(grouped_data[0], grouped_data[1]) test_name = 'T-test' else: stat, p = mannwhitneyu(grouped_data[0], grouped_data[1]) test_name = 'Mann-Whitney U test' else: # Check normality for multiple groups is_normal, _ = perform_normality_tests(df, grouping_column, [column]) if is_normal: stat, p = f_oneway(*grouped_data) test_name = 'One-way ANOVA' else: stat, p = kruskal(*grouped_data) test_name = 'Kruskal-Wallis test' test_results.append({ 'Column': column, 'Test Name': test_name, 'Test Statistic': stat, 'p-value': p, 'Groups': len(unique_groups) }) return test_results
[docs] def perform_posthoc_tests(df, grouping_column, data_column, is_normal): """Perform post-hoc tests for multiple groups with both original and adjusted p-values.""" unique_groups = df[grouping_column].unique() posthoc_results = [] if len(unique_groups) > 2: num_groups = len(unique_groups) num_data_points = len(df[data_column].dropna()) // num_groups # Assuming roughly equal data points per group p_adjust_method = choose_p_adjust_method(num_groups, num_data_points) if is_normal: # Tukey's HSD automatically adjusts p-values tukey_result = pairwise_tukeyhsd(df[data_column], df[grouping_column], alpha=0.05) for comparison, p_value in zip(tukey_result._results_table.data[1:], tukey_result.pvalues): posthoc_results.append({ 'Comparison': f"{comparison[0]} vs {comparison[1]}", 'Original p-value': None, # Tukey HSD does not provide raw p-values 'Adjusted p-value': p_value, 'Adjusted Method': 'Tukey HSD', 'Test Name': 'Tukey HSD' }) else: # Dunn's test with p-value adjustment raw_dunn_result = sp.posthoc_dunn(df, val_col=data_column, group_col=grouping_column, p_adjust=None) adjusted_dunn_result = sp.posthoc_dunn(df, val_col=data_column, group_col=grouping_column, p_adjust=p_adjust_method) for i, group_a in enumerate(adjusted_dunn_result.index): for j, group_b in enumerate(adjusted_dunn_result.columns): if i < j: # Only consider unique pairs posthoc_results.append({ 'Comparison': f"{group_a} vs {group_b}", 'Original p-value': raw_dunn_result.iloc[i, j], 'Adjusted p-value': adjusted_dunn_result.iloc[i, j], 'Adjusted Method': p_adjust_method, 'Test Name': "Dunn's Post-hoc" }) return posthoc_results
[docs] def chi_pairwise(raw_counts, verbose=False): """ Perform pairwise chi-square or Fisher's exact tests between all unique group pairs and apply p-value correction. Parameters: - raw_counts (DataFrame): Contingency table with group-wise counts. - verbose (bool): Whether to print results for each pair. Returns: - pairwise_df (DataFrame): DataFrame with pairwise test results, including corrected p-values. """ pairwise_results = [] groups = raw_counts.index.unique() # Use index from raw_counts for group pairs raw_p_values = [] # Store raw p-values for correction later # Calculate the number of groups and average number of data points per group num_groups = len(groups) num_data_points = raw_counts.sum(axis=1).mean() # Average total data points per group p_adjust_method = choose_p_adjust_method(num_groups, num_data_points) for group1, group2 in itertools.combinations(groups, 2): contingency_table = raw_counts.loc[[group1, group2]].values if contingency_table.shape[1] == 2: # Fisher's Exact Test for 2x2 tables oddsratio, p_value = fisher_exact(contingency_table) test_name = "Fisher's Exact Test" else: # Chi-Square Test for larger tables chi2_stat, p_value, _, _ = chi2_contingency(contingency_table) test_name = 'Pairwise Chi-Square Test' pairwise_results.append({ 'Group 1': group1, 'Group 2': group2, 'Test Name': test_name, 'p-value': p_value }) raw_p_values.append(p_value) # Apply p-value correction corrected_p_values = multipletests(raw_p_values, method=p_adjust_method)[1] # Add corrected p-values to results for i, result in enumerate(pairwise_results): result['p-value_adj'] = corrected_p_values[i] pairwise_df = pd.DataFrame(pairwise_results) pairwise_df['adj'] = p_adjust_method if verbose: # Print pairwise results print("\nPairwise Frequency Analysis Results:") print(pairwise_df.to_string(index=False)) return pairwise_df