Source code for genal.association

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
import scipy.stats as st
import os, subprocess

from .extract_prs import check_bfiles
from .tools import get_plink19_path


[docs] def association_test_func(data, covar_list, standardize, name, data_pheno, pheno_type): """ Conduct single-SNP association tests against a phenotype. This function performs a series of operations: 1. Checks for necessary preliminary steps. 2. Updates the FAM file with the phenotype data. 3. Creates a covariate file if required. 4. Runs a PLINK association test. 5. Processes the results and returns them. Args: data (pd.DataFrame): Genetic data with the standard Geno columns. covar_list (list): List of column names in the data_pheno DataFrame to use as covariates. standardize (bool): Flag indicating if the phenotype needs standardization. name (str): Prefix for the filenames used during the process. data_pheno (pd.DataFrame): Phenotype data with at least an IID and PHENO columns. pheno_type (str): Type of phenotype ('binary' or 'quant'). Returns: pd.DataFrame: Processed results of the association test. This function corresponds to the following Geno method: :meth:`Geno.association_test`. """ # Check necessary files are available genetic_path = os.path.join("tmp_GENAL", f"{name}_allchr") if not check_bfiles(genetic_path): raise FileNotFoundError( "Run the extract_snps() method before performing association tests." ) if data.shape[0] == 0: raise ValueError( "No SNPs for the association tests. Check the .data or .data_clumped dataframes." ) # Update phenotype in the FAM file fam = _prepare_fam_file(genetic_path, data_pheno, pheno_type, standardize) # Prepare covariate file if covariates are provided covar, covar_filename = _handle_covariates(covar_list, data_pheno, name) # Execute PLINK association test output, method = _run_plink_assoc_test( genetic_path, name, pheno_type, covar, covar_filename, covar_list ) # Process and return results return _process_results(output, method, data, pheno_type)
def _prepare_fam_file(genetic_path, data_pheno, pheno_type, standardize): """Helper function to prepare the FAM file with phenotype data.""" # Read the FAM file fam = pd.read_csv(genetic_path + ".fam", header=None, delimiter=" ") # Extract relevant phenotype data data_pheno_trait = data_pheno[["IID", "PHENO"]].rename(columns={"IID": 0}).copy() # Merge phenotype data with the FAM dataframe fam = fam.merge(data_pheno_trait, how="left", on=0, indicator=True) fam[5] = fam.PHENO # Verify that the merge was successful if (fam["_merge"] == "both").sum() == 0: raise ValueError( "The IDs in the phenotype dataframe are inconsistent with those in the genetic dataset. Call set_phenotype() method again, specifying the correct column name for the genetic IDs." ) fam.drop(axis=1, columns=["PHENO", "_merge"], inplace=True) # Count the number of individuals with a valid phenotype trait n_non_na = fam.shape[0] - fam[5].isna().sum() print( f"{n_non_na} individuals are present in the genetic data and have a valid phenotype trait." ) # Update phenotype values based on its type if pheno_type == "binary": fam[5] = fam[5] + 1 fam[5] = fam[5].astype("Int64") if (pheno_type == "quant") & (standardize == True): # Standardizing for quantitative phenotypes print( "Standardizing the phenotype to approximate a normal distribution. Use standardize = False if you do not want to standardize." ) fam[5] = (fam[5] - fam[5].mean()) / fam[5].std() fam[5] = fam[5].fillna(-9) fam.to_csv(genetic_path + ".fam", header=None, index=False, sep=" ") return fam def _handle_covariates(covar_list, data_pheno, name): """Helper function to prepare the covariate file.""" if len(covar_list) > 0: # Ensure all covariates are present in phenotype data for col in covar_list: if col not in data_pheno.columns: raise TypeError( f"The {col} column is not found in the .phenotype dataframe." ) # Select required columns and rename columns data_cov = data_pheno[["IID", "IID"] + covar_list].copy() data_cov.columns = ["FID"] + list(data_cov.columns[1:]) # Remove rows with NA values and print their number nrows = data_cov.shape[0] data_cov.dropna(inplace=True) removed_rows = nrows - data_cov.shape[0] if removed_rows > 0: print( f"{removed_rows}({removed_rows/nrows*100:.3f}%) individuals have NA values in the covariates columns and will be excluded from the association tests." ) # Ensure the covariates are numeric and not trivial (lead to association fail) for col in covar_list: if data_pheno[col].nunique() == 1: print( f"The {col} covariate contains only one value and is removed from the tests." ) data_cov.drop(axis=1, columns=[col], inplace=True) if not pd.api.types.is_numeric_dtype(data_pheno[col]): print( f"The {col} covariate is not numeric and is removed from the tests." ) data_cov.drop(axis=1, columns=[col], inplace=True, errors="ignore") # Define the covariate filename covar_filename = os.path.join("tmp_GENAL", f"{name}_covar.cov") # Ensure FID and IID are in integer format and write the covariate file data_cov["IID"] = data_cov["IID"].astype("Int64") data_cov["FID"] = data_cov["FID"].astype("Int64") data_cov.to_csv(covar_filename, sep=" ", header=True, index=False) covar = True else: covar = False covar_filename = None return covar, covar_filename def _run_plink_assoc_test( genetic_path, name, pheno_type, covar, covar_filename, covar_list ): """Helper function to execute the PLINK association test.""" method = "logistic" if pheno_type == "binary" else "linear" print( f"Running single-SNP {method} regression tests on {genetic_path} data {f'with adjustment for: {covar_list}' if covar else 'without covariate adjustments'}." ) # Formulate the covariate argument for the PLINK command covar_argument = f"--covar {covar_filename} --hide-covar" if covar == True else "" output = os.path.join("tmp_GENAL", name) # Construct and run the PLINK command command = f"{get_plink19_path()} --bfile {genetic_path} --{method} {covar_argument} --out {output}" subprocess.run(command, shell=True, capture_output=True, text=True, check=True) return output, method def _process_results(output, method, data, pheno_type): """Helper function to process results after the PLINK association test.""" # Path to PLINK results results_path = output + f".assoc." + method assoc = pd.read_csv(results_path, delimiter="\s+") # If logistic regression, log-transform the odds ratio assoc["BETA"] = np.log(assoc.OR) if pheno_type == "binary" else assoc.BETA n_na = assoc["BETA"].isna().sum() # Merge results with the clumped data data = data.drop(axis=1, columns=["BETA", "CHR", "P"], errors="ignore").merge( assoc, how="inner", on="SNP" ) # Adjust beta values based on allele match data["BETA"] = np.where( data.EA == data.A1, data.BETA, np.where(data.NEA == data.A1, -data.BETA, np.nan) ) # Calculate and set standard error values data["SE"] = np.abs(data.BETA / st.norm.ppf(data.P / 2)) # Drop unnecessary columns data = data.drop( axis=1, columns=["A1", "TEST", "NMISS", "OR", "STAT", "BP"], errors="ignore" ) # Remove rows with mismatches in allele columns and notify the user nrow_previous = data.shape[0] data = data.dropna(subset="BETA") delta_nrow = nrow_previous - data.shape[0] - n_na if (delta_nrow > 0) or (n_na > 0): print( f"{f'{n_na}({n_na/nrow_previous*100:.3f}%) SNP-trait tests returned NA value and ' if n_na>0 else ''}{delta_nrow}({delta_nrow/nrow_previous*100:.3f}%) SNPs removed due to allele discrepancies between the main data and the genetic data." ) return data
[docs] def set_phenotype_func(data_original, PHENO, PHENO_type, IID, alternate_control): """ Set a phenotype dataframe containing individual IDs and phenotype columns formatted for single-SNP association testing. Args: data (pd.DataFrame): Contains at least an individual IDs column and one phenotype column. IID (str): Name of the individual IDs column in data. PHENO (str): Name of the phenotype column in data. PHENO_type (str, optional): Type of the phenotype column. Either "quant" for quantitative (continuous) or "binary".The function tries to infer the type if not provided. alternate_control (bool): Assumes that for a binary trait, the controls are coded with the most frequent value. Use True to reverse the assumption. Returns: pd.DataFrame: The modified data. str: The inferred or provided PHENO_type. Raises: ValueError: For inconsistencies in the provided data or arguments. This function corresponds to the following GENO method: :meth:`GENO.set_phenotype`. """ data = data_original.copy() _validate_columns_existence(data, PHENO, IID) data = _standardize_column_names(data, PHENO, IID) PHENO_type = _determine_phenotype_type(data, PHENO_type) data = _validate_and_process_phenotype(data, PHENO, PHENO_type, alternate_control) _report_na_values(data) print("The phenotype data is stored in the .phenotype attribute.") return data, PHENO_type
def _validate_columns_existence(data, PHENO, IID): """Checks if columns exist and raises errors if not.""" for column in [PHENO, IID]: # Raise an error if the column name is not provided if column is None: raise ValueError(f"Please provide a name for the {column} variable.") # Raise an error if the column does not exist in the data if column not in data.columns: raise ValueError( f"The column '{column}' is not present in the dataset. This column is required!" ) if data.shape[0] == 0: raise ValueError("The phenotype dataframe is empty.") def _standardize_column_names(data, PHENO, IID): """Standardizes the column names to 'IID' and 'PHENO'.""" # Drop redundant columns if they exist and rename the target columns to standard names if PHENO != "PHENO": data.drop(axis=1, columns=["PHENO"], errors="ignore", inplace=True) if IID != "IID": data.drop(axis=1, columns=["IID"], errors="ignore", inplace=True) data.rename(columns={IID: "IID", PHENO: "PHENO"}, inplace=True) return data def _determine_phenotype_type(data, PHENO_type): """Guesses or validates the phenotype type.""" # If phenotype type is not given, deduce it based on the unique values in the column if PHENO_type is None: if len(np.unique(data.PHENO.dropna())) == 2: print( "Detected a binary phenotype in the 'PHENO' column. Specify 'PHENO_type=\"quant\"' if this is incorrect." ) return "binary" else: print( "Detected a quantitative phenotype in the 'PHENO' column. Specify 'PHENO_type=\"binary\"' if this is incorrect." ) return "quant" else: if not PHENO_type in ["binary", "quant"]: raise ValueError( f"The only possible values for the PHENO_type argument are 'binary' or 'quant'" ) return PHENO_type def _validate_and_process_phenotype(data, PHENO, PHENO_type, alternate_control): """Validates the phenotype and processes it accordingly.""" # Process the phenotype based on its type if PHENO_type == "binary": _process_binary_phenotype(data, PHENO, alternate_control) elif PHENO_type == "quant": _validate_quantitative_phenotype(data, PHENO) else: raise ValueError("Accepted values for 'PHENO_type' are 'binary' or 'quant'.") return data def _process_binary_phenotype(data, PHENO, alternate_control): """Processes a binary phenotype.""" # Ensure that the phenotype is binary if len(np.unique(data.PHENO.dropna())) != 2: raise ValueError( f"The '{PHENO}' column is not binary as it contains more than two distinct values." ) if alternate_control: code_control = data.PHENO.value_counts().index[1] code_case = data.PHENO.value_counts().index[0] else: code_control = data.PHENO.value_counts().index[0] code_case = data.PHENO.value_counts().index[1] print( f"Identified {code_control} as the control code in 'PHENO'. {'Set alternate_control=True to inverse this interpretation.' if not alternate_control else ''}" ) # Update the control and case codings data.replace({"PHENO": {code_control: 0, code_case: 1}}, inplace=True) def _validate_quantitative_phenotype(data, PHENO): """Validates a quantitative phenotype.""" # Ensure that the phenotype is numeric if not is_numeric_dtype(data.PHENO): raise ValueError( f"The '{PHENO}' column must contain numeric values for a quantitative phenotype." ) def _report_na_values(data): """Reports the number of NA values in 'IID' and 'PHENO' columns.""" nrows = data.shape[0] n_nan_id = data.IID.isna().sum() n_nan_pheno = data.PHENO.isna().sum() # Report NA values in ID and PHENO columns, if they exist if n_nan_id > 0: print( f"Detected {n_nan_id} NA values in the 'ID' column, accounting for {n_nan_id/nrows*100:.3f}% of entries. These will be omitted during analyses." ) if n_nan_pheno > 0: print( f"Detected {n_nan_pheno} NA values in the 'PHENO' column, accounting for {n_nan_pheno/nrows*100:.3f}% of entries. These will be omitted during analyses." )