import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from scipy.stats import norm, chi2
from concurrent.futures import ProcessPoolExecutor, as_completed
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
from numpy.random import default_rng
from functools import partial
##todo: implement the multivariable option, for the moment we assume only 1 BETA_e column
# Also: check if we can replace the LinearRegression of sklearn with one from statsmodels to avoid using sklearn just for that
# MR-PRESSO main function
[docs]
def mr_presso(
data,
BETA_e_columns=["BETA_e"],
n_iterations=1000,
outlier_test=True,
distortion_test=True,
significance_p=0.05,
cpus=5,
):
"""
Perform the MR-PRESSO algorithm for detection of horizontal pleiotropy.
Args:
data (pd.DataFrame): DataFrame with at least 4 columns: BETA_o (outcome), SE_o, BETA_e (exposure), SE_e.
BETA_e_columns (list): List of exposure beta columns.
n_iterations (int): Number of steps performed (random data generation).
outlier_test (bool): If True, identifies outlier SNPs responsible for horizontal pleiotropy.
distortion_test (bool): If True, tests significant distortion in the causal estimates.
significance_p (float): Statistical significance threshold for the detection of horizontal pleiotropy.
cpus (int): Number of CPUs to use for parallel processing.
Returns:
mod_table (pd.DataFrame): DataFrame with the original and outlier-corrected inverse variance-weighted MR results.
GlobalTest (dict): Dictionary with p-value of the global MR-PRESSO test.
OutlierTest (pd.DataFrame): DataFrame with p-value for each SNP for the outlier test.
BiasTest (dict): Dictionary with results of the distortion test.
"""
# Transforming the data
data = data[["BETA_o", *BETA_e_columns, "SE_o", "SE_e"]].dropna()
data[["BETA_o", *BETA_e_columns]] = data[["BETA_o", *BETA_e_columns]].multiply(
np.sign(data[BETA_e_columns[0]]), axis=0
)
data["Weights"] = 1 / (data["SE_o"] ** 2)
if len(data) <= len(BETA_e_columns) + 2:
raise Exception("Not enough instrumental variables")
if len(data) >= n_iterations:
raise Exception(
"Not enough elements to compute empirical P-values, increase n_iterations"
)
print(f"Running the MR-PRESSO algorithm with N = {n_iterations} iterations.")
# 1- Computing the observed residual sum of squares (RSS)
print(f"Computing the observed residual sum of squares...")
RSSobs = getRSS_LOO(data, BETA_e_columns, outlier_test)
# 2- Computing the distribution of expected residual sum of squares (RSS)
print("Computing the global MRPRESSO p-value...")
partial_parallel_RSS_LOO = partial(
parallel_RSS_LOO, data=data, BETA_e_columns=BETA_e_columns
) # Wrapper function freezing the parallel_RSS_LOO call
with ProcessPoolExecutor(max_workers=cpus) as executor:
results = list(
tqdm(
executor.map(partial_parallel_RSS_LOO, range(n_iterations)),
total=n_iterations,
desc="Generating random data",
ncols=100,
)
)
RSSexp = [res[0] for res in results]
Random_data_e = np.vstack([r[1] for r in results])
Random_data_o = np.vstack([r[2] for r in results])
global_p = np.sum([r > RSSobs[0] for r in RSSexp]) / n_iterations
global_p_str = (
global_p if global_p > 1 / n_iterations else f"< {1/n_iterations:.1e}"
)
if outlier_test:
GlobalTest = {"RSSobs": RSSobs[0], "Global_test_p": global_p_str}
else:
GlobalTest = {"RSSobs": RSSobs, "Global_test_p": global_p_str}
# 3- Computing the single IV outlier test
if global_p < significance_p and outlier_test:
print("Running the Outlier test.")
if len(BETA_e_columns) == 1:
Dif = data["BETA_o"].values - data["BETA_e"].values * RSSobs[1]
Exp = Random_data_o - (Random_data_e * RSSobs[1])
else:
raise ValueError("Outlier test not done for multi MR.")
abs_diffs = np.abs(Exp.T) > np.abs(Dif)[:, np.newaxis]
pvals = np.sum(abs_diffs, axis=1) / Exp.shape[0]
OutlierTest = pd.DataFrame({"RSSobs": Dif**2, "Pvalue": pvals})
OutlierTest.index = data.index
OutlierTest["Pvalue"] = np.minimum(
OutlierTest["Pvalue"] * len(data), 1
) # Bonferroni correction
if data.shape[0] / n_iterations > significance_p:
print(
f"Warning: the Outlier test in unstable. The {significance_p} significance threshold cannot be obtained with {n_iterations} Distributions. Increase n_iterations."
)
else:
outlier_test = False
OutlierTest = pd.DataFrame()
# 4- Computing the test of the distortion of the causal estimate
print("Running the Distortion test.")
formula = f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}"
mod_all = smf.wls(formula, data=data, weights=data["Weights"]).fit()
BiasTest = {}
if distortion_test and outlier_test:
## Is there an error in the MRPRESSO code? The outlier indices are supposed to be excluded from the expected bias computation (as per the paper).
def get_random_bias(BETA_e_columns, data, ref_outlier):
indices = np.concatenate(
[
ref_outlier,
np.random.choice(
list(set(range(len(data))) - set(ref_outlier)),
len(data) - len(ref_outlier),
),
]
)
subset_data = data.iloc[indices[: -len(ref_outlier)]]
mod_random = smf.wls(
f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}",
data=subset_data,
weights=subset_data["Weights"],
).fit()
return mod_random.params[BETA_e_columns]
ref_outlier = OutlierTest.loc[OutlierTest["Pvalue"] <= significance_p].index
if len(ref_outlier) > 0:
if len(ref_outlier) < len(data):
BiasExp = [
get_random_bias(BETA_e_columns, data, ref_outlier)
for _ in range(n_iterations)
]
BiasExp = pd.concat(BiasExp, axis=1).transpose()
subset_data = data.drop(ref_outlier)
mod_no_outliers = smf.wls(
f"BETA_o ~ -1 + {' + '.join(BETA_e_columns)}",
data=subset_data,
weights=subset_data["Weights"],
).fit()
BiasObs = (
mod_all.params[BETA_e_columns]
- mod_no_outliers.params[BETA_e_columns]
) / abs(mod_no_outliers.params[BETA_e_columns])
BiasExp = (mod_all.params[BETA_e_columns] - BiasExp) / abs(BiasExp)
p_value = np.sum(np.abs(BiasExp) > np.abs(BiasObs)) / n_iterations
BiasTest = {
"Outliers Indices": list(ref_outlier),
"Distortion test coefficient": 100 * BiasObs.values[0],
"Distortion test p-value": p_value.iloc[0],
}
else:
BiasTest = {
"Outliers Indices": "All SNPs considered as outliers",
"Distortion test coefficient": np.nan,
"Distortion test p-value": np.nan,
}
else:
BiasTest = {
"Outliers Indices": "No significant outliers",
"Distortion test coefficient": np.nan,
"Distortion test p-value": np.nan,
}
# 5- Format
row_original = {
"exposure": BETA_e_columns[0],
"method": "Raw",
"nSNP": len(data),
"b": mod_all.params["BETA_e"],
"se": mod_all.bse["BETA_e"],
"pval": mod_all.pvalues["BETA_e"],
}
if "mod_no_outliers" in locals():
row_corrected = {
"exposure": BETA_e_columns[0],
"method": "Outlier-corrected",
"nSNP": len(data) - len(ref_outlier),
"b": mod_no_outliers.params["BETA_e"],
"se": mod_no_outliers.bse["BETA_e"],
"pval": mod_no_outliers.pvalues["BETA_e"],
}
else:
row_corrected = {
"exposure": BETA_e_columns[0],
"method": "Outlier-corrected",
"nSNP": np.nan,
"b": np.nan,
"se": np.nan,
"pval": np.nan,
}
mod_table = pd.DataFrame([row_original, row_corrected])
return mod_table, GlobalTest, OutlierTest, BiasTest
## MR-PRESSO helper functions
# Define the matrix power operator
[docs]
def power_eigen(x, n):
values, vectors = np.linalg.eig(x)
return vectors.dot(np.diag(values**n)).dot(vectors.T)
# Function to compute the residual sum of squares in a LOO framework
# Generate random data based on normal distributions
[docs]
def getRandomData(data, BETA_e_columns=["BETA_e"]):
rng = default_rng()
models = []
for i in range(len(data)):
lm = LinearRegression(fit_intercept=False)
data_i = data.drop(i)
lm.fit(
data_i[BETA_e_columns], data_i["BETA_o"], sample_weight=data_i["Weights"]
)
models.append(lm)
random_data_dict = {}
for col, sd_col in zip(BETA_e_columns, ["SE_e"]):
random_data_dict[col] = rng.normal(data[col], data[sd_col])
random_data_dict["BETA_o"] = [
rng.normal(
model.predict(data.iloc[[i]][BETA_e_columns]), data.iloc[i]["SE_o"]
).item()
for i, model in enumerate(models)
]
random_data_dict["Weights"] = data["Weights"].values
random_data_df = pd.DataFrame(random_data_dict)
return random_data_df
# Function for the parallel executor in step 2: generate random data and compute the expected residual sum of squares