Source code for genal.snp_query

import aiohttp
import asyncio
import numpy as np
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

# Using nest_asyncio to allow execution in notebooks
nest_asyncio.apply()

# Main function to start the event loop and run the asynchronous query
[docs] def async_query_gwas_catalog(snps, p_threshold=5e-8, return_p=False, return_study=False, max_associations=None, timeout=100): loop = asyncio.get_event_loop() results_global, errors, timeouts = loop.run_until_complete(query_gwas_catalog_coroutine(snps, p_threshold, return_p, return_study, max_associations, timeout)) return results_global, errors, timeouts
# Function to query GWAS Catalog API for SNP associations
[docs] async def query_gwas_catalog_coroutine(snps, p_threshold=5e-8, return_p=False, return_study=False, max_associations=None, timeout=100): """ Query the GWAS Catalog API for SNP associations. Parameters: snps (list): List of SNPs to query. p_threshold (float): P-value threshold for filtering associations. return_p (bool): Whether to return the P-value of the association. return_study (bool): Whether to return the study ID of the association. max_associations (int): Maximum number of associations to return for each SNP. timeout (int): Timeout for each query in seconds. Returns: results_global (dict): Dictionary storing the SNP (keys) and results for each SNP: a list of single strings or tuples errors (list): List storing SNP for which the GWAS Catalog could not be queried timeouts (list): List storing SNP for which the timeout was reached """ results_global = {} # Dictionary storing the SNP (keys) and results for each SNP: a list of single strings or tuples errors = [] # List storing SNP for which the GWAS Catalog could not be queried timeouts = [] # List storing SNP for which the timeout was reached async def fetch(session, url, timeout=timeout): try: async with asyncio.timeout(timeout): async with session.get(url) as response: if response.status == 200: return await response.json() return None except asyncio.TimeoutError: return "TIMEOUT" async def process_snp(session, snp): #print(f"Processing SNP {snp}") results_snp = [] # List storing the results for each association found for this SNP base_url = f"https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/associations?projection=associationBySnp" base_data = await fetch(session, base_url, timeout=timeout) if base_data == "TIMEOUT": timeouts.append(snp) elif base_data: i = 0 # Process each association found for this SNP for assoc in base_data.get('_embedded', {}).get('associations', []): # If there are already max_associations, stop the loop if max_associations and i >= max_associations: break i += 1 pvalue = assoc.get("pvalue", np.nan) # If the pvalue of the association does not pass the threshold, the association is not processed further nor reported if pvalue < p_threshold: trait = assoc.get("efoTraits", [])[0].get("trait", "") # If the return_study flag is active: query the page containing the GWAS Catalog study ID if return_study: study_url = assoc.get("_links", {}).get("study", {}).get("href", {}) study_data = await fetch(session, study_url, timeout=timeout) study_id = "TIMEOUT" if study_data == "TIMEOUT" else study_data.get("accessionId", "") if study_data else "Not found" else: study_id = None # Return a tuple or a string depending on the return flags if return_p and return_study: result_assoc = (trait, "{:.4g}".format(pvalue), study_id) elif return_p: result_assoc = (trait, "{:.4g}".format(pvalue)) elif return_study: result_assoc = (trait, study_id) else: result_assoc = trait results_snp.append(result_assoc) else: continue # Clean the associations depending on the flag # If the P-value and Study ID are not returned, display each trait only once if not return_p and not return_study: results_snp = list(set(results_snp)) # If the P-value must be returned, return each trait once with the lowest p-value elif return_p and not return_study: min_trait = {} for trait, pvalue in results_snp: if trait not in min_trait or pvalue < min_trait[trait]: min_trait[trait] = pvalue results_snp = [(trait, min_trait[trait]) for trait in min_trait] results_global[snp] = results_snp else: errors.append(snp) async with aiohttp.ClientSession() as session: tasks = [process_snp(session, snp) for snp in snps] await tqdm_asyncio.gather(*tasks) # Exclude timeouts from errors #errors = [error for error in errors if error not in timeouts] return results_global, errors, timeouts