--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "nbs/Untitled.ipynb" ---
{% raw %}
{% endraw %}

Summary statistics merger

{% raw %}
import os
import yaml
import glob
import pandas as pd
from Bio.Seq import Seq
{% endraw %} {% raw %}
pd.read_csv('../xqtl-pipeline/pipeline/misc/data/yml_list.txt',sep = "\t").values.tolist()
[[0, './data/template.yml']]
{% endraw %} {% raw %}
merge_sumstats('../xqtl-pipeline/pipeline/misc/data/template.yml',keep_ambiguous=False)
Total number of sumstats:  4
{'../xqtl-pipeline/pipeline/misc/data/testflip/snps500_flip_rea0a1.regenie.snp_stats.gz': {'ID': 'CHR,POS', 'CHR': 'CHR', 'POS': 'POS', 'A0': 'REF', 'A1': 'ALT', 'SNP': 'SNP', 'STAT': 'BETA', 'SE': 'SE', 'P': 'P'}, '../xqtl-pipeline/pipeline/misc/data/testflip/snps500_rea0a1.regenie.snp_stats.gz': {'ID': 'CHR,POS', 'CHR': 'CHR', 'POS': 'POS', 'A0': 'REF', 'A1': 'ALT', 'SNP': 'SNP', 'STAT': 'BETA', 'SE': 'SE', 'P': 'P'}, '../xqtl-pipeline/pipeline/misc/data/testflip/snps500.regenie.snp_stats.gz': {'ID': 'CHR,POS', 'CHR': 'CHR', 'POS': 'POS', 'A0': 'REF', 'A1': 'ALT', 'SNP': 'SNP', 'STAT': 'BETA', 'SE': 'SE', 'P': 'P'}, '../xqtl-pipeline/pipeline/misc/data/testflip/snps1000.regenie.snp_stats.gz': {'ID': 'CHR,POS', 'CHR': 'CHR', 'POS': 'POS', 'A0': 'REF', 'A1': 'ALT', 'SNP': 'SNP', 'STAT': 'BETA', 'SE': 'SE', 'P': 'P'}}
../xqtl-pipeline/pipeline/misc/data/testflip/snps500_flip_rea0a1.regenie.snp_stats.gz {'ID': 'CHR,POS', 'CHR': 'CHR', 'POS': 'POS', 'A0': 'REF', 'A1': 'ALT', 'SNP': 'SNP', 'STAT': 'BETA', 'SE': 'SE', 'P': 'P'}
../xqtl-pipeline/pipeline/misc/data/testflip/snps500_rea0a1.regenie.snp_stats.gz {'ID': 'CHR,POS', 'CHR': 'CHR', 'POS': 'POS', 'A0': 'REF', 'A1': 'ALT', 'SNP': 'SNP', 'STAT': 'BETA', 'SE': 'SE', 'P': 'P'}
../xqtl-pipeline/pipeline/misc/data/testflip/snps500.regenie.snp_stats.gz {'ID': 'CHR,POS', 'CHR': 'CHR', 'POS': 'POS', 'A0': 'REF', 'A1': 'ALT', 'SNP': 'SNP', 'STAT': 'BETA', 'SE': 'SE', 'P': 'P'}
../xqtl-pipeline/pipeline/misc/data/testflip/snps1000.regenie.snp_stats.gz {'ID': 'CHR,POS', 'CHR': 'CHR', 'POS': 'POS', 'A0': 'REF', 'A1': 'ALT', 'SNP': 'SNP', 'STAT': 'BETA', 'SE': 'SE', 'P': 'P'}
Total rows of query:  500 Total rows of subject:  500
Overlap chr:pos 500
              sign_flip  strand_flip  exact_match
1:18744859         True         True        False
1:19112524         True         True        False
1:19112744         True         True        False
1:19220870         True         True        False
1:19324561         True         True        False
...                 ...          ...          ...
10:91959301        True         True        False
10:93074445        True         True        False
10:94171346        True         True        False
10:100293231       True         True        False
10:101803994       True         True        False

[418 rows x 3 columns]
Overlap SNPs 418
Total rows of query:  500 Total rows of subject:  500
Overlap chr:pos 500
              sign_flip  strand_flip  exact_match
1:18744859         True        False        False
1:19112524         True        False        False
1:19112744         True        False        False
1:19220870         True        False        False
1:19324561         True        False        False
...                 ...          ...          ...
10:91959301        True        False        False
10:93074445        True        False        False
10:94171346        True        False        False
10:100293231       True        False        False
10:101803994       True        False        False

[418 rows x 3 columns]
Overlap SNPs 418
Total rows of query:  500 Total rows of subject:  500
Overlap chr:pos 500
              sign_flip  strand_flip  exact_match
1:18744859        False        False         True
1:19112524        False        False         True
1:19112744        False        False         True
1:19220870        False        False         True
1:19324561        False        False         True
...                 ...          ...          ...
10:91959301       False        False         True
10:93074445       False        False         True
10:94171346       False        False         True
10:100293231      False        False         True
10:101803994      False        False         True

[418 rows x 3 columns]
Overlap SNPs 418
Total rows of query:  1000 Total rows of subject:  500
Overlap chr:pos 500
              sign_flip  strand_flip  exact_match
1:18744859        False        False         True
1:19112524        False        False         True
1:19112744        False        False         True
1:19220870        False        False         True
1:19324561        False        False         True
...                 ...          ...          ...
10:91959301       False        False         True
10:93074445       False        False         True
10:94171346       False        False         True
10:100293231      False        False         True
10:101803994      False        False         True

[418 rows x 3 columns]
Overlap SNPs 418
Total number of common SNPs:  418
/tmp/1988675.1.plot.q/ipykernel_5955/2259156742.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_query.STAT[pm.sign_flip] = -new_query.STAT[pm.sign_flip]
/tmp/1988675.1.plot.q/ipykernel_5955/2259156742.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_query.STAT[pm.sign_flip] = -new_query.STAT[pm.sign_flip]
/tmp/1988675.1.plot.q/ipykernel_5955/2259156742.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_query.STAT[pm.sign_flip] = -new_query.STAT[pm.sign_flip]
/tmp/1988675.1.plot.q/ipykernel_5955/2259156742.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_query.STAT[pm.sign_flip] = -new_query.STAT[pm.sign_flip]
All are done!!!
{% endraw %} {% raw %}
yml = load_yaml('../xqtl-pipeline/pipeline/misc/data/template.yml')
{% endraw %} {% raw %}
input_dict = parse_input(yml['INPUT'])
target_dict = parse_input(yml['TARGET'])
output_path = yml['OUTPUT']
{% endraw %} {% raw %}
yml['TARGET']
[{'../xqtl-pipeline/pipeline/misc/data/testflip/snps500.regenie.snp_stats.gz': {'ID': 'CHR,POS',
   'CHR': 'CHR',
   'POS': 'POS',
   'A0': 'REF',
   'A1': 'ALT',
   'SNP': 'SNP',
   'STAT': 'BETA',
   'SE': 'SE',
   'P': 'P'}}]
{% endraw %} {% raw %}
input_dict
{'../xqtl-pipeline/pipeline/misc/data/testflip/snps500_flip_rea0a1.regenie.snp_stats.gz': {'ID': 'CHR,POS',
  'CHR': 'CHR',
  'POS': 'POS',
  'A0': 'REF',
  'A1': 'ALT',
  'SNP': 'SNP',
  'STAT': 'BETA',
  'SE': 'SE',
  'P': 'P'},
 '../xqtl-pipeline/pipeline/misc/data/testflip/snps500_rea0a1.regenie.snp_stats.gz': {'ID': 'CHR,POS',
  'CHR': 'CHR',
  'POS': 'POS',
  'A0': 'REF',
  'A1': 'ALT',
  'SNP': 'SNP',
  'STAT': 'BETA',
  'SE': 'SE',
  'P': 'P'},
 '../xqtl-pipeline/pipeline/misc/data/testflip/snps500.regenie.snp_stats.gz': {'ID': 'CHR,POS',
  'CHR': 'CHR',
  'POS': 'POS',
  'A0': 'REF',
  'A1': 'ALT',
  'SNP': 'SNP',
  'STAT': 'BETA',
  'SE': 'SE',
  'P': 'P'},
 '../xqtl-pipeline/pipeline/misc/data/testflip/snps1000.regenie.snp_stats.gz': {'ID': 'CHR,POS',
  'CHR': 'CHR',
  'POS': 'POS',
  'A0': 'REF',
  'A1': 'ALT',
  'SNP': 'SNP',
  'STAT': 'BETA',
  'SE': 'SE',
  'P': 'P'},
 '../xqtl-pipeline/pipeline/misc/data/testflip/flip/snps500_flip.regenie.snp_stats.gz': None}
{% endraw %} {% raw %}
target_dict
{}
{% endraw %} {% raw %}
list(target_dict.values())
[]
{% endraw %} {% raw %}
def merge_sumstats(yml,keep_ambiguous):
    #parse yaml
    yml = load_yaml(yml)
    input_dict = parse_input(yml['INPUT'])
    target_dict = parse_input(yml['TARGET'])
    output_path = yml['OUTPUT']

    input_dict[list(target_dict.keys())[0]] = list(target_dict.values())[0]
    lst_sumstats_file = [os.path.basename(i) for i in input_dict.keys()]
    print('Total number of sumstats: ',len(lst_sumstats_file))
    if len(set(lst_sumstats_file))<len(lst_sumstats_file):
        raise Exception("There are duplicated names in ", lst_sumstats_file)
    #read all sumstats
    print(input_dict)
    lst_sumstats = {os.path.basename(i):read_sumstat(i,j) for i,j in input_dict.items()}
    nqs = []
    for query in lst_sumstats.values():
        nq,_ = snps_match(query,lst_sumstats[os.path.basename(list(target_dict.keys())[0])],keep_ambiguous)
        nqs.append(nq)
    #get common snps
    common_snps = set.intersection(*[set(nq.SNP) for nq in nqs])
    print('Total number of common SNPs: ',len(common_snps))
    #write out new smustats
    for output_sumstats,nq in zip(lst_sumstats_file,nqs):
        sumstats = nq[nq.SNP.isin(common_snps)]
        sumstats.to_csv(os.path.join(output_path, output_sumstats), sep = "\t", header = True, index = False,compression='gzip')
    print('All are done!!!')
{% endraw %} {% raw %}
def load_yaml(yaml_file):
    with open(yaml_file, "r") as stream:
        try:
            yml = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return yml
{% endraw %} {% raw %}
def parse_input(yml_input):
    input_dict = {}
    for i in yml_input:
        for name in glob.glob(list(i.keys())[0]):
            input_dict[name] = list(i.values())[0].copy()
    return input_dict
{% endraw %} {% raw %}
a_dict = {"a": 1, "B": 2, "C": 3}
{% endraw %} {% raw %}
a_dict.pop('a')
1
{% endraw %} {% raw %}
a_dict
{'B': 2, 'C': 3}
{% endraw %} {% raw %}
'a,b'.split(',')
['a', 'b']
{% endraw %} {% raw %}
def read_sumstat(file, config=None):
    print(file,config)
    try:
        sumstats = pd.read_csv(file, compression='gzip', header=0, sep='\t', quotechar='"')
    except:
        sumstats = pd.read_csv(file, header=0, sep='\t', quotechar='"')
    if config is not None:
        try:
            sumstats.index = sumstats.loc[:,config.pop('ID').split(',')].astype(str).agg(':'.join, axis=1)
            sumstats = sumstats.loc[:,list(config.values())]
        except:
            raise ValueError(f'According to config_file, input summary statistics should have the following columns: %s' % list(config.values()))
        sumstats.columns = list(config.keys())
    sumstats.SNP = 'chr'+sumstats.CHR.astype(str) + ':' + sumstats.POS.astype(str) + ':' + sumstats.A0.astype(str) + ':' + sumstats.A1.astype(str)
    sumstats.CHR = sumstats.CHR.astype(int)
    sumstats.POS = sumstats.POS.astype(int)
    return sumstats
{% endraw %} {% raw %}
['a','b'].remove('a')
{% endraw %} {% raw %}
def snps_match(query,subject,keep_ambiguous=True):
    query.index = query.iloc[:,:2].astype(str).agg(':'.join, axis=1)
    subject.index = subject.iloc[:,:2].astype(str).agg(':'.join, axis=1)
    #overlap snps by chr+pos
    print("Total rows of query: ",query.shape[0],"Total rows of subject: ",subject.shape[0])
    subject = subject[subject.index.isin(query.index)]
    query = query.loc[subject.index]
    print("Overlap chr:pos",query.shape[0])
    if query.index.duplicated().any():
        raise Exception("There are duplicated chr:pos")
    pm = pair_match(query.A1,query.A0,subject.A1,subject.A0)
    if keep_ambiguous:
        print('Warning: there are',sum(~pm.ambiguous),'ambiguous SNPs')
        pm = pm.iloc[:,1:]
    else:
        pm = pm[~pm.ambiguous].iloc[:,1:]
        print(pm)
    keep_idx = pm.any(axis=1)
    keep_idx = keep_idx.index[keep_idx==True]
    print("Overlap SNPs",len(keep_idx))
    #overlap snps by chr+pos+alleles.
    new_subject = subject.loc[keep_idx]
    #update beta and snp info
    new_query = pd.concat([new_subject.iloc[:,:5],query.loc[keep_idx].iloc[:,5:]],axis=1)
    new_query.STAT[pm.sign_flip] = -new_query.STAT[pm.sign_flip]
    return new_query,new_subject
{% endraw %} {% raw %}
def pair_match(a1,a2,ref1,ref2):
    # a1 and a2 are the first data-set
	# ref1 and ref2 are the 2nd data-set
	# Make all the alleles into upper-case, as A,T,C,G:
    a1 = a1.str.upper()
    a2 = a2.str.upper()
    ref1 = ref1.str.upper()
    ref2 = ref2.str.upper()
	# Strand flip, to change the allele representation in the 2nd data-set
    flip1 = ref1.apply(strand_flip)
    flip2 = ref2.apply(strand_flip)
    result = {}
    result["ambiguous"] = ((a1=="A") & (a2=="T")) | ((a1=="T") & (a2=="A")) | ((a1=="C") & (a2=="G")) | ((a1=="G") & (a2=="C"))
    # as long as scenario 1 is involved, sign_flip will return TRUE
    result["sign_flip"] = ((a1==ref2) & (a2==ref1)) | ((a1==flip2) & (a2==flip1))
	# as long as scenario 2 is involved, strand_flip will return TRUE
    result["strand_flip"] = ((a1==flip1) & (a2==flip2)) | ((a1==flip2) & (a2==flip1))
	# remove other cases, eg, tri-allelic, one dataset is A C, the other is A G, for example.
    result["exact_match"] = ((a1 == ref1) & (a2 == ref2))
    return pd.DataFrame(result)
{% endraw %} {% raw %}
def strand_flip(s):
    return ''.join(Seq(s).reverse_complement())
{% endraw %}