--- title: 1.read fam keywords: fastai sidebar: home_sidebar nb_path: "nbs/06_Create_MWE.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
import pandas as pd
import gzip

def get_vcf_names(vcf_path):
    with gzip.open(vcf_path, "rt") as ifile:
        for line in ifile:
            if line.startswith("#CHROM"):
                vcf_names = [x.strip() for x in line.split('\t')]
                break
    ifile.close()
    return vcf_names

def read_vcf_chunk(fn,chunksize=10):
    names = get_vcf_names(fn)
    try:
        vcf = pd.read_csv(fn, compression='gzip', comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
    except:
        vcf = pd.read_csv(fn, comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
    df= pd.DataFrame(vcf.get_chunk(chunksize))
    return df

def read_anno_chunk(fn,chunksize=10):
    try:
        vcf = pd.read_csv(fn, compression='gzip', chunksize=chunksize)
    except:
        vcf = pd.read_csv(fn, chunksize=chunksize)
    df= pd.DataFrame(vcf.get_chunk(chunksize))
    return df
{% endraw %} {% raw %}
fam = pd.read_csv('data/nodp_fam.csv',header=None,names=['fid','iid','fathid','mothid','sex','ad','vcf'])
{% endraw %} {% raw %}
fam.iloc[:,:-1].sort_values('fid').to_csv('data/nodp_fam.fam',header=False,index=False,sep='\t')
{% endraw %}

1.1 Normal family

{% raw %}
fam[fam.fid.isin(['22_1','1036','28_9'])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_normal_fam.csv',header=False,index=False,sep='\t')
{% endraw %} {% raw %}
tmp = pd.DataFrame([pd.Series(list(set(fam.fid))),pd.Series([sum(fam[fam.fid == i].vcf) for i in set(fam.fid)]),pd.Series([sum(fam.fid == i) for i in set(fam.fid)])])
{% endraw %} {% raw %}
tmp = tmp.T
{% endraw %} {% raw %}
tmp.columns = ['fid','vcf','tol']
{% endraw %} {% raw %}
tmp['per'] = tmp.vcf/tmp.tol
{% endraw %} {% raw %}
tmp.sort_values('per')
fid vcf tol per
37 4_649 2 373 0.005362
36 535 1 62 0.016129
761 8_64024 1 43 0.023256
734 10R_R81 1 43 0.023256
115 4_715 1 41 0.02439
... ... ... ... ...
264 1999 6 9 0.666667
225 22_1 5 7 0.714286
735 1036 5 7 0.714286
3 28_9 8 11 0.727273
434 10R_R107 8 10 0.8

818 rows × 4 columns

{% endraw %}

1.2 Multi-group family

{% raw %}
fam[fam.fid.isin(['27_104'])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_multi_group_fam.csv',header=False,index=False,sep='\t')
{% endraw %}

1.3 In-law family

{% raw %}
fam[fam.fid.isin(['4_461'])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_in_law_fam.csv',header=False,index=False,sep='\t')
{% endraw %}

1.4

{% raw %}
fam[fam.fid.isin(['1005'])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_the_1005_fam.csv',header=False,index=False,sep='\t')
{% endraw %}

1.5

{% raw %}
ok_plot = pd.read_csv('data/ok_plot.csv',index_col=0)
{% endraw %} {% raw %}
fam[fam.fid.isin(ok_plot.x[:10])].iloc[:,:-1].sort_values('fid').to_csv('data/mwe_ok_plot_fam.csv',header=False,index=False,sep='\t')
{% endraw %}

1.6 4_364 family

{% raw %}
f4_364 = fam[fam.fid.isin(['4_364'])].iloc[:,:-1]
{% endraw %} {% raw %}
f4_364.to_csv('data/mwe_the_4_364_all_fam.csv',header=False,index=False,sep='\t')
{% endraw %} {% raw %}
f4_364[~f4_364.iid.isin(['4_364_'+str(i) for i in range(14,20)])].to_csv('data/mwe_the_4_364_rm_4-5layers_fam.csv',header=False,index=False,sep='\t')
{% endraw %} {% raw %}
f4_364[~f4_364.iid.isin(['4_364_'+str(i) for i in range(17,20)])].to_csv('data/mwe_the_4_364_rm_5layers_fam.csv',header=False,index=False,sep='\t')
{% endraw %}

1.read ped

{% raw %}
ped = pd.read_csv('data/nodp_ped_with_depth_trim.csv',header=0)
{% endraw %} {% raw %}
ped[ped.fid.isin(trim_smry.index[trim_smry.before<4])].iloc[:,:-3].sort_values('fid').to_csv('data/mwe_3layers.csv',header=False,index=False,sep='\t')
{% endraw %} {% raw %}
trim_ped = ped[ped.trim==False]
{% endraw %} {% raw %}
trim_smry = pd.DataFrame([[ped.depth[ped.fid == i].max() for i in trim_ped.fid.unique()],[trim_ped.depth[trim_ped.fid == i].max() for i in trim_ped.fid.unique()]])
trim_smry = trim_smry.T
trim_smry.index = trim_ped.fid.unique()
trim_smry.columns = ['before','after']
trim_smry.hist()
array([[<AxesSubplot:title={'center':'before'}>,
        <AxesSubplot:title={'center':'after'}>]], dtype=object)
{% endraw %}

Output families with pedigree less than 4

{% raw %}
trim_ped[trim_ped.fid.isin(trim_smry.index[trim_smry.after<4])].iloc[:,:-3].sort_values('fid').to_csv('data/mwe_trimed_3layers.csv',header=False,index=False,sep='\t')
{% endraw %} {% raw %}
trim_lay3 = trim_ped[trim_ped.fid.isin(trim_smry.index[trim_smry.after<4])]
{% endraw %} {% raw %}
trim_faml20_fid = trim_lay3.fid.value_counts()<20
{% endraw %} {% raw %}
trim_ped[trim_ped.fid.isin(trim_faml20_fid[trim_faml20_fid].index)].iloc[:,:-3].sort_values('fid').to_csv('data/mwe_trimed_3layers_faml20.csv',header=False,index=False,sep='\t')
{% endraw %}

2.read vcf and annotation

{% raw %}
vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/vcf/full_sample.vcf.gz', chunksize=1000)
{% endraw %} {% raw %}
vcf
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 4_364_99 ... 8_64039_20 8_64039_25 8_64039_3 25_6_107C 25_22_491C 26_EGH_EGH64401 26_SW_SW27020 10R_R111_16 10R_R114_16 10R_R114_20
0 1 10140 chr1:10140:ACCCTAAC:A ACCCTAAC A . . PR GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
1 1 10146 chr1:10146:AC:A AC A . . PR GT 0/0 ... 0/0 0/0 0/1 0/0 0/0 0/0 0/1 0/0 1/1 0/1
2 1 10146 chr1:10146:ACC:* ACC * . . PR GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
3 1 10150 chr1:10150:CT:C CT C . . PR GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 ./. 0/0
4 1 10172 chr1:10172:CCCTAA:C CCCTAA C . . PR GT ./. ... 0/0 0/0 0/0 0/0 0/0 0/0 ./. 0/0 ./. ./.
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 1 66479 chr1:66479:TATTTATAG:* TATTTATAG * . . PR GT 0/0 ... ./. 0/0 ./. ./. 0/0 0/0 0/0 0/0 0/0 0/0
996 1 66480 chr1:66480:AT:A AT A . . PR GT 0/0 ... ./. 0/0 ./. ./. 0/0 0/0 0/0 0/0 0/0 0/0
997 1 66480 chr1:66480:ATTT:* ATTT * . . PR GT 0/0 ... ./. 0/0 ./. ./. 0/0 0/0 0/0 0/0 0/0 0/0
998 1 66481 chr1:66481:TTTATAGA:* TTTATAGA * . . PR GT 0/0 ... ./. 0/0 ./. ./. 0/0 0/0 ./. 0/0 0/0 ./.
999 1 66481 chr1:66481:TTTATAGA:T TTTATAGA T . . PR GT 0/0 ... ./. 0/0 ./. ./. 0/0 0/0 ./. 0/0 0/0 ./.

1000 rows × 3488 columns

{% endraw %}

2.1merge vcf and annotation

{% raw %}
import numpy as np
{% endraw %} {% raw %}
anno = read_anno_chunk('/home/yl4604/project/alzheimers-family/SMMAT/20210802/annotation/EFIGA_NIALOAD_chr1.hg38.hg38_multianno.csv',chunksize=1000)
{% endraw %} {% raw %}
af = anno.AF
{% endraw %} {% raw %}
af = af.replace('.',0.00001).astype(np.float64)
{% endraw %} {% raw %}
af.hist()
<AxesSubplot:>
{% endraw %} {% raw %}
vcf.INFO = ['AF='+str(i) for i in af]
{% endraw %} {% raw %}
vcf.to_csv('data/first1000snp_full_samples.vcf',sep='\t',header=True,index=False)
{% endraw %} bgzip first1000snp_full_samples.vcf tabix first1000snp_full_samples.vcf.gz