--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "nbs/debug_info.ipynb" ---
{% raw %}
{% endraw %}
{% raw %}
import dask as da
import dask.dataframe as dd
{% endraw %} {% raw %}
import pandas as pd
import gzip

def get_vcf_names(vcf_path):
    with gzip.open(vcf_path, "rt") as ifile:
        for line in ifile:
            if line.startswith("#CHROM"):
                vcf_names = [x.strip() for x in line.split('\t')]
                break
    ifile.close()
    return vcf_names

def read_vcf_chunk(fn,chunksize=10):
    names = get_vcf_names(fn)
    try:
        vcf = pd.read_csv(fn, compression='gzip', comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
    except:
        vcf = pd.read_csv(fn, comment='#', chunksize=chunksize, delim_whitespace=True, header=None, names=names)
    df= pd.DataFrame(vcf.get_chunk(chunksize))
    return df
{% endraw %} {% raw %}
vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/vcf/full_sample.vcf.gz', chunksize=10000)
{% endraw %} {% raw %}
fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/bfiles/full_sample.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
{% endraw %} {% raw %}
sum(fam.iid == pd.Series(names[9:]))
3479
{% endraw %} {% raw %}
fam
fid iid father mother gender trait
0 4_364 4_364_99 4_364_1 4_364_2 2 2
1 4_44 4_44_3 4_44_1 4_44_2 2 2
2 27_104 27_104_62571 27_104_84753 27_104_84752 1 2
3 27_90 27_90_84583 27_90_84575 27_90_84574 2 2
4 27_90 27_90_84784 27_90_84575 27_90_84574 2 2
... ... ... ... ... ... ...
3474 26_EGH 26_EGH_EGH64401 0 0 2 -9
3475 26_SW 26_SW_SW27020 26_SW_SW27023 26_SW_SW27022 1 -9
3476 10R_R111 10R_R111_16 10R_R111_5 10R_R111_4 1 -9
3477 10R_R114 10R_R114_16 10R_R114_2 10R_R114_3 1 -9
3478 10R_R114 10R_R114_20 10R_R114_2 10R_R114_3 2 -9

3479 rows × 6 columns

{% endraw %}

?Questions for Chong: why do the last rows use characters? what is the meaning of last colnum.

{% raw %}
fam.shape
(3479, 6)
{% endraw %} {% raw %}
anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/rare_positions/full_sample_coding.hg38_multianno.txt',delim_whitespace=True)
/home/yh3455/miniconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (0,1) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
{% endraw %} {% raw %}
anno
Chr Start End Ref Alt AF AF_raw AF_male AF_female AF_afr AF_ami AF_amr AF_asj AF_eas AF_fin AF_nfe AF_oth AF_sas
0 ##INFO=<ID=PR,Number=0,Type=Flag,Description="... reference allele, may not . . . . . . . . . . . . .
1 #CHROM POS ID REF ALT . . . . . . . . . . . . .
2 1 69496 chr1:69496:G:A G A . . . . . . . . . . . . .
3 1 69590 chr1:69590:T:A T A . . . . . . . . . . . . .
4 1 69655 chr1:69655:G:C G C . . . . . . . . . . . . .
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
427088 22 50769522 chr22:50769522:T:G T G . . . . . . . . . . . . .
427089 22 50775824 chr22:50775824:C:T C T . . . . . . . . . . . . .
427090 22 50776669 chr22:50776669:C:A C A . . . . . . . . . . . . .
427091 22 50777958 chr22:50777958:T:G T G . . . . . . . . . . . . .
427092 22 50782243 chr22:50782243:C:T C T . . . . . . . . . . . . .

427093 rows × 18 columns

{% endraw %} {% raw %}
anno.AF.value_counts()
.    427093
Name: AF, dtype: int64
{% endraw %}

?Question for Chong. all of the AF are ., which means there is something wrong with the annotation.

{% raw %}
[anno[[i]].value_counts() for i in anno.columns[5:]]
[AF
 .     427093
 dtype: int64,
 AF_raw
 .         427093
 dtype: int64,
 AF_male
 .          427093
 dtype: int64,
 AF_female
 .            427093
 dtype: int64,
 AF_afr
 .         427093
 dtype: int64,
 AF_ami
 .         427093
 dtype: int64,
 AF_amr
 .         427093
 dtype: int64,
 AF_asj
 .         427093
 dtype: int64,
 AF_eas
 .         427093
 dtype: int64,
 AF_fin
 .         427093
 dtype: int64,
 AF_nfe
 .         427093
 dtype: int64,
 AF_oth
 .         427093
 dtype: int64,
 AF_sas
 .         427093
 dtype: int64]
{% endraw %} {% raw %}
pheno_full_sample_path = '/mnt/mfs/statgen/alzheimers-family/linkage_files/pheno/full_sample/'
{% endraw %} {% raw %}
efiga_pedigree.txt  full_sample_efi_nia.fam  full_sample_fam_id.txt  full_sample_fam_pop.txt  full_sample_id_list.txt  full_sample_pheno.txt  niaload_pedigree.txt
{% endraw %} {% raw %}
efiga = pd.read_csv(pheno_full_sample_path+'efiga_pedigree.txt',delim_whitespace=True)
{% endraw %} {% raw %}
efiga
ID SEX AD AGE APOE FATHID MOTHID
0 127_99 1.0 1 69.0 1.0 1 2
1 127_15 0.0 0 NaN NaN 0 0
2 127_14 1.0 0 NaN NaN 0 0
3 127_13 0.0 0 NaN NaN 0 0
4 127_12 1.0 0 NaN NaN 15 99
... ... ... ... ... ... ... ...
12765 359_153 0.0 0 NaN NaN 0 0
12766 359_154 1.0 0 NaN NaN 153 121
12767 359_155 0.0 0 NaN NaN 153 121
12768 359_156 0.0 0 NaN NaN 153 121
12769 359_157 0.0 0 NaN NaN 153 121

12770 rows × 7 columns

{% endraw %} {% raw %}
efi_nia_fam = pd.read_csv(pheno_full_sample_path+'full_sample_efi_nia.fam',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait'])
{% endraw %} {% raw %}
efi_nia_fam
fid iid father mother gender trait
0 127 127_8 1 2 2 2
1 127 127_7 1 2 2 1
2 127 127_6 1 2 2 2
3 127 127_5 1 2 2 2
4 127 127_4 1 2 1 2
... ... ... ... ... ... ...
3224 4_3747 4_3747_7 6 4 2 1
3225 4_3747 4_3747_8 6 4 1 1
3226 4_3747 4_3747_9 6 4 1 1
3227 4_3832 4_3832_3 1 2 2 1
3228 4_3832 4_3832_4 1 2 2 2

3229 rows × 6 columns

{% endraw %} {% raw %}
efi_nia_txt = pd.read_csv(pheno_full_sample_path+'full_sample_fam_id.txt',delim_whitespace=True,header=None,names = ['fid','iid','father','mother','gender','trait','id'])
{% endraw %} {% raw %}
efi_nia_txt
fid iid father mother gender trait id
0 4_364 4_364_99 4_364_1 4_364_2 2 2 02AD4427
1 4_44 4_44_3 4_44_1 4_44_2 2 2 02AD4429
2 27_104 27_104_62571 27_104_84753 27_104_84752 1 2 03AD4435
3 27_90 27_90_84583 27_90_84575 27_90_84574 2 2 03AD4437
4 27_90 27_90_84784 27_90_84575 27_90_84574 2 2 03AD4438
... ... ... ... ... ... ... ...
3474 26_EGH 26_EGH_EGH64401 0 0 2 -9 05AD8765
3475 26_SW 26_SW_SW27020 26_SW_SW27023 26_SW_SW27022 1 -9 07AD1732
3476 10R_R111 10R_R111_16 10R_R111_5 10R_R111_4 1 -9 10AD23886
3477 10R_R114 10R_R114_16 10R_R114_2 10R_R114_3 1 -9 10AD32608
3478 10R_R114 10R_R114_20 10R_R114_2 10R_R114_3 2 -9 10AD32610

3479 rows × 7 columns

{% endraw %} {% raw %}
fam_pop = pd.read_csv(pheno_full_sample_path+'full_sample_fam_pop.txt',delim_whitespace=True,header=None,names = ['fid','pop'])
{% endraw %} {% raw %}
fam_pop
fid pop
0 4_364 AF_nfe
1 4_44 AF_afr
2 27_104 AF_nfe
3 27_90 AF_nfe
4 4_92 AF_nfe
... ... ...
1063 4_499 AF_afr
1064 5_26170 AF_nfe
1065 6_1103 AF_nfe
1066 25_6 AF_nfe
1067 26_EGH AF_nfe

1068 rows × 2 columns

{% endraw %} {% raw %}
sample_id_list = pd.read_csv(pheno_full_sample_path+'full_sample_id_list.txt',delim_whitespace=True,header=None,names = ['id'])
{% endraw %} {% raw %}
sample_id_list
id
0 02AD4427
1 02AD4429
2 03AD4435
3 03AD4437
4 03AD4438
... ...
3474 05AD8765
3475 07AD1732
3476 10AD23886
3477 10AD32608
3478 10AD32610

3479 rows × 1 columns

{% endraw %} {% raw %}
sample_pheno = pd.read_csv(pheno_full_sample_path+'full_sample_pheno.txt',sep='\t')
{% endraw %} {% raw %}
sample_pheno
project/alzheimers-family/linkage_files/pheno/full_sample/niaload_pedigree.txtIID ID ProjectID Columbia_Description SEX AD AGE APOE RACE STUDY FID pop super_pop
0 02AD4427 4_364_99 CCDG_12711 NYGC2 F 2.0 71.0 1.0 1.0 NIALOAD 4_364 European European
1 02AD4429 4_44_3 CCDG_12711 NYGC2 F 2.0 90.0 1.0 2.0 NIALOAD 4_44 African African
2 03AD4435 27_104_62571 CCDG_11949 NYGC1 M 2.0 74.0 1.0 1.0 NIALOAD 27_104 European European
3 03AD4437 27_90_84583 CCDG_11949 NYGC1 F 2.0 71.0 1.0 1.0 NIALOAD 27_90 European European
4 03AD4438 27_90_84784 CCDG_11949 NYGC1 F 2.0 85.0 0.0 1.0 NIALOAD 27_90 European European
... ... ... ... ... ... ... ... ... ... ... ... ... ...
3474 05AD8765 26_EGH_EGH64401 CCDG_11949 NYGC1 F NaN NaN 1.0 NaN NIALOAD 26_EGH European European
3475 07AD1732 26_SW_SW27020 CCDG_11949 NYGC1 M NaN NaN 0.0 2.0 NIALOAD 26_SW European European
3476 10AD23886 10R_R111_16 CCDG_12711 NYGC2 M NaN 69.0 0.0 2.0 NIALOAD 10R_R111 European European
3477 10AD32608 10R_R114_16 CCDG_12711 NYGC2 M NaN 63.0 1.0 2.0 NIALOAD 10R_R114 European European
3478 10AD32610 10R_R114_20 CCDG_12711 NYGC2 F NaN 61.0 1.0 2.0 NIALOAD 10R_R114 European European

3479 rows × 13 columns

{% endraw %} {% raw %}
sample_pheno.describe(include='all')
project/alzheimers-family/linkage_files/pheno/full_sample/niaload_pedigree.txtIID ID ProjectID Columbia_Description SEX AD AGE APOE RACE STUDY FID pop super_pop
count 3479 3479 3479 3479 3479 3472.000000 3372.000000 3365.000000 3478.000000 3479 3479 3479 3479
unique 3479 3479 8 8 2 NaN NaN NaN NaN 3 1068 4 4
top 02AD4427 4_364_99 CCDG_12711 NYGC2 F NaN NaN NaN NaN EFIGA 3761 Hispanic Hispanic
freq 1 1 1172 1172 2180 NaN NaN NaN NaN 2059 47 2133 2133
mean NaN NaN NaN NaN NaN 1.339286 91.765563 0.660921 2.480161 NaN NaN NaN NaN
std NaN NaN NaN NaN NaN 1.612784 366.352308 0.690971 3.706756 NaN NaN NaN NaN
min NaN NaN NaN NaN NaN -9.000000 35.000000 0.000000 1.000000 NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN 1.000000 66.000000 0.000000 1.000000 NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN 2.000000 73.000000 1.000000 3.000000 NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN 2.000000 80.000000 1.000000 3.000000 NaN NaN NaN NaN
max NaN NaN NaN NaN NaN 2.000000 8070.000000 2.000000 99.000000 NaN NaN NaN NaN
{% endraw %} {% raw %}
nia_ped = pd.read_csv(pheno_full_sample_path+'niaload_pedigree.txt',delim_whitespace=True)
{% endraw %} {% raw %}
nia_ped
Sample_ID FID Gender MOTHID FATHID prob.AD APOE APOE4NUM AAO_AgeLastSeen
0 10J_103_1 10J_103 M 3 10 2 33 0.0 69.0
1 10J_103_10 10J_103 M 0 0 -9 NANA NaN NaN
2 10J_103_2 10J_103 F 3 4 2 34 1.0 71.0
3 10J_103_3 10J_103 F 0 0 -9 NANA NaN NaN
4 10J_103_4 10J_103 M 0 0 -9 NANA NaN NaN
... ... ... ... ... ... ... ... ... ...
6303 26_CJE_CJE43421 26_CJE M CJE43406 -9 NANA NaN NaN NaN
6304 26_CBL_CBL12416 26_CBL F CBL12403 -9 NANA NaN NaN NaN
6305 26_ARH_ARH05007 26_ARH F ARH05004 -9 NANA NaN NaN NaN
6306 26_RBR_RBR22809 26_RBR F RBR22801 -9 NANA NaN NaN NaN
6307 26_GRU_GRU03008 26_GRU F GRU03004 -9 NANA NaN NaN NaN

6308 rows × 9 columns

{% endraw %}

sample_i

{% raw %}
s1_vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.vcf.gz')
{% endraw %} {% raw %}
s1_vcf
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 4_553_5 ... 756_43 3745_31 3745_36 3745_35 3745_33 4_558_15 8_64039_3 280_12 770_13 770_14
0 1 69496 chr1:69496:G:A G A . . .;ANNOVAR_DATE=2019-10-24;AF=0.0024;AF_raw=0.0... GT 0/0 ... ./. 0/0 ./. ./. ./. 0/0 ./. ./. 0/0 ./.
1 1 69590 chr1:69590:T:A T A . . .;ANNOVAR_DATE=2019-10-24;AF=0.0005;AF_raw=0.0... GT 0/0 ... ./. 0/0 ./. ./. ./. 0/0 0/0 ./. ./. 0/0
2 1 69655 chr1:69655:G:C G C . . .;ANNOVAR_DATE=2019-10-24;AF=4.253e-05;AF_raw=... GT 0/0 ... ./. 0/0 ./. ./. ./. 0/0 ./. 0/0 ./. 0/0
3 1 139849 chr1:139849:T:C T C . . .;ANNOVAR_DATE=2019-10-24;AF=9.367e-06;AF_raw=... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
4 1 182735 chr1:182735:C:A C A . . .;ANNOVAR_DATE=2019-10-24;AF=8.027e-05;AF_raw=... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
5 1 183188 chr1:183188:A:C A C . . .;ANNOVAR_DATE=2019-10-24;AF=0.0017;AF_raw=0.0... GT 0/0 ... 0/0 0/0 ./. ./. 0/0 0/0 0/0 0/0 0/0 0/0
6 1 183189 chr1:183189:G:C G C . . .;ANNOVAR_DATE=2019-10-24;AF=0.0229;AF_raw=0.0... GT 0/0 ... 0/0 0/0 ./. ./. 0/0 0/0 0/0 0/0 0/0 0/0
7 1 183199 chr1:183199:G:A G A . . .;ANNOVAR_DATE=2019-10-24;AF=0.0002;AF_raw=0.0... GT 0/0 ... ./. 0/0 ./. ./. 0/0 0/0 0/0 0/0 0/0 0/0
8 1 183205 chr1:183205:G:C G C . . .;ANNOVAR_DATE=2019-10-24;AF=.;AF_raw=.;AF_mal... GT 0/0 ... ./. 0/0 ./. ./. 0/0 0/0 0/0 0/0 0/0 0/0
9 1 183220 chr1:183220:A:G A G . . .;ANNOVAR_DATE=2019-10-24;AF=7.051e-06;AF_raw=... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0

10 rows × 175 columns

{% endraw %} {% raw %}
s1_fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
{% endraw %} {% raw %}
s1_fam
fid iid father mother gender trait
0 4_553 4_553_5 0 0 1 1
1 4_558 4_558_34 0 0 1 1
2 4_558 4_558_33 0 0 2 0
3 4_558 4_558_11 0 0 1 1
4 4_558 4_558_99 0 0 1 1
... ... ... ... ... ... ...
161 4_558 4_558_15 4_558_99 4_558_33 2 0
162 8_64039 8_64039_3 8_64039_2 8_64039_1 1 0
163 280 280_12 280_13 280_9 1 1
164 770 770_13 770_99 770_10 1 1
165 770 770_14 770_99 770_10 1 1

166 rows × 6 columns

{% endraw %} {% raw %}
s1_anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_i/rare_positions/sample_i_coding.hg38_multianno.txt',delim_whitespace=True)
/home/yh3455/miniconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (18) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
{% endraw %} {% raw %}
s1_anno
Chr Start End Ref Alt AF AF_raw AF_male AF_female AF_afr ... Otherinfo169 Otherinfo170 Otherinfo171 Otherinfo172 Otherinfo173 Otherinfo174 Otherinfo175 Otherinfo176 Otherinfo177 Otherinfo178
0 1 69496 69496 G A 0.0024 0.0022 0.0026 0.0022 0.0062 ... ./. 0/0 ./. ./. ./. 0/0 ./. ./. 0/0 ./.
1 1 69590 69590 T A 0.0005 0.0005 0.0006 0.0004 0.0001 ... ./. 0/0 ./. ./. ./. 0/0 0/0 ./. ./. 0/0
2 1 69655 69655 G C 4.253e-05 8.251e-05 3.033e-05 5.324e-05 0.0001 ... ./. 0/0 ./. ./. ./. 0/0 ./. 0/0 ./. 0/0
3 1 139849 139849 T C 9.367e-06 1.425e-05 0 1.779e-05 0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
4 1 182735 182735 C A 8.027e-05 7.724e-05 7.531e-05 8.493e-05 2.675e-05 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
427086 22 50769522 50769522 T G 0.0043 0.0043 0.0043 0.0043 0.0143 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
427087 22 50775824 50775824 C T 2.791e-05 2.789e-05 2.881e-05 2.708e-05 0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
427088 22 50776669 50776669 C A 3.488e-05 3.487e-05 1.44e-05 5.415e-05 2.378e-05 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
427089 22 50777958 50777958 T G 0.0055 0.0055 0.0056 0.0055 0.0184 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
427090 22 50782243 50782243 C T 0.0004 0.0004 0.0003 0.0004 0.0001 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0

427091 rows × 196 columns

{% endraw %}

sample_ii

{% raw %}
s2_vcf = read_vcf_chunk('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_ii/rare_positions/sample_ii_coding.hg38_multianno.vcf.gz')
{% endraw %} {% raw %}
s2_vcf
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 4_515_3 ... 4_515_69 4_515_78 4_515_8 4_515_9 4_515_99 4_558_15 4_558_22 4_558_27 4_558_31 4_558_42
0 1 69496 chr1:69496:G:A G A . . .;ANNOVAR_DATE=2019-10-24;AF=0.0024;AF_raw=0.0... GT 0/0 ... ./. 0/0 ./. ./. ./. ./. 0/0 0/0 0/0 ./.
1 1 69590 chr1:69590:T:A T A . . .;ANNOVAR_DATE=2019-10-24;AF=0.0005;AF_raw=0.0... GT 0/0 ... ./. 0/0 ./. ./. ./. ./. 0/0 0/0 0/0 ./.
2 1 69655 chr1:69655:G:C G C . . .;ANNOVAR_DATE=2019-10-24;AF=4.253e-05;AF_raw=... GT 0/0 ... ./. 0/0 ./. ./. ./. 0/0 0/0 0/0 0/0 ./.
3 1 139849 chr1:139849:T:C T C . . .;ANNOVAR_DATE=2019-10-24;AF=9.367e-06;AF_raw=... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
4 1 182735 chr1:182735:C:A C A . . .;ANNOVAR_DATE=2019-10-24;AF=8.027e-05;AF_raw=... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
5 1 183188 chr1:183188:A:C A C . . .;ANNOVAR_DATE=2019-10-24;AF=0.0017;AF_raw=0.0... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
6 1 183189 chr1:183189:G:C G C . . .;ANNOVAR_DATE=2019-10-24;AF=0.0229;AF_raw=0.0... GT 0/0 ... 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0 0/0 0/0
7 1 183199 chr1:183199:G:A G A . . .;ANNOVAR_DATE=2019-10-24;AF=0.0002;AF_raw=0.0... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
8 1 183205 chr1:183205:G:C G C . . .;ANNOVAR_DATE=2019-10-24;AF=.;AF_raw=.;AF_mal... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
9 1 183220 chr1:183220:A:G A G . . .;ANNOVAR_DATE=2019-10-24;AF=7.051e-06;AF_raw=... GT 0/0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0

10 rows × 231 columns

{% endraw %} {% raw %}
s2_fam = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/pheno/sample_ii/small_sample_ii.fam',delim_whitespace=True, header=None,names = ['fid','iid','father','mother','gender','trait'])
{% endraw %} {% raw %}
s2_fam
fid iid father mother gender trait
0 4_515 4_515_3 4_515_25 4_515_7 2.0 1
1 4_515 4_515_29 4_515_30 4_515_99 2.0 1
2 4_515 4_515_11 4_515_1 4_515_2 2.0 2
3 4_515 4_515_48 4_515_45 4_515_49 2.0 1
4 4_515 4_515_19 4_515_5 4_515_20 2.0 1
... ... ... ... ... ... ...
1205 4_558 4_558_AAAA 0 0 1.0 1
1206 4_558 4_558_CCCC 0 0 2.0 1
1207 4_558 4_558_BBBB 0 0 1.0 1
1208 10R_R78 10R_R78_55 10R_R78_12 10R_R78_47 1.0 -9
1209 10R_R78 10R_R78_56 10R_R78_12 10R_R78_47 1.0 -9

1210 rows × 6 columns

{% endraw %} {% raw %}
s2_anno = pd.read_csv('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/sample_ii/rare_positions/sample_ii_coding.hg38_multianno.txt',delim_whitespace=True)
/home/yh3455/miniconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (18) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
{% endraw %} {% raw %}
s2_anno
Chr Start End Ref Alt AF AF_raw AF_male AF_female AF_afr ... Otherinfo225 Otherinfo226 Otherinfo227 Otherinfo228 Otherinfo229 Otherinfo230 Otherinfo231 Otherinfo232 Otherinfo233 Otherinfo234
0 1 69496 69496 G A 0.0024 0.0022 0.0026 0.0022 0.0062 ... ./. 0/0 ./. ./. ./. ./. 0/0 0/0 0/0 ./.
1 1 69590 69590 T A 0.0005 0.0005 0.0006 0.0004 0.0001 ... ./. 0/0 ./. ./. ./. ./. 0/0 0/0 0/0 ./.
2 1 69655 69655 G C 4.253e-05 8.251e-05 3.033e-05 5.324e-05 0.0001 ... ./. 0/0 ./. ./. ./. 0/0 0/0 0/0 0/0 ./.
3 1 139849 139849 T C 9.367e-06 1.425e-05 0 1.779e-05 0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
4 1 182735 182735 C A 8.027e-05 7.724e-05 7.531e-05 8.493e-05 2.675e-05 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
427086 22 50769522 50769522 T G 0.0043 0.0043 0.0043 0.0043 0.0143 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
427087 22 50775824 50775824 C T 2.791e-05 2.789e-05 2.881e-05 2.708e-05 0 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
427088 22 50776669 50776669 C A 3.488e-05 3.487e-05 1.44e-05 5.415e-05 2.378e-05 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
427089 22 50777958 50777958 T G 0.0055 0.0055 0.0056 0.0055 0.0184 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
427090 22 50782243 50782243 C T 0.0004 0.0004 0.0003 0.0004 0.0001 ... 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0

427091 rows × 252 columns

{% endraw %} {% raw %}
s2_fam[s2_fam.duplicated()]
fid iid father mother gender trait
222 167 167_99 167_1 167_2 1.0 2
231 167 167_11 167_4 167_12 2.0 1
234 167 167_8 167_1 167_2 1.0 2
236 167 167_6 167_1 167_2 1.0 2
238 167 167_4 167_1 167_2 1.0 2
... ... ... ... ... ... ...
1164 4_558 4_558_72 4_558_68 4_558_69 2.0 2
1165 4_558 4_558_73 4_558_75 4_558_72 2.0 1
1166 4_558 4_558_74 4_558_68 4_558_69 2.0 1
1172 4_558 4_558_8 4_558_1 4_558_2 1.0 1
1192 4_558 4_558_99 4_558_1 4_558_2 1.0 2

216 rows × 6 columns

{% endraw %} {% raw %}
s2_vcf.columns[9:]
Index(['4_515_3', '4_515_29', '4_515_11', '4_515_48', '4_515_19', '4_515_50',
       '4_558_34', '4_558_33', '4_558_8', '4_558_11',
       ...
       '4_515_69', '4_515_78', '4_515_8', '4_515_9', '4_515_99', '4_558_15',
       '4_558_22', '4_558_27', '4_558_31', '4_558_42'],
      dtype='object', length=222)
{% endraw %} {% raw %}
len(s2_vcf.columns[9:])
222
{% endraw %} {% raw %}
s2_fam[210:230]
fid iid father mother gender trait
210 4_515 4_515_36 4_515_37 4_515_10 2.0 1
211 4_515 4_515_5 4_515_1 4_515_2 1.0 2
212 4_515 4_515_69 4_515_64 4_515_65 1.0 2
213 4_515 4_515_78 4_515_70 4_515_66 2.0 1
214 4_515 4_515_8 4_515_1 4_515_2 2.0 2
215 4_515 4_515_9 4_515_1 4_515_2 2.0 2
216 4_515 4_515_99 4_515_1 4_515_2 2.0 2
217 4_558 4_558_15 4_558_99 4_558_33 2.0 1
218 4_558 4_558_22 4_558_4 4_558_35 1.0 1
219 4_558 4_558_27 4_558_8 4_558_38 2.0 1
220 4_558 4_558_31 4_558_39 4_558_10 1.0 1
221 4_558 4_558_42 4_558_59 4_558_9 2.0 1
222 167 167_99 167_1 167_2 1.0 2
223 167 167_22 167_99 167_19 2.0 0
224 167 167_21 167_99 167_19 1.0 0
225 167 167_20 167_99 167_19 2.0 0
226 167 167_19 0 0 2.0 0
227 167 167_18 167_5 167_17 1.0 0
228 167 167_14 0 0 2.0 0
229 167 167_13 0 0 1.0 0
{% endraw %} {% raw %}
s2_fam.drop_duplicates().to
fid iid father mother gender trait
0 4_515 4_515_3 4_515_25 4_515_7 2.0 1
1 4_515 4_515_29 4_515_30 4_515_99 2.0 1
2 4_515 4_515_11 4_515_1 4_515_2 2.0 2
3 4_515 4_515_48 4_515_45 4_515_49 2.0 1
4 4_515 4_515_19 4_515_5 4_515_20 2.0 1
... ... ... ... ... ... ...
1205 4_558 4_558_AAAA 0 0 1.0 1
1206 4_558 4_558_CCCC 0 0 2.0 1
1207 4_558 4_558_BBBB 0 0 1.0 1
1208 10R_R78 10R_R78_55 10R_R78_12 10R_R78_47 1.0 -9
1209 10R_R78 10R_R78_56 10R_R78_12 10R_R78_47 1.0 -9

994 rows × 6 columns

{% endraw %} {% raw %}
s2_fam.to_csv('/mnt/mfs/statgen/alzheimers-family/yhseqlink/data/MWE/sample2_uniq.fam',header=False,index=False,sep='\t')
{% endraw %}

coding_region_rare_variant_positions.txt

  1. why do you choose these snps?

  2. vcf hg19? hg38?