--- title: Genodata module keywords: fastai sidebar: home_sidebar summary: "read and extract genodata" description: "read and extract genodata" nb_path: "nbs/00_Genodata.ipynb" ---
write_plink(geno,'test.bed')
from pandas_plink import read_plink1_bin
geno1 = Genodata('test.bed')
geno1
geno_path ='/home/dmc2245/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c1.merged.filtered.bed'
region = [5,272741,1213528-900000]
geno_path = 'MWE_region_extraction/ukb23156_c5.merged.filtered.5_272741_1213528.bed'
sumstats_path = 'MWE_region_extraction/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2_f3393.regenie.snp_stats'
pheno_path = None
unr_path = 'MWE_region_extraction/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.txt'
imp_geno_path = 'MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.bgen'
imp_sumstats_path = 'MWE_region_extraction/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats'
imp_ref = 'hg19'
output_sumstats = 'test.snp_stats'
output_LD = 'test_corr.csv'
#main(region,geno_path,sumstats_path,pheno_path,unr_path,imp_geno_path,imp_sumstats_path,imp_ref,output_sumstats,output_LD)
from pandas_plink import Chunk
Chunk(512,512)
exome_geno.extractbyvariants(exome_geno.bim.snp[:50])
exome_geno.extractbysamples(exome_geno.fam.iid[:60])
from cugg.sumstat import *
region = [5, 272741, 1213528]
imput_sumstats = Sumstat('/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/2021_10_07_f3393_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats.gz')
imput_sumstats.extractbyregion(region)
imput_sumstats
bgen = PyBGEN(geno_file)
sample_file = geno_file.replace('.bgen', '.sample')
if not os.path.isfile(sample_file):
if not os.path.isfile(${bgen_sample_path:r}):
raise ValueError(f"Cannot find the matching sample file ``{sample_file}`` for ``{geno_file}``.\nYou can specify path to sample file for all BGEN files using ``--bgen-sample-path``.")
else:
sample_file = ${bgen_sample_path:r}
bgen_fam = pd.read_csv(sample_file, header=0, delim_whitespace=True, quotechar='"',skiprows=1)
bgen_fam.columns = ['fid','iid','missing','sex']
geno = [bgen,bgen_fam]
bgen_sample_path = '/home/dmc2245/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb32285_imputedindiv.sample'
imput_geno = Genodata(imp_geno_path,bgen_sample_path)
imput_geno.extractbyregion(region)
imput_geno.extractbyvariants(list(imput_geno.bim.snp[10:20]))
imput_geno.extractbysamples(list(imput_geno.fam.iid[50:100]))
imput_geno
region
from pybgen import PyBGEN
bgen = PyBGEN(imp_geno_path,probs_only=True)
pybgen_region(bgen,region)
for t,g in bgen.iter_variants_in_region('0'+str(region[0]) if region[0]<10 else str(region[0]),region[1],region[2]):
print(t)
import pandas as pd
tmp = bgen.iter_variants()
genos = []
for i,v in zip(range(bgen.nb_variants),bgen):
geno = []
if i % 100000 ==0:
geno.append(v.argmax(axis=1).astype(np.int8))
print(i,j)
genos = []
n = len(index)
for i in range(0,n,step):
onecode_geno = bgen.read(index[i:min(n,i+step)]) #samples x variants
geno = onecode_geno.argmax(axis=2).astype(np.int8)
genos.append(da.from_array(geno))
1002 %10000
tmp
a = tmp.next()
import numpy as np
a[1]
aa = a[1].argmax(axis=1).astype(np.int8)
pd.Series(aa).value_counts()
tmp = []
for i,t in enumerate(bgen.iter_variant_info()):
tmp.append([int(t.chrom),t.name,0.0,t.pos,t.a1,t.a2,i])
tmp = pd.DataFrame(tmp,columns=['chrom','snp','cm','pos','a0','a1','i'])
tmp.snp = 'chr'+tmp[['chrom','pos','a0','a1']].astype(str).agg(':'.join, axis=1)
tmp
list(bgen.iter_variant_info())[0]
idx = imput_geno.idx
if type(list(idx)[0]) is bool:
pd_idx = pd.Series(idx)
idx = list(pd_idx[pd_idx].index)
len(idx)
idx[1:10]
imp_geno_path = 'MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.bgen'
bgen = open_bgen(imp_geno_path)
bgen.read(1)
imp_geno_path = '/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen'
bgen = open_bgen(imp_geno_path)
bgen.read(1)
imput_geno.bed
bgen
imput_geno.geno_in_stat(imput_sumstats.ss)
read_bgen('/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen')