--- title: Liftover module keywords: fastai sidebar: home_sidebar summary: "Liftover genodata and sumstat" description: "Liftover genodata and sumstat" nb_path: "nbs/02_Liftover.ipynb" ---
geno.bim.shape[0]
from pathlib import Path
from cugg.genodata import *
from cugg.sumstat import Sumstat
from cugg.liftover import Liftover
def main(input_path,output_path,fr='hg19',to='hg38',remove_missing=True,rename=True):
lf = Liftover(fr,to)
print("liftover from " + fr +"to" +to)
print("Removing SNPs failed to liftover is", remove_missing)
#file type detection, sumstats, plink, vcf,gvcf, >>>future bgen
input_path = Path(input_path)
input_suffixes = set(input_path.suffixes)
output_path = Path(output_path)
if not input_path.exists(): print("The file is not exist:", input_path)
if input_path.suffix in ['.bim','.bed','.fam']:
geno = Genodata(str(input_path.with_suffix('.bed')))
new_bim = lf.bim_liftover(geno.bim)
idx = new_bim.chrom == 0
if remove_missing:
geno.bim = new_bim
geno.extractbyidx(~idx)
geno.export_plink(output_path.with_suffix('.bed'))
print("Total number SNPs ",new_bim.shape[0],". Removing SNPs failed to liftover ", sum(idx))
else:
write_bim(output_path.with_suffix('.bim'),new_bim)
print("Total number SNPs ",new_bim.shape[0],". The number of SNPs failed to liftover ", sum(idx),". Their chr and pos is replaced with 0, 0")
elif len(input_suffixes.intersection(['.gvcf','.vcf']))>0:
lf.vcf_liftover(input_path,output_path,remove_missing)
else:
print("This file is considered as sumstat format file")
sums = Sumstat(input_path,rename=rename)
new_sums = lf.sumstat_liftover(sums.ss,rename)
idx = new_sums.CHR == 0
if remove_missing:
new_sums[~idx].to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)
print("Total number SNPs ",new_sums.shape[0],". Removing SNPs failed to liftover ", sum(idx))
else:
new_sums.to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)
print("Total number SNPs ",new_sums.shape[0],". The number of SNPs failed to liftover ", sum(idx),". Their chr and pos is replaced with 0, 0")
from glob import glob
from pathlib import Path
tmp = Path('data/GH.AR.SAD.P1.001.0_X3547_S42_1180478_GVCF.hard-filtered.gvcf.gz')
tmp.suffix[1:] in ['bim','bed','fam']
tmp.with_suffix('.bed')
lf = Liftover('hg19','hg38')
lf.chainmap[22][50549067]
vcf ='data/GH.AR.SAD.P1.001.0_X3547_S42_1180478_GVCF.hard-filtered.gvcf.gz'
main(vcf,'data/new_hg19_hg38_test.gvcf.gz',remove_missing=True)
lf.region_liftover([5,272741,1213528-900000])
lf.vcf_liftover(vcf)
59995/93816694
1
from cugg.genodata import Genodata
geno = Genodata('/mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/bfiles/full_sample.bed')
geno = Genodata('/mnt/mfs/statgen/guangyou/imputation/genome/othergenes/UKB_exome_othergenes.bed')
main('/mnt/mfs/statgen/guangyou/imputation/genome/othergenes/UKB_exome_othergenes.bed','test2.bed')
main('MWE_region_extraction/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats.gz','test_sumstats.sumstats.gz')
geno
geno.extractbyidx(~(geno.bim.chrom == 1))
geno.export_plink('test1.bed')
from cugg.sumstat import Sumstat
from cugg.liftover import Liftover
def gwas_liftover(input_path,output_path,output_unmapped,output_mapped,fr='hg19',to='hg38',remove_missing=False):
lf = Liftover(fr,to)
print("reading GWAS sumstat")
sums = Sumstat(input_path)
print("liftover from" + fr +"to" +to)
sums1 = lf.sumstat_liftover(sums.ss)
if remove_missing:
sums1[sums1.CHR == 0].to_csv(output_unmapped, compression='gzip', sep = "\t", header = True, index = False)
sums1[sums1.CHR != 0].to_csv(output_mapped, compression='gzip', sep = "\t", header = True, index = False)
else:
sums1.to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)
def gwas_liftover(input_file,output_path=None,fr='hg19',to='hg38',remove_missing=False):
if output_path is None:
output_path = os.path.dirname(input_file)+'/'
basename = os.path.basename(input_file)
lf = Liftover('hg19','hg38')
print("reading GWAS sumstat")
sums = Sumstat(input_path)
print("liftover from" + fr +"to" +to)
sums1 = lf.sumstat_liftover(sums.ss)
if remove_missing:
sums1[sums1.CHR == 0].to_csv(output_unmapped, compression='gzip', sep = "\t", header = True, index = False)
sums1[sums1.CHR != 0].to_csv(output_mapped, compression='gzip', sep = "\t", header = True, index = False)
else:
sums1.to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)
sumstats_lifted = f'{cwd}/{_input:bnn}.hg38.sumstats.gz',
sumstats_unmapped = f'{cwd}/{_input:bnn}.hg38.sumstats_unmapped.gz',
sumstats_mapped = f'{cwd}/{_input:bnn}.hg38.sumstats_mapped.gz'
import os
tmp = os.path.basename(input_path)
os.path.splitext(tmp)
os.path.dirname(input_path)+'/'
sums = Sumstat(input_path)
lf = Liftover('hg19','hg38')
lf.sumstat_liftover(sums.ss[:10])
def main(input_path,output_path,remove_missing):
sums = read_regenie(input_path)
sums1 = sumstat_liftover(sums)
if remove_missing:
sums1[sums1.CHR == 0].to_csv(output_path, sep = "\t", header = True, index = False)
else:
sums1.to_csv(output_path, sep = "\t", header = True, index = False)
input_path = '/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/2021_10_07_combined_500K/100521_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k_PC1_PC2_f2247_f2257.regenie.snp_stats.gz'
output_path = ''
remove_missing = True
main(input_path,output_path,remove_missing)
':'.join([1,'1'])
gen
region = [22,50519304,50549676]
exome_sumstats = Sumstat('/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/010522_f2247_hearing_diff_200K_imputed/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl_PC1_2_f2247.regenie.snp_stats.gz')
exome_sumstats.extractbyregion(region)
print('1.2. LiftOver the region')
hg38toimpref = Liftover('hg38','hg19')
imp_region = hg38toimpref.region_liftover(region)
imput_sumstats.extractbyregion(imp_region)
print('1.3. Regional SNPs Liftover')
impreftohg38 = Liftover(imp_ref,'hg38') #oppsite with hg38toimpref
impreftohg38.