import anndata as ad
import scanpy.api as sc
import episcanpy.api as epi
Download the raw peak matrices here: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111586
the data is composed of 2 replicates. 2 mouse brain.
## load the raw count matrix
path=''
file_name = 'GSM3034623_BoneMarrow_62216.peakmatrix.txt'
adata = epi.pp.read_mtx_bed(file_name) # terrible name... bed is not for a bed format but an output of
adata
# load sample metadata into .obs space
metadata_file = 'BoneMarrow62216_metadata.txt'
epi.pp.load_metadata(adata, metadata_file, separator='\t')
epi.pp.commoness_features(adata)
epi.pp.coverage_cells(adata)
# if the count matrix is not binary. Consider making it binary
epi.pp.binarize(adata)
adata.write('matrices/bone_marrow_62216_raw.h5ad')
adata = ad.read('matrices/bone_marrow_62216_raw.h5ad')
path=''
file_name2 = 'GSM3034622_BoneMarrow_62016.peakmatrix.txt'
adata2 = read_mtx_bed(file_name2)
adata2.write('matrices/bone_marrow_62016_raw.h5ad')
adata2 = ad.read('matrices/bone_marrow_62016_raw.h5ad')
metadata_file = 'BoneMarrow62016_metadata.txt'
epi.pp.load_metadata(adata2, metadata_file, separator='\t')
adata3 = ad.AnnData.concatenate(adata, adata2)
adata3.obs_names = list(adata.obs_names)+list(adata3.obs_names)
epi.pp.coverage_cells(adata3)
epi.pp.commoness_features(adata3)
adata3.write('matrices/bone_marrow_merged_62016_and_62216_raw.h5ad')
adata3 = ad.read('matrices/bone_marrow_merged_62016_and_62216_raw.h5ad')
# peaks shall be shared in at least 150 cells
adata4 = adata3[:,adata3.var["commonness"] >= 150]
adata4
epi.pp.coverage_cells(adata4, key_addded="sum_red_peaks")
# this is an important step. The first PC correspond to the peak coverage.
sc.pp.regress_out(adata4, "sum_red_peaks")
# despite the regression on the number of peak covered. Some cells are too lowly covered to be considered
# remove cells that have less than 500 peaks covered
adata5 = adata4[adata4.obs["sum_red_peaks"] >= 500,:]
adata5
# plot the resulting matrix and adjust filtering in case it looks odd
epi.pp.lazy(adata5, n_comps=100, n_neighbors=50, nb_pcs=20)
sc.tl.loucain(adata5)
sc.pl.umap(adata5, color="louvain")
# if it looks good, save the final filtered matrix:
adata5.write('../matrices/bone_marrow_62216_62016_com150_peaks_regressed_red_cov500_final_filtered.h5ad')
Now that you have a processed matrix you can attempt to identify cell types. For this check out the corresponding tutorial