import mira #import mira, I've adjusted the namespaces so that they are more convenient to use
import scanpy as sc
import numpy as np
import anndata
import pandas as pd
import logging
logging.getLogger().setLevel(logging.ERROR)
mira.utils.wide_view() # new namespace
Same as before
rna_data = sc.read_10x_h5('/Users/alynch/Dropbox (Partners HealthCare)/Data/mouse_prostate/2021-06-01_mouse_prostate_raw_10x_features.h5')
rna_data.var_names = rna_data.var_names.str.upper()
rna_data.var_names_make_unique()
rna_data = rna_data[:, ~rna_data.var.index.str.startswith('GM')]
sc.pp.filter_genes(rna_data, min_cells=15)
sc.pp.filter_cells(rna_data, min_genes=400)
sc.pp.filter_cells(rna_data, min_counts=800)
sc.pp.filter_genes(rna_data, min_cells=25)
rna_data.raw = rna_data
sc.pp.normalize_total(rna_data, 1e4)
sc.pp.log1p(rna_data)
sc.pp.highly_variable_genes(rna_data, min_disp = -0.5, n_bins=20)
rna_data.var['exog_feature'] = rna_data.var.highly_variable
rna_data.var.highly_variable = (rna_data.var.dispersions_norm > 0.5) & rna_data.var.exog_feature
rna_data.layers['norm'] = rna_data.X
rna_data.X = rna_data.raw.X
rna_data.var.highly_variable.sum(), rna_data.var.exog_feature.sum()
Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Trying to set attribute `.var` of view, copying.
(2524, 7632)
rna_model = mira.topic_model.ExpressionTopicModel( #new namespace
exogenous_key='exog_feature',
endogenous_key='highly_variable',
)
rna_model.get_learning_rate_bounds(rna_data) #learning rate bounds moved to model instead of trainer
Learning rate range test: 100%|████████████████████████████████████████████████████████████████████████████████| 27/27 [00:14<00:00, 1.85it/s]
(0.0013553956963741314, 0.15950445631422783)
rna_model.trim_learning_rate_bounds(1.5, 0.5)
rna_model.plot_learning_rate_bounds(figsize=(5,3))
<AxesSubplot:xlabel='Learning Rate', ylabel='Loss'>
tuner = mira.topic_model.TopicModelTuner(rna_model) # new namespace, provide the tuner with a model which you've specified (like a scikit-learn CV construct)
tuner.train_test_split(rna_data, train_size = 0.8) # new function, labels cells in the adata with a random test set
tuner.tune(rna_data) #stopped trials early just for demonstration
Trials finished 2 Modules | Trials (number is #folds tested) 7 | 0 48 | 0 Trial Information: Trial #0 | pruned at step: 0 | params: {'num_topics': 48, 'batch_size': 128, 'encoder_dropout': 0.07558272160242445, 'num_epochs': 21, 'beta': 0.9523042471222197, 'seed': 2797517746} Trial #1 | pruned at step: 0 | params: {'num_topics': 7, 'batch_size': 128, 'encoder_dropout': 0.012828034025943614, 'num_epochs': 22, 'beta': 0.9363667448175782, 'seed': 3820114806} Next trial progress: |##########|# | | | |
<optuna.study.study.Study at 0x7fc3fe60ad10>
rna_model = tuner.select_best_model(rna_data)
Epoch 3 done. Recent losses: 1.333e+00 --> 1.137e+00 --> 9.058e-01: 14%|█████▎ | 3/22 [00:07<00:47, 2.49s/it] Epoch 2 done. Recent losses: 1.336e+00 --> 1.168e+00: 9%|████▌ | 2/23 [00:04<00:47, 2.26s/it] Epoch 2 done. Recent losses: 1.235e+00 --> 9.743e-01: 5%|██▋ | 2/39 [00:05<01:36, 2.61s/it] Epoch 1 done. Recent losses: 1.342e+00: 5%|███ | 1/22 [00:04<01:24, 4.03s/it]
rna_model = mira.topic_model.ExpressionTopicModel( # training a real model for demonstration purposes
exogenous_key='exog_feature',
endogenous_key='highly_variable',
num_topics = 12,
encoder_dropout=0.05,
num_epochs=30,batch_size=32,beta=0.95,
max_learning_rate=0.15950445631422783,
min_learning_rate=0.006074462078423195,
).fit(rna_data)
Epoch 31 done. Recent losses: 2.942e-01 --> 2.939e-01 --> 2.938e-01 --> 2.937e-01 --> 2.937e-01: 97%|████████▋| 30/31 [01:20<00:02, 2.67s/it]
rna_model.save('test.pth') #save model
rna_model = mira.topic_model.ExpressionTopicModel.load('test.pth') #load model
rna_model.predict(rna_data)
rna_model.get_umap_features(rna_data)
rna_model.impute(rna_data)
Predicting latent vars: 100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 8.30it/s] Predicting latent vars: 100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 8.44it/s] Imputing features: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 107.36it/s]
sc.pp.neighbors(rna_data, use_rep='X_umap_features', metric='manhattan')
sc.tl.umap(rna_data, min_dist = 0.1)
sc.pl.umap(rna_data, color=rna_model.topic_cols, frameon=False, ncols=6)
rna_model.post_topic(6, top_n=250)
rna_model.fetch_topic_enrichments(6)
rna_model.plot_enrichments(6, gene_fontsize=8, barcolor = 'white', color_by_adj=True, palette='Reds') #Bars are outlined by adjusted p-value, will add way to adjust the
#upper limit of the palette later. Maybe it should be like 5 or 3
atac_data = anndata.read_h5ad('/Users/alynch/Dropbox (Partners HealthCare)/Data/mouse_prostate/atac_data.h5ad')
atac_model = mira.topic_model.AccessibilityTopicModel() # no highly variable selection so you can just initialize it w/o any params
for col, vals in zip( ['chr','start','end'], list(zip(*atac_data.var.resetindex()['name'].str.split('')))): atac_data.var[col]=vals
tss_data = pd.read_csv('/Users/alynch/genomes/mm10/canonical_tss.tsv', sep = '\t') tss_data.columns = tss_data.columns.str.split('.').str.get(-1) tss_data['geneSymbol'] = tss_data.geneSymbol.str.upper() tss_data = tss_data.dropna()
mira.tools.get_distance_to_TSS(atac_data, tss_data = tss_data, gene_chrom='chrom', gene_end='chromEnd', gene_start='chromStart', gene_strand='strand', genome_file='/Users/alynch/genomes/mm10/mm10.genome')