Tutorial

[1]:
import os
import sonia
from sonia.sonia_leftpos_rightpos import SoniaLeftposRightpos
from sonia.plotting import Plotter
from sonia.evaluate_model import EvaluateModel
from sonia.sequence_generation import SequenceGeneration
import numpy as np
import pandas as pd

load lists of sequences with gene specification

[2]:
# this assume data sequences are in semi-colon separated text file, with gene specification
data_seqs = list(pd.read_csv('data_seqs.csv.gz').values.astype(np.str))
data_seqs[:3]
[2]:
[array(['CASSKQGASEAFF', 'TRBV7-8', 'TRBJ1-1'], dtype='<U26'),
 array(['CASSPPPNYGYTF', 'TRBV6-1', 'TRBJ1-2'], dtype='<U26'),
 array(['CASSTDTTEAFF', 'TRBV6-5', 'TRBJ1-1'], dtype='<U26')]

define and infer model

[3]:
qm = SoniaLeftposRightpos(data_seqs=data_seqs,chain_type='humanTRB')
  1%|          | 1449/200000 [00:00<00:13, 14488.42it/s]
Encode data.
100%|██████████| 200000/200000 [00:14<00:00, 13918.58it/s]
[4]:
# add generated sequences (you can add them from file too, more is better.)
qm.add_generated_seqs(int(2e5))
  1%|          | 1824/200000 [00:00<00:21, 9184.30it/s]
Generate sequences.
100%|██████████| 200000/200000 [00:22<00:00, 8876.69it/s]
  1%|▏         | 2944/200000 [00:00<00:13, 14349.60it/s]
Encode gen.
100%|██████████| 200000/200000 [00:13<00:00, 14425.46it/s]
[5]:
#define and train model
qm.infer_selection(epochs=30)

do some plotting

[6]:
plot_sonia=Plotter(qm)
plot_sonia.plot_model_learning()
_images/sonia_tutorial_9_0.png
[7]:
plot_sonia.plot_vjl()
_images/sonia_tutorial_10_0.png
_images/sonia_tutorial_10_1.png
[8]:
plot_sonia.plot_logQ()
_images/sonia_tutorial_11_0.png

generate sequences

[9]:
gn=SequenceGeneration(qm)
[10]:
pre_seqs=gn.generate_sequences_pre(int(1e4))
pre_seqs[:3]
[10]:
[['CASTGLDTEAFF', 'TRBV9', 'TRBJ1-1', 'TGTGCCAGCACAGGGCTTGACACTGAAGCTTTCTTT'],
 ['CASSALPGQNLNTEAFF',
  'TRBV5-4',
  'TRBJ1-1',
  'TGTGCCAGCAGCGCCCTTCCCGGGCAAAATTTGAACACTGAAGCTTTCTTT'],
 ['CASSFLKEGALYGYTF',
  'TRBV11-3',
  'TRBJ1-2',
  'TGTGCCAGCAGCTTTCTCAAAGAGGGGGCCCTCTATGGCTACACCTTC']]
[11]:
post_seqs=gn.generate_sequences_post(int(1e4))
post_seqs[:3]
[11]:
array([['CAIVMVARIDTQYF', 'TRBV9', 'TRBJ2-3',
        'TGTGCCATTGTGATGGTAGCGAGGATAGATACGCAGTATTTT'],
       ['CASSPSSSLTNYGYTF', 'TRBV4-2', 'TRBJ1-2',
        'TGTGCCAGCAGCCCCAGTTCGAGCCTCACTAACTATGGCTACACCTTC'],
       ['CASSELLVWDRVGNEQFF', 'TRBV2', 'TRBJ2-1',
        'TGTGCCAGCAGTGAACTACTCGTCTGGGACAGGGTTGGCAATGAGCAGTTCTTC']],
      dtype='<U96')

evaluate sequences

[12]:
ev=EvaluateModel(qm)
[13]:
Q_data,pgen_data,ppost_data=ev.evaluate_seqs(qm.data_seqs[:int(1e4)])
Q_gen,pgen_gen,ppost_gen=ev.evaluate_seqs(pre_seqs)
Q_model,pgen_model,ppost_model=ev.evaluate_seqs(post_seqs)
print(Q_model[:3]),
print(pgen_model[:3])
print(ppost_model[:3])
[0.77306277 3.2957966  0.80261236]
[3.34611302e-16 2.71243768e-12 1.90287780e-14]
[2.58675539e-16 8.93964296e-12 1.52727325e-14]
[14]:
plot_sonia.plot_prob(data=pgen_data,gen=pgen_gen,model=pgen_model,ptype='P_{pre}')
_images/sonia_tutorial_19_0.png
[15]:
plot_sonia.plot_prob(ppost_data,ppost_gen,ppost_model,ptype='P_{post}')
_images/sonia_tutorial_20_0.png
[16]:
plot_sonia.plot_prob(Q_data,Q_gen,Q_model,ptype='Q',bin_min=-4,bin_max=2)
_images/sonia_tutorial_21_0.png

some utils from OLGA

[17]:
# evaluation and selection models import olga classes, that can be used for additional analyses.
[18]:
gn.seq_gen_model.gen_rnd_prod_CDR3()
[18]:
('TGTGCCAGCAGCTTATTGAGTACGGGACAGGGGCAAAGAGCTTTCTTT', 'CASSLLSTGQGQRAFF', 69, 0)
[19]:
gn.genomic_data.genJ[1]
[19]:
['TRBJ1-2*01',
 'CTAACTATGGCTACACCTTC',
 'CTAACTATGGCTACACCTTCGGTTCGGGGACCAGGTTAACCGTTGTAG']
[20]:
ev.pgen_model.PinsDJ
[20]:
array([6.17437e-02, 3.61889e-02, 9.09608e-02, 1.05828e-01, 1.37586e-01,
       1.14643e-01, 9.60481e-02, 8.14864e-02, 6.38634e-02, 4.92164e-02,
       3.93751e-02, 2.90524e-02, 2.30059e-02, 1.64381e-02, 1.45157e-02,
       1.13759e-02, 5.79127e-03, 5.97164e-03, 3.92779e-03, 2.96191e-03,
       2.04381e-03, 2.48417e-03, 9.09996e-04, 1.35102e-03, 2.44798e-04,
       4.52171e-04, 9.17052e-04, 6.28282e-04, 0.00000e+00, 1.41295e-05,
       9.74155e-04, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00])

load default sonia models

[21]:
# load default model (human TRA)
model_dir=os.path.join(os.path.dirname(sonia.sonia_leftpos_rightpos.__file__),'default_models','human_T_alpha')
qm=SoniaLeftposRightpos(load_dir=model_dir,chain_type='human_T_alpha')
Cannot find data_seqs.tsv  --  no data seqs loaded.
Cannot find gen_seqs.tsv  --  no generated seqs loaded.
[22]:
sq=SequenceGeneration(sonia_model=qm)
# generate seqs pre
seqs=sq.generate_sequences_pre(10)
# generate seqs post
seqs = sq.generate_sequences_post(10)
print(seqs)
[['CALSRVTGGGNKLTF' 'TRAV19' 'TRAJ10'
  'TGTGCTCTGTCCAGGGTCACGGGAGGAGGAAACAAACTCACCTTT']
 ['CALSDLRSDGQKLLF' 'TRAV9-2' 'TRAJ16'
  'TGTGCTCTGAGTGATCTACGGTCAGATGGCCAGAAGCTGCTCTTT']
 ['CAAKTGTASKLTF' 'TRAV13-1' 'TRAJ44'
  'TGTGCAGCAAAAACCGGCACTGCCAGTAAACTCACCTTT']
 ['CAMREVDTVSGGYNKLIF' 'TRAV14/DV4' 'TRAJ4'
  'TGTGCAATGAGAGAGGTAGATACGGTTTCTGGTGGCTACAATAAGCTGATTTTT']
 ['CAAIPNNNARLMF' 'TRAV23/DV6' 'TRAJ31'
  'TGTGCAGCAATCCCGAATAACAATGCCAGACTCATGTTT']
 ['CALEESSASKIIF' 'TRAV6' 'TRAJ3'
  'TGTGCTCTAGAAGAGAGCAGTGCTTCCAAGATAATCTTT']
 ['CAVRDPNQGGKLIF' 'TRAV1-1' 'TRAJ23'
  'TGCGCTGTGAGAGATCCGAACCAGGGAGGAAAGCTTATCTTC']
 ['CAVRVNNNARLMF' 'TRAV1-1' 'TRAJ31'
  'TGCGCTGTGAGAGTGAATAACAATGCCAGACTCATGTTT']
 ['CAVFGNAGGTSYGKLTF' 'TRAV36/DV7' 'TRAJ52'
  'TGTGCTGTGTTTGGTAATGCTGGTGGTACTAGCTATGGAAAGCTGACATTT']
 ['CAACSQGGSEKLVF' 'TRAV13-1' 'TRAJ57'
  'TGTGCAGCATGCTCTCAGGGCGGATCTGAAAAGCTGGTCTTT']]