Source code for dscript.models.embedding

from __future__ import print_function,division

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import PackedSequence

[docs]class IdentityEmbed(nn.Module): """ Does not reduce the dimension of the language model embeddings, just passes them through to the contact model. """
[docs] def forward(self, x): """ :param x: Input language model embedding :math:`(b \\times N \\times d_0)` :type x: torch.Tensor :return: Same embedding :rtype: torch.Tensor """ return x
[docs]class FullyConnectedEmbed(nn.Module): """ Protein Projection Module. Takes embedding from language model and outputs low-dimensional interaction aware projection. :param nin: Size of language model output :type nin: int :param nout: Dimension of projection :type nout: int :param dropout: Proportion of weights to drop out [default: 0.5] :type dropout: float :param activation: Activation for linear projection model :type activation: torch.nn.Module """ def __init__(self, nin, nout, dropout=0.5, activation=nn.ReLU()): super(FullyConnectedEmbed, self).__init__() self.nin = nin self.nout = nout self.dropout_p = dropout self.transform = nn.Linear(nin, nout) self.drop = nn.Dropout(p = self.dropout_p) self.activation = activation
[docs] def forward(self, x): """ :param x: Input language model embedding :math:`(b \\times N \\times d_0)` :type x: torch.Tensor :return: Low dimensional projection of embedding :rtype: torch.Tensor """ t = self.transform(x) t = self.activation(t) t = self.drop(t) return t
[docs]class LSTMEmbed(nn.Module): def __init__(self, nout, activation='ReLU', sparse=False, p=0.5): super(LSTMEmbed, self).__init__() self.activation = activation self.sparse = sparse self.p = p self.embedding = SkipLSTM(21, nout, 1024, 3) self.embedding.load_state_dict(torch.load(EMBEDDING_STATE_DICT)) for param in self.embedding.parameters(): param.requires_grad = False torch.nn.init.normal_(self.embedding.proj.weight) torch.nn.init.uniform_(self.embedding.proj.bias, 0, 0) self.embedding.proj.weight.requires_grad = True self.embedding.proj.bias.requires_grad = True self.activationDict = nn.ModuleDict({ 'None': IdentityEmbed(), 'ReLU': nn.ReLU(), 'Sigmoid': nn.Sigmoid() }) self.dropout = nn.Dropout(p=self.p)
[docs] def forward(self, x): t = self.embedding(x) if self.activation: t = self.activationDict[self.activation](t) if self.sparse: t = self.dropout(t) return t
[docs] def long_embed(self, x): return self.embedding.transform(x)
[docs]class SkipLSTM(nn.Module): """ Language model from `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_. Loaded with pre-trained weights in embedding function. :param nin: Input dimension of amino acid one-hot [default: 21] :type nin: int :param nout: Output dimension of final layer [default: 100] :type nout: int :param hidden_dim: Size of hidden dimension [default: 1024] :type hidden_dim: int :param num_layers: Number of stacked LSTM models [default: 3] :type num_layers: int :param dropout: Proportion of weights to drop out [default: 0] :type dropout: float :param bidirectional: Whether to use biLSTM vs. LSTM :type bidirectional: bool """ def __init__(self, nin, nout, hidden_dim, num_layers, dropout=0, bidirectional=True): super(SkipLSTM, self).__init__() self.nin = nin self.nout = nout self.dropout = nn.Dropout(p=dropout) self.layers = nn.ModuleList() dim = nin for i in range(num_layers): f = nn.LSTM(dim, hidden_dim, 1, batch_first=True, bidirectional=bidirectional) self.layers.append(f) if bidirectional: dim = 2*hidden_dim else: dim = hidden_dim n = hidden_dim*num_layers + nin if bidirectional: n = 2*hidden_dim*num_layers + nin self.proj = nn.Linear(n, nout)
[docs] def to_one_hot(self, x): """ Transform numeric encoded amino acid vector to one-hot encoded vector :param x: Input numeric amino acid encoding :math:`(N)` :type x: torch.Tensor :return: One-hot encoding vector :math:`(N \\times n_{in})` :rtype: torch.Tensor """ packed = type(x) is PackedSequence if packed: one_hot = x.data.new(x.data.size(0), self.nin).float().zero_() one_hot.scatter_(1, x.data.unsqueeze(1), 1) one_hot = PackedSequence(one_hot, x.batch_sizes) else: one_hot = x.new(x.size(0), x.size(1), self.nin).float().zero_() one_hot.scatter_(2, x.unsqueeze(2), 1) return one_hot
[docs] def transform(self, x): """ :param x: Input numeric amino acid encoding :math:`(N)` :type x: torch.Tensor :return: Concatenation of all hidden layers :math:`(N \\times (n_{in} + 2 \\times \\text{num_layers} \\times \\text{hidden_dim}))` :rtype: torch.Tensor """ one_hot = self.to_one_hot(x) hs = [one_hot] # [] h_ = one_hot for f in self.layers: h,_ = f(h_) #h = self.dropout(h) hs.append(h) h_ = h if type(x) is PackedSequence: h = torch.cat([z.data for z in hs], 1) h = PackedSequence(h, x.batch_sizes) else: h = torch.cat([z for z in hs], 2) return h
def forward(self, x): """ :meta private: """ one_hot = self.to_one_hot(x) hs = [one_hot] h_ = one_hot for f in self.layers: h,_ = f(h_) #h = self.dropout(h) hs.append(h) h_ = h if type(x) is PackedSequence: h = torch.cat([z.data for z in hs], 1) z = self.proj(h) z = PackedSequence(z, x.batch_sizes) else: h = torch.cat([z for z in hs], 2) z = self.proj(h.view(-1,h.size(2))) z = z.view(x.size(0), x.size(1), -1) return z