Source code for dscript.alphabets

from __future__ import print_function, division

import numpy as np

[docs]class Alphabet: """ From `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_. :param chars: List of characters in alphabet :type chars: byte str :param encoding: Mapping of characters to numbers [default: encoding] :type encoding: np.ndarray :param mask: Set encoding mask [default: False] :type mask: bool :param missing: Number to use for a value outside the alphabet [default: 255] :type missing: int """ def __init__(self, chars, encoding=None, mask=False, missing=255): self.chars = np.frombuffer(chars, dtype=np.uint8) self.encoding = np.zeros(256, dtype=np.uint8) + missing if encoding is None: self.encoding[self.chars] = np.arange(len(self.chars)) self.size = len(self.chars) else: self.encoding[self.chars] = encoding self.size = encoding.max() + 1 self.mask = mask if mask: self.size -= 1 def __len__(self): return self.size def __getitem__(self, i): return chr(self.chars[i])
[docs] def encode(self, x): """ Encode a byte string into alphabet indices :param x: Amino acid string :type x: byte str :return: Numeric encoding :rtype: np.ndarray """ x = np.frombuffer(x, dtype=np.uint8) return self.encoding[x]
[docs] def decode(self, x): """ Decode numeric encoding to byte string of this alphabet :param x: Numeric encoding :type x: np.ndarray :return: Amino acid string :rtype: byte str """ string = self.chars[x] return string.tobytes()
[docs] def unpack(self, h, k): """ unpack integer h into array of this alphabet with length k """ n = self.size kmer = np.zeros(k, dtype=np.uint8) for i in reversed(range(k)): c = h % n kmer[i] = c h = h // n return kmer
[docs] def get_kmer(self, h, k): """ retrieve byte string of length k decoded from integer h """ kmer = self.unpack(h, k) return self.decode(kmer)
DNA = Alphabet(b'ACGT')
[docs]class Uniprot21(Alphabet): """ Uniprot 21 Amino Acid Encoding. From `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_. """ def __init__(self, mask=False): chars = alphabet = b'ARNDCQEGHILKMFPSTWYVXOUBZ' encoding = np.arange(len(chars)) encoding[21:] = [11,4,20,20] # encode 'OUBZ' as synonyms super(Uniprot21, self).__init__(chars, encoding=encoding, mask=mask, missing=20)
[docs]class SDM12(Alphabet): """ A D KER N TSQ YF LIVM C W H G P See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2732308/#B33 "Reduced amino acid alphabets exhibit an improved sensitivity and selectivity in fold assignment" Peterson et al. 2009. Bioinformatics. """ def __init__(self, mask=False): chars = alphabet = b'ADKNTYLCWHGPXERSQFIVMOUBZ' groups = [b'A',b'D',b'KERO',b'N',b'TSQ',b'YF',b'LIVM',b'CU',b'W',b'H',b'G',b'P',b'XBZ'] groups = {c:i for i in range(len(groups)) for c in groups[i]} encoding = np.array([groups[c] for c in chars]) super(SDM12, self).__init__(chars, encoding=encoding, mask=mask)
SecStr8 = Alphabet(b'HBEGITS ')