Source code for biskit.match2seq

## numpy-oldnumeric calls replaced by custom script; 09/06/2016
## Automatically adapted for numpy-oldnumeric Mar 26, 2007 by alter_code1.py

##
## Biskit, a toolkit for the manipulation of macromolecular structures
## Copyright (C) 2004-2016 Raik Gruenberg & Johan Leckner
##
## This program is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 3 of the
## License, or any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You find a copy of the GNU General Public License in the file
## license.txt along with this program; if not, write to the Free
## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
##
##
"""
Match 2 sequences against each other, deleting all positions that differ.
compareStructures() compares sequences of 2 structures and returns
a residue mask for each of them. 
"""

## see: https://www.python.org/dev/peps/pep-0366/
## allow relative imports when calling module as main script for testing
if __name__ == "__main__" and __package__ is None:
    import biskit
    __package__ = "biskit"

from .core import oldnumeric as N0
from . import tools as T
from .core.difflib_old import SequenceMatcher

import numpy as N


[docs]def getOpCodes( seq_1, seq_2 ): """ Compares two sequences and returns a list with the information needed to convert the first one sequence into the second. :param seq_1: list of single letters :type seq_1: [ str ] :param seq_2: list of single letters :type seq_2: [ str ] :return: Optimization code from difflib:: [('delete', 0, 1, 0, 0), ('equal', 1, 4, 0, 3), ('insert', 4, 4, 3, 4), ('equal', 4, 180, 4, 180)] :rtype: [tuples] """ seqDiff = SequenceMatcher( None, ''.join(seq_1) , ''.join(seq_2) ) seqDiff = seqDiff.get_opcodes() return seqDiff
[docs]def getSkipLists( seqDiff ): """ Extracts information about what residues that have to be removed from sequence 1 (delete code) and sequence 2 (insert code). Returns deletion codes in the format (start_pos, length). :param seqDiff: opcodes :type seqDiff: [tuples] :return: Lists of tuples containing regions of the sequences that should be deteted. Example:: strucDel_1 = [(0, 1), (180, 4)] strucDel_2 = [(3, 1), (207, 4)] :rtype: [tuple], [tuple] """ strucDel_2 = [] strucDel_1 = [] i=0 for list in seqDiff: # residues to be deleted in sequence 1 if str( seqDiff[i][0] ) == 'delete': strucDel_1 = strucDel_1 + [ (list[1],list[2]-list[1]) ] # residues to be deleted in sequence 2 if str( seqDiff[i][0] ) == 'insert': strucDel_2 = strucDel_2 + [ (list[3],list[4]-list[3]) ] i += 1 return strucDel_1, strucDel_2
[docs]def getEqualLists( seqDiff ): """ Extract information about regions in the sequences that are equal. Returns deletion codes in the format (start_pos, length). :param seqDiff: opcodes :type seqDiff: [tuples] :return: Lists of tuples containing regions of the sequences that are equal. Example:: strucEqual_1 = [(0, 216)] strucEqual_2 = [(0, 216)] :rtype: [tuple], [tuple] """ strucEqual_1 = [] strucEqual_2 = [] i=0 for list in seqDiff: if str( seqDiff[i][0] ) == 'equal': strucEqual_1 = strucEqual_1 + [ (list[1],list[2]-list[1]) ] strucEqual_2 = strucEqual_2 + [ (list[3],list[4]-list[3]) ] i += 1 return strucEqual_1, strucEqual_2
[docs]def expandRepeatsLeft( s, start, end, length=1 ): """recursively identify sequence repeats on left edge of s[start:end]""" core = s[start:end] if start-length>=0 and s[ start-length : start ] == core[0 : length]: start -= length start = expandRepeatsLeft( s, start, end ) return start
[docs]def expandRepeatsRight( s, start, end, length=1 ): """recursively identify sequence repeats on right edge of s[start:end]""" core = s[start:end] if end+length<=len(s) and s[ end: end+length ] == core[-length:end]: end += length end = expandRepeatsRight( s, start, end, length ) return end
[docs]def expandRepeats( s, start, size ): """ Expand a text fragment within a larger string so that it includes any sequence repetitions to its right or left edge. Example: ABC[BC]CCCDE -> A[BCCCC]DE The idea here is to avoid alignment missmatches due to duplications. The above to sequences could be aligned in several ways, for example: A--BC---DE AB----C-DE ABCBCCCCDE or ABCBCCCCDE We don't know for sure which positions should be kept and which positions should be deleted in the longer string. So the most conservative approach is to remove the whole ambiguous fragment. :param s: input string :type s: str :param start: start position of text fragment :type start: int :param size: size of text fragment :type size: int :return: start and size of expanded fragment :rtype: (int, int) """ end = start + size left = [ expandRepeatsLeft(s,start,end,l) for l in range(size+1) ] right= [ expandRepeatsRight(s,start,end,l) for l in range(size+1) ] left = min(left) right= max(right) return left, right-left
[docs]def getEqual( seqAA, seqNr, equalList ): """ Gets only the postions of the sequences that are equal according to the OpCodes. This should not be nessesary but might be usefull to skip 'replace' OpCode. :param seqAA: list with the amino acid sequence in one letter code :type seqAA: [str] :param seqNr: list with the amino acid postitons :type seqNr: [int] :param equalList: Lists of tuples containing regions of the sequences that are equal :type equalList: [tuple], [tuple] :return: lists of amino acids and positions where equal :rtype: [str], [int] """ equalSeqAA = [] equalSeqNr = [] # delete residues in delList for equal in equalList: equalSeqAA = equalSeqAA + seqAA[equal[0]:equal[0]+equal[1]] equalSeqNr = equalSeqNr + seqNr[equal[0]:equal[0]+equal[1]] return equalSeqAA, equalSeqNr
[docs]def del2mask( seq, *delpos ): """convert list of (from, to) delete positions into a mask of 0 or 1""" mask = N0.ones( len(seq) ) for start, size in delpos: mask.put( range( start, start+size), 0 ) return mask
[docs]def compareSequences( seqAA_1, seqAA_2 ): """ """ seqAA_1 = list( seqAA_1 ) seqAA_2 = list( seqAA_2 ) seqNr_1 = list(range( len( seqAA_1 ))) ## try removing list() seqNr_2 = list(range( len( seqAA_2 ))) # get mask mask_1 = N0.zeros( len( seqNr_1 ) ) mask_2 = N0.zeros( len( seqNr_2 ) ) # compare sequences seqDiff = getOpCodes( seqAA_1, seqAA_2) # get delete lists del_1, del_2 = getSkipLists( seqDiff ) del_1 = [ expandRepeats( seqAA_1, *pos ) for pos in del_1 ] del_2 = [ expandRepeats( seqAA_2, *pos ) for pos in del_2 ] mask1 = del2mask( seqAA_1, *del_1 ) mask2 = del2mask( seqAA_2, *del_2 ) seqAA_1 = N0.compress( mask1, seqAA_1 ).tolist() seqNr_1 = N0.compress( mask1, seqNr_1 ).tolist() seqAA_2 = N0.compress( mask2, seqAA_2 ).tolist() seqNr_2 = N0.compress( mask2, seqNr_2 ).tolist() # get equal parts seqDiff = getOpCodes( seqAA_1, seqAA_2 ) equal_1, equal_2 = getEqualLists( seqDiff ) seqAA_1, seqNr_1 = getEqual( seqAA_1, seqNr_1, equal_1) seqAA_2, seqNr_2 = getEqual( seqAA_2, seqNr_2, equal_2 ) N0.put( mask_1, seqNr_1 , 1 ) N0.put( mask_2, seqNr_2 , 1 ) return mask_1, mask_2
[docs]def compareModels( model_1, model_2 ): """ Initiates comparison of the sequences of two structure objects and returns two equal sequence lists (new_seqAA_1 and new_seqAA_2 should be identical) and the corresponding residue position lists. :param model_1: model :type model_1: PDBModel :param model_2: model :type model_2: PDBModel :return: tuple of atom masks for model_1 and model_2:: e.g. ( [0001011101111111], [1110000111110111] ) :rtype: ([1|0...],[1|0...]) """ # get sequence AA and Nr strings seqAA_1 = model_1.sequence() seqAA_2 = model_2.sequence() return compareSequences( seqAA_1, seqAA_2 )
############# ## TESTING ############# import biskit.test as BT class Test(BT.BiskitTest): """Test case""" def test_match2seq(self): """match2seq test""" ## Reading pdb files lig_traj = T.load( T.testRoot() + '/lig_pcr_00/traj.dat' )[:2] m = [ m.compress( m.maskProtein() ) for m in lig_traj ] ## make the models different m[1].removeRes(['ALA']) mask1, mask2 = compareModels( m[0], m[1] ) if self.local: print('Reading and comparing two models') print('\nResidue masks to make the two maodels equal') print('mask1\n', mask1) print('mask2\n', mask2) globals().update( locals() ) self.assertTrue( N.all(mask1 == self.EXPECT[0] ) ) self.assertTrue( N.all(mask2 == self.EXPECT[1] ) ) def test_sequenceRepeats(self): """match2seq sequence repeat test""" seq1 = 'ABCDEFG~~~~~~~~~~~~~~~' seq2 = '~~~~~' mask1, mask2 = compareSequences( seq1, seq2 ) self.assertTrue( N.all( mask1 == N0.zeros( len(seq1 ) )) ) self.assertTrue( N.all( mask2 == N0.zeros( len(seq2 ) )) ) EXPECT = N0.array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1],N0.Int),\ N0.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],N0.Int) if __name__ == '__main__': BT.localTest()