Coverage for lingpy/align/multiple.py : 94%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# *-* coding: utf-8 *-* Module provides classes and functions for multiple alignment analyses. """
ipa2tokens, tokens2class, prosodic_string, prosodic_weights, pid, )
""" Basic class for multiple sequence alignment analyses.
Parameters ---------- seqs : list List of sequences that shall be aligned.
Notes ----- Depending on the structure of the sequences, further keywords can be specified that manage how the items get tokenized.
"""
# store input sequences, check whether tokens or strings are passed else:
# define a tokenizer function for convenience "diacritics": rcParams['diacritics'], "vowels": rcParams['vowels'], "tones": rcParams['tones'], "combiners": rcParams['combiners'], "breaks": rcParams['breaks'], "stress": rcParams["stress"], "merge_vowels": rcParams["merge_vowels"], "unique_seqs": rcParams["unique_sequences"] }
else: # create a numerical representation of all sequences which reflects the # order of both their position and the position of their tokens. Before # this can be done, a tokenized version of all sequences has to be # created # check for pre-tokenized strings
# create dictionary of all unique sequences, this is important, since # identical sequences should only be counted once in an alignment, # since they otherwise may disturb the analysis or slow it down else: # no uniqueness filtering self.uniseqs = range(0, len(self.seqs))
# the length of an alignment is defined as the number of unique # sequences present in the alignment return self._length
# if alignments are present, print the alignments # else, return all sequences
return False
""" Return specified values. """ return [x[idx[1]] for x in self.alm_matrix[idx[0]]] except: if idx[1] == 'w': return self.seqs[idx[0]] if idx[1] == 'c': return self.classes[idx[0]] if idx[1] == 't': return self.tokens[idx[0]] return self.alm_matrix[idx[0]]
""" Method returns specific values of the class, depending on the index which is used. """ # XXX this should be evaluated, maybe it is not needed in the future.
if value == '_numbers': return self._numbers[idxA][idxB] if value == '_prosodics': if value == 'tokens': if value == 'sonars': if value == 'numbers': if value == 'classes': if value == '_numbers':
self, model=None, classes=True, sonar=True, sonars=False, scoredict={}): """ Method defines a specific class model for the calculation.
Parameters ---------- model : { None ~lingpy.data.model.Model } (default=None) A sound class model. """ # check whether model is a string
# check for keyword classes else:
# create the sound-classes or the fake classes
# once a class model is defined, there may be identical sequences, # which in IPA terms are different. In order to avoid computing # alignments for these identical sequences, a dictionary is created # which stores references to all identical sequences, thus allowing to # compute only one alignment for each set of identical sequences
# create additional matrices for the internal representation of the # class sequences else:
# add the classes range(len(self._classes[i]))] for i in range(self.height)]
# create an index which allows to quickly interchange between classes # and given sequences (trivial without sequence uniqueness else: self.int2ext = {i: [i] for i in range(len(keys))}
# store sonars if they are passed as a list # create sonars if the argument is true map(lambda x: [int(t) for t in tokens2class( x, rcParams['art'], stress=rcParams['stress'])], [self.tokens[key] for key in keys])) "Sequence {0} contains unrecognized characters!".format( self.seqs[self.int2ext[_i][0]])) # do nothing if no arguments are passed else:
# create a scoredict for the calculation of alignment analyses # append the scorer if it is given with the model
enumerate(self._numbers), 2): self._get(numA, '_classes'), self._get(numB, '_classes'))
""" Functions sets the scorer to the simple class model or to the library model. """
self, mode='global', gop=-2, scale=0.5, factor=0.3, restricted_chars='T_', **keywords): """ Function calculates all pairwise alignments from the data. """
# create array for alignments
# create the distance matrix
# check for the mode, if sonority profiles are not chose, take the # simple alignment function
# get the weights
self._numbers, self._weights, self._prostrings, gop, scale, factor, self.scorer, restricted_chars, mode) else: self._numbers, gop, scale, self.scorer, mode)
""" Method creates an extended library for alignments using the Tcoffee approach. """
# create library for non-sound-class approaches for k, l in product(numA, numB): self.library[k, l] = 0.0 else: # note that we somehow HAVE to include a sensitivity for V-C # distinctions in the library mode, otherwise it may get complicated # sometimes, therefore, the library is initialized by setting only the # scores for c-c and v-v matches to 0, the other scores get their # original penalty defined by the old scorer enumerate(self._numbers), 2): # see the comment above for the add-on in this # line else:
""" Extend the library by new alignments. """ # add the residue-pairs of all aligned sequences first for j in range(self.height) if i <= j]: # add the values to the library # the similarity score is determined by adding taking the # average of matrix score and the similarity score of the # alignment of both sequences len(self._alignments[i][j][0]))
# add the residue-pairs resulting from an alignment via a third sequence
# create the indices for the loop (i, j, k) for i in range(self.height) for j in range(self.height) for k in range(self.height) if i <= j and k != i and k != j)
# determine, which of the values occur in both alignments # with the third sequence
""" Create the guide tree using either the UPGMA or the Neighbor-Joining algorithm. """
# create the tree matrix
# carry out the clustering else: 'Method <' + tree_calc + '> for tree calculation not available.')
# create a newick-representation of the string self.tree_matrix, [''.join(c) for c in self._classes], False))
self, almsA, almsB, mode='global', gop=-3, scale=0.5, factor=0, gap_weight=0.5, return_similarity=False, iterate=False, restricted_chars="T_"):
# calculate profile length and profile depth for both profiles
# create the weights by which the gap opening penalties will be modified for char in line] for line in profileA] for char in line] for line in profileB]
# get the consensus string for the sonority profiles int(sum([k for k in col if k != 0]) / len([k for k in col if k != 0]) + 0.5) for col in sonarA] int(sum([k for k in col if k != 0]) / len([k for k in col if k != 0]) + 0.5) for col in sonarB] int(sum([k for k in col if k >= 0]) / len([k for k in col if k >= 0]) + 0.5) for col in sonarA] int(sum([k for k in col if k >= 0]) / len([k for k in col if k >= 0]) + 0.5) for col in sonarB] '', extra=dict(lines=[' '.join([str(x) for x in cons]) for cons in [consA, consB]])) "Failed to compute the consensus string.", extra=dict(lines=[ sonarA, sonarB, almsA[0], [self._get(n_, 'tokens') for n_ in almsA[0]], almsB[0], [self._get(n_, 'tokens') for n_ in almsB[0]] ]))
# carry out the alignment profileA, profileB, weightsA, weightsB, prosA, prosB, gop, scale, factor, self.scorer, restricted_chars, mode, gap_weight)
# trace the gaps inserted in both aligned profiles and insert them # in the original profiles
# invert the profiles and the weight matrices by turning columns # into rows and rows into columns
# return the aligned profiles and weight matrices
self, almsA, almsB, mode='global', gop=-3, scale=0.5, gap_weight=0.5, return_similarity=False, iterate=False): """ Align profiles for tokens, not sound classes. """
# calculate profile length and profile depth for both profiles
# carry out the alignment profileA, profileB, gop, scale, self.scorer, mode, gap_weight)
# trace the gaps inserted in both aligned profiles and insert them # in the original profiles
# invert the profiles and the weight matrices by turning columns # into rows and rows into columns
# return the aligned profiles and weight matrices return profileA, profileB
self, mode='global', gop=-3, scale=0.5, factor=0, gap_weight=0.5, restricted_chars='T_'): # create the lists which will store the current stages of the # alignment process
# start the iteration through the tree array: the first two lines # in the matrix contain the ids of the sequences in the array, # which are aligned along the tree else:
alm_lst[m], alm_lst[n], mode=mode, gop=gop, scale=scale, gap_weight=gap_weight, **kw)
# get the last stage of each alignment process
# restore the original order of the strings in the alignment
# create the matrix which stores all alignments
# calculate the sonority profile for char in line] for line in tmp] int(sum([k for k in col if k != 0]) / len([k for k in col if k != 0]) + 0.5) for col in sonars] int(sum([k for k in col if k >= 0]) / len([k for k in col if k >= 0]) + 0.5) for col in sonars] consensus = [] self.log.error("Failed to compute the consensus string.")
""" Carry out a progressive alignment analysis of the input sequences.
Parameters ----------
model : { "dolgo", "sca", "asjp" } (defaul="sca") A string indicating the name of the :py:class:`Model \ <lingpy.data.model>` object that shall be used for the analysis. Currently, three models are supported:
* "dolgo" -- a sound-class model based on :evobib:`Dolgopolsky1986`,
* "sca" -- an extension of the "dolgo" sound-class model based on :evobib:`List2012b`, and
* "asjp" -- an independent sound-class model which is based on the sound-class model of :evobib:`Brown2008` and the empirical data of :evobib:`Brown2011` (see the description in :evobib:`List2012`.
mode : { "global", "dialign" } (default="global") A string indicating which kind of alignment analysis should be carried out during the progressive phase. Select between:
* "global" -- traditional global alignment analysis based on the Needleman-Wunsch algorithm :evobib:`Needleman1970`,
* "dialign" -- global alignment analysis which seeks to maximize local similarities :evobib:`Morgenstern1996`.
gop : int (default=-2) The gap opening penalty (GOP) used in the analysis.
scale : float (default=0.5) The factor by which the penalty for the extension of gaps (gap extension penalty, GEP) shall be decreased. This approach is essentially inspired by the exension of the basic alignment algorithm for affine gap penalties :evobib:`Gotoh1982`.
factor : float (default=0.3) The factor by which the initial and the descending position shall be modified.
tree_calc : { "neighbor", "upgma" } (default="upgma") The cluster algorithm which shall be used for the calculation of the guide tree. Select between ``neighbor``, the Neighbor-Joining algorithm (:evobib:`Saitou1987`), and ``upgma``, the UPGMA algorithm (:evobib:`Sokal1958`).
guide_tree : tree_matrix Use a custom guide tree instead of performing a cluster algorithm for constructing one based on the input similarities. The use of this option makes the tree_calc option irrelevant.
gap_weight : float (default=0.5) The factor by which gaps in aligned columns contribute to the calculation of the column score. When set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
restricted_chars : string (default="T") Define which characters of the prosodic string of a sequence reflect its secondary structure (cf. :evobib:`List2012b`) and should therefore be aligned specifically. This defaults to "T", since this is the character that represents tones in the prosodic strings of sequences.
""" # set up the defaults parameters stored in the kw dictionary model=rcParams['model'], mode=rcParams['align_mode'], scale=rcParams['align_scale'], factor=rcParams['align_factor'], tree_calc=rcParams['align_tree_calc'], restricted_chars=rcParams['restricted_chars'], classes=rcParams['align_classes'], sonar=rcParams['align_sonar'], sonars=False, scoredict=rcParams['align_scorer'], gop=rcParams['align_gop'], gap_weight=rcParams['align_gap_weight'] )
# fixing a but to avoid that defining models as string will yield an error
# define the model for convenience
# create a string with the current parameters 'prog', model.name, str(kw['gop']), '{0:.1f}'.format(kw['scale']), '{0:.1f}'.format(kw['factor']), kw['tree_calc'], '{0:.1f}'.format(kw['gap_weight']), kw['restricted_chars'] ])
gop=kw['gop'], scale=kw['scale'], factor=kw['factor'], restricted_chars=kw['restricted_chars'])
else:
mode=kw['mode'], gop=kw['gop'], scale=kw['scale'], factor=kw['factor'], restricted_chars=kw['restricted_chars'], gap_weight=kw['gap_weight'])
""" Carry out a library-based progressive alignment analysis of the sequences.
Notes ----- In contrast to traditional progressive multiple sequence alignment approaches such as :evobib:`Feng1981` and :evobib:`Thompson1994`, library-based progressive alignment :evobib:`Notredame2000` is based on a pre-processing of the data where the information given in global and local pairwise alignments of the input sequences is used to derive a refined scoring function (*library*) which is later used in the progressive phase.
Parameters ----------
model : { "dolgo", "sca", "asjp" } (default="sca") A string indicating the name of the :py:class:`Model \ <lingpy.data.model>` object that shall be used for the analysis. Currently, three models are supported:
* "dolgo" -- a sound-class model based on :evobib:`Dolgopolsky1986`,
* "sca" -- an extension of the "dolgo" sound-class model based on :evobib:`List2012b`, and
* "asjp" -- an independent sound-class model which is based on the sound-class model of :evobib:`Brown2008` and the empirical data of :evobib:`Brown2011` (see the description in :evobib:`List2012`.
mode : { "global", "dialign" } (default="global") A string indicating which kind of alignment analysis should be carried out during the progressive phase. Select between:
* "global" -- traditional global alignment analysis based on the Needleman-Wunsch algorithm :evobib:`Needleman1970`,
* "dialign" -- global alignment analysis which seeks to maximize local similarities :evobib:`Morgenstern1996`.
modes : list (default=[("global",-10,0.6),("local",-1,0.6)]) Indicate the mode, the gap opening penalties (GOP), and the gap extension scale (GEP scale), of the pairwise alignment analyses which are used to create the library.
gop : int (default=-5) The gap opening penalty (GOP) used in the analysis.
gep_scale : float (default=0.6) The factor by which the penalty for the extension of gaps (gap extension penalty, GEP) shall be decreased. This approach is essentially inspired by the exension of the basic alignment algorithm for affine gap penalties :evobib:`Gotoh1982`.
factor : float (default=1) The factor by which the initial and the descending position shall be modified.
tree_calc : { "neighbor", "upgma" } (default="upgma") The cluster algorithm which shall be used for the calculation of the guide tree. Select between ``neighbor``, the Neighbor-Joining algorithm (:evobib:`Saitou1987`), and ``upgma``, the UPGMA algorithm (:evobib:`Sokal1958`).
guide_tree : tree_matrix Use a custom guide tree instead of performing a cluster algorithm for constructing one based on the input similarities. The use of this option makes the tree_calc option irrelevant.
gap_weight : float (default=0) The factor by which gaps in aligned columns contribute to the calculation of the column score. When set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
restricted_chars : string (default="T") Define which characters of the prosodic string of a sequence reflect its secondary structure (cf. :evobib:`List2012b`) and should therefore be aligned specifically. This defaults to "T", since this is the character that represents tones in the prosodic strings of sequences.
""" # set up the defaults parameters stored in the kw dictionary model=rcParams['model'], mode=rcParams['align_mode'], modes=rcParams['align_modes'], scale=rcParams['align_scale'], factor=rcParams['align_factor'], tree_calc=rcParams['align_tree_calc'], restricted_chars=rcParams['restricted_chars'], classes=rcParams['align_classes'], sonar=rcParams['align_sonar'], scoredict=rcParams['align_scorer'], gop=rcParams['align_gop'], gap_weight=rcParams['align_gap_weight'], sonars=False)
# fixing a but to avoid that defining models as string will yield an error
# create a string with the current parameters 'lib', kw['model'].name, kw['mode'], '{0:.1f}'.format(kw['factor']), kw['tree_calc'], '{0:.1f}'.format(kw['gap_weight']), kw['restricted_chars'] ]
# append parameters to the params-string
kw['model'], kw['classes'], kw['sonar'], kw['sonars'], kw['scoredict'])
# start to create the library, note that scales and factors are set to # zero here, since scales and zeros are only useful in # profile-alignments. they eventually disturb pairwise alignments, # which is why it is important to keep their influence low when # creating the library from pairwise alignments run[0], run[1], run[2], kw['factor'], kw['restricted_chars'])
kw['mode'], 0, 0.0, kw['factor'], kw['restricted_chars'])
else:
# merge the alignments, not that the scale doesn't really influence any # of the results here, since gap scores are set to 0, gapping should be # the same in all positions, the factor, however, eventually influences # the score, since it changes character mappings as well kw['mode'], 0, 0.0, 0, kw['gap_weight'], kw['restricted_chars'])
else: return self
""" Method reduces all columns from an MSA when there are only gaps. This method is important for the iterative procedures. """ # XXX new_msa = np.array(msa[:]) # XXX if list(new_msa[:,i]).count(gap) != len(new_msa):
# XXX new_msa = new_msa[:,no_gap_index].tolist()
""" Split an MSA into two parts and retain their indices. """ # XXX # create the inverted index
# get idxA
# XXX partA = self._reduce_gap_sites(self._alm_matrix[idxA]) # XXX partB = self._reduce_gap_sites(self._alm_matrix[idxB])
""" Join two aligned MSA by their index. """
# XXX out_alm = np.array(out_alm)
self, idx_list, mode='global', gop=-3, scale=0.5, factor=0.0, gap_weight=0.5, check='final', restricted_chars='T_'): """ Split an MSA into two parts and realign them. """ # XXX .copy()
almA, almB, mode=mode, iterate=True, gop=gop, scale=scale, factor=factor, gap_weight=gap_weight)
new_sop = self.sum_of_pairs() if new_sop < sop: self._alm_matrix = alm_matrix else:
self._alm_matrix = alm_matrix
""" Calculate the sum-of-pairs score for a given alignment analysis.
Parameters ----------
alm_matrix : { "self", "other" } (default="self") Indicate for which MSA the sum-of-pairs score shall be calculated.
mat : { None, list } If "other" is chosen as an option for **alm_matrix**, define for which matrix the sum-of-pairs score shall be calculated.
gap_weight : float (default=0) The factor by which gaps in aligned columns contribute to the calculation of the column score. When set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
Returns ------- The sum-of-pairs score of the alignment. """ else: alm_matrix = mat
else:
[line[i] for line in alm_matrix], [line[i] for line in alm_matrix], self.scorer, *args, **kw)
[line[i] for line in alm_matrix], [line[i] for line in alm_matrix], self.scorer, gap_weight=gap_weight, swap_penalty=swap_penalty)
self, check='final', mode='global', gop=-3, scale=0.5, factor=0, gap_weight=1.0, restricted_chars='T_'): """ Iterate over the most divergent sequences in the sample.
Parameters ----------
check : string (default="final") Specify when to check for improved sum-of-pairs scores: After each iteration ("immediate") or after all iterations have been carried out ("final").
mode : { "global", "overlap", "dialign" } (default="global") A string indicating which kind of alignment analysis should be carried out during the progressive phase. Select between:
* "global" -- traditional global alignment analysis based on the Needleman-Wunsch algorithm :evobib:`Needleman1970`,
* "dialign" -- global alignment analysis which seeks to maximize local similarities :evobib:`Morgenstern1996`.
* "overlap" -- semi-global alignment, where gaps introduced in the beginning and the end of a sequence do not score.
gop : int (default=-5) The gap opening penalty (GOP) used in the analysis.
gep_scale : float (default=0.6) The factor by which the penalty for the extension of gaps (gap extension penalty, GEP) shall be decreased. This approach is essentially inspired by the exension of the basic alignment algorithm for affine gap penalties :evobib:`Gotoh1981`.
factor : float (default=0.3) The factor by which the initial and the descending position shall be modified.
gap_weight : float (default=0) The factor by which gaps in aligned columns contribute to the calculation of the column score. When set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
Notes ----- The most divergent sequences are those whose average distance to all other sequences is above the average distance of all sequence pairs.
See also -------- Multiple.iterate_clusters Multiple.iterate_similar_gap_sites Multiple.iterate_all_sequences
"""
orphans, check=check, mode=mode, scale=scale, gop=gop, factor=factor, gap_weight=gap_weight, restricted_chars=restricted_chars)
self, threshold, check='final', mode='global', gop=-3, scale=0.5, factor=0, gap_weight=1, restricted_chars='T_'): """ Iterative refinement based on a flat cluster analysis of the data.
Notes ----- This method uses the :py:func:`lingpy.algorithm.clustering.flat_upgma` function in order to retrieve a flat cluster of the data.
Parameters ----------
threshold : float The threshold for the flat cluster analysis.
check : string (default="final") Specify when to check for improved sum-of-pairs scores: After each iteration ("immediate") or after all iterations have been carried out ("final").
mode : { "global", "overlap", "dialign" } (default="global") A string indicating which kind of alignment analysis should be carried out during the progressive phase. Select between:
* 'global' -- traditional global alignment analysis based on the Needleman-Wunsch algorithm :evobib:`Needleman1970`,
* 'dialign' -- global alignment analysis which seeks to maximize local similarities :evobib:`Morgenstern1996`.
* 'overlap' -- semi-global alignment, where gaps introduced in the beginning and the end of a sequence do not score.
gop : int (default=-5) The gap opening penalty (GOP) used in the analysis.
gep_scale : float (default=0.6) The factor by which the penalty for the extension of gaps (gap extension penalty, GEP) shall be decreased. This approach is essentially inspired by the exension of the basic alignment algorithm for affine gap penalties :evobib:`Gotoh1981`.
factor : float (default=0.3) The factor by which the initial and the descending position shall be modified.
gap_weight : float (default=0) The factor by which gaps in aligned columns contribute to the calculation of the column score. When set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
See also -------- Multiple.iterate_similar_gap_sites Multiple.iterate_all_sequences
""" # don't calculate this if there are less than 5 sequences
# create the clusters
clusters.values(), check=check, mode=mode, scale=scale, gop=gop, factor=factor, gap_weight=gap_weight, restricted_chars=restricted_chars)
self, check='final', mode='global', gop=-3, scale=0.5, factor=0, gap_weight=1, restricted_chars='T_'): """ Iterative refinement based on the *Similar Gap Sites* heuristic.
Notes ----- This heuristic is fairly simple. The idea is to try to split a given MSA into partitions with identical gap sites.
Parameters ----------
check : { "final", "immediate" } (default="final") Specify when to check for improved sum-of-pairs scores: After each iteration ("immediate") or after all iterations have been carried out ("final").
mode : { "global", "overlap", "dialign" } (default="global") A string indicating which kind of alignment analysis should be carried out during the progressive phase. Select between:
* 'global' -- traditional global alignment analysis based on the Needleman-Wunsch algorithm :evobib:`Needleman1970`,
* 'dialign' -- global alignment analysis which seeks to maximize local similarities :evobib:`Morgenstern1996`.
* 'overlap' -- semi-global alignment, where gaps introduced in the beginning and the end of a sequence do not score.
gop : int (default=-5) The gap opening penalty (GOP) used in the analysis.
gep_scale : float (default=0.5) The factor by which the penalty for the extension of gaps (gap extension penalty, GEP) shall be decreased. This approach is essentially inspired by the exension of the basic alignment algorithm for affine gap penalties :evobib:`Gotoh1982`.
factor : float (default=0.3) The factor by which the initial and the descending position shall be modified.
gap_weight : float (default=1) The factor by which gaps in aligned columns contribute to the calculation of the column score. When, e.g., set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
See also -------- Multiple.iterate_clusters Multiple.iterate_all_sequences Multiple.iterate_orphans
"""
list(self.gap_dict.values()), check=check, mode=mode, scale=scale, gop=gop, factor=factor, gap_weight=gap_weight)
self, check="final", mode="global", gop=-3, scale=0.5, factor=0, gap_weight=1, restricted_chars="T_"): """ Iterative refinement based on a complete realignment of all sequences.
Notes ----- This method essentially follows the iterative method of :evobib:`Barton1987` with the exception that an MSA has already been calculated.
Parameters ----------
check : { "final", "immediate" } (default="final") Specify when to check for improved sum-of-pairs scores: After each iteration ("immediate") or after all iterations have been carried out ("final").
mode : { "global", "overlap", "dialign" } (default="global") A string indicating which kind of alignment analysis should be carried out during the progressive phase. Select between:
* "global" -- traditional global alignment analysis based on the Needleman-Wunsch algorithm :evobib:`Needleman1970`,
* "dialign" -- global alignment analysis which seeks to maximize local similarities :evobib:`Morgenstern1996`.
* "overlap" -- semi-global alignment, where gaps introduced in the beginning and the end of a sequence do not score.
gop : int (default=-5) The gap opening penalty (GOP) used in the analysis.
gep_scale : float (default=0.5) The factor by which the penalty for the extension of gaps (gap extension penalty, GEP) shall be decreased. This approach is essentially inspired by the exension of the basic alignment algorithm for affine gap penalties :evobib:`Gotoh1981`.
factor : float (default=0.3) The factor by which the initial and the descending position shall be modified.
gap_weight : float (default=0) The factor by which gaps in aligned columns contribute to the calculation of the column score. When set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
See also -------- Multiple.iterate_clusters Multiple.iterate_similar_gap_sites Multiple.iterate_orphans
""" [[i] for i in range(self.height)], check=check, mode=mode, scale=scale, gop=gop, factor=factor, gap_weight=gap_weight, restricted_chars=restricted_chars)
""" Calculate the profile score for each column of the alignment.
Parameters ----------
gap_weight : float (default=0) The factor by which gaps in aligned columns contribute to the calculation of the column score. When set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
Returns -------
peaks : list A list containing the profile scores for each column of the given alignment.
Examples --------
""" calign.score_profile( [k[i] for k in self._alm_matrix], [k[i] for k in self._alm_matrix], self.scorer, gap_weight=gap_weight) for i in range(len(self._alm_matrix[0])) ]
""" Return all peaks in a given alignment.
Parameters ---------- threshold : { int, float } (default=2) The threshold to determine whether a given column is a peak or not. gap_weight : float (default=0.0) The weight for gaps.
"""
""" Function creates a dictionary of all pairwise alignments scores.
Parameters ---------- new_calc : bool (default=True) Specify, whether the analysis should be repeated from the beginning, or whether already conducted analyses should be carried out.
model : string (default="sca") A string indicating the name of the :py:class:`Model \ <lingpy.data.model>` object that shall be used for the analysis. Currently, three models are supported:
* "dolgo" -- a sound-class model based on :evobib:`Dolgopolsky1986`,
* "sca" -- an extension of the "dolgo" sound-class model based on :evobib:`List2012b`, and
* "asjp" -- an independent sound-class model which is based on the sound-class model of :evobib:`Brown2008` and the empirical data of :evobib:`Brown2011` (see the description in :evobib:`List2012`.
mode : string (default="global") A string indicating which kind of alignment analysis should be carried out during the progressive phase. Select between:
* "global" -- traditional global alignment analysis based on the Needleman-Wunsch algorithm :evobib:`Needleman1970`,
* "dialign" -- global alignment analysis which seeks to maximize local similarities :evobib:`Morgenstern1996`.
gop : int (default=-3) The gap opening penalty (GOP) used in the analysis.
gep_scale : float (default=0.6) The factor by which the penalty for the extension of gaps (gap extension penalty, GEP) shall be decreased. This approach is essentially inspired by the exension of the basic alignment algorithm for affine gap penalties :evobib:`Gotoh1982`.
factor : float (default=1) The factor by which the initial and the descending position shall be modified.
gap_weight : float (default=0) The factor by which gaps in aligned columns contribute to the calculation of the column score. When set to 0, gaps will be ignored in the calculation. When set to 0.5, gaps will count half as much as other characters.
restricted_chars : string (default="T") Define which characters of the prosodic string of a sequence reflect its secondary structure (cf. :evobib:`List2012b`) and should therefore be aligned specifically. This defaults to "T", since this is the character that represents tones in the prosodic strings of sequences.
""" keywords, new_calc=True, model=rcParams['sca'], mode='global', gop=-3, scale=0.5, factor=1, restricted_chars='T_', classes=True, sonar=True, scorer={})
# define the class model keywords['model'], keywords['classes'], keywords['sonar'], keywords['scorer'])
# reset the scorer to "classes"
# retrieve the alignments keywords['mode'], keywords['gop'], keywords['scale'], keywords['factor'], keywords['restricted_chars'])
# get the score of the alignment
# retrieve the numeric tokens
# append values to dictionary dotjoin(idx + 1, m.split('.')[1]), 'tokens'))
else: # if new_calc is not chosen, the PID of an alignment will be # returned, beware only to calculate the pid for unique sequences # in order to save time and memory # get the score of the alignment
""" Return the Percentage Identity (PID) score of the calculated MSA.
Parameters ---------- mode : { 1, 2, 3, 4, 5 } (default=1) Indicate which of the four possible PID scores described in :evobib:`Raghava2006` should be calculated, the fifth possibility is added for linguistic purposes:
1. identical positions / (aligned positions + internal gap positions),
2. identical positions / aligned positions,
3. identical positions / shortest sequence, or
4. identical positions / shortest sequence (including internal gap pos.)
5. identical positions / (aligned positions + 2 * number of gaps)
Returns ------- score : float The PID score of the given alignment as a floating point number between 0 and 1.
See also -------- lingpy.sequence.sound_classes.pid
"""
# create a dictionary of unique sequences
""" Create a dictionary which lists all strings with a similar gap structure and their index. The gap structure is marked in the key of the dictionary, where '1' refers to gapped sites and '0' refers to ungapped sites. The values of the dictionary are a list of integers referring to the position of the sequences having this structure in the MSA. """
""" A dictionary storing the different gap profiles of an MSA as keys and the indices of the corresponding sequences as values. """
""" Return an array of the gap-profiles of an alignment. @return: An array, representing the gap profiles as integers (0 indicates characters and 1 indicates gaps). @rtype: C{scipy.array} """
""" The condition for swaps to possibly occur in the alignment. These are the complementary sites in the alignment, which are extracted from the gap array. """ [line[i] for line in gap_array], [line[i + 2] for line in gap_array] )]:
""" Carry out a check for swapped regions. """ # [i] We define two versions of the possibly swapped region, a first # ... one, where the original alignment is unchanged, and a second one, # ... where the alignment is shifted, i.e. the gaps are switched.
# [i] shift the gap of the first and third matrix
# determine in which direction to turn by counting the number of chars # in all cols
# [i] unswap the possibly swapped columns by shifting values # ... unequal to a gap and leaving a special symbol (+) which will # ... cope for the penalty for a swap. if matA[i][ind] != 'X': pass elif matA[i][ind + 2] != 'X': else: else: pass
# [i] apply the same procedure to the unshifted matrix else: if matB[i][ind] != 'X': matB[i][ind] = '+' else:
# [i] calculate normal and new sum-of-pairs scores, convert to integers # ... in order to guarantee the accuracy of the comparison of # ... sop-scores matA, gap_weight=gap_weight, swap_penalty=swap_penalty) matB, gap_weight=gap_weight, swap_penalty=swap_penalty) lines=[[self._get(x, '_classes') for x in line] for line in matA] + \ [msaA] + \ [[self._get(x, '_classes') for x in line] for line in matA] + \ [msaB] + [msaAB, msa] ))
# return True if the newly calculated sop-score is greater than the previous one
""" Check for possibly swapped sites in the alignment.
Parameters ---------- swap_penalty : { int, float } (default=-3) Specify the penalty for swaps in the alignment.
score_mode : { "classes", "library" } (default="classes") Define the score-mode of the calculation which is either based on sound classes proper, or on the specific scores derived from the library approach.
Returns ------- result : bool Returns ``True``, if a swap was identified, and ``False`` otherwise. The information regarding the position of the swap is stored in the attribute ``swap_index``.
Notes ----- The method for swap detection is described in detail in :evobib:`List2012b`.
Examples -------- Define a set of strings whose alignment contans a swap.
>>> from lingpy import * >>> mult = Multiple(["woldemort", "waldemar", "wladimir"])
Align the data, using the progressive approach.
>>> mult.prog_align()
Check for swaps.
>>> mult.swap_check() True
Print the alignment
>>> print(mult) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r -
"""
else:
# check for incompatible swaps. this is a temporary solution # since it might be better to check the BEST swaps and not only # to discard them in a linear order
pprint=False): """ A short-cut method for multiple alignment analyses.
Parameters ---------- seqs : list The input sequences. gop = int (default=-1) The gap opening penalty. scale : float (default=0.5) The scaling factor by which penalties for gap extensions are decreased. tree_calc : { "upgma" "neighbor" } (default="upgma") The algorithm which is used for the calculation of the guide tree. pprint : bool (default=False) Indicate whether results shall be printed onto screen.
Returns ------- alignments : list A two-dimensional list in which alignments are represented as a list of tokens.
Examples -------- >>> m = mult_align(["woldemort", "waldemar", "vladimir"], pprint=True) w o l - d e m o r t w a l - d e m a r - - v l a d i m i r -
""" classes=False, sonar=False, gop=gop, tree_calc=tree_calc, scale=scale, scoredict=scoredict or {})
|