Coverage for lingpy/compare/phylogeny.py : 94%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# *-* coding: utf-8 *-* Phylogeny-based detection of borrowings in lexicostatistical wordlists. """
# mpl is only used for specific plots, we can therefor make a safe import try: # pragma: no cover import matplotlib as mpl import matplotlib.pyplot as plt except: mpl, plt = None, None
# import the geoplot module try: # pragma: no cover import mpl_toolkits.basemap as bmp log.missing_module('basemap')
except ImportError: sp = None log.missing_module('scipy')
paps, taxa, tree, gpl=1, weights=(1, 1), push_gains=True, missing_data=0): """ Calculate a gain-loss scenario.
Parameters ---------- paps : list A list containing the presence-absence patterns for all leaves of the reference tree. Presence is indicated by 1, and absence by 0. Missing characters are indicated by -1. taxa : list The list of taxa (leaves of the tree). tree : str A tree in Newick-format. Taxon names should (of course) be identical with the names in the list of taxa. gpl : int Gains per lineage. Specify the maximal amount of gains per lineage. One lineage is hereby defined as one path in the tree. If set to 0, only one gain per lineage is allowed, if set to 1, one additional gain is allowed, and so on. Use with care, since this will lead to larger computation costs (more possibilities have to be taken care of) and can also be quite unrealistic. weights : tuple (default=(1,1)) Specify the weights for gains and losses. Setting this parameter to (2,1) will penalize gain events with 2 and loss events with 1. push_gains : bool (default=True) Determine whether of a set of equally parsimonious patterns those should be retained that show gains closer to the leaves of the tree or not. missing_data : int (default=0) Determine how missing data should be represented. If set to 0 (default), missing data will be treated in the same way as absence character states. If you want missing data to be accounted for in the algorithm, set this parameter to -1.
Notes ----- This is an enhanced version of the older approach to parsimony-based gain-loss mapping. The algorithm is much faster than the previous one and also written much clearer as to the code. In most tests I ran so far, it also outperformed other approaches by finding more parsimonious solutions.
""" # this line is just to make sure we actually copy the paps and don't change # them unwillingly
# get dictionary for taxa with their states
# get subtree for taxa with positive paps
# assign the scenarios, each scenario consists of the state of the node in # the tree and a dictionary with the previous events, where the node-name # is the key and the event (1,0,-1) is the value
# return simple scenario if the group is single origin
# start iteration over outmost layer
# define new nodes list (to be appended to new node *[scenarios[n.Name] for n in tree_node.Children]): # get stories
# evaluate the states
# combine states if they evaluate to 1 # combine states if they evaluate to 0 # append the new combined stuff to the dictionary # if the both evaluate to -1, also combine them else: # append both scenarios if there's both 1 and 0 # assuming origin, each node that has a 0, needs an extra origin
# evaluate the scenarios for consistency reasons, # avoid to append scenarios with more than allowed gains per lineage # check scenarios having a loss in order to retrieve # the scenario with the minimal weight, since once a # loss is determined, the gains can be freely chosen # do the same for scenarios having a gain, if multiple # loss-models are encountered
# append lowest weights in gains to the list
# select the best of all scenarios by comparing all weights
# count the weights list(s[1].items()))
# select the scenario with the hightest number of gains, if push-gains # option is set to true [x for x in tree.getNodeMatchingName(root).getTipNames() if x not in states1],))
winners[min(winners)], key=lambda x: [y[1] for y in x].count(1 if push_gains else 0))[0]
""" Basic class for calculations using the TreBor method.
Parameters ---------- dataset : string Name of the dataset that shall be analyzed. tree : {None, string} Name of the tree file. paps : string (default="pap") Name of the column that stores the specific cognate IDs consisting of an arbitrary integer key and a key for the concept. ref : string (default="cogid") Name of the column that stores the general cognate ids (the "reference" of the analysis). tree_calc : {'neighbor','upgma'} (default='neighbor') Select the algorithm to be used for the tree calculation if no tree is passed with the file. missing : int (default=-1) Specify how missing data should be handled. If set to -1, missing data can account for both presence or absence of a cognate set in the given language. If set to 0, missing data is treated as absence. degree : int (default=100) The degree which is chosen for the projection of the tree layout. """
# XXX generally: find a way to check whether a dataset was already loaded, # XXX otherwise it takes too long a time to recalculate everything
self, dataset, tree=None, paps='pap', ref='cogid', tree_calc='neighbor', output_dir=None, **keywords ): # TODO check for keywords, allow to load trees, etc. keywords, degree=100, singletons=True, missing=-1, change=lambda x: x ** 1.5, start=0 )
# check for cognates log.deprecated('cognates', 'ref') ref = keywords['cognates']
# store the basename of the dataset without suffix and the identifier for paps if dataset_name[-4:] in ['.qlc', '.csv'] else dataset_name
# open csv-file of the data and store it as a word list attribute log.deprecated('csv', 'qlc') else: raise compat.FileNotFoundError("The input file could not be found.")
# check for glossid else: self._id2gl = {int(self[k, 'glid']): self[k, 'concept'] for k in self} self._gl2id = {self[k, 'concept']: int(self[k, 'glid']) for k in self}
# check for paps as attribute in the wordlist paps, ref + ',glid', lambda x, y: "{0}:{1}".format(x[y[0]], x[y[1]]))
# get the paps and the etymological dictionary
# get a list of concepts corresponding to the cogs and get the # singletons to be excluded from the calculation
# a dictionary with pap-key as key and concept as value
# list stores the singletons
# only calculate singletons if the option is chosen # get the names of the concepts
# check for singletons
# create a list of keys for faster access when iterating
# summarize the cognate sets under their common concept
# Load the tree, if it is not defined, assume that the treefile has the # same name as the dataset # try to load the tree first # create it otherwise # XXX TODO # if it is explicitly defined, try to load that file else: # not hasattr(self,'tree'):
# if no good topology is given, create it automatically, using # the radial layout function self.tree, filename='', degree=keywords['degree'], change=keywords['change'], start=keywords['start'] )
# create a couple of further attributes
"""A path within the output directory for the dataset.
Note: All intermediate directories will be created unless a keyword argument mkdir=False is passed.
:param comps: path components relative to the output directory. :return: the path. """ os.path.join( self._output_dir, os.path.basename(self.dataset) + '_phybo', *comps), mkdir=kw.get('mkdir', True))
"""Write a file to the dataset-specific output directory.
:param name: Name of the file to be written relative to the output directory. :param content: Content to be written. :param log: Flag signalling whether there should be log output or not. """
return jsonlib.load(self.dataset + '.json')
# get the list of nodes that are not missing
""" Infer gain-loss scenario using the method by Dagan & Martin (2007).
""" # check for mode
# get list of taxa where pap is 1
# get the subtree containing all taxa that have positive paps [self.taxa[i] for i in range(len(self.taxa)) if pap[i] >= 1])
# assign the basic (starting) values to the dictionary
return [(tree.Name, 1)]
# store the scenario
# make the queue # get tree and counter from queue
# break if counter exceeds the mode [p for p in presents if p in tmp_tree.getTipNames()]) else: # store common names and children nodes
# store results for separate children
[p for p in presents if p in child.getTipNames()])
# check for tip names in subtrees else: else:
# evaluate the results # check for identity and stop iteration if tips are identical # otherwise check for intersection and small amount of # differences # elif tmp_tree == tree.lowestCommonAncestor(commons): # elif cSet.issubset(tSet) and len(tSet) - len(cSet) < len(tSet) / 2: # scenario += [(tmp_tree.lowestCommonAncestor(presents).Name,1)] # otherwise append the other results to the queue else: else:
if add2scenario[0].Parent == add2scenario[1].Parent: scenario += [(add2scenario[0].Parent.Name, 1)] else: else:
# TODO fill the scenario with gaps
# order the internal nodes according to the number of their leaves subtree.nontips() + [subtree], key=lambda x: len(x.tips()))
# start bottom-up
# check for identity of states else:
self, pap, mode='w', r=(1, 1), gpl=1, push_gains=True, missing_data=0 ): """ Calculate a gain-loss scenario (GLS) for a given PAP.
Parameters ---------- pap : list The presence/absence pattern of a given cognate-set. mode : str (default='w') The mode of the analysis, select between "w" (weights) and "r" (restrictio). r : { tuple, int } (default=(1,1)) The weights (as binary tuple) or the restriction (an integer), negative restrictions mark the maximal amount of losses. gpl : int (default=1) The maximal number of gains per lineage. push_gains : bool (default=True) Indicate whether gains should be pushed to the leaves or not. missing_data : int (default=0) Indicate, how missing values should be represented in the paps. If set to 0, missing values will be treated as non-cognate words. If set to 1, missing values will be treated as potential cognates.
""" # make a dictionary that stores the scenario
# get the subtree containing all taxa that have positive paps [self.taxa[i] for i in range(len(self.taxa)) if pap[i] >= 1])
# assign the basic (starting) values to the dictionary
# calculate the initial restriction value (maximal weight). This is roughly # spoken simply the minimal value of either all events being counted as # origins (case 1) or assuming origin of the character at the root and # counting all leaves that lost the character as single loss events (case # 2). In case two, the first gain of the character has to be added # additionally
# get maximal number of gains and losses, note that we have to include # missing data in a two-fold fashion here. this is probably # computationally not the most feasible solution. however, it is the # only way I can think of at the moment
# get the first state of all nodes and store the state in the # dictionary. note that we start from two distinct scenarios: one # assuming single origin at the root where all present states in the # leave are treated as retentions, and one assuming multiple origins, # where all present states in the leaves are treated as origins else:
# we append the maximally remaining possible number of gains and # losses to the queue dictionary and decrease it steadily once two # branches are merged as either loss or gain
# return simple scenario, if the group is single-origin
# order the internal nodes according to the number of their leaves
# join the nodes successively
# when dealing with multifurcating trees, we have to store all # possible scenarios, i.e. we need to store the crossproduct of all # scenarios
# get the nodes with their states from the dictionary
# get the cross-product of the stuff
# combine the histories of the items if all have the same value, # therefore, we first get the states in a simple list
# get the restriction values
# get the minimal gain and loss values
# calculate the restriction value # the following line contains some serious bug # (DENKFEHLER), it works at the moment, but we should be # very DAMN careful with this! rst = min(maxGain * r[0] + r[1], maxLoss * r[1] + r[0]) else:
"... MaxG / MaxL / rst: {0} / {1} / {2}.".format(maxGain, maxLoss, rst)) # log.debug("... Stories: %s" % stories)
# combine the histories
# if states are identical and point to gain / presence of # chars, we add them directly. here we also include the number # of missing states: if missing states turn up, we simply treat # them as presence values
# add the histories to the queue only if their weight is # less or equal to the maxWeight
else: else:
# when combining two gains, make sure that the allowed # amount of gains per lineage will not be overwritten by # the combination of new gains
# make sure to append a smaller restriction value, # since we could spare one event due to regular # calculation else: # if gl.count(0) <= maxLoss:
# if states are identical and point to absence of chars, we # assign them directly to the higher node. here, missing chars # are also included
else:
# if states are both missing
# if the states are not identical, we check for both scenarios else:
# first scenario (tmpA) assumes origin, that is, for each node # that has a 1, we add an origin to new_stories, same is # for loss scenario (tmpB)
# get the vectors to make it easier to retrieve the number # of losses and gains
# check the gain-loss scores weightA = glA.count(1) * r[0] + glA.count(0) * r[1] weightB = glB.count(1) * r[0] + glB.count(0) * r[1] else: else:
# create the new nodes. Note that we can only reduce the # number of losses here by one, but not the possible number # of gains, since the last gain will also score in our # calculation, but we cannot predict, whether a given gain # is indeed the last one (or can we?)
# check for additional gains in the gain-scenario, # according to the current model, we don't allow for one # character to be gained twice along a branch, i.e. by an # ancestor, then get lost, and than be gained anew
d[node.Name])))
# try to find the best scenario by counting the ratio of gains and losses. # the key idea here is to reduce the number of possible scenarios according # to a given criterion. We choose the criterion of minimal changes as a # first criterion to reduce the possibilities, i.e. we weight both gains # and losses by 1 and select only those scenarios where gains and losses # sum up to a minimal number of gains and losses. This pre-selection of # scenarios can be further reduced by weighting gains and losses # differently. So in a second stage we choose only those scenarios where # there is a minimal amount of gains.
# convert the specific format of the d[tree.Name] to simple format else:
# the tracer stores all scores
# calculate gains and losses
# calculate the score else:
# return the minimal indices, sort them according to the number of # gains inferred, thereby pushing gains to the root, similar to # Mirkin's (2003) suggestion best_gls, key=lambda x: sum([i[1] for i in x]), reverse=push_gains)[0]
# push gains down to the root as suggested by Mirkin 2003
# make sure to check the model with minimal amount of gains if sum([1 for x in line if x[1] == 1]) == minGains]
# calculate number of tips for the gains of a given scenario self.tree.getNodeMatchingName(taxon).getTipNames())
# store the graph gls, self.tgraph, self.tree, filename=self._output_path( 'gml', '{0}-{1}'.format(self.dataset, glm), cog))
d['graphics']['x'], d['graphics']['y'], d['graphics']['fill'], d['origin'], d['label']))
g.node[a]['graphics']['x'], g.node[b]['graphics']['x'], g.node[a]['graphics']['y'], g.node[b]['graphics']['y']))
# mpl.rc('text',usetex=keywords['usetex'])
if l.startswith('edge') or l.startswith('root'): else: x, y, l, horizontalalignment='center', verticalalignment='center', size=8, fontweight='bold', color='#ffffff' if f == '#000000' else '#000000', backgroundcolor=f)
# plt.subplots_adjust(left=0.02,right=0.98,top=0.98,bottom=0.02) 'gml', '{0}-{1}-figures'.format(self.dataset, glm), cog + '.' + fileformat))
# if tar is chosen, put it into a tarfile 'cd {0}_phybo/gml/ ; tar -pczf {0}-{1}.tar.gz {0}-{1}; cd ..; cd ..'.format( self.dataset, glm))
self, mode='weighted', ratio=(1, 1), restriction=3, output_gml=False, output_plot=False, tar=False, **keywords ): """ Create gain-loss-scenarios for all non-singleton paps in the data.
Parameters ---------- mode : string (default="weighted") Select between "weighted", "restriction" and "topdown". The three modes refer to the following frameworks:
* "weighted" refers to the weighted parsimony framework described in :evobib:`List2014b` and :evobib:`List2014a`. Weights are specified with help of a ratio for the scoring of gain and loss events. The ratio can be defined with help of the *ratio* keyword. * "restrictino" refers to a simple method in which only a specific amount of gain events is allowed. The maximally allowed number of gain events can be defined with help of the *restriction* keyword. * "topdown" refers to the top-down method outlined in :evobib:`Dagan2007` and first applied to linguistic data in :evobib:`Nelson-Sathi2011`. This method also defines a maximal number of gain events, but in contrast to the "restriction" approach, it starts from the top of the tree and stops if the maximal number of restrictions has been reached. The maximally allowed number of gain events can, again, be specified with help of the *restriction* keyword. ratio : tuple (default=(1,1)) If "weighted" mode is selected, define the ratio between the weights for gains and losses. restriction : int (default=3) If "restriction" is selected as mode, define the maximal number of gains. output_gml : bool (default=False) If set to c{True}, the decisions for each GLS are stored in a separate file in GML-format. tar : bool (default=False) If set to c{True}, the GML-files will be added to a compressed tar-file. gpl : int (default=1) Specifies the maximal number of gains per lineage. This parameter specifies how cases should be handled in which a character is first gained, then lost, and then gained again. By setting this parameter to 1 (the default setting), such cases are prohibited, since only one gain per lineage is allowed. missing_data : int (default=0) Currently, we offer two ways to handle missing data. The first case just treats missing data in the same way in which the absence of a character is handled and can be evoked by setting this parameter to 0. The second case will treat missing data as either absent or present characters, based on how well each option coincides with the overall evolutionary scenario. This behaviour can be evoked by setting this parameter to -1. push_gains: bool (default=True) In bottom-up calculations, there will often be multiple scenarios upon which only one is selected by the method. In order to define consistent criteria for scenario selection, we follow :evobib:`Mirkin2003` in allowing to force the algorithm to prefer those scenarios in which gains are pushed to the leaves. This behaviour is handle by this parameter. Setting it to *True* will force the algorithm to push gain events to the leaves of the tree. Setting it to *False* will force it to prefer those scenarios where the gains are closer to the root.
"""
# define alias for mode else:
# create a named string for the mode
keywords, force=False, gpl=1, push_gains=True, missing_data=0)
# check for previous analyses log.info( "Gain-loss scenario {0} has already been calculated. For recalculation, " "set 'force' to 'True'.".format(glm)) return
# create statistics for this run
# store the statistics
# attribute stores all gls for each cog
# make a temporary hash in order to decrease the number of calls to the algorithm
# check whether cog has already been calculated skip += 1 log.debug( "Skipping already calculated pattern for COG {0}...".format(cog)) self.gls[glm][cog] = cogDict[cogTuple] else:
# check for singletons gls = [(self.taxa[self.paps[cog].index(1)], 1)] else: self.paps[cog], self.taxa, self.tree, gpl=keywords['gpl'], weights=ratio, push_gains=keywords['push_gains'], missing_data=keywords['missing_data'] )
self.paps[cog], r=restriction, mode='r', gpl=keywords['gpl'], push_gains=keywords['push_gains'], missing_data=keywords['missing_data'] )
self.paps[cog], mode=restriction, missing_data=keywords['missing_data'] )
# append new results to cogDict
# append scenario to gls
# write the results to file # if output of gls is chosen, load the gml-graph
# store some statistics as attributes [v[1] for v in self.gls[glm].values()] ) / len(self.gls[glm])
# store statistics and gain-loss-scenarios in textfiles
"{0}".format(cog), ','.join(["{0}:{1}".format(a, b) for a, b in gls]), text_type(noo)]) os.path.join('gls', '{0}-{1}.gls'.format(self.dataset, glm)), [util.tabjoin(line) for line in lines])
'Number of PAPs (total): {0}'.format(len(self.paps)), 'Number of PAPs (non-singletons): {0}'.format(len(self.gls[glm])), 'Number of Singletons: {0}'.format(len(self.singletons)), 'Average Number of Origins: {0:.2f}'.format(self.stats[glm]['ano']), 'Maximum Number of Origins: {0}'.format(self.stats[glm]['mno']), 'Mode: {0}'.format(mode), ] os.path.join('stats', '{0}-{1}'.format(self.dataset, glm)), lines)
""" Calculate the Contemporary Vocabulary Size Distribution (CVSD).
""" # -># define taxa and concept as attribute for convenience # ->taxa = self.taxa # ->concepts = self.concept #XXX do we need this? XXX
# -># calculate vocabulary size # ->forms = [] # ->meanings = [] # ->for taxon in taxa: # -> f = [x for x in set( # -> self.get_list(col=taxon,entry=self._pap_string,flat=True) # -> ) if x in self.cogs # -> ] # -> m = set([x.split(':')[1] for x in f]) # -> forms += [len(f)] # -> meanings += [len(m)] # -> # -># store the stuff as an attribute # ->self.dists['contemporary'] = [x for x,y in zip(forms,meanings)] # XXX taxa=t, entry=self._pap_string, flat=True) if p not in self.singletons))
""" Function retrieves all pap s for ancestor languages in a given tree. """
# check for already calculated glm # check for previous analyses "Gain-loss scenario {0} has already been calculated. For recalculation, " "set 'force' to 'True'.".format(glm))
p2p = dict(zip(paps, protos)) else:
# -># define concepts for convenience # ->concepts = self.concepts # XXX do we need this? XXX # -> # -># get all internal nodes, i.e. the nontips and also the root # ->nodes = ['root'] + sorted( # -> [node.Name for node in self.tree.nontips()], # -> key=lambda x: len(self.tree.getNodeMatchingName(x).tips()), # -> reverse = True # -> )
# -># retrieve scenarios # ->tmp = sorted([(a,b,c) for a,(b,c) in self.gls[glm].items()]) # ->cog_list = [t[0] for t in tmp] # ->gls_list = [t[1] for t in tmp] # ->noo_list = [t[2] for t in tmp]
# -># create a list that stores the paps # ->paps = [[0 for i in range(len(nodes))] for j in range(len(cog_list))]
# -># iterate and assign values # ->for i,cog in enumerate(cog_list): # -> # -> # sort the respective gls # -> gls = sorted( # -> gls_list[i], # -> key = lambda x: len(self.tree.getNodeMatchingName(x[0]).tips()), # -> reverse = True # -> )
# -> # retrieve the state of the root # -> if gls[0][1] == 1 and gls[0][0] == 'root': # -> state = 1 # -> else: # -> state = 0
# -> # assign the state of the root to all nodes # -> paps[i] = [state for node in nodes]
# -> # iterate over the gls and assign the respective values to all # -> # children # -> # XXX note that here we assume that missing data is coded as # -> # 0, so this should probably be adapted XXX # -> for name,event in gls: # -> if event == 1: # -> this_state = 1 # -> else: # -> this_state = 0
# -> # get the subtree nodes # -> sub_tree_nodes = [node.Name for node in # -> self.tree.getNodeMatchingName(name).nontips()]
# -> # assign this state to all subtree nodes # -> for node in sub_tree_nodes: # -> paps[i][nodes.index(node)] = this_state
# -># get number of forms and number of meanings # -># extract cogs instead of numbers, XXX this can actually be done in the # -># step before, it's just for testing at the moment # ->for i,cog in enumerate(cog_list): # -> for j,t in enumerate(paps[i]): # -> if t == 1: # -> paps[i][j] = cog # -> else: # -> pass # -> # -># get forms and meanings # ->forms = [] # ->meanings = [] # ->for i in range(len(paps[0])): # -> f = set([x[i] for x in paps if x[i] != 0]) # -> m = set([x[i].split(':')[1] for x in paps if x[i] != 0]) # -> forms += [len(f)] # -> meanings += [len(m)]
# -># store the number of forms as an attribute # ->self.dists[glm] = [x for x,y in zip(forms,meanings)] # XXX
# -># store results of the analyses, that is, all paps for each ancestral # -># node # ->cogs = [k[self.header['pap']] for k in self._data.values()]
# -># search for proto as keyword # ->if keywords['proto']: # -> protos = [k[self.header[keywords['proto']]] for k in # -> self._data.values()] # -> cogs2proto = dict(zip(cogs,protos)) # ->else: # -> cogs2proto = dict(zip(cogs,cogs))
# -># store data in acs attribute (ancestral cognate states) # ->self.acs[glm] = {} # ->for i,n in enumerate(nodes): # -> for j,p in enumerate(paps): # -> c = paps[j][i] # -> if c != 0: # -> m = self.pap2con[c] # -> p = cogs2proto[c]
# -> if n != 'root': # -> node = self.tree.getNodeMatchingName(n) # -> node = n #''.join( # -> #[x for x in str(node) if x not in '";()'+"'"] # -> #)#.replace('(','').replace(')','').replace(',','-') # -> else: # -> node = n # -> # -> try: # -> self.acs[glm][node] += [(c,m,p)] # -> except: # -> self.acs[glm][node] = [(c,m,p)]
""" Plot a tree in which the node size correlates with the size of the ancestral node. """ keywords, scaler=0.1, degree=180, change=lambda x: 2.5 * x, figsize=(10, 5), colormap=mpl.cm.jet, colors=True)
# check for the model
# create a dictionary for all nodes
# iterate over contemporary taxa first # get all cognates that are not singletongs col=taxon, flat=True, entry='pap') if x in self.singletons]))
# count the number of paps
# iterate over internal nodes now x not in self.taxa]:
keywords['colormap'](cfunc[int(tmp[node] * 244 / max(vsizes))]))
# check for filename in keywords
self, output_gml=False, output_plot=False, tar=True, leading_model=False, mixed_threshold=0.0, evaluation='mwu', **keywords ): """ Calculate VSD on the basis of each item.
"""
# assign concept dict
# define concepts and taxa for convenience
# get all internal nodes, i.e. the nontips and also the root [node.Name for node in self.tree.nontips()], key=lambda x: len(self.tree.getNodeMatchingName(x).tips()), reverse=True)
# make dictionary that stores the best models for each cognate set
# make array for all nodes and a dict for the scenarios
# iterate over concepts # get paps
# add to list if value is missing tmp[taxon] = []
# calculate distribution for contemporary taxa
# calculate ancestral dists, get all paps first self.get_list(row=concept, entry=self._pap_string, flat=True) ) if i not in self.singletons]
# get the models models = [leading_model] + sorted( [k for k in self.gls.keys() if k != leading_model]) else:
m != 'mixed' and self._pvalues[m] >= mixed_threshold]
# get the scenarios # get the parent
# get paps of parent
# count number of paps
form in tmp_list]]
# calculate best distribution, we can use averages for this # purpose, since it seems that the kruskalwallis test or # mannwhitneyu does not really apply to this kind of data with lots # of small numbers XXX zp_vsd.append((0, 0.0)) else: elif evaluation in ['average']: # check for best median and best average ave_cvsd = sum(cvsd) / len(cvsd) ave_avsd = sum(avsd) / len(avsd)
score = abs(ave_cvsd - ave_avsd) zp_vsd.append((1, score))
# extract p-values elif evaluation in ['average']: maxP = min(p_vsd)
# check for threshold # if leading_model: # if True: #maxP >= mixed_threshold: # maxIdx = p_vsd.index(maxP) # best_model = models[maxIdx] # else: # maxIdx = 0 # best_model = leading_model # maxP = p_vsd[0] # else:
# add sum to general model XXX start here XXX
# add to concepts
# print(sum([n for m,n,o in best_models.values()]) / len(best_models))
# append to distributions
# self.dists['mixed'] = all_avsd
# append to available models
# write the results to file # if output of gls is chosen, load the gml-graph self._plot('mixed', output_plot, tar, fileformat=kw['fileformat'])
# store some statistics as attributes [v[1] for v in self.gls['mixed'].values()] ) / len(self.gls['mixed'])
# store statistics and gain-loss-scenarios in textfiles
# write gls-data to folder self._output_path('gls', '{0}-{1}.gls'.format(self.dataset, "mixed")), log=False ) as f: "{0}\t".format(cog) + ','.join( ["{0}:{1}".format(a, b) for a, b in gls] ) + '\t' + str(noo) + '\n' )
""" Compute the ancestral character states (ACS) for all internal nodes.
""" keywords, proto=False, force=False, filename=self._output_path('acs-' + glm), fileformat='csv')
""" Compute an Minimal Lateral Network for a given model.
Parameters ---------- glm : str The dictionary key for the gain-loss-model. threshold : int (default=1) The threshold used to exclude edges. method : str (default='mr') Select the method for MLN calculation. Choose between: * "mr": majority-rule, multiple links are resolved by selecting those which occur most frequently * "td": tree-distance, multiple links are resolved by selecting those which are closest on the tree * "bc": betweenness-centrality, multiple links are resolved by selecting those which have the highest betweenness centrality
"""
# make alias for tree and taxa for convenience
# get the topological graph
# make alias for the current gls for convenience
# create dictionary for inferred lateral events
# create out graph
# load data for nodes into new graph else:
# load edge data into new graph
# start to assign the edge weights # connect the origins with edges else:
scenarios.items(), desc='MLN-RECONSTRUCTION', total=len(scenarios)):
# calculate majority-rule edges # iterate over nodes nodeA, nodeB, weight=gPrm[nodeA][nodeB]['weight']) else: len(self.tree.getConnectingEdges('root', nodeB)) gPrm, normalized=True, weight='weight') # get the weighted degrees for the primary graph
# add all nodes as simple nodeA, nodeB, weight=0 if max_deg in [nodeA, nodeB] else 10)
# if the graph is not empty # check for identical weights and change them according to tree-distance
# check for identical weights and calculate the tree distance
# check whether there are more identical weights # if so, order all stuff according to branch # length, we need try-except statement for # branchdistances here, since cogent does not # calculate distances to the root and back self.tree.getConnectingEdges(a, b)) branch_distance = len( self.tree.getConnectingEdges(b, a)) branches += [(a, b, branch_distance)] else:
# now change the weights according to the order branches, key=lambda x: (x[2], x[1], x[0]), reverse=True)
# change maximum weights to distance weights key=lambda x: x[2]['weight']):
# calculate the MST
# assign the MST-weights to gMST else:
# load data for nodes into new graph else:
# load edge data into new graph
# assign new edge weights
# write the inferred borrowing events (ILS, inferred lateral event) # between all taxa to file
text_type(cog) + '\t' + ','.join( ['{0}:{1}'.format(a, b) for a, b in events] ) + '\n')
# create file name for node labels (cytoscape output)
# add gOut to graphattributes
# write stats to file # get the degree
n, text_type(tree.getNodeMatchingName(n)), d, w)))
# write edge distributions
nA, nB, d['weight'], d['cogs'], tree.getNodeMatchingName(nA), tree.getNodeMatchingName(nB))))
self._output_path(os.path.join('taxa-' + glm, taxon + '.csv')), log=False ) as f:
# get the index of the current entry in dictionary representation
# get its real index
# include entries specified in keywords XXX modify later # for customization
""" Calculate Patchily Distributed Cognates. """
# get the origins [x[0] for x in gls if x[1] == 1], key=lambda x: len(self.tree.getNodeMatchingName(x).getTipNames()))
# get the tip-taxa for each origin
i + 1, [t for t in self.tree.getNodeMatchingName(ori).getTipNames() if t not in losses and t not in tipsofar]]
# now, all set of origins with their tips are there, we store them # in the patchy dictionary, where each taxon is assigned the # numerical value of the given patchy dist else:
# get the index for the paps in the wordlist
# create a dictionary as updater for the wordlist
# update the wordlist
# write data to file # self.output('csv',filename=self.dataset+'_phybo/wl-'+glm) # XXX change later
# write ranking of concepts to file else: f.write('COGID\tGLID\tCONCEPT\tORIGINS\tREFLEXES\tORIG/REFL\n')
# check for number of occurrences
# append three vals: number of origins, number of words, and the # number of origins per number of words
# check for proto a1, a2, a3, b, len(l), b / float(len(l)), proto)) else: a1, a2, a3, b, len(l), float(b) / len(l)))
# write stats on concepts
# get origins per concept
# get patchy cognate sets per number of words
# get the number of words per concept
# concepts[key] = sum(concepts[key])/len(concepts[key])
# write average to file sum([cstats[c][0] for c in cstats]) / len(cstats), sum([cstats[c][1] for c in cstats]) / len(cstats), sum([cstats[c][2] for c in cstats]) / len(cstats) ))
# write alternative stats on concepts including information of # singletons (excluding them may bias the results)
sum([v[0] for v in value]), sum([v[1] for v in value]), sum([v[2] for v in value]) )
concepts.items(), key=lambda x: x[1][2], reverse=True): '{0}\t{1}\t{2}\t{3}\t{4:.2f}\n'.format(k, c, p, r, (p - c + 1) / r)) # write mean
"MEAN", mc, mp, mr, (mp - mc + 1) / mr))
# store params in attribute stats cognates=sum([x[0] for x in concepts.values()]) / self.height, patchies=sum([x[1] for x in concepts.values()]) / self.height, reflexes=sum([x[2] for x in concepts.values()]) / self.height, origins=sum([x[0] for x in cstats.values()]) / self.height, # reflexes = sum([x[1] for x in cstats.values()]) / self.height, patchy_per_reflex=sum([x[2] for x in cstats.values()]) / self.height)
# write results to alm-file # get all patchy cognates
# XXX change this later for more flexibility XXX else self[key, 'counterpart']
# write stuff to alm-file self._output_path(self.dataset + '-' + glm + '.alm.patchy'), log=False ) as f:
# get words and languages tmp[concept][pap][patchy]]
range(len(tmp[concept][pap][patchy]))]
# XXX add for different alignment algorithm later XXX
# get formatter for languages
formatter) + '}\t{1}\t|\t{2}\t|\t[{3}]\n' langs[i], patchies[i], '\t'.join(alms[i]), word))
""" Return the edge data for a given gain-loss model. """ # define a warning message # check for entryB
# get the graph locally for convenience
cogs = edge['cogs'].split(',') else: else: cogs = edge['cogs'].split(',')
# check whether nodes are in list or not if nodeA in self.taxa: nodesA = [nodeA] else: nodesA = self.tree.getNodeMatchingName(nodeA).getTipNames() if nodeB in self.taxa: nodesB = [nodeB] else: nodesB = self.tree.getNodeMatchingName(nodeB).getTipNames()
def assemble_output(nodes): out = defaultdict(list) for node in nodes: self.get_list(col=node, flat=True, entry='pap'), self.get_list(col=node, flat=True))) for cog in cogs:
for cog in cogs: output += [(self.pap2con[cog], outA[cog], outB[cog])] except: "Error encountered in cognate {0}.".format(self.pap2con[cog]))
self, runs="default", mixed=False, output_gml=False, tar=False, full_analysis=True, plot_dists=False, output_plot=False, plot_mln=False, plot_msn=False, **keywords ): """ Carry out a full analysis using various parameters.
Parameters ---------- runs : {str list} (default="default") Define a couple of different models to be analyzed. Select between:
* 'default': weighted analysis, using parsimony and weights for gains and losses * 'topdown': use the traditional approach by :evobib:`Nelson-Sathi2011` * 'restriction': use the restriction approach
You can also define your own mix of models.
usetex : bool (default=True) Specify whether you want to use LaTeX to render plots. mixed : bool (default=False) If set to c{True}, calculate a mixed model by selecting the best model for each item separately. output_gml : bool (default=False) Set to c{True} in order to output every gain-loss-scenario in GML-format. full_analysis : bool (default=True) Specifies whether a full analysis is carried out or not. plot_mln : bool (default=True) Select or unselect output plot for the MLN. plot_msn : bool (default=False) Select or unselect output plot for the MSN.
""" keywords, colorbar=None, # mpl.cm.jet, threshold=1, fileformat=rcParams['phybo_fileformat'], usetex=False, only=[], colormap=None, # mpl.cm.jet proto=False, xticksize=6, method='mr', # majority rule gpl=1, push_gains=True, missing_data=0, aligned_output=False, homoplasy=0.05, evaluation='mwu')
# define a default set of runs ('weighted', (5, 2)), ('weighted', (2, 1)), ('weighted', (3, 2)), ('weighted', (1, 1))] ('topdown', 3), ('topdown', 4), ('topdown', 5), ('topdown', 6), ('topdown', 7), ('topdown', 8), ('topdown', 9), ('topdown', 10)] ('restriction', 3), ('restriction', 4), ('restriction', 5), ('restriction', 6)]
"Analysing dataset with mode {0} and params {1}...".format(mode, params)) mode=mode, output_gml=output_gml, tar=tar, output_plot=output_plot, missing_data=keywords["missing_data"]) gpl=keywords['gpl'], push_gains=keywords['push_gains'], ratio=params) gpl=keywords['gpl'], push_gains=keywords['push_gains'], restriction=params) elif mode == 'topdown':
# calculate the different distributions # start by calculating the contemporary distributions
# now calculate the rest of the distributions
# compare the distributions using mannwhitneyu
self.dists['contemporary'], self.dists[m])
# determine the best model
# make a hash for all ps for all models
# set the best model
# calculate mixed model output_plot=output_plot, output_gml=output_gml, tar=tar, leading_model=glm, **keywords )
# set the mixed model as the best one
self.dists['contemporary'], self.dists['mixed'])
# write results to file '{0}\t{1:.2f}\t{2}\t{3}\n'.format( modes[i], self.stats[modes[i]]['ano'], self.stats[modes[i]]['mno'], '{0[0]}\t{0[1]:.4f}'.format(zp_vsd[i]) ) )
# plot the stats if this is defined in the settings # specify latex
# store distributions in lists
# store contemporary dists
# get the average number of origins
# create a sorter for the distributions s[0] for s in sorted(zip(range(len(modes)), ano), key=lambda x: x[1])]
# sort the stuff mode_strings = [m for m in modes]
# sort the zp-values
# format the zp-values
# adjust the modes mode_strings[i] = r'\textbf{' + modes[i] + '}' else: p_vsd.append('p$=${0:.2f}'.format(p)) else: else:
range(1, len(modes) + 2), [''] + ['{0}\n{1}'.format(m, p) for m, p in zip(mode_strings, p_vsd)], size=keywords['xticksize'], rotation=45, ha='center')
True, linestyle='-', which='major', color='lightgrey', alpha=0.5, zorder=1)
self.best_model, threshold=keywords['threshold'], method=keywords['method'])
self.best_model, filename=self._output_path('mln-' + glm), threshold=keywords['threshold'], fileformat=keywords['fileformat'], usetex=keywords['usetex'], colormap=keywords['colormap'] ) self.best_model, filename=self._output_path('msn-' + glm), fileformat=keywords['fileformat'], threshold=keywords['threshold'], only=keywords['only'], usetex=keywords['usetex'], colormap=keywords['colormap'] )
self, glm='', fileformat='pdf', threshold=1, usetex=False, taxon_labels='taxon_short_labels', alphat=False, alpha=0.75, **keywords ): """ Plot the MLN with help of Matplotlib.
glm : str (default='') Identifier for the gain-loss model that is plotted. Defaults to the model that had the best scores in terms of probability. filename : str (default='') If no filename is selected, the filename is identical with the dataset. fileformat : {'svg','png','jpg','pdf'} (default='pdf') Select the format of the output plot. threshold : int (default=1) Select the threshold for drawing lateral edges. usetex : bool (default=True) Specify whether you want to use LaTeX to render plots. colormap : {None matplotlib.cm} A :py:class:`matplotlib.colormap` instance. If set to c{None}, this defaults to :py:class:`~matplotlib.cm.jet`. taxon_labels : str (default='taxon.short_labels') Specify the taxon labels that should be included in the plot.
""" # check for correct glm
# switch backend, depending on whether tex is used or not
keywords, figsize="optimal", # rcParams['phybo_figsize'], figure_width=10, figure_scale=1, colormap=mpl.cm.jet, filename=self.dataset, linescale=rcParams['phybo_linescale'], maxweight=rcParams['phybo_maxweight'], xlim=rcParams['phybo_xlim'], ylim=rcParams['phybo_ylim'], xlimr=rcParams['phybo_xlimr'], xliml=rcParams['phybo_xliml'], ylimt=rcParams['phybo_ylimt'], ylimb=rcParams['phybo_ylimb'], left=rcParams['phybo_left'], right=rcParams['phybo_right'], top=rcParams['phybo_top'], bottom=rcParams['phybo_bottom'], cbar_shrink=rcParams['phybo_cbar_shrink'], cbar_fraction=rcParams['phybo_cbar_fraction'], cbar_pad=rcParams['phybo_cbar_pad'], cbar_orientation=rcParams['phybo_cbar_orientation'], cbar_label=rcParams['phybo_cbar_label'], vedgestyle=rcParams['phybo_vedgestyle'], vedgecolor=rcParams['phybo_vedgecolor'], vedgelinewidth=rcParams['phybo_vedgelinewidth'], vedgeinnerline=rcParams['phybo_vedgeinnerline'], hedgescale=rcParams['phybo_hedgescale'], nodestyle=rcParams['phybo_nodestyle'], nodesize=rcParams['phybo_nodesize'], nodecolor=rcParams['phybo_nodecolor'], labels=rcParams['phybo_labels'], _prefix=rcParams['phybo_prefix'], _suffix=rcParams['phybo_suffix'], textsize=rcParams['phybo_textsize'], vsd_scale=rcParams['phybo_vsd_scale'], latex_preamble=rcParams['phybo_latex_preamble'], alpha_threshold=0.2)
# get max and min values for coordinates
keywords['figure_width'] + keywords['figure_scale'], h / (w / (keywords['figure_width'])))
self.get_list(col=taxon, flat=True, entry='pap') if x in self.cogs] # count the number of paps
x not in self.taxa]: else:
# store in internal and external nodes
# get colormap for edgeweights graph.edges(data=True) if d['label'] == 'horizontal']
# add max weight to edge_weights edge_weights += range(keywords['maxweight'])
# determine a colorfunction
# get the scale for the weights (needed for the line-width) scale = 1.0
# write colors and scale to graph colormap(cfunc[weights.index(w)]))
# get the nodes
else:
# store vertical and lateral edges
else: else: w = 0.0
# draw the horizontal edges [xA, xB], [yA, yB], '-', color=f, linewidth=float(w) / keywords['hedgescale'], alpha=a)
# draw the vertical edges [xA, xB], [yA, yB], '-', color=keywords['vedgecolor'], linewidth=keywords['vedgelinewidth']) [xA, xB], [yA, yB], '-', color='1.0', linewidth=keywords['vedgeinnerline']) # store x,y values for ylim,xlim drawing
x, y, 'o', markersize=s, # keywords['nodesize'], color=keywords['nodecolor'], zorder=200)
# this is a workaround to get the text away from the node else:
x, y, text, size=keywords['textsize'], verticalalignment='center', horizontalalignment=ha, fontweight='bold', color='black', rotation=r, rotation_mode='anchor', zorder=1)
figsp.imshow([[1, 2], [1, 2]], cmap=colormap, visible=False), ticks=[1, 1.25, 1.5, 1.75, 2], orientation=keywords['cbar_orientation'], shrink=keywords['cbar_shrink'], fraction=keywords['cbar_fraction'], pad=keywords['cbar_pad'])
# check for maxweights-keyword
text_type(min(weights)), '', text_type(int(max(weights) / 2)), '', text_type(max(weights))])
else:
else:
left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'] ) # fig.axes.get_xaxis().set_visible(False) # fig.axes.get_yaxis().set_visible(False)
# save the figure
self, glm='', filename='', fileformat='pdf', threshold=1, usetex=True, colormap=None, # mpl.cm.jet, taxon_labels='taxon_short_labels', alphat=False, alpha=0.75, **keywords ): """ Plot the MLN with help of Matplotlib in 3d.
glm : str (default='') Identifier for the gain-loss model that is plotted. Defaults to the model that had the best scores in terms of probability. filename : str (default='') If no filename is selected, the filename is identical with the dataset. fileformat : {'svg','png','jpg','pdf'} (default='pdf') Select the format of the output plot. threshold : int (default=1) Select the threshold for drawing lateral edges. usetex : bool (default=True) Specify whether you want to use LaTeX to render plots. colormap : {None matplotlib.cm} A :py:class:`matplotlib.colormap` instance. If set to c{None}, this defaults to :py:class:`~matplotlib.cm.jet`. taxon_labels : str (default='taxon.short_labels') Specify the taxon labels that should be included in the plot.
"""
# check for correct glm glm = self.best_model raise ValueError("[i] You should select an appropriate model first.")
# switch backend, depending on whether tex is used or not
elif not usetex and backend != 'TkAgg': plt.switch_backend('TkAgg')
# set default, XXX change later
# try to load the configuration file
# check for 'taxon.labels' in conf else:
# get the graph
# store in internal and external nodes
# get colormap for edgeweights graph.edges(data=True) if d['label'] == 'horizontal']
# determine a colorfunction
# sort the weights
# get the scale for the weights (needed for the line-width)
# write colors and scale to graph colormap(cfunc[weights.index(w)]))
# get the nodes else: else: g['x'], g['y'], -g['z'], label, d['graphics'].get('angle', 0), g['s'], g['zorder'] ))
# store vertical and lateral edges
# get the edges
else: if alphat: a = 0.2 else: w = 0.0
# draw the horizontal edges ledges, key=lambda x: x[-2]): [xA, xB], [yA, yB], [zA, zB], color=f, linewidth=float(w) / 4, alpha=a, zorder=zorder # 100 # * abs(xA-xB) + 100 * abs(yA-yB) )
# draw the vertical edges [xA, xB], [yA, yB], [zA, zB], color='0.0', linewidth=3, alpha=0.75, zorder=zorder # 100 * abs(xA-xB) + 100 * abs(yA-yB) ) # store x,y values for ylim,xlim drawing
# draw the nodes
# draw the leaves, store x and y-maxima for ylim, xlim drawing
x, y, z, t, size='5', verticalalignment='center', horizontalalignment='center', bbox=dict( facecolor='white', boxstyle='square,pad=0.2', ec="none", # alpha = 0.25 ), fontweight='bold', color='black', zorder=zorder + 200 # 120 # * x + 100 * y )
""" Plot the Minimal Spatial Network.
Parameters ---------- glm : str (default='') A string that encodes which model should be plotted. filename : str The name of the file to which the plot shall be written. fileformat : str The output format of the plot. threshold : int (default=1) The threshold for the minimal amount of shared links that shall be plotted. usetex : bool (default=True) Specify whether LaTeX shall be used for the plot.
""" # check for correct glm glm = self.best_model raise ValueError("[i] You should select an appropriate model first.")
# redefine taxa and tree for convenience
# XXX check for coordinates of the taxa, otherwise load them from file and # add them to the wordlist XXX add later, we first load it from file else: coords = csv2dict(self.dataset, 'coords', dtype=[str, float, float])
# calculate all resulting edges, using convex hull as approximation
# get the labels
# first check, whether edge is horizontal # if both labels occur in taxa, it is simple # if only one in taxa, we need the convex hull for that node # check which node is in taxa
# check whether the nodes have the respective cognate # and take only those that have it col=other_node, entry='pap', flat=True)
# get the convex points of others (round(coords[t][0], 5), round(coords[t][1], 5)) for t in new_other_nodes]
# get the hull with the minimal euclidean distance np.array(hull) - np.array(coords[this_label]))) (round(this_hull[0], 5), round(this_hull[1], 5)))]
# append the edge to the graph += ',' + cog cogs=cog)
# get the convex points (round(coords[t][0], 5), round(coords[t][1], 5)) for t in newtaxA] (round(coords[t][0], 5), round(coords[t][1], 5)) for t in newtaxB]
np.array(hullA) - np.array(hullB)))
(round(minHulls[0][0], 5), round(minHulls[0][1], 5)))] (round(minHulls[1][0], 5), round(minHulls[1][1], 5)))]
# append the edge to the graph
# write stats to file # get the degree
g for g in geoGraph[taxon] if 'weight' in geoGraph[taxon][g]]
n, text_type(tree.getNodeMatchingName(n)), d, w))
# write edge distributions
nA, nB, d['weight'], d['cogs'], tree.getNodeMatchingName(nA), tree.getNodeMatchingName(nB)))
self, glm='', fileformat='pdf', threshold=1, usetex=False, alphat=False, alpha=0.75, only=[], **keywords ): """ Plot a minimal spatial network. """ keywords, latex_preamble=[], figsize=(10, 10), colormap=mpl.cm.jet, filename=self.dataset, linescale=1.0, maxweight=False, xlim=5, ylim=5, xlimr=False, xliml=False, ylimt=False, ylimb=False, left=0.02, right=0.98, top=1.00, bottom=0.00, cbar_shrink=0.55, cbar_fraction=0.1, cbar_pad=0.1, cbar_orientation='vertical', cbar_label='Inferred Links', cbar_fontsize=10, resolution='l', table_text_color='black', water_color='0.2', lw=2, cmap_max=250, continent_color='0.9', projection='merc', legend_size=18, linewidth=4, min_lon=False, max_lon=False, min_lat=False, max_lat=False, table_column_width=[0.025, 0.1325], coastline_color="0.5", table_location=3, legend_location=[0.85, 0.02], table_cell_height=0.024, table_text_size=10, alpha=0.75, cmap_min=30, markersize=20)
# load the rc-file XXX add internal loading later
# switch backend, depending on whether tex is used or not
# check for preamble settings
# check for groups, add functionality for groups in qlc-file later XXX else:
# set the graph variable
# get the weights for the lines
# XXX check for coordinates of the taxa, otherwise load them from file and # add them to the wordlist XXX add later, we first load it from file else:
# determine the maxima of the coordinates
# instantiate the basemap llcrnrlon=min_lon + conf['min_lon'], llcrnrlat=min_lat + conf['min_lat'], urcrnrlon=max_lon + conf['max_lon'], urcrnrlat=max_lat + conf['max_lat'], resolution=conf['resolution'], projection=conf['projection'] )
# draw first values
# plot the lines
else: else:
# retrieve the coords
# get the points on the map
# plot the points [xA, xB], [yA, yB], '-', color=color, alpha=alpha, linewidth=linewidth, zorder=w + 50)
# plot the points for the languages
# check for taxon.labels in conf else: else:
sorted(coords.items(), key=lambda x: x[0])):
# retrieve x and y from the map
# get colors from conf taxon_marker = 'o'
x, y, taxon_marker, markersize=conf['markersize'], color=taxon_color, zorder=max_weight + 52) else: x, y, taxon_marker, markersize=conf['markersize'], color=taxon_color, zorder=max_weight + 52, label=gfunc(groups[taxon]))
# add number to celltext else:
# plot the text # check for darkness of color '0.1', '0.0']: else:
x, y, str(i + 1), size=str(int(conf['markersize'] / 2)), color=text_color, label=taxon, horizontalalignment='center', fontweight="bold", verticalalignment='center', zorder=max_weight + 55)
# add a colorbar cax, ticks=[1, 1.25, 1.5, 1.75, 2], orientation=keywords['cbar_orientation'], shrink=keywords['cbar_shrink'], fraction=keywords['cbar_fraction'], pad=keywords['cbar_pad'], ) [ str(min(weights)), '', str(int(max(weights) / 2)), '', str(max(weights)) ], fontsize=keywords['cbar_fontsize'] )
# add the legend cellText=cell_text, colWidths=conf['table_column_width'], loc=conf['table_location'], )
# adjust the table # this_table._cells[line]._text._fontproperties.set_size(conf['table.text.size'])
loc=conf['legend_location'], numpoints=1, prop={'size': conf['legend_size'], 'weight': 'bold'})
left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom'])
self, concept, cogA, cogB, labels={1: '1', 2: '2', 3: '3', 4: '4'}, tcolor={1: 'white', 2: 'black', 3: '0.5', 4: '0.1'}, filename='pdf', fileformat='pdf', usetex=True ): """ Plot the evolution of two concepts in space.
Notes ----- This function may be useful to contrast patterns of different words in geographic space.
"""
# XXX check for coordinates of the taxa, otherwise load them from file and # add them to the wordlist XXX add later, we first load it from file else:
# get the dictionary and the entry
# check for identical cogs and assign them to the 4 categories else:
# determine the maxima of the coordinates
# start to initialize the basemap
# instantiate the basemap llcrnrlon=min_lon + conf['min_lon'], llcrnrlat=min_lat + conf['min_lat'], urcrnrlon=max_lon + conf['max_lon'], urcrnrlat=max_lat + conf['max_lat'], resolution=conf['resolution'], projection=conf['projection'] )
# draw first values
# plot the points for the languages sorted(coords.items(), key=lambda x: x[0])):
# retrieve x and y from the map
# get the color of the given taxon # taxon_color = colors[groups[taxon]]
x, y, marker, markersize=conf['markersize'], color=tcolor[these_taxa[taxon]]) else: x, y, marker, markersize=conf['markersize'], color=tcolor[these_taxa[taxon]], label=labels[these_taxa[taxon]])
# add number to celltext else: cell_text.append([str(i + 1), taxon])
x, y, str(i + 1), size=str(int(conf['markersize'] / 2)), label=taxon, color='white' if tcolor[these_taxa[taxon]] == 'black' else 'black', horizontalalignment='center', verticalalignment='center')
cellText=cell_text, colWidths=conf['table.column.width'], loc=conf['table.location'])
# adjust the table conf['table.text.size']) this_table._cells[line].set_linewidth(0.0) this_table._cells[line].set_color(conf['table.cell.color'])
loc=conf['legend.location'], numpoints=1, prop={'size': conf['legend.size'], 'weight': 'bold'})
""" Plot the inferred scenarios for a given model. """
# store the graph gls, self.tgraph, self.tree, filename=self._output_path( 'gml', '{0}-{1}'.format(self.dataset, glm), cog))
# if plot of gml is chosen
o = d['origin']
xA = g.node[a]['graphics']['x'] yA = g.node[a]['graphics']['y'] yB = g.node[b]['graphics']['y']
# mpl.rc('text',usetex=keywords['usetex'])
f = '#a3a3a3' else: c = '#000000'
size = 20 else: size = 10 else: x, y, l, horizontalalignment='center', verticalalignment='center', size=8, fontweight='bold', color=c, backgroundcolor=f ) else: x, y, l, ha=s, va='baseline', size=8, fontweight='bold', color=c, rotation=r, rotation_mode='anchor', bbox=dict( facecolor='white', boxstyle='square,pad=0.25', ec="none", alpha=0.25 ), )
self._output_path( 'gml', '{0}-{1}-figures'.format(self.dataset, glm), '{0}-{1}.'.format(self.pap2con[cog], cog) + kw['fileformat'] ) )
""" Calculate basic statistics for a given gain-loss model. """ else: gains = [] # get the respective subset-item first
# check whether subset is as specified if item in subset[1] or item == subset[1]: gains += [self.gls[glm][cog][1]]
filename, 'Number of origins: {0:.2f}\nPercentage of patchy cogs {1:.2f}\n'.format(noo, ppc))
""" Plot the evolution of specific concepts along the reference tree. """ keywords, figsize=(15, 15), left=0.05, top=0.95, bottom=0.05, right=0.95, colormap=mpl.cm.jet, edgewidth=5, radius=2.5, outer_radius=0.5, inner_radius=0.25, cognates='', ref='', usetex=False, latex_preamble=False, textsize=8, subset=[])
# check for the correct item else:
# XXX customize later XXX
# start with the analysis
# switch backend, depending on whether tex is used or not plt.switch_backend('pgf')
# check for preamble settings
# get all paps that are no singletons row=concept, flat=True, entry='pap') if p not in self.singletons]))
"No entries for concept {0} could be found, skipping the plot.".format( concept)) else: # get the number of paps in order to get the right colors range(len(paps))])
# get the wedges for the paps
# add stuff for the legend (0, 0), 1, wedges[pap][0], wedges[pap][1], facecolor=colors[pap], zorder=1, linewidth=2, edgecolor='black' ) idx = [x[0] for x in self.etd[pap] if x != 0][0] legendTextA += [self[idx, keywords['cognates']]] else:
# second legend explains evolution (0, 0), 1, 0, 360, facecolor='0.5', linewidth=2, edgecolor='black')
# overwrite stuff
# iterate over the paps and append states to the graph # get the graph with the model
# add a pap-dictionary if it's not already there if 'pap' not in graph.node[n]: graph.node[n]['pap'] = {}
graph.node[n]['pap'][pap] = d['state']
# create the figure
# iterate over edges first
[xA, xB], [yA, yB], '-', color='black', linewidth=keywords['edgewidth'])
# now iterate over the nodes
# get z-value which serves as zorder attribute
# plot the default marker # check for origins in cpaps (x, y), keywords['radius'] + keywords['outer_radius'], 0, 360) if 'O' in cpaps.values(): facecolor='white', zorder=57 + z, linewidth=2.5, linestyle='dashed') facecolor='white', zorder=56 + z, linewidth=2.5, linestyle='solid') if 'L' in cpaps.values() and 'O' in cpaps.values(): facecolor='0.5', zorder=58 + z, linewidth=2.5, edgecolor='black', linestyle='dashed') facecolor='0.5', zorder=59 + z, linewidth=2.5, edgecolor='black')
wedge_args = ((x, y), keywords['radius'], theta1, theta2) facecolor=color, zorder=61 + z, linewidth=2, edgecolor='black' )
# check for characteristics of this pap wedge_kw.update(linestyle='dashed')
# add number for node x, y, n, size=keywords['textsize'], verticalalignment='baseline', backgroundcolor='white', horizontalalignment='center', fontweight='bold', color='black', bbox=dict( facecolor='white', boxstyle='square,pad=0.25', ec="none", alpha=1 ), zorder=300 )
numpoints=1)
left=keywords['left'], right=keywords['right'], top=keywords['top'], bottom=keywords['bottom']) 'items', '{0}-{1}'.format(self.dataset, glm), concept.replace('/', '_') + '.' + fileformat))
|