Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

""" 

Module offers methods to handle colexification patterns in wordlist objects. 

""" 

from collections import defaultdict 

 

import networkx as nx 

 

from lingpy import log 

from lingpy.util import combinations2, dotjoin, join 

 

def _get_colexifications(wordlist, entry='ipa', concept='concept', family='family'): 

""" 

Helper function computes colexifications for a given set of languages in a 

wordlist. 

""" 

if family not in wordlist.header: 

family = 'doculect' 

 

taxa = wordlist.cols 

colexifications = [] 

for taxon in taxa: 

log.info('Analyzing taxon {0}...'.format(taxon)) 

 

tmp_idxs = wordlist.get_list(taxon=taxon, flat=True) 

tmp_family = wordlist[tmp_idxs[0], family] 

tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept) 

tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry) 

 

# iterate over all concepts and add them to the graph 

for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)): 

if tmp_entries[i] == tmp_entries[j] and c1 != c2: 

colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i])] 

 

return colexifications 

 

 

def _get_statistics(wordlist, entry='ipa', concept='concept', family='family'): 

""" 

Assemble data on occurrence frequency of each concept in the data. 

 

Returns 

------- 

statistics : dict 

A dictionary with concepts as primary keys and dictionaries as values. 

The dictionary-values themselves yield various statistics, including 

the number of language families in which each concept occurs, the 

number of languages, and the respective words themselves. 

""" 

# check whether "family" is in the wordlist.header 

if family not in wordlist.header: 

family = 'doculect' 

 

statistics = defaultdict(lambda: defaultdict(list)) 

for k in wordlist: 

tmp_concept = wordlist[k, concept] 

statistics[tmp_concept]['families'].append(wordlist[k, family]) 

statistics[tmp_concept]['doculects'].append(wordlist[k, 'taxon']) 

statistics[tmp_concept]['words'].append(wordlist[k, entry]) 

 

for k in statistics: 

statistics[k]['familyOcc'] = len(set(statistics[k]['families'])) 

statistics[k]['doculectOcc'] = len(set(statistics[k]['doculects'])) 

statistics[k]['wordOcc'] = len(statistics[k]['words']) 

statistics[k]['families'] = sorted(set(statistics[k]['families'])) 

statistics[k]['doculects'] = sorted(set(statistics[k]['doculects'])) 

 

return statistics 

 

 

def _get_colexifications_by_taxa(colexifications): 

colex = defaultdict(set) 

for c1, c2, t, fam, entry in colexifications: 

colex[t].add((c1, c2)) 

colex[t].add((c2, c1)) 

return colex 

 

 

def _make_matrix(taxa, colex): 

""" 

Take colexification data and use it to create a distance matrix. 

 

Notes 

----- 

"colex" is a dictionary with taxon names as keys and colexification data in 

form of tuples of concepts, not necessarily ordered, in both directions, as 

values. 

""" 

# calculate the matrix 

matrix = [[0 for i in range(len(colex))] for j in range(len(colex))] 

for (i, t1), (j, t2) in combinations2(enumerate(taxa)): 

intersection = colex[t1].intersection(colex[t2]) 

union = colex[t1].union(colex[t2]) 

matrix[i][j] = matrix[j][i] = 1 - len(intersection) / len(union) 

return matrix 

 

 

def _make_graph(colexifications, bipartite=False): 

""" 

Return a graph-object from colexification data. 

""" 

G = nx.Graph() 

 

if not bipartite: 

for c1, c2, t, f, entry in colexifications: 

try: 

G[c1][c2]['families'] += [f] 

G[c1][c2]['doculects'] += [t] 

G[c1][c2]['words'] += [entry] 

except: 

G.add_node(c1, ntype='concept') 

G.add_node(c2, ntype='concept') 

G.add_edge(c1, c2, families=[f], doculects=[t], words=[entry]) 

for a, b, d in G.edges(data=True): 

d['familyWeight'] = len(set(d['families'])) 

d['wordWeight'] = len(d['words']) 

d['doculectWeight'] = len(set(d['doculects'])) 

d['family'] = sorted(set(d['families'])) 

d['doculects'] = sorted(set(d['doculects'])) 

elif bipartite: 

for idx, (c1, c2, t, f, entry) in enumerate(colexifications): 

nindex = dotjoin(t, idx + 1) 

try: 

G[nindex][c1]['weight'] += 1 

G[nindex][c2]['weight'] += 1 

except KeyError: 

G.add_node(nindex, ntype='word', entry=entry, doculect=t, family=f) 

G.add_node(c1, ntype='concept') 

G.add_node(c2, ntype='concept') 

G.add_edge(nindex, c1, weight=1) 

G.add_edge(nindex, c2, weight=1) 

return G 

 

 

def colexification_network( 

wordlist, 

entry='ipa', 

concept='concept', 

output='', 

filename='network', 

bipartite=False, 

**keywords): 

""" 

Calculate a colexification network from a given wordlist object. 

 

Parameters 

---------- 

wordlist : ~lingpy.basic.wordlist.Wordlist 

The wordlist object containing the data. 

 

entry : str (default="ipa") 

The reference point for the language entry. We use "ipa" as a default. 

concept : str (default="concept") 

The reference point for the name of the row containing the concepts. We 

use "concept" as a default. 

output: str (default='') 

If output is set to "gml", the resulting network will be written to a 

textfile in GML format. 

 

Returns 

------- 

G : networkx.Graph 

A networkx.Graph object. 

 

""" 

# now, iterate over all concepts for each taxon and add the connections to 

# our network, which we now simply store as networkx graph for conveniency 

colexifications = _get_colexifications(wordlist, entry, concept) 

stats = _get_statistics(wordlist, entry, concept) 

 

G = _make_graph(colexifications, bipartite=bipartite) 

 

# we should also add meta-data to the nodes in the graph 

for node, data in G.nodes(data=True): 

if data['ntype'] == 'concept': 

data.update(stats[node]) 

 

if not output: 

return G 

 

def stringify_data(data): 

for k in data: 

if isinstance(data[k], list): 

data[k] = join('//', *data[k]) 

 

if output == 'gml': 

for node, data in G.nodes(data=True): 

stringify_data(data) 

for nA, nB, data in G.edges(data=True): 

stringify_data(data) 

nx.write_gml(G, filename + '.gml') 

log.file_written(filename + '.gml') 

 

 

def compare_colexifications(wordlist, entry='ipa', concept='concept'): 

""" 

Compare colexification patterns for a given wordlist. 

""" 

colexifications = _get_colexifications(wordlist, entry, concept) 

return _make_matrix(wordlist.cols, _get_colexifications_by_taxa(colexifications)) 

 

def evaluate_colexifications(G, weight='wordWeight', outfile=None): 

""" 

Function calculates most frequent colexifications in a wordlist. 

 

""" 

 

# get edges first, sorted by degree 

edges = [(a, b, c[weight]) for a, b, c in sorted( 

G.edges(data=True), key=lambda x: x[2][weight])] 

 

# now get the degree for the nodes 

nodes = sorted(dict(G.degree(weight=weight)).items(), key=lambda x: x[1]) 

 

if not outfile: 

return nodes, edges