Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

""" 

Adapting specific cluster algorithms from scikit-learn to LingPy. 

""" 

from collections import defaultdict 

 

try: 

from sklearn import cluster 

except ImportError: 

cluster = False 

try: 

import igraph 

except ImportError: 

igraph = False 

 

import numpy as np 

 

 

def dbscan( 

threshold, 

matrix, 

taxa, 

revert=False, 

min_samples=1): 

""" 

Compute DBSCAN cluster analysis. 

 

Parameters 

---------- 

threshold : float 

The threshold for clustering you want to use. 

matrix : list 

The two-dimensional matrix passed as list or array. 

taxa : list 

The list of taxon names. If set to "False" a fake list of taxon names 

will be created, giving a positive numerical ID in increasing order for 

each column in the matrix. 

revert : bool 

If set to "False", don't return taxon names but simply the language 

identifiers and their labels as a dictionary. Otherwise returns a 

dictionary with labels as keys and list of taxon names as values. 

min_samples : int (default=1) 

The minimal samples parameter of the DBCSCAN method from the SKLEARN 

package. 

 

Returns 

------- 

clusters : dict 

Either a dictionary of taxon identifiers and labels, or a dictionary of 

labels and taxon names. 

 

Notes 

----- 

This method does not work as expected, probably since it normally requires 

distances between points as input. We list it only for completeness here, 

but urge to be careful when using the code and checking properly our 

implementation in the source code. 

 

Requires the scikitlearn package, downloadable from http://scikit-learn.org/. 

""" 

if not cluster: 

raise ValueError("The package sklearn is needed to run this analysis.") 

 

if not taxa: 

taxa = list(range(1, len(matrix) + 1)) 

 

core_samples, labels = cluster.dbscan( 

matrix, eps=threshold, min_samples=min_samples, metric='precomputed') 

 

# change to our internal cluster style 

idx = max(labels) + 1 

if idx == 0: 

idx += 1 

for i, c in enumerate(labels): 

if c == -1: 

labels[i] = idx 

idx += 1 

 

# check for revert 

if revert: 

return dict(zip(range(len(taxa)), labels)) 

 

clr = defaultdict(list) 

for i, t in enumerate(taxa): 

clr[labels[i]] += [t] 

return clr 

 

 

def affinity_propagation(threshold, matrix, taxa, revert=False): 

""" 

Compute affinity propagation from the matrix. 

 

Parameters 

---------- 

threshold : float 

The threshold for clustering you want to use. 

matrix : list 

The two-dimensional matrix passed as list or array. 

taxa : list 

The list of taxon names. If set to "False" a fake list of taxon names 

will be created, giving a positive numerical ID in increasing order for 

each column in the matrix. 

revert : bool 

If set to "False", don't return taxon names but simply the language 

identifiers and their labels as a dictionary. Otherwise returns a 

dictionary with labels as keys and list of taxon names as values. 

 

Returns 

------- 

clusters : dict 

Either a dictionary of taxon identifiers and labels, or a dictionary of 

labels and taxon names. 

 

Notes 

----- 

 

Affinity propagation is a clustering method originally proposed by 

:evobib:`Frey2007`. 

 

Requires the scikitlearn package, downloadable from http://scikit-learn.org/. 

 

 

 

""" 

if not cluster: 

raise ValueError("The package sklearn is needed to run this analysis.") 

 

if not taxa: 

taxa = list(range(1, len(matrix) + 1)) 

# turn distances to similarities 

matrix = np.array(matrix) 

 

# iterate over matrix 

for i, line in enumerate(matrix): 

matrix[i][i] = 10 

for j in range(i + 1, len(matrix)): 

score = matrix[i][j] 

if score < threshold: 

matrix[i][j] = - np.log2(1 - score ** 2) 

matrix[j][i] = matrix[i][j] 

else: 

matrix[i][j] = - score ** 5 

matrix[j][i] = - score ** 5 

 

ap = cluster.AffinityPropagation(affinity='precomputed') 

labels = ap.fit_predict(matrix) 

 

# change to our internal cluster style 

idx = max(labels) + 1 

if idx == 0: 

idx += 1 

for i, c in enumerate(labels): 

if c == -1: 

labels[i] = idx 

idx += 1 

 

# check for revert 

if revert: 

return dict(zip(range(len(taxa)), labels)) 

 

clr = defaultdict(list) 

for i, t in enumerate(taxa): 

clr[labels[i]] += [t] 

return clr 

 

 

def infomap_clustering(threshold, matrix, taxa=False, revert=False): 

""" 

Compute the Infomap clustering analysis of the data. 

 

Parameters 

---------- 

threshold : float 

The threshold for clustering you want to use. 

matrix : list 

The two-dimensional matrix passed as list or array. 

taxa : list 

The list of taxon names. If set to "False" a fake list of taxon names 

will be created, giving a positive numerical ID in increasing order for 

each column in the matrix. 

revert : bool 

If set to "False", don't return taxon names but simply the language 

identifiers and their labels as a dictionary. Otherwise returns a 

dictionary with labels as keys and list of taxon names as values. 

 

Returns 

------- 

clusters : dict 

Either a dictionary of taxon identifiers and labels, or a dictionary of 

labels and taxon names. 

 

Notes 

----- 

Infomap clustering is a community detection method originally proposed by 

:evobib:`Rosvall2008`. 

 

Requires the igraph package is required, downloadable from http://igraph.org/. 

""" 

if not igraph: 

raise ValueError("The package igraph is needed to run this analysis.") 

if not taxa: 

taxa = list(range(1, len(matrix) + 1)) 

 

G = igraph.Graph() 

vertex_weights = [] 

for i in range(len(matrix)): 

G.add_vertex(i) 

vertex_weights += [0] 

 

# variable stores edge weights, if they are not there, the network is 

# already separated by the threshold 

for i, row in enumerate(matrix): 

for j, cell in enumerate(row): 

if i < j: 

if cell <= threshold: 

G.add_edge(i, j) 

 

comps = G.community_infomap(edge_weights=None, 

vertex_weights=None) 

D = {} 

for i, comp in enumerate(comps.subgraphs()): 

vertices = [v['name'] for v in comp.vs] 

for vertex in vertices: 

D[vertex] = i + 1 

 

if revert: 

return D 

 

clr = defaultdict(list) 

for i, t in enumerate(taxa): 

clr[D[i]] += [t] 

return clr