Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Distance computations (:mod:`scipy.spatial.distance`) 

3===================================================== 

4 

5.. sectionauthor:: Damian Eads 

6 

7Function reference 

8------------------ 

9 

10Distance matrix computation from a collection of raw observation vectors 

11stored in a rectangular array. 

12 

13.. autosummary:: 

14 :toctree: generated/ 

15 

16 pdist -- pairwise distances between observation vectors. 

17 cdist -- distances between two collections of observation vectors 

18 squareform -- convert distance matrix to a condensed one and vice versa 

19 directed_hausdorff -- directed Hausdorff distance between arrays 

20 

21Predicates for checking the validity of distance matrices, both 

22condensed and redundant. Also contained in this module are functions 

23for computing the number of observations in a distance matrix. 

24 

25.. autosummary:: 

26 :toctree: generated/ 

27 

28 is_valid_dm -- checks for a valid distance matrix 

29 is_valid_y -- checks for a valid condensed distance matrix 

30 num_obs_dm -- # of observations in a distance matrix 

31 num_obs_y -- # of observations in a condensed distance matrix 

32 

33Distance functions between two numeric vectors ``u`` and ``v``. Computing 

34distances over a large collection of vectors is inefficient for these 

35functions. Use ``pdist`` for this purpose. 

36 

37.. autosummary:: 

38 :toctree: generated/ 

39 

40 braycurtis -- the Bray-Curtis distance. 

41 canberra -- the Canberra distance. 

42 chebyshev -- the Chebyshev distance. 

43 cityblock -- the Manhattan distance. 

44 correlation -- the Correlation distance. 

45 cosine -- the Cosine distance. 

46 euclidean -- the Euclidean distance. 

47 jensenshannon -- the Jensen-Shannon distance. 

48 mahalanobis -- the Mahalanobis distance. 

49 minkowski -- the Minkowski distance. 

50 seuclidean -- the normalized Euclidean distance. 

51 sqeuclidean -- the squared Euclidean distance. 

52 wminkowski -- (deprecated) alias of `minkowski`. 

53 

54Distance functions between two boolean vectors (representing sets) ``u`` and 

55``v``. As in the case of numerical vectors, ``pdist`` is more efficient for 

56computing the distances between all pairs. 

57 

58.. autosummary:: 

59 :toctree: generated/ 

60 

61 dice -- the Dice dissimilarity. 

62 hamming -- the Hamming distance. 

63 jaccard -- the Jaccard distance. 

64 kulsinski -- the Kulsinski distance. 

65 rogerstanimoto -- the Rogers-Tanimoto dissimilarity. 

66 russellrao -- the Russell-Rao dissimilarity. 

67 sokalmichener -- the Sokal-Michener dissimilarity. 

68 sokalsneath -- the Sokal-Sneath dissimilarity. 

69 yule -- the Yule dissimilarity. 

70 

71:func:`hamming` also operates over discrete numerical vectors. 

72""" 

73 

74# Copyright (C) Damian Eads, 2007-2008. New BSD License. 

75 

76__all__ = [ 

77 'braycurtis', 

78 'canberra', 

79 'cdist', 

80 'chebyshev', 

81 'cityblock', 

82 'correlation', 

83 'cosine', 

84 'dice', 

85 'directed_hausdorff', 

86 'euclidean', 

87 'hamming', 

88 'is_valid_dm', 

89 'is_valid_y', 

90 'jaccard', 

91 'jensenshannon', 

92 'kulsinski', 

93 'mahalanobis', 

94 'matching', 

95 'minkowski', 

96 'num_obs_dm', 

97 'num_obs_y', 

98 'pdist', 

99 'rogerstanimoto', 

100 'russellrao', 

101 'seuclidean', 

102 'sokalmichener', 

103 'sokalsneath', 

104 'sqeuclidean', 

105 'squareform', 

106 'wminkowski', 

107 'yule' 

108] 

109 

110 

111import warnings 

112import numpy as np 

113 

114from functools import partial 

115from collections import namedtuple 

116from scipy._lib._util import _asarray_validated 

117 

118from . import _distance_wrap 

119from . import _hausdorff 

120from ..linalg import norm 

121from ..special import rel_entr 

122 

123 

124def _args_to_kwargs_xdist(args, kwargs, metric, func_name): 

125 """ 

126 Convert legacy positional arguments to keyword arguments for pdist/cdist. 

127 """ 

128 if not args: 

129 return kwargs 

130 

131 if (callable(metric) and metric not in [ 

132 braycurtis, canberra, chebyshev, cityblock, correlation, cosine, 

133 dice, euclidean, hamming, jaccard, jensenshannon, kulsinski, 

134 mahalanobis, matching, minkowski, rogerstanimoto, russellrao, 

135 seuclidean, sokalmichener, sokalsneath, sqeuclidean, yule, 

136 wminkowski]): 

137 raise TypeError('When using a custom metric arguments must be passed' 

138 'as keyword (i.e., ARGNAME=ARGVALUE)') 

139 

140 if func_name == 'pdist': 

141 old_arg_names = ['p', 'w', 'V', 'VI'] 

142 else: 

143 old_arg_names = ['p', 'V', 'VI', 'w'] 

144 

145 num_args = len(args) 

146 warnings.warn('%d metric parameters have been passed as positional.' 

147 'This will raise an error in a future version.' 

148 'Please pass arguments as keywords(i.e., ARGNAME=ARGVALUE)' 

149 % num_args, DeprecationWarning) 

150 

151 if num_args > 4: 

152 raise ValueError('Deprecated %s signature accepts only 4' 

153 'positional arguments (%s), %d given.' 

154 % (func_name, ', '.join(old_arg_names), num_args)) 

155 

156 for old_arg, arg in zip(old_arg_names, args): 

157 if old_arg in kwargs: 

158 raise TypeError('%s() got multiple values for argument %s' 

159 % (func_name, old_arg)) 

160 kwargs[old_arg] = arg 

161 return kwargs 

162 

163 

164def _copy_array_if_base_present(a): 

165 """Copy the array if its base points to a parent array.""" 

166 if a.base is not None: 

167 return a.copy() 

168 return a 

169 

170 

171def _correlation_cdist_wrap(XA, XB, dm, **kwargs): 

172 XA = XA - XA.mean(axis=1, keepdims=True) 

173 XB = XB - XB.mean(axis=1, keepdims=True) 

174 _distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs) 

175 

176 

177def _correlation_pdist_wrap(X, dm, **kwargs): 

178 X2 = X - X.mean(axis=1, keepdims=True) 

179 _distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs) 

180 

181 

182def _convert_to_type(X, out_type): 

183 return np.ascontiguousarray(X, dtype=out_type) 

184 

185 

186def _filter_deprecated_kwargs(kwargs, args_blacklist): 

187 # Filtering out old default keywords 

188 for k in args_blacklist: 

189 if k in kwargs: 

190 del kwargs[k] 

191 warnings.warn('Got unexpected kwarg %s. This will raise an error' 

192 ' in a future version.' % k, DeprecationWarning) 

193 

194 

195def _nbool_correspond_all(u, v, w=None): 

196 if u.dtype == v.dtype == bool and w is None: 

197 not_u = ~u 

198 not_v = ~v 

199 nff = (not_u & not_v).sum() 

200 nft = (not_u & v).sum() 

201 ntf = (u & not_v).sum() 

202 ntt = (u & v).sum() 

203 else: 

204 dtype = np.find_common_type([int], [u.dtype, v.dtype]) 

205 u = u.astype(dtype) 

206 v = v.astype(dtype) 

207 not_u = 1.0 - u 

208 not_v = 1.0 - v 

209 if w is not None: 

210 not_u = w * not_u 

211 u = w * u 

212 nff = (not_u * not_v).sum() 

213 nft = (not_u * v).sum() 

214 ntf = (u * not_v).sum() 

215 ntt = (u * v).sum() 

216 return (nff, nft, ntf, ntt) 

217 

218 

219def _nbool_correspond_ft_tf(u, v, w=None): 

220 if u.dtype == v.dtype == bool and w is None: 

221 not_u = ~u 

222 not_v = ~v 

223 nft = (not_u & v).sum() 

224 ntf = (u & not_v).sum() 

225 else: 

226 dtype = np.find_common_type([int], [u.dtype, v.dtype]) 

227 u = u.astype(dtype) 

228 v = v.astype(dtype) 

229 not_u = 1.0 - u 

230 not_v = 1.0 - v 

231 if w is not None: 

232 not_u = w * not_u 

233 u = w * u 

234 nft = (not_u * v).sum() 

235 ntf = (u * not_v).sum() 

236 return (nft, ntf) 

237 

238 

239def _validate_cdist_input(XA, XB, mA, mB, n, metric_name, **kwargs): 

240 if metric_name is not None: 

241 # get supported types 

242 types = _METRICS[metric_name].types 

243 # choose best type 

244 typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0] 

245 # validate data 

246 XA = _convert_to_type(XA, out_type=typ) 

247 XB = _convert_to_type(XB, out_type=typ) 

248 

249 # validate kwargs 

250 _validate_kwargs = _METRICS[metric_name].validator 

251 if _validate_kwargs: 

252 kwargs = _validate_kwargs(np.vstack([XA, XB]), mA + mB, n, **kwargs) 

253 else: 

254 typ = None 

255 return XA, XB, typ, kwargs 

256 

257 

258def _validate_hamming_kwargs(X, m, n, **kwargs): 

259 w = kwargs.get('w', np.ones((n,), dtype='double')) 

260 

261 if w.ndim != 1 or w.shape[0] != n: 

262 raise ValueError("Weights must have same size as input vector. %d vs. %d" % (w.shape[0], n)) 

263 

264 kwargs['w'] = _validate_weights(w) 

265 return kwargs 

266 

267 

268def _validate_mahalanobis_kwargs(X, m, n, **kwargs): 

269 VI = kwargs.pop('VI', None) 

270 if VI is None: 

271 if m <= n: 

272 # There are fewer observations than the dimension of 

273 # the observations. 

274 raise ValueError("The number of observations (%d) is too " 

275 "small; the covariance matrix is " 

276 "singular. For observations with %d " 

277 "dimensions, at least %d observations " 

278 "are required." % (m, n, n + 1)) 

279 CV = np.atleast_2d(np.cov(X.astype(np.double).T)) 

280 VI = np.linalg.inv(CV).T.copy() 

281 kwargs["VI"] = _convert_to_double(VI) 

282 return kwargs 

283 

284 

285def _validate_minkowski_kwargs(X, m, n, **kwargs): 

286 if 'p' not in kwargs: 

287 kwargs['p'] = 2. 

288 return kwargs 

289 

290 

291def _validate_pdist_input(X, m, n, metric_name, **kwargs): 

292 if metric_name is not None: 

293 # get supported types 

294 types = _METRICS[metric_name].types 

295 # choose best type 

296 typ = types[types.index(X.dtype)] if X.dtype in types else types[0] 

297 # validate data 

298 X = _convert_to_type(X, out_type=typ) 

299 

300 # validate kwargs 

301 _validate_kwargs = _METRICS[metric_name].validator 

302 if _validate_kwargs: 

303 kwargs = _validate_kwargs(X, m, n, **kwargs) 

304 else: 

305 typ = None 

306 return X, typ, kwargs 

307 

308 

309def _validate_seuclidean_kwargs(X, m, n, **kwargs): 

310 V = kwargs.pop('V', None) 

311 if V is None: 

312 V = np.var(X.astype(np.double), axis=0, ddof=1) 

313 else: 

314 V = np.asarray(V, order='c') 

315 if len(V.shape) != 1: 

316 raise ValueError('Variance vector V must ' 

317 'be one-dimensional.') 

318 if V.shape[0] != n: 

319 raise ValueError('Variance vector V must be of the same ' 

320 'dimension as the vectors on which the distances ' 

321 'are computed.') 

322 kwargs['V'] = _convert_to_double(V) 

323 return kwargs 

324 

325 

326def _validate_vector(u, dtype=None): 

327 # XXX Is order='c' really necessary? 

328 u = np.asarray(u, dtype=dtype, order='c').squeeze() 

329 # Ensure values such as u=1 and u=[1] still return 1-D arrays. 

330 u = np.atleast_1d(u) 

331 if u.ndim > 1: 

332 raise ValueError("Input vector should be 1-D.") 

333 return u 

334 

335 

336def _validate_weights(w, dtype=np.double): 

337 w = _validate_vector(w, dtype=dtype) 

338 if np.any(w < 0): 

339 raise ValueError("Input weights should be all non-negative") 

340 return w 

341 

342 

343def _validate_wminkowski_kwargs(X, m, n, **kwargs): 

344 w = kwargs.pop('w', None) 

345 if w is None: 

346 raise ValueError('weighted minkowski requires a weight ' 

347 'vector `w` to be given.') 

348 kwargs['w'] = _validate_weights(w) 

349 if 'p' not in kwargs: 

350 kwargs['p'] = 2. 

351 return kwargs 

352 

353 

354def directed_hausdorff(u, v, seed=0): 

355 """ 

356 Compute the directed Hausdorff distance between two N-D arrays. 

357 

358 Distances between pairs are calculated using a Euclidean metric. 

359 

360 Parameters 

361 ---------- 

362 u : (M,N) ndarray 

363 Input array. 

364 v : (O,N) ndarray 

365 Input array. 

366 seed : int or None 

367 Local `numpy.random.RandomState` seed. Default is 0, a random 

368 shuffling of u and v that guarantees reproducibility. 

369 

370 Returns 

371 ------- 

372 d : double 

373 The directed Hausdorff distance between arrays `u` and `v`, 

374 

375 index_1 : int 

376 index of point contributing to Hausdorff pair in `u` 

377 

378 index_2 : int 

379 index of point contributing to Hausdorff pair in `v` 

380 

381 Raises 

382 ------ 

383 ValueError 

384 An exception is thrown if `u` and `v` do not have 

385 the same number of columns. 

386 

387 Notes 

388 ----- 

389 Uses the early break technique and the random sampling approach 

390 described by [1]_. Although worst-case performance is ``O(m * o)`` 

391 (as with the brute force algorithm), this is unlikely in practice 

392 as the input data would have to require the algorithm to explore 

393 every single point interaction, and after the algorithm shuffles 

394 the input points at that. The best case performance is O(m), which 

395 is satisfied by selecting an inner loop distance that is less than 

396 cmax and leads to an early break as often as possible. The authors 

397 have formally shown that the average runtime is closer to O(m). 

398 

399 .. versionadded:: 0.19.0 

400 

401 References 

402 ---------- 

403 .. [1] A. A. Taha and A. Hanbury, "An efficient algorithm for 

404 calculating the exact Hausdorff distance." IEEE Transactions On 

405 Pattern Analysis And Machine Intelligence, vol. 37 pp. 2153-63, 

406 2015. 

407 

408 See Also 

409 -------- 

410 scipy.spatial.procrustes : Another similarity test for two data sets 

411 

412 Examples 

413 -------- 

414 Find the directed Hausdorff distance between two 2-D arrays of 

415 coordinates: 

416 

417 >>> from scipy.spatial.distance import directed_hausdorff 

418 >>> u = np.array([(1.0, 0.0), 

419 ... (0.0, 1.0), 

420 ... (-1.0, 0.0), 

421 ... (0.0, -1.0)]) 

422 >>> v = np.array([(2.0, 0.0), 

423 ... (0.0, 2.0), 

424 ... (-2.0, 0.0), 

425 ... (0.0, -4.0)]) 

426 

427 >>> directed_hausdorff(u, v)[0] 

428 2.23606797749979 

429 >>> directed_hausdorff(v, u)[0] 

430 3.0 

431 

432 Find the general (symmetric) Hausdorff distance between two 2-D 

433 arrays of coordinates: 

434 

435 >>> max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0]) 

436 3.0 

437 

438 Find the indices of the points that generate the Hausdorff distance 

439 (the Hausdorff pair): 

440 

441 >>> directed_hausdorff(v, u)[1:] 

442 (3, 3) 

443 

444 """ 

445 u = np.asarray(u, dtype=np.float64, order='c') 

446 v = np.asarray(v, dtype=np.float64, order='c') 

447 if u.shape[1] != v.shape[1]: 

448 raise ValueError('u and v need to have the same ' 

449 'number of columns') 

450 result = _hausdorff.directed_hausdorff(u, v, seed) 

451 return result 

452 

453 

454def minkowski(u, v, p=2, w=None): 

455 """ 

456 Compute the Minkowski distance between two 1-D arrays. 

457 

458 The Minkowski distance between 1-D arrays `u` and `v`, 

459 is defined as 

460 

461 .. math:: 

462 

463 {||u-v||}_p = (\\sum{|u_i - v_i|^p})^{1/p}. 

464 

465 

466 \\left(\\sum{w_i(|(u_i - v_i)|^p)}\\right)^{1/p}. 

467 

468 Parameters 

469 ---------- 

470 u : (N,) array_like 

471 Input array. 

472 v : (N,) array_like 

473 Input array. 

474 p : int 

475 The order of the norm of the difference :math:`{||u-v||}_p`. 

476 w : (N,) array_like, optional 

477 The weights for each value in `u` and `v`. Default is None, 

478 which gives each value a weight of 1.0 

479 

480 Returns 

481 ------- 

482 minkowski : double 

483 The Minkowski distance between vectors `u` and `v`. 

484 

485 Examples 

486 -------- 

487 >>> from scipy.spatial import distance 

488 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 1) 

489 2.0 

490 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 2) 

491 1.4142135623730951 

492 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 3) 

493 1.2599210498948732 

494 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 1) 

495 1.0 

496 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 2) 

497 1.0 

498 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 3) 

499 1.0 

500 

501 """ 

502 u = _validate_vector(u) 

503 v = _validate_vector(v) 

504 if p < 1: 

505 raise ValueError("p must be at least 1") 

506 u_v = u - v 

507 if w is not None: 

508 w = _validate_weights(w) 

509 if p == 1: 

510 root_w = w 

511 if p == 2: 

512 # better precision and speed 

513 root_w = np.sqrt(w) 

514 else: 

515 root_w = np.power(w, 1/p) 

516 u_v = root_w * u_v 

517 dist = norm(u_v, ord=p) 

518 return dist 

519 

520 

521# `minkowski` gained weights in scipy 1.0. Once we're at say version 1.3, 

522# deprecated `wminkowski`. Not done at once because it would be annoying for 

523# downstream libraries that used `wminkowski` and support multiple scipy 

524# versions. 

525def wminkowski(u, v, p, w): 

526 """ 

527 Compute the weighted Minkowski distance between two 1-D arrays. 

528 

529 The weighted Minkowski distance between `u` and `v`, defined as 

530 

531 .. math:: 

532 

533 \\left(\\sum{(|w_i (u_i - v_i)|^p)}\\right)^{1/p}. 

534 

535 Parameters 

536 ---------- 

537 u : (N,) array_like 

538 Input array. 

539 v : (N,) array_like 

540 Input array. 

541 p : int 

542 The order of the norm of the difference :math:`{||u-v||}_p`. 

543 w : (N,) array_like 

544 The weight vector. 

545 

546 Returns 

547 ------- 

548 wminkowski : double 

549 The weighted Minkowski distance between vectors `u` and `v`. 

550 

551 Notes 

552 ----- 

553 `wminkowski` is DEPRECATED. It implements a definition where weights 

554 are powered. It is recommended to use the weighted version of `minkowski` 

555 instead. This function will be removed in a future version of scipy. 

556 

557 Examples 

558 -------- 

559 >>> from scipy.spatial import distance 

560 >>> distance.wminkowski([1, 0, 0], [0, 1, 0], 1, np.ones(3)) 

561 2.0 

562 >>> distance.wminkowski([1, 0, 0], [0, 1, 0], 2, np.ones(3)) 

563 1.4142135623730951 

564 >>> distance.wminkowski([1, 0, 0], [0, 1, 0], 3, np.ones(3)) 

565 1.2599210498948732 

566 >>> distance.wminkowski([1, 1, 0], [0, 1, 0], 1, np.ones(3)) 

567 1.0 

568 >>> distance.wminkowski([1, 1, 0], [0, 1, 0], 2, np.ones(3)) 

569 1.0 

570 >>> distance.wminkowski([1, 1, 0], [0, 1, 0], 3, np.ones(3)) 

571 1.0 

572 

573 """ 

574 w = _validate_weights(w) 

575 return minkowski(u, v, p=p, w=w**p) 

576 

577 

578def euclidean(u, v, w=None): 

579 """ 

580 Computes the Euclidean distance between two 1-D arrays. 

581 

582 The Euclidean distance between 1-D arrays `u` and `v`, is defined as 

583 

584 .. math:: 

585 

586 {||u-v||}_2 

587 

588 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)^{1/2} 

589 

590 Parameters 

591 ---------- 

592 u : (N,) array_like 

593 Input array. 

594 v : (N,) array_like 

595 Input array. 

596 w : (N,) array_like, optional 

597 The weights for each value in `u` and `v`. Default is None, 

598 which gives each value a weight of 1.0 

599 

600 Returns 

601 ------- 

602 euclidean : double 

603 The Euclidean distance between vectors `u` and `v`. 

604 

605 Examples 

606 -------- 

607 >>> from scipy.spatial import distance 

608 >>> distance.euclidean([1, 0, 0], [0, 1, 0]) 

609 1.4142135623730951 

610 >>> distance.euclidean([1, 1, 0], [0, 1, 0]) 

611 1.0 

612 

613 """ 

614 return minkowski(u, v, p=2, w=w) 

615 

616 

617def sqeuclidean(u, v, w=None): 

618 """ 

619 Compute the squared Euclidean distance between two 1-D arrays. 

620 

621 The squared Euclidean distance between `u` and `v` is defined as 

622 

623 .. math:: 

624 

625 {||u-v||}_2^2 

626 

627 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right) 

628 

629 Parameters 

630 ---------- 

631 u : (N,) array_like 

632 Input array. 

633 v : (N,) array_like 

634 Input array. 

635 w : (N,) array_like, optional 

636 The weights for each value in `u` and `v`. Default is None, 

637 which gives each value a weight of 1.0 

638 

639 Returns 

640 ------- 

641 sqeuclidean : double 

642 The squared Euclidean distance between vectors `u` and `v`. 

643 

644 Examples 

645 -------- 

646 >>> from scipy.spatial import distance 

647 >>> distance.sqeuclidean([1, 0, 0], [0, 1, 0]) 

648 2.0 

649 >>> distance.sqeuclidean([1, 1, 0], [0, 1, 0]) 

650 1.0 

651 

652 """ 

653 # Preserve float dtypes, but convert everything else to np.float64 

654 # for stability. 

655 utype, vtype = None, None 

656 if not (hasattr(u, "dtype") and np.issubdtype(u.dtype, np.inexact)): 

657 utype = np.float64 

658 if not (hasattr(v, "dtype") and np.issubdtype(v.dtype, np.inexact)): 

659 vtype = np.float64 

660 

661 u = _validate_vector(u, dtype=utype) 

662 v = _validate_vector(v, dtype=vtype) 

663 u_v = u - v 

664 u_v_w = u_v # only want weights applied once 

665 if w is not None: 

666 w = _validate_weights(w) 

667 u_v_w = w * u_v 

668 return np.dot(u_v, u_v_w) 

669 

670 

671def correlation(u, v, w=None, centered=True): 

672 """ 

673 Compute the correlation distance between two 1-D arrays. 

674 

675 The correlation distance between `u` and `v`, is 

676 defined as 

677 

678 .. math:: 

679 

680 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})} 

681 {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2} 

682 

683 where :math:`\\bar{u}` is the mean of the elements of `u` 

684 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`. 

685 

686 Parameters 

687 ---------- 

688 u : (N,) array_like 

689 Input array. 

690 v : (N,) array_like 

691 Input array. 

692 w : (N,) array_like, optional 

693 The weights for each value in `u` and `v`. Default is None, 

694 which gives each value a weight of 1.0 

695 

696 Returns 

697 ------- 

698 correlation : double 

699 The correlation distance between 1-D array `u` and `v`. 

700 

701 """ 

702 u = _validate_vector(u) 

703 v = _validate_vector(v) 

704 if w is not None: 

705 w = _validate_weights(w) 

706 if centered: 

707 umu = np.average(u, weights=w) 

708 vmu = np.average(v, weights=w) 

709 u = u - umu 

710 v = v - vmu 

711 uv = np.average(u * v, weights=w) 

712 uu = np.average(np.square(u), weights=w) 

713 vv = np.average(np.square(v), weights=w) 

714 dist = 1.0 - uv / np.sqrt(uu * vv) 

715 return dist 

716 

717 

718def cosine(u, v, w=None): 

719 """ 

720 Compute the Cosine distance between 1-D arrays. 

721 

722 The Cosine distance between `u` and `v`, is defined as 

723 

724 .. math:: 

725 

726 1 - \\frac{u \\cdot v} 

727 {||u||_2 ||v||_2}. 

728 

729 where :math:`u \\cdot v` is the dot product of :math:`u` and 

730 :math:`v`. 

731 

732 Parameters 

733 ---------- 

734 u : (N,) array_like 

735 Input array. 

736 v : (N,) array_like 

737 Input array. 

738 w : (N,) array_like, optional 

739 The weights for each value in `u` and `v`. Default is None, 

740 which gives each value a weight of 1.0 

741 

742 Returns 

743 ------- 

744 cosine : double 

745 The Cosine distance between vectors `u` and `v`. 

746 

747 Examples 

748 -------- 

749 >>> from scipy.spatial import distance 

750 >>> distance.cosine([1, 0, 0], [0, 1, 0]) 

751 1.0 

752 >>> distance.cosine([100, 0, 0], [0, 1, 0]) 

753 1.0 

754 >>> distance.cosine([1, 1, 0], [0, 1, 0]) 

755 0.29289321881345254 

756 

757 """ 

758 # cosine distance is also referred to as 'uncentered correlation', 

759 # or 'reflective correlation' 

760 return correlation(u, v, w=w, centered=False) 

761 

762 

763def hamming(u, v, w=None): 

764 """ 

765 Compute the Hamming distance between two 1-D arrays. 

766 

767 The Hamming distance between 1-D arrays `u` and `v`, is simply the 

768 proportion of disagreeing components in `u` and `v`. If `u` and `v` are 

769 boolean vectors, the Hamming distance is 

770 

771 .. math:: 

772 

773 \\frac{c_{01} + c_{10}}{n} 

774 

775 where :math:`c_{ij}` is the number of occurrences of 

776 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

777 :math:`k < n`. 

778 

779 Parameters 

780 ---------- 

781 u : (N,) array_like 

782 Input array. 

783 v : (N,) array_like 

784 Input array. 

785 w : (N,) array_like, optional 

786 The weights for each value in `u` and `v`. Default is None, 

787 which gives each value a weight of 1.0 

788 

789 Returns 

790 ------- 

791 hamming : double 

792 The Hamming distance between vectors `u` and `v`. 

793 

794 Examples 

795 -------- 

796 >>> from scipy.spatial import distance 

797 >>> distance.hamming([1, 0, 0], [0, 1, 0]) 

798 0.66666666666666663 

799 >>> distance.hamming([1, 0, 0], [1, 1, 0]) 

800 0.33333333333333331 

801 >>> distance.hamming([1, 0, 0], [2, 0, 0]) 

802 0.33333333333333331 

803 >>> distance.hamming([1, 0, 0], [3, 0, 0]) 

804 0.33333333333333331 

805 

806 """ 

807 u = _validate_vector(u) 

808 v = _validate_vector(v) 

809 if u.shape != v.shape: 

810 raise ValueError('The 1d arrays must have equal lengths.') 

811 u_ne_v = u != v 

812 if w is not None: 

813 w = _validate_weights(w) 

814 return np.average(u_ne_v, weights=w) 

815 

816 

817def jaccard(u, v, w=None): 

818 """ 

819 Compute the Jaccard-Needham dissimilarity between two boolean 1-D arrays. 

820 

821 The Jaccard-Needham dissimilarity between 1-D boolean arrays `u` and `v`, 

822 is defined as 

823 

824 .. math:: 

825 

826 \\frac{c_{TF} + c_{FT}} 

827 {c_{TT} + c_{FT} + c_{TF}} 

828 

829 where :math:`c_{ij}` is the number of occurrences of 

830 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

831 :math:`k < n`. 

832 

833 Parameters 

834 ---------- 

835 u : (N,) array_like, bool 

836 Input array. 

837 v : (N,) array_like, bool 

838 Input array. 

839 w : (N,) array_like, optional 

840 The weights for each value in `u` and `v`. Default is None, 

841 which gives each value a weight of 1.0 

842 

843 Returns 

844 ------- 

845 jaccard : double 

846 The Jaccard distance between vectors `u` and `v`. 

847 

848 Notes 

849 ----- 

850 When both `u` and `v` lead to a `0/0` division i.e. there is no overlap 

851 between the items in the vectors the returned distance is 0. See the 

852 Wikipedia page on the Jaccard index [1]_, and this paper [2]_. 

853 

854 .. versionchanged:: 1.2.0 

855 Previously, when `u` and `v` lead to a `0/0` division, the function 

856 would return NaN. This was changed to return 0 instead. 

857 

858 References 

859 ---------- 

860 .. [1] https://en.wikipedia.org/wiki/Jaccard_index 

861 .. [2] S. Kosub, "A note on the triangle inequality for the Jaccard 

862 distance", 2016, Available online: https://arxiv.org/pdf/1612.02696.pdf 

863 

864 Examples 

865 -------- 

866 >>> from scipy.spatial import distance 

867 >>> distance.jaccard([1, 0, 0], [0, 1, 0]) 

868 1.0 

869 >>> distance.jaccard([1, 0, 0], [1, 1, 0]) 

870 0.5 

871 >>> distance.jaccard([1, 0, 0], [1, 2, 0]) 

872 0.5 

873 >>> distance.jaccard([1, 0, 0], [1, 1, 1]) 

874 0.66666666666666663 

875 

876 """ 

877 u = _validate_vector(u) 

878 v = _validate_vector(v) 

879 

880 nonzero = np.bitwise_or(u != 0, v != 0) 

881 unequal_nonzero = np.bitwise_and((u != v), nonzero) 

882 if w is not None: 

883 w = _validate_weights(w) 

884 nonzero = w * nonzero 

885 unequal_nonzero = w * unequal_nonzero 

886 a = np.double(unequal_nonzero.sum()) 

887 b = np.double(nonzero.sum()) 

888 return (a / b) if b != 0 else 0 

889 

890 

891def kulsinski(u, v, w=None): 

892 """ 

893 Compute the Kulsinski dissimilarity between two boolean 1-D arrays. 

894 

895 The Kulsinski dissimilarity between two boolean 1-D arrays `u` and `v`, 

896 is defined as 

897 

898 .. math:: 

899 

900 \\frac{c_{TF} + c_{FT} - c_{TT} + n} 

901 {c_{FT} + c_{TF} + n} 

902 

903 where :math:`c_{ij}` is the number of occurrences of 

904 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

905 :math:`k < n`. 

906 

907 Parameters 

908 ---------- 

909 u : (N,) array_like, bool 

910 Input array. 

911 v : (N,) array_like, bool 

912 Input array. 

913 w : (N,) array_like, optional 

914 The weights for each value in `u` and `v`. Default is None, 

915 which gives each value a weight of 1.0 

916 

917 Returns 

918 ------- 

919 kulsinski : double 

920 The Kulsinski distance between vectors `u` and `v`. 

921 

922 Examples 

923 -------- 

924 >>> from scipy.spatial import distance 

925 >>> distance.kulsinski([1, 0, 0], [0, 1, 0]) 

926 1.0 

927 >>> distance.kulsinski([1, 0, 0], [1, 1, 0]) 

928 0.75 

929 >>> distance.kulsinski([1, 0, 0], [2, 1, 0]) 

930 0.33333333333333331 

931 >>> distance.kulsinski([1, 0, 0], [3, 1, 0]) 

932 -0.5 

933 

934 """ 

935 u = _validate_vector(u) 

936 v = _validate_vector(v) 

937 if w is None: 

938 n = float(len(u)) 

939 else: 

940 w = _validate_weights(w) 

941 n = w.sum() 

942 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w) 

943 

944 return (ntf + nft - ntt + n) / (ntf + nft + n) 

945 

946 

947def seuclidean(u, v, V): 

948 """ 

949 Return the standardized Euclidean distance between two 1-D arrays. 

950 

951 The standardized Euclidean distance between `u` and `v`. 

952 

953 Parameters 

954 ---------- 

955 u : (N,) array_like 

956 Input array. 

957 v : (N,) array_like 

958 Input array. 

959 V : (N,) array_like 

960 `V` is an 1-D array of component variances. It is usually computed 

961 among a larger collection vectors. 

962 

963 Returns 

964 ------- 

965 seuclidean : double 

966 The standardized Euclidean distance between vectors `u` and `v`. 

967 

968 Examples 

969 -------- 

970 >>> from scipy.spatial import distance 

971 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [0.1, 0.1, 0.1]) 

972 4.4721359549995796 

973 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [1, 0.1, 0.1]) 

974 3.3166247903553998 

975 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [10, 0.1, 0.1]) 

976 3.1780497164141406 

977 

978 """ 

979 u = _validate_vector(u) 

980 v = _validate_vector(v) 

981 V = _validate_vector(V, dtype=np.float64) 

982 if V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]: 

983 raise TypeError('V must be a 1-D array of the same dimension ' 

984 'as u and v.') 

985 return euclidean(u, v, w=1/V) 

986 

987 

988def cityblock(u, v, w=None): 

989 """ 

990 Compute the City Block (Manhattan) distance. 

991 

992 Computes the Manhattan distance between two 1-D arrays `u` and `v`, 

993 which is defined as 

994 

995 .. math:: 

996 

997 \\sum_i {\\left| u_i - v_i \\right|}. 

998 

999 Parameters 

1000 ---------- 

1001 u : (N,) array_like 

1002 Input array. 

1003 v : (N,) array_like 

1004 Input array. 

1005 w : (N,) array_like, optional 

1006 The weights for each value in `u` and `v`. Default is None, 

1007 which gives each value a weight of 1.0 

1008 

1009 Returns 

1010 ------- 

1011 cityblock : double 

1012 The City Block (Manhattan) distance between vectors `u` and `v`. 

1013 

1014 Examples 

1015 -------- 

1016 >>> from scipy.spatial import distance 

1017 >>> distance.cityblock([1, 0, 0], [0, 1, 0]) 

1018 2 

1019 >>> distance.cityblock([1, 0, 0], [0, 2, 0]) 

1020 3 

1021 >>> distance.cityblock([1, 0, 0], [1, 1, 0]) 

1022 1 

1023 

1024 """ 

1025 u = _validate_vector(u) 

1026 v = _validate_vector(v) 

1027 l1_diff = abs(u - v) 

1028 if w is not None: 

1029 w = _validate_weights(w) 

1030 l1_diff = w * l1_diff 

1031 return l1_diff.sum() 

1032 

1033 

1034def mahalanobis(u, v, VI): 

1035 """ 

1036 Compute the Mahalanobis distance between two 1-D arrays. 

1037 

1038 The Mahalanobis distance between 1-D arrays `u` and `v`, is defined as 

1039 

1040 .. math:: 

1041 

1042 \\sqrt{ (u-v) V^{-1} (u-v)^T } 

1043 

1044 where ``V`` is the covariance matrix. Note that the argument `VI` 

1045 is the inverse of ``V``. 

1046 

1047 Parameters 

1048 ---------- 

1049 u : (N,) array_like 

1050 Input array. 

1051 v : (N,) array_like 

1052 Input array. 

1053 VI : ndarray 

1054 The inverse of the covariance matrix. 

1055 

1056 Returns 

1057 ------- 

1058 mahalanobis : double 

1059 The Mahalanobis distance between vectors `u` and `v`. 

1060 

1061 Examples 

1062 -------- 

1063 >>> from scipy.spatial import distance 

1064 >>> iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]] 

1065 >>> distance.mahalanobis([1, 0, 0], [0, 1, 0], iv) 

1066 1.0 

1067 >>> distance.mahalanobis([0, 2, 0], [0, 1, 0], iv) 

1068 1.0 

1069 >>> distance.mahalanobis([2, 0, 0], [0, 1, 0], iv) 

1070 1.7320508075688772 

1071 

1072 """ 

1073 u = _validate_vector(u) 

1074 v = _validate_vector(v) 

1075 VI = np.atleast_2d(VI) 

1076 delta = u - v 

1077 m = np.dot(np.dot(delta, VI), delta) 

1078 return np.sqrt(m) 

1079 

1080 

1081def chebyshev(u, v, w=None): 

1082 """ 

1083 Compute the Chebyshev distance. 

1084 

1085 Computes the Chebyshev distance between two 1-D arrays `u` and `v`, 

1086 which is defined as 

1087 

1088 .. math:: 

1089 

1090 \\max_i {|u_i-v_i|}. 

1091 

1092 Parameters 

1093 ---------- 

1094 u : (N,) array_like 

1095 Input vector. 

1096 v : (N,) array_like 

1097 Input vector. 

1098 w : (N,) array_like, optional 

1099 Unused, as 'max' is a weightless operation. Here for API consistency. 

1100 

1101 Returns 

1102 ------- 

1103 chebyshev : double 

1104 The Chebyshev distance between vectors `u` and `v`. 

1105 

1106 Examples 

1107 -------- 

1108 >>> from scipy.spatial import distance 

1109 >>> distance.chebyshev([1, 0, 0], [0, 1, 0]) 

1110 1 

1111 >>> distance.chebyshev([1, 1, 0], [0, 1, 0]) 

1112 1 

1113 

1114 """ 

1115 u = _validate_vector(u) 

1116 v = _validate_vector(v) 

1117 if w is not None: 

1118 w = _validate_weights(w) 

1119 has_weight = w > 0 

1120 if has_weight.sum() < w.size: 

1121 u = u[has_weight] 

1122 v = v[has_weight] 

1123 return max(abs(u - v)) 

1124 

1125 

1126def braycurtis(u, v, w=None): 

1127 """ 

1128 Compute the Bray-Curtis distance between two 1-D arrays. 

1129 

1130 Bray-Curtis distance is defined as 

1131 

1132 .. math:: 

1133 

1134 \\sum{|u_i-v_i|} / \\sum{|u_i+v_i|} 

1135 

1136 The Bray-Curtis distance is in the range [0, 1] if all coordinates are 

1137 positive, and is undefined if the inputs are of length zero. 

1138 

1139 Parameters 

1140 ---------- 

1141 u : (N,) array_like 

1142 Input array. 

1143 v : (N,) array_like 

1144 Input array. 

1145 w : (N,) array_like, optional 

1146 The weights for each value in `u` and `v`. Default is None, 

1147 which gives each value a weight of 1.0 

1148 

1149 Returns 

1150 ------- 

1151 braycurtis : double 

1152 The Bray-Curtis distance between 1-D arrays `u` and `v`. 

1153 

1154 Examples 

1155 -------- 

1156 >>> from scipy.spatial import distance 

1157 >>> distance.braycurtis([1, 0, 0], [0, 1, 0]) 

1158 1.0 

1159 >>> distance.braycurtis([1, 1, 0], [0, 1, 0]) 

1160 0.33333333333333331 

1161 

1162 """ 

1163 u = _validate_vector(u) 

1164 v = _validate_vector(v, dtype=np.float64) 

1165 l1_diff = abs(u - v) 

1166 l1_sum = abs(u + v) 

1167 if w is not None: 

1168 w = _validate_weights(w) 

1169 l1_diff = w * l1_diff 

1170 l1_sum = w * l1_sum 

1171 return l1_diff.sum() / l1_sum.sum() 

1172 

1173 

1174def canberra(u, v, w=None): 

1175 """ 

1176 Compute the Canberra distance between two 1-D arrays. 

1177 

1178 The Canberra distance is defined as 

1179 

1180 .. math:: 

1181 

1182 d(u,v) = \\sum_i \\frac{|u_i-v_i|} 

1183 {|u_i|+|v_i|}. 

1184 

1185 Parameters 

1186 ---------- 

1187 u : (N,) array_like 

1188 Input array. 

1189 v : (N,) array_like 

1190 Input array. 

1191 w : (N,) array_like, optional 

1192 The weights for each value in `u` and `v`. Default is None, 

1193 which gives each value a weight of 1.0 

1194 

1195 Returns 

1196 ------- 

1197 canberra : double 

1198 The Canberra distance between vectors `u` and `v`. 

1199 

1200 Notes 

1201 ----- 

1202 When `u[i]` and `v[i]` are 0 for given i, then the fraction 0/0 = 0 is 

1203 used in the calculation. 

1204 

1205 Examples 

1206 -------- 

1207 >>> from scipy.spatial import distance 

1208 >>> distance.canberra([1, 0, 0], [0, 1, 0]) 

1209 2.0 

1210 >>> distance.canberra([1, 1, 0], [0, 1, 0]) 

1211 1.0 

1212 

1213 """ 

1214 u = _validate_vector(u) 

1215 v = _validate_vector(v, dtype=np.float64) 

1216 if w is not None: 

1217 w = _validate_weights(w) 

1218 with np.errstate(invalid='ignore'): 

1219 abs_uv = abs(u - v) 

1220 abs_u = abs(u) 

1221 abs_v = abs(v) 

1222 d = abs_uv / (abs_u + abs_v) 

1223 if w is not None: 

1224 d = w * d 

1225 d = np.nansum(d) 

1226 return d 

1227 

1228 

1229def jensenshannon(p, q, base=None): 

1230 """ 

1231 Compute the Jensen-Shannon distance (metric) between 

1232 two 1-D probability arrays. This is the square root 

1233 of the Jensen-Shannon divergence. 

1234 

1235 The Jensen-Shannon distance between two probability 

1236 vectors `p` and `q` is defined as, 

1237 

1238 .. math:: 

1239 

1240 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}} 

1241 

1242 where :math:`m` is the pointwise mean of :math:`p` and :math:`q` 

1243 and :math:`D` is the Kullback-Leibler divergence. 

1244 

1245 This routine will normalize `p` and `q` if they don't sum to 1.0. 

1246 

1247 Parameters 

1248 ---------- 

1249 p : (N,) array_like 

1250 left probability vector 

1251 q : (N,) array_like 

1252 right probability vector 

1253 base : double, optional 

1254 the base of the logarithm used to compute the output 

1255 if not given, then the routine uses the default base of 

1256 scipy.stats.entropy. 

1257 

1258 Returns 

1259 ------- 

1260 js : double 

1261 The Jensen-Shannon distance between `p` and `q` 

1262 

1263 .. versionadded:: 1.2.0 

1264 

1265 Examples 

1266 -------- 

1267 >>> from scipy.spatial import distance 

1268 >>> distance.jensenshannon([1.0, 0.0, 0.0], [0.0, 1.0, 0.0], 2.0) 

1269 1.0 

1270 >>> distance.jensenshannon([1.0, 0.0], [0.5, 0.5]) 

1271 0.46450140402245893 

1272 >>> distance.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0]) 

1273 0.0 

1274 

1275 """ 

1276 p = np.asarray(p) 

1277 q = np.asarray(q) 

1278 p = p / np.sum(p, axis=0) 

1279 q = q / np.sum(q, axis=0) 

1280 m = (p + q) / 2.0 

1281 left = rel_entr(p, m) 

1282 right = rel_entr(q, m) 

1283 js = np.sum(left, axis=0) + np.sum(right, axis=0) 

1284 if base is not None: 

1285 js /= np.log(base) 

1286 return np.sqrt(js / 2.0) 

1287 

1288 

1289def yule(u, v, w=None): 

1290 """ 

1291 Compute the Yule dissimilarity between two boolean 1-D arrays. 

1292 

1293 The Yule dissimilarity is defined as 

1294 

1295 .. math:: 

1296 

1297 \\frac{R}{c_{TT} * c_{FF} + \\frac{R}{2}} 

1298 

1299 where :math:`c_{ij}` is the number of occurrences of 

1300 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1301 :math:`k < n` and :math:`R = 2.0 * c_{TF} * c_{FT}`. 

1302 

1303 Parameters 

1304 ---------- 

1305 u : (N,) array_like, bool 

1306 Input array. 

1307 v : (N,) array_like, bool 

1308 Input array. 

1309 w : (N,) array_like, optional 

1310 The weights for each value in `u` and `v`. Default is None, 

1311 which gives each value a weight of 1.0 

1312 

1313 Returns 

1314 ------- 

1315 yule : double 

1316 The Yule dissimilarity between vectors `u` and `v`. 

1317 

1318 Examples 

1319 -------- 

1320 >>> from scipy.spatial import distance 

1321 >>> distance.yule([1, 0, 0], [0, 1, 0]) 

1322 2.0 

1323 >>> distance.yule([1, 1, 0], [0, 1, 0]) 

1324 0.0 

1325 

1326 """ 

1327 u = _validate_vector(u) 

1328 v = _validate_vector(v) 

1329 if w is not None: 

1330 w = _validate_weights(w) 

1331 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w) 

1332 return float(2.0 * ntf * nft / np.array(ntt * nff + ntf * nft)) 

1333 

1334 

1335@np.deprecate(message="spatial.distance.matching is deprecated in scipy 1.0.0; " 

1336 "use spatial.distance.hamming instead.") 

1337def matching(u, v, w=None): 

1338 """ 

1339 Compute the Hamming distance between two boolean 1-D arrays. 

1340 

1341 This is a deprecated synonym for :func:`hamming`. 

1342 """ 

1343 return hamming(u, v, w=w) 

1344 

1345 

1346def dice(u, v, w=None): 

1347 """ 

1348 Compute the Dice dissimilarity between two boolean 1-D arrays. 

1349 

1350 The Dice dissimilarity between `u` and `v`, is 

1351 

1352 .. math:: 

1353 

1354 \\frac{c_{TF} + c_{FT}} 

1355 {2c_{TT} + c_{FT} + c_{TF}} 

1356 

1357 where :math:`c_{ij}` is the number of occurrences of 

1358 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1359 :math:`k < n`. 

1360 

1361 Parameters 

1362 ---------- 

1363 u : (N,) ndarray, bool 

1364 Input 1-D array. 

1365 v : (N,) ndarray, bool 

1366 Input 1-D array. 

1367 w : (N,) array_like, optional 

1368 The weights for each value in `u` and `v`. Default is None, 

1369 which gives each value a weight of 1.0 

1370 

1371 Returns 

1372 ------- 

1373 dice : double 

1374 The Dice dissimilarity between 1-D arrays `u` and `v`. 

1375 

1376 Examples 

1377 -------- 

1378 >>> from scipy.spatial import distance 

1379 >>> distance.dice([1, 0, 0], [0, 1, 0]) 

1380 1.0 

1381 >>> distance.dice([1, 0, 0], [1, 1, 0]) 

1382 0.3333333333333333 

1383 >>> distance.dice([1, 0, 0], [2, 0, 0]) 

1384 -0.3333333333333333 

1385 

1386 """ 

1387 u = _validate_vector(u) 

1388 v = _validate_vector(v) 

1389 if w is not None: 

1390 w = _validate_weights(w) 

1391 if u.dtype == v.dtype == bool and w is None: 

1392 ntt = (u & v).sum() 

1393 else: 

1394 dtype = np.find_common_type([int], [u.dtype, v.dtype]) 

1395 u = u.astype(dtype) 

1396 v = v.astype(dtype) 

1397 if w is None: 

1398 ntt = (u * v).sum() 

1399 else: 

1400 ntt = (u * v * w).sum() 

1401 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w) 

1402 return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft)) 

1403 

1404 

1405def rogerstanimoto(u, v, w=None): 

1406 """ 

1407 Compute the Rogers-Tanimoto dissimilarity between two boolean 1-D arrays. 

1408 

1409 The Rogers-Tanimoto dissimilarity between two boolean 1-D arrays 

1410 `u` and `v`, is defined as 

1411 

1412 .. math:: 

1413 \\frac{R} 

1414 {c_{TT} + c_{FF} + R} 

1415 

1416 where :math:`c_{ij}` is the number of occurrences of 

1417 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1418 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`. 

1419 

1420 Parameters 

1421 ---------- 

1422 u : (N,) array_like, bool 

1423 Input array. 

1424 v : (N,) array_like, bool 

1425 Input array. 

1426 w : (N,) array_like, optional 

1427 The weights for each value in `u` and `v`. Default is None, 

1428 which gives each value a weight of 1.0 

1429 

1430 Returns 

1431 ------- 

1432 rogerstanimoto : double 

1433 The Rogers-Tanimoto dissimilarity between vectors 

1434 `u` and `v`. 

1435 

1436 Examples 

1437 -------- 

1438 >>> from scipy.spatial import distance 

1439 >>> distance.rogerstanimoto([1, 0, 0], [0, 1, 0]) 

1440 0.8 

1441 >>> distance.rogerstanimoto([1, 0, 0], [1, 1, 0]) 

1442 0.5 

1443 >>> distance.rogerstanimoto([1, 0, 0], [2, 0, 0]) 

1444 -1.0 

1445 

1446 """ 

1447 u = _validate_vector(u) 

1448 v = _validate_vector(v) 

1449 if w is not None: 

1450 w = _validate_weights(w) 

1451 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w) 

1452 return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft))) 

1453 

1454 

1455def russellrao(u, v, w=None): 

1456 """ 

1457 Compute the Russell-Rao dissimilarity between two boolean 1-D arrays. 

1458 

1459 The Russell-Rao dissimilarity between two boolean 1-D arrays, `u` and 

1460 `v`, is defined as 

1461 

1462 .. math:: 

1463 

1464 \\frac{n - c_{TT}} 

1465 {n} 

1466 

1467 where :math:`c_{ij}` is the number of occurrences of 

1468 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1469 :math:`k < n`. 

1470 

1471 Parameters 

1472 ---------- 

1473 u : (N,) array_like, bool 

1474 Input array. 

1475 v : (N,) array_like, bool 

1476 Input array. 

1477 w : (N,) array_like, optional 

1478 The weights for each value in `u` and `v`. Default is None, 

1479 which gives each value a weight of 1.0 

1480 

1481 Returns 

1482 ------- 

1483 russellrao : double 

1484 The Russell-Rao dissimilarity between vectors `u` and `v`. 

1485 

1486 Examples 

1487 -------- 

1488 >>> from scipy.spatial import distance 

1489 >>> distance.russellrao([1, 0, 0], [0, 1, 0]) 

1490 1.0 

1491 >>> distance.russellrao([1, 0, 0], [1, 1, 0]) 

1492 0.6666666666666666 

1493 >>> distance.russellrao([1, 0, 0], [2, 0, 0]) 

1494 0.3333333333333333 

1495 

1496 """ 

1497 u = _validate_vector(u) 

1498 v = _validate_vector(v) 

1499 if u.dtype == v.dtype == bool and w is None: 

1500 ntt = (u & v).sum() 

1501 n = float(len(u)) 

1502 elif w is None: 

1503 ntt = (u * v).sum() 

1504 n = float(len(u)) 

1505 else: 

1506 w = _validate_weights(w) 

1507 ntt = (u * v * w).sum() 

1508 n = w.sum() 

1509 return float(n - ntt) / n 

1510 

1511 

1512def sokalmichener(u, v, w=None): 

1513 """ 

1514 Compute the Sokal-Michener dissimilarity between two boolean 1-D arrays. 

1515 

1516 The Sokal-Michener dissimilarity between boolean 1-D arrays `u` and `v`, 

1517 is defined as 

1518 

1519 .. math:: 

1520 

1521 \\frac{R} 

1522 {S + R} 

1523 

1524 where :math:`c_{ij}` is the number of occurrences of 

1525 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1526 :math:`k < n`, :math:`R = 2 * (c_{TF} + c_{FT})` and 

1527 :math:`S = c_{FF} + c_{TT}`. 

1528 

1529 Parameters 

1530 ---------- 

1531 u : (N,) array_like, bool 

1532 Input array. 

1533 v : (N,) array_like, bool 

1534 Input array. 

1535 w : (N,) array_like, optional 

1536 The weights for each value in `u` and `v`. Default is None, 

1537 which gives each value a weight of 1.0 

1538 

1539 Returns 

1540 ------- 

1541 sokalmichener : double 

1542 The Sokal-Michener dissimilarity between vectors `u` and `v`. 

1543 

1544 Examples 

1545 -------- 

1546 >>> from scipy.spatial import distance 

1547 >>> distance.sokalmichener([1, 0, 0], [0, 1, 0]) 

1548 0.8 

1549 >>> distance.sokalmichener([1, 0, 0], [1, 1, 0]) 

1550 0.5 

1551 >>> distance.sokalmichener([1, 0, 0], [2, 0, 0]) 

1552 -1.0 

1553 

1554 """ 

1555 u = _validate_vector(u) 

1556 v = _validate_vector(v) 

1557 if u.dtype == v.dtype == bool and w is None: 

1558 ntt = (u & v).sum() 

1559 nff = (~u & ~v).sum() 

1560 elif w is None: 

1561 ntt = (u * v).sum() 

1562 nff = ((1.0 - u) * (1.0 - v)).sum() 

1563 else: 

1564 w = _validate_weights(w) 

1565 ntt = (u * v * w).sum() 

1566 nff = ((1.0 - u) * (1.0 - v) * w).sum() 

1567 (nft, ntf) = _nbool_correspond_ft_tf(u, v) 

1568 return float(2.0 * (ntf + nft)) / float(ntt + nff + 2.0 * (ntf + nft)) 

1569 

1570 

1571def sokalsneath(u, v, w=None): 

1572 """ 

1573 Compute the Sokal-Sneath dissimilarity between two boolean 1-D arrays. 

1574 

1575 The Sokal-Sneath dissimilarity between `u` and `v`, 

1576 

1577 .. math:: 

1578 

1579 \\frac{R} 

1580 {c_{TT} + R} 

1581 

1582 where :math:`c_{ij}` is the number of occurrences of 

1583 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1584 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`. 

1585 

1586 Parameters 

1587 ---------- 

1588 u : (N,) array_like, bool 

1589 Input array. 

1590 v : (N,) array_like, bool 

1591 Input array. 

1592 w : (N,) array_like, optional 

1593 The weights for each value in `u` and `v`. Default is None, 

1594 which gives each value a weight of 1.0 

1595 

1596 Returns 

1597 ------- 

1598 sokalsneath : double 

1599 The Sokal-Sneath dissimilarity between vectors `u` and `v`. 

1600 

1601 Examples 

1602 -------- 

1603 >>> from scipy.spatial import distance 

1604 >>> distance.sokalsneath([1, 0, 0], [0, 1, 0]) 

1605 1.0 

1606 >>> distance.sokalsneath([1, 0, 0], [1, 1, 0]) 

1607 0.66666666666666663 

1608 >>> distance.sokalsneath([1, 0, 0], [2, 1, 0]) 

1609 0.0 

1610 >>> distance.sokalsneath([1, 0, 0], [3, 1, 0]) 

1611 -2.0 

1612 

1613 """ 

1614 u = _validate_vector(u) 

1615 v = _validate_vector(v) 

1616 if u.dtype == v.dtype == bool and w is None: 

1617 ntt = (u & v).sum() 

1618 elif w is None: 

1619 ntt = (u * v).sum() 

1620 else: 

1621 w = _validate_weights(w) 

1622 ntt = (u * v * w).sum() 

1623 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w) 

1624 denom = np.array(ntt + 2.0 * (ntf + nft)) 

1625 if not denom.any(): 

1626 raise ValueError('Sokal-Sneath dissimilarity is not defined for ' 

1627 'vectors that are entirely false.') 

1628 return float(2.0 * (ntf + nft)) / denom 

1629 

1630 

1631_convert_to_double = partial(_convert_to_type, out_type=np.double) 

1632_convert_to_bool = partial(_convert_to_type, out_type=bool) 

1633 

1634# adding python-only wrappers to _distance_wrap module 

1635_distance_wrap.pdist_correlation_double_wrap = _correlation_pdist_wrap 

1636_distance_wrap.cdist_correlation_double_wrap = _correlation_cdist_wrap 

1637 

1638# Registry of implemented metrics: 

1639# Dictionary with the following structure: 

1640# { 

1641# metric_name : MetricInfo(aka, types=[double], validator=None) 

1642# } 

1643# 

1644# Where: 

1645# `metric_name` must be equal to python metric name 

1646# 

1647# MetricInfo is a named tuple with fields: 

1648# 'aka' : [list of aliases], 

1649# 

1650# 'validator': f(X, m, n, **kwargs) # function that check kwargs and 

1651# # computes default values. 

1652# 

1653# 'types': [list of supported types], # X (pdist) and XA (cdist) are used to 

1654# # choose the type. if there is no match 

1655# # the first type is used. Default double 

1656# } 

1657MetricInfo = namedtuple("MetricInfo", 'aka types validator ') 

1658MetricInfo.__new__.__defaults__ = (['double'], None) 

1659 

1660_METRICS = { 

1661 'braycurtis': MetricInfo(aka=['braycurtis']), 

1662 'canberra': MetricInfo(aka=['canberra']), 

1663 'chebyshev': MetricInfo(aka=['chebychev', 'chebyshev', 'cheby', 'cheb', 'ch']), 

1664 'cityblock': MetricInfo(aka=['cityblock', 'cblock', 'cb', 'c']), 

1665 'correlation': MetricInfo(aka=['correlation', 'co']), 

1666 'cosine': MetricInfo(aka=['cosine', 'cos']), 

1667 'dice': MetricInfo(aka=['dice'], types=['bool']), 

1668 'euclidean': MetricInfo(aka=['euclidean', 'euclid', 'eu', 'e']), 

1669 'hamming': MetricInfo(aka=['matching', 'hamming', 'hamm', 'ha', 'h'], 

1670 types=['double', 'bool'], 

1671 validator=_validate_hamming_kwargs), 

1672 'jaccard': MetricInfo(aka=['jaccard', 'jacc', 'ja', 'j'], 

1673 types=['double', 'bool']), 

1674 'jensenshannon': MetricInfo(aka=['jensenshannon', 'js'], 

1675 types=['double']), 

1676 'kulsinski': MetricInfo(aka=['kulsinski'], types=['bool']), 

1677 'mahalanobis': MetricInfo(aka=['mahalanobis', 'mahal', 'mah'], 

1678 validator=_validate_mahalanobis_kwargs), 

1679 'minkowski': MetricInfo(aka=['minkowski', 'mi', 'm', 'pnorm'], 

1680 validator=_validate_minkowski_kwargs), 

1681 'rogerstanimoto': MetricInfo(aka=['rogerstanimoto'], types=['bool']), 

1682 'russellrao': MetricInfo(aka=['russellrao'], types=['bool']), 

1683 'seuclidean': MetricInfo(aka=['seuclidean', 'se', 's'], 

1684 validator=_validate_seuclidean_kwargs), 

1685 'sokalmichener': MetricInfo(aka=['sokalmichener'], types=['bool']), 

1686 'sokalsneath': MetricInfo(aka=['sokalsneath'], types=['bool']), 

1687 'sqeuclidean': MetricInfo(aka=['sqeuclidean', 'sqe', 'sqeuclid']), 

1688 'wminkowski': MetricInfo(aka=['wminkowski', 'wmi', 'wm', 'wpnorm'], 

1689 validator=_validate_wminkowski_kwargs), 

1690 'yule': MetricInfo(aka=['yule'], types=['bool']), 

1691} 

1692 

1693 

1694_METRIC_ALIAS = dict((alias, name) 

1695 for name, info in _METRICS.items() 

1696 for alias in info.aka) 

1697 

1698_METRICS_NAMES = list(_METRICS.keys()) 

1699 

1700_TEST_METRICS = {'test_' + name: globals()[name] for name in _METRICS.keys()} 

1701 

1702 

1703def _select_weighted_metric(mstr, kwargs, out): 

1704 kwargs = dict(kwargs) 

1705 

1706 if "w" in kwargs and kwargs["w"] is None: 

1707 # w=None is the same as omitting it 

1708 kwargs.pop("w") 

1709 

1710 if mstr.startswith("test_") or mstr in _METRICS['wminkowski'].aka + _METRICS['hamming'].aka: 

1711 # These support weights 

1712 pass 

1713 elif "w" in kwargs: 

1714 if (mstr in _METRICS['seuclidean'].aka or 

1715 mstr in _METRICS['mahalanobis'].aka): 

1716 raise ValueError("metric %s incompatible with weights" % mstr) 

1717 

1718 # XXX: C-versions do not support weights 

1719 # need to use python version for weighting 

1720 kwargs['out'] = out 

1721 mstr = "test_%s" % mstr 

1722 

1723 return mstr, kwargs 

1724 

1725 

1726def pdist(X, metric='euclidean', *args, **kwargs): 

1727 """ 

1728 Pairwise distances between observations in n-dimensional space. 

1729 

1730 See Notes for common calling conventions. 

1731 

1732 Parameters 

1733 ---------- 

1734 X : ndarray 

1735 An m by n array of m original observations in an 

1736 n-dimensional space. 

1737 metric : str or function, optional 

1738 The distance metric to use. The distance function can 

1739 be 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 

1740 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 

1741 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching', 

1742 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 

1743 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'. 

1744 *args : tuple. Deprecated. 

1745 Additional arguments should be passed as keyword arguments 

1746 **kwargs : dict, optional 

1747 Extra arguments to `metric`: refer to each metric documentation for a 

1748 list of all possible arguments. 

1749 

1750 Some possible arguments: 

1751 

1752 p : scalar 

1753 The p-norm to apply for Minkowski, weighted and unweighted. 

1754 Default: 2. 

1755 

1756 w : ndarray 

1757 The weight vector for metrics that support weights (e.g., Minkowski). 

1758 

1759 V : ndarray 

1760 The variance vector for standardized Euclidean. 

1761 Default: var(X, axis=0, ddof=1) 

1762 

1763 VI : ndarray 

1764 The inverse of the covariance matrix for Mahalanobis. 

1765 Default: inv(cov(X.T)).T 

1766 

1767 out : ndarray. 

1768 The output array 

1769 If not None, condensed distance matrix Y is stored in this array. 

1770 Note: metric independent, it will become a regular keyword arg in a 

1771 future scipy version 

1772 

1773 Returns 

1774 ------- 

1775 Y : ndarray 

1776 Returns a condensed distance matrix Y. For 

1777 each :math:`i` and :math:`j` (where :math:`i<j<m`),where m is the number 

1778 of original observations. The metric ``dist(u=X[i], v=X[j])`` 

1779 is computed and stored in entry ``ij``. 

1780 

1781 See Also 

1782 -------- 

1783 squareform : converts between condensed distance matrices and 

1784 square distance matrices. 

1785 

1786 Notes 

1787 ----- 

1788 See ``squareform`` for information on how to calculate the index of 

1789 this entry or to convert the condensed distance matrix to a 

1790 redundant square matrix. 

1791 

1792 The following are common calling conventions. 

1793 

1794 1. ``Y = pdist(X, 'euclidean')`` 

1795 

1796 Computes the distance between m points using Euclidean distance 

1797 (2-norm) as the distance metric between the points. The points 

1798 are arranged as m n-dimensional row vectors in the matrix X. 

1799 

1800 2. ``Y = pdist(X, 'minkowski', p=2.)`` 

1801 

1802 Computes the distances using the Minkowski distance 

1803 :math:`||u-v||_p` (p-norm) where :math:`p \\geq 1`. 

1804 

1805 3. ``Y = pdist(X, 'cityblock')`` 

1806 

1807 Computes the city block or Manhattan distance between the 

1808 points. 

1809 

1810 4. ``Y = pdist(X, 'seuclidean', V=None)`` 

1811 

1812 Computes the standardized Euclidean distance. The standardized 

1813 Euclidean distance between two n-vectors ``u`` and ``v`` is 

1814 

1815 .. math:: 

1816 

1817 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}} 

1818 

1819 

1820 V is the variance vector; V[i] is the variance computed over all 

1821 the i'th components of the points. If not passed, it is 

1822 automatically computed. 

1823 

1824 5. ``Y = pdist(X, 'sqeuclidean')`` 

1825 

1826 Computes the squared Euclidean distance :math:`||u-v||_2^2` between 

1827 the vectors. 

1828 

1829 6. ``Y = pdist(X, 'cosine')`` 

1830 

1831 Computes the cosine distance between vectors u and v, 

1832 

1833 .. math:: 

1834 

1835 1 - \\frac{u \\cdot v} 

1836 {{||u||}_2 {||v||}_2} 

1837 

1838 where :math:`||*||_2` is the 2-norm of its argument ``*``, and 

1839 :math:`u \\cdot v` is the dot product of ``u`` and ``v``. 

1840 

1841 7. ``Y = pdist(X, 'correlation')`` 

1842 

1843 Computes the correlation distance between vectors u and v. This is 

1844 

1845 .. math:: 

1846 

1847 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})} 

1848 {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2} 

1849 

1850 where :math:`\\bar{v}` is the mean of the elements of vector v, 

1851 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`. 

1852 

1853 8. ``Y = pdist(X, 'hamming')`` 

1854 

1855 Computes the normalized Hamming distance, or the proportion of 

1856 those vector elements between two n-vectors ``u`` and ``v`` 

1857 which disagree. To save memory, the matrix ``X`` can be of type 

1858 boolean. 

1859 

1860 9. ``Y = pdist(X, 'jaccard')`` 

1861 

1862 Computes the Jaccard distance between the points. Given two 

1863 vectors, ``u`` and ``v``, the Jaccard distance is the 

1864 proportion of those elements ``u[i]`` and ``v[i]`` that 

1865 disagree. 

1866 

1867 10. ``Y = pdist(X, 'chebyshev')`` 

1868 

1869 Computes the Chebyshev distance between the points. The 

1870 Chebyshev distance between two n-vectors ``u`` and ``v`` is the 

1871 maximum norm-1 distance between their respective elements. More 

1872 precisely, the distance is given by 

1873 

1874 .. math:: 

1875 

1876 d(u,v) = \\max_i {|u_i-v_i|} 

1877 

1878 11. ``Y = pdist(X, 'canberra')`` 

1879 

1880 Computes the Canberra distance between the points. The 

1881 Canberra distance between two points ``u`` and ``v`` is 

1882 

1883 .. math:: 

1884 

1885 d(u,v) = \\sum_i \\frac{|u_i-v_i|} 

1886 {|u_i|+|v_i|} 

1887 

1888 

1889 12. ``Y = pdist(X, 'braycurtis')`` 

1890 

1891 Computes the Bray-Curtis distance between the points. The 

1892 Bray-Curtis distance between two points ``u`` and ``v`` is 

1893 

1894 

1895 .. math:: 

1896 

1897 d(u,v) = \\frac{\\sum_i {|u_i-v_i|}} 

1898 {\\sum_i {|u_i+v_i|}} 

1899 

1900 13. ``Y = pdist(X, 'mahalanobis', VI=None)`` 

1901 

1902 Computes the Mahalanobis distance between the points. The 

1903 Mahalanobis distance between two points ``u`` and ``v`` is 

1904 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI`` 

1905 variable) is the inverse covariance. If ``VI`` is not None, 

1906 ``VI`` will be used as the inverse covariance matrix. 

1907 

1908 14. ``Y = pdist(X, 'yule')`` 

1909 

1910 Computes the Yule distance between each pair of boolean 

1911 vectors. (see yule function documentation) 

1912 

1913 15. ``Y = pdist(X, 'matching')`` 

1914 

1915 Synonym for 'hamming'. 

1916 

1917 16. ``Y = pdist(X, 'dice')`` 

1918 

1919 Computes the Dice distance between each pair of boolean 

1920 vectors. (see dice function documentation) 

1921 

1922 17. ``Y = pdist(X, 'kulsinski')`` 

1923 

1924 Computes the Kulsinski distance between each pair of 

1925 boolean vectors. (see kulsinski function documentation) 

1926 

1927 18. ``Y = pdist(X, 'rogerstanimoto')`` 

1928 

1929 Computes the Rogers-Tanimoto distance between each pair of 

1930 boolean vectors. (see rogerstanimoto function documentation) 

1931 

1932 19. ``Y = pdist(X, 'russellrao')`` 

1933 

1934 Computes the Russell-Rao distance between each pair of 

1935 boolean vectors. (see russellrao function documentation) 

1936 

1937 20. ``Y = pdist(X, 'sokalmichener')`` 

1938 

1939 Computes the Sokal-Michener distance between each pair of 

1940 boolean vectors. (see sokalmichener function documentation) 

1941 

1942 21. ``Y = pdist(X, 'sokalsneath')`` 

1943 

1944 Computes the Sokal-Sneath distance between each pair of 

1945 boolean vectors. (see sokalsneath function documentation) 

1946 

1947 22. ``Y = pdist(X, 'wminkowski', p=2, w=w)`` 

1948 

1949 Computes the weighted Minkowski distance between each pair of 

1950 vectors. (see wminkowski function documentation) 

1951 

1952 23. ``Y = pdist(X, f)`` 

1953 

1954 Computes the distance between all pairs of vectors in X 

1955 using the user supplied 2-arity function f. For example, 

1956 Euclidean distance between the vectors could be computed 

1957 as follows:: 

1958 

1959 dm = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum())) 

1960 

1961 Note that you should avoid passing a reference to one of 

1962 the distance functions defined in this library. For example,:: 

1963 

1964 dm = pdist(X, sokalsneath) 

1965 

1966 would calculate the pair-wise distances between the vectors in 

1967 X using the Python function sokalsneath. This would result in 

1968 sokalsneath being called :math:`{n \\choose 2}` times, which 

1969 is inefficient. Instead, the optimized C version is more 

1970 efficient, and we call it using the following syntax.:: 

1971 

1972 dm = pdist(X, 'sokalsneath') 

1973 

1974 """ 

1975 # You can also call this as: 

1976 # Y = pdist(X, 'test_abc') 

1977 # where 'abc' is the metric being tested. This computes the distance 

1978 # between all pairs of vectors in X using the distance metric 'abc' but 

1979 # with a more succinct, verifiable, but less efficient implementation. 

1980 

1981 X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True, 

1982 check_finite=False) 

1983 kwargs = _args_to_kwargs_xdist(args, kwargs, metric, "pdist") 

1984 

1985 X = np.asarray(X, order='c') 

1986 

1987 s = X.shape 

1988 if len(s) != 2: 

1989 raise ValueError('A 2-dimensional array must be passed.') 

1990 

1991 m, n = s 

1992 out = kwargs.pop("out", None) 

1993 if out is None: 

1994 dm = np.empty((m * (m - 1)) // 2, dtype=np.double) 

1995 else: 

1996 if out.shape != (m * (m - 1) // 2,): 

1997 raise ValueError("output array has incorrect shape.") 

1998 if not out.flags.c_contiguous: 

1999 raise ValueError("Output array must be C-contiguous.") 

2000 if out.dtype != np.double: 

2001 raise ValueError("Output array must be double type.") 

2002 dm = out 

2003 

2004 # compute blacklist for deprecated kwargs 

2005 if(metric in _METRICS['jensenshannon'].aka 

2006 or metric == 'test_jensenshannon' or metric == jensenshannon): 

2007 kwargs_blacklist = ["p", "w", "V", "VI"] 

2008 

2009 elif(metric in _METRICS['minkowski'].aka 

2010 or metric in _METRICS['wminkowski'].aka 

2011 or metric in ['test_minkowski', 'test_wminkowski'] 

2012 or metric in [minkowski, wminkowski]): 

2013 kwargs_blacklist = ["V", "VI"] 

2014 

2015 elif(metric in _METRICS['seuclidean'].aka or 

2016 metric == 'test_seuclidean' or metric == seuclidean): 

2017 kwargs_blacklist = ["p", "w", "VI"] 

2018 

2019 elif(metric in _METRICS['mahalanobis'].aka 

2020 or metric == 'test_mahalanobis' or metric == mahalanobis): 

2021 kwargs_blacklist = ["p", "w", "V"] 

2022 

2023 else: 

2024 kwargs_blacklist = ["p", "V", "VI"] 

2025 

2026 _filter_deprecated_kwargs(kwargs, kwargs_blacklist) 

2027 

2028 if callable(metric): 

2029 mstr = getattr(metric, '__name__', 'UnknownCustomMetric') 

2030 metric_name = _METRIC_ALIAS.get(mstr, None) 

2031 

2032 if metric_name is not None: 

2033 X, typ, kwargs = _validate_pdist_input(X, m, n, 

2034 metric_name, **kwargs) 

2035 

2036 k = 0 

2037 for i in range(0, m - 1): 

2038 for j in range(i + 1, m): 

2039 dm[k] = metric(X[i], X[j], **kwargs) 

2040 k = k + 1 

2041 

2042 elif isinstance(metric, str): 

2043 mstr = metric.lower() 

2044 

2045 mstr, kwargs = _select_weighted_metric(mstr, kwargs, out) 

2046 

2047 metric_name = _METRIC_ALIAS.get(mstr, None) 

2048 

2049 if metric_name is not None: 

2050 X, typ, kwargs = _validate_pdist_input(X, m, n, 

2051 metric_name, **kwargs) 

2052 

2053 # get pdist wrapper 

2054 pdist_fn = getattr(_distance_wrap, 

2055 "pdist_%s_%s_wrap" % (metric_name, typ)) 

2056 pdist_fn(X, dm, **kwargs) 

2057 return dm 

2058 

2059 elif mstr in ['old_cosine', 'old_cos']: 

2060 warnings.warn('"old_cosine" is deprecated and will be removed in ' 

2061 'a future version. Use "cosine" instead.', 

2062 DeprecationWarning) 

2063 X = _convert_to_double(X) 

2064 norms = np.einsum('ij,ij->i', X, X, dtype=np.double) 

2065 np.sqrt(norms, out=norms) 

2066 nV = norms.reshape(m, 1) 

2067 # The numerator u * v 

2068 nm = np.dot(X, X.T) 

2069 # The denom. ||u||*||v|| 

2070 de = np.dot(nV, nV.T) 

2071 dm = 1.0 - (nm / de) 

2072 dm[range(0, m), range(0, m)] = 0.0 

2073 dm = squareform(dm) 

2074 elif mstr.startswith("test_"): 

2075 if mstr in _TEST_METRICS: 

2076 dm = pdist(X, _TEST_METRICS[mstr], **kwargs) 

2077 else: 

2078 raise ValueError('Unknown "Test" Distance Metric: %s' % mstr[5:]) 

2079 else: 

2080 raise ValueError('Unknown Distance Metric: %s' % mstr) 

2081 else: 

2082 raise TypeError('2nd argument metric must be a string identifier ' 

2083 'or a function.') 

2084 return dm 

2085 

2086 

2087def squareform(X, force="no", checks=True): 

2088 """ 

2089 Convert a vector-form distance vector to a square-form distance 

2090 matrix, and vice-versa. 

2091 

2092 Parameters 

2093 ---------- 

2094 X : ndarray 

2095 Either a condensed or redundant distance matrix. 

2096 force : str, optional 

2097 As with MATLAB(TM), if force is equal to ``'tovector'`` or 

2098 ``'tomatrix'``, the input will be treated as a distance matrix or 

2099 distance vector respectively. 

2100 checks : bool, optional 

2101 If set to False, no checks will be made for matrix 

2102 symmetry nor zero diagonals. This is useful if it is known that 

2103 ``X - X.T1`` is small and ``diag(X)`` is close to zero. 

2104 These values are ignored any way so they do not disrupt the 

2105 squareform transformation. 

2106 

2107 Returns 

2108 ------- 

2109 Y : ndarray 

2110 If a condensed distance matrix is passed, a redundant one is 

2111 returned, or if a redundant one is passed, a condensed distance 

2112 matrix is returned. 

2113 

2114 Notes 

2115 ----- 

2116 1. ``v = squareform(X)`` 

2117 

2118 Given a square n-by-n symmetric distance matrix ``X``, 

2119 ``v = squareform(X)`` returns a ``n * (n-1) / 2`` 

2120 (i.e. binomial coefficient n choose 2) sized vector `v` 

2121 where :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]` 

2122 is the distance between distinct points ``i`` and ``j``. 

2123 If ``X`` is non-square or asymmetric, an error is raised. 

2124 

2125 2. ``X = squareform(v)`` 

2126 

2127 Given a ``n * (n-1) / 2`` sized vector ``v`` 

2128 for some integer ``n >= 1`` encoding distances as described, 

2129 ``X = squareform(v)`` returns a n-by-n distance matrix ``X``. 

2130 The ``X[i, j]`` and ``X[j, i]`` values are set to 

2131 :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]` 

2132 and all diagonal elements are zero. 

2133 

2134 In SciPy 0.19.0, ``squareform`` stopped casting all input types to 

2135 float64, and started returning arrays of the same dtype as the input. 

2136 

2137 """ 

2138 

2139 X = np.ascontiguousarray(X) 

2140 

2141 s = X.shape 

2142 

2143 if force.lower() == 'tomatrix': 

2144 if len(s) != 1: 

2145 raise ValueError("Forcing 'tomatrix' but input X is not a " 

2146 "distance vector.") 

2147 elif force.lower() == 'tovector': 

2148 if len(s) != 2: 

2149 raise ValueError("Forcing 'tovector' but input X is not a " 

2150 "distance matrix.") 

2151 

2152 # X = squareform(v) 

2153 if len(s) == 1: 

2154 if s[0] == 0: 

2155 return np.zeros((1, 1), dtype=X.dtype) 

2156 

2157 # Grab the closest value to the square root of the number 

2158 # of elements times 2 to see if the number of elements 

2159 # is indeed a binomial coefficient. 

2160 d = int(np.ceil(np.sqrt(s[0] * 2))) 

2161 

2162 # Check that v is of valid dimensions. 

2163 if d * (d - 1) != s[0] * 2: 

2164 raise ValueError('Incompatible vector size. It must be a binomial ' 

2165 'coefficient n choose 2 for some integer n >= 2.') 

2166 

2167 # Allocate memory for the distance matrix. 

2168 M = np.zeros((d, d), dtype=X.dtype) 

2169 

2170 # Since the C code does not support striding using strides. 

2171 # The dimensions are used instead. 

2172 X = _copy_array_if_base_present(X) 

2173 

2174 # Fill in the values of the distance matrix. 

2175 _distance_wrap.to_squareform_from_vector_wrap(M, X) 

2176 

2177 # Return the distance matrix. 

2178 return M 

2179 elif len(s) == 2: 

2180 if s[0] != s[1]: 

2181 raise ValueError('The matrix argument must be square.') 

2182 if checks: 

2183 is_valid_dm(X, throw=True, name='X') 

2184 

2185 # One-side of the dimensions is set here. 

2186 d = s[0] 

2187 

2188 if d <= 1: 

2189 return np.array([], dtype=X.dtype) 

2190 

2191 # Create a vector. 

2192 v = np.zeros((d * (d - 1)) // 2, dtype=X.dtype) 

2193 

2194 # Since the C code does not support striding using strides. 

2195 # The dimensions are used instead. 

2196 X = _copy_array_if_base_present(X) 

2197 

2198 # Convert the vector to squareform. 

2199 _distance_wrap.to_vector_from_squareform_wrap(X, v) 

2200 return v 

2201 else: 

2202 raise ValueError(('The first argument must be one or two dimensional ' 

2203 'array. A %d-dimensional array is not ' 

2204 'permitted') % len(s)) 

2205 

2206 

2207def is_valid_dm(D, tol=0.0, throw=False, name="D", warning=False): 

2208 """ 

2209 Return True if input array is a valid distance matrix. 

2210 

2211 Distance matrices must be 2-dimensional numpy arrays. 

2212 They must have a zero-diagonal, and they must be symmetric. 

2213 

2214 Parameters 

2215 ---------- 

2216 D : ndarray 

2217 The candidate object to test for validity. 

2218 tol : float, optional 

2219 The distance matrix should be symmetric. `tol` is the maximum 

2220 difference between entries ``ij`` and ``ji`` for the distance 

2221 metric to be considered symmetric. 

2222 throw : bool, optional 

2223 An exception is thrown if the distance matrix passed is not valid. 

2224 name : str, optional 

2225 The name of the variable to checked. This is useful if 

2226 throw is set to True so the offending variable can be identified 

2227 in the exception message when an exception is thrown. 

2228 warning : bool, optional 

2229 Instead of throwing an exception, a warning message is 

2230 raised. 

2231 

2232 Returns 

2233 ------- 

2234 valid : bool 

2235 True if the variable `D` passed is a valid distance matrix. 

2236 

2237 Notes 

2238 ----- 

2239 Small numerical differences in `D` and `D.T` and non-zeroness of 

2240 the diagonal are ignored if they are within the tolerance specified 

2241 by `tol`. 

2242 

2243 """ 

2244 D = np.asarray(D, order='c') 

2245 valid = True 

2246 try: 

2247 s = D.shape 

2248 if len(D.shape) != 2: 

2249 if name: 

2250 raise ValueError(('Distance matrix \'%s\' must have shape=2 ' 

2251 '(i.e. be two-dimensional).') % name) 

2252 else: 

2253 raise ValueError('Distance matrix must have shape=2 (i.e. ' 

2254 'be two-dimensional).') 

2255 if tol == 0.0: 

2256 if not (D == D.T).all(): 

2257 if name: 

2258 raise ValueError(('Distance matrix \'%s\' must be ' 

2259 'symmetric.') % name) 

2260 else: 

2261 raise ValueError('Distance matrix must be symmetric.') 

2262 if not (D[range(0, s[0]), range(0, s[0])] == 0).all(): 

2263 if name: 

2264 raise ValueError(('Distance matrix \'%s\' diagonal must ' 

2265 'be zero.') % name) 

2266 else: 

2267 raise ValueError('Distance matrix diagonal must be zero.') 

2268 else: 

2269 if not (D - D.T <= tol).all(): 

2270 if name: 

2271 raise ValueError(('Distance matrix \'%s\' must be ' 

2272 'symmetric within tolerance %5.5f.') 

2273 % (name, tol)) 

2274 else: 

2275 raise ValueError('Distance matrix must be symmetric within' 

2276 ' tolerance %5.5f.' % tol) 

2277 if not (D[range(0, s[0]), range(0, s[0])] <= tol).all(): 

2278 if name: 

2279 raise ValueError(('Distance matrix \'%s\' diagonal must be' 

2280 ' close to zero within tolerance %5.5f.') 

2281 % (name, tol)) 

2282 else: 

2283 raise ValueError(('Distance matrix \'%s\' diagonal must be' 

2284 ' close to zero within tolerance %5.5f.') 

2285 % tol) 

2286 except Exception as e: 

2287 if throw: 

2288 raise 

2289 if warning: 

2290 warnings.warn(str(e)) 

2291 valid = False 

2292 return valid 

2293 

2294 

2295def is_valid_y(y, warning=False, throw=False, name=None): 

2296 """ 

2297 Return True if the input array is a valid condensed distance matrix. 

2298 

2299 Condensed distance matrices must be 1-dimensional numpy arrays. 

2300 Their length must be a binomial coefficient :math:`{n \\choose 2}` 

2301 for some positive integer n. 

2302 

2303 Parameters 

2304 ---------- 

2305 y : ndarray 

2306 The condensed distance matrix. 

2307 warning : bool, optional 

2308 Invokes a warning if the variable passed is not a valid 

2309 condensed distance matrix. The warning message explains why 

2310 the distance matrix is not valid. `name` is used when 

2311 referencing the offending variable. 

2312 throw : bool, optional 

2313 Throws an exception if the variable passed is not a valid 

2314 condensed distance matrix. 

2315 name : bool, optional 

2316 Used when referencing the offending variable in the 

2317 warning or exception message. 

2318 

2319 """ 

2320 y = np.asarray(y, order='c') 

2321 valid = True 

2322 try: 

2323 if len(y.shape) != 1: 

2324 if name: 

2325 raise ValueError(('Condensed distance matrix \'%s\' must ' 

2326 'have shape=1 (i.e. be one-dimensional).') 

2327 % name) 

2328 else: 

2329 raise ValueError('Condensed distance matrix must have shape=1 ' 

2330 '(i.e. be one-dimensional).') 

2331 n = y.shape[0] 

2332 d = int(np.ceil(np.sqrt(n * 2))) 

2333 if (d * (d - 1) / 2) != n: 

2334 if name: 

2335 raise ValueError(('Length n of condensed distance matrix ' 

2336 '\'%s\' must be a binomial coefficient, i.e.' 

2337 'there must be a k such that ' 

2338 '(k \\choose 2)=n)!') % name) 

2339 else: 

2340 raise ValueError('Length n of condensed distance matrix must ' 

2341 'be a binomial coefficient, i.e. there must ' 

2342 'be a k such that (k \\choose 2)=n)!') 

2343 except Exception as e: 

2344 if throw: 

2345 raise 

2346 if warning: 

2347 warnings.warn(str(e)) 

2348 valid = False 

2349 return valid 

2350 

2351 

2352def num_obs_dm(d): 

2353 """ 

2354 Return the number of original observations that correspond to a 

2355 square, redundant distance matrix. 

2356 

2357 Parameters 

2358 ---------- 

2359 d : ndarray 

2360 The target distance matrix. 

2361 

2362 Returns 

2363 ------- 

2364 num_obs_dm : int 

2365 The number of observations in the redundant distance matrix. 

2366 

2367 """ 

2368 d = np.asarray(d, order='c') 

2369 is_valid_dm(d, tol=np.inf, throw=True, name='d') 

2370 return d.shape[0] 

2371 

2372 

2373def num_obs_y(Y): 

2374 """ 

2375 Return the number of original observations that correspond to a 

2376 condensed distance matrix. 

2377 

2378 Parameters 

2379 ---------- 

2380 Y : ndarray 

2381 Condensed distance matrix. 

2382 

2383 Returns 

2384 ------- 

2385 n : int 

2386 The number of observations in the condensed distance matrix `Y`. 

2387 

2388 """ 

2389 Y = np.asarray(Y, order='c') 

2390 is_valid_y(Y, throw=True, name='Y') 

2391 k = Y.shape[0] 

2392 if k == 0: 

2393 raise ValueError("The number of observations cannot be determined on " 

2394 "an empty distance matrix.") 

2395 d = int(np.ceil(np.sqrt(k * 2))) 

2396 if (d * (d - 1) / 2) != k: 

2397 raise ValueError("Invalid condensed distance matrix passed. Must be " 

2398 "some k where k=(n choose 2) for some n >= 2.") 

2399 return d 

2400 

2401 

2402def cdist(XA, XB, metric='euclidean', *args, **kwargs): 

2403 """ 

2404 Compute distance between each pair of the two collections of inputs. 

2405 

2406 See Notes for common calling conventions. 

2407 

2408 Parameters 

2409 ---------- 

2410 XA : ndarray 

2411 An :math:`m_A` by :math:`n` array of :math:`m_A` 

2412 original observations in an :math:`n`-dimensional space. 

2413 Inputs are converted to float type. 

2414 XB : ndarray 

2415 An :math:`m_B` by :math:`n` array of :math:`m_B` 

2416 original observations in an :math:`n`-dimensional space. 

2417 Inputs are converted to float type. 

2418 metric : str or callable, optional 

2419 The distance metric to use. If a string, the distance function can be 

2420 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 

2421 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 

2422 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 

2423 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 

2424 'wminkowski', 'yule'. 

2425 *args : tuple. Deprecated. 

2426 Additional arguments should be passed as keyword arguments 

2427 **kwargs : dict, optional 

2428 Extra arguments to `metric`: refer to each metric documentation for a 

2429 list of all possible arguments. 

2430 

2431 Some possible arguments: 

2432 

2433 p : scalar 

2434 The p-norm to apply for Minkowski, weighted and unweighted. 

2435 Default: 2. 

2436 

2437 w : ndarray 

2438 The weight vector for metrics that support weights (e.g., Minkowski). 

2439 

2440 V : ndarray 

2441 The variance vector for standardized Euclidean. 

2442 Default: var(vstack([XA, XB]), axis=0, ddof=1) 

2443 

2444 VI : ndarray 

2445 The inverse of the covariance matrix for Mahalanobis. 

2446 Default: inv(cov(vstack([XA, XB].T))).T 

2447 

2448 out : ndarray 

2449 The output array 

2450 If not None, the distance matrix Y is stored in this array. 

2451 Note: metric independent, it will become a regular keyword arg in a 

2452 future scipy version 

2453 

2454 Returns 

2455 ------- 

2456 Y : ndarray 

2457 A :math:`m_A` by :math:`m_B` distance matrix is returned. 

2458 For each :math:`i` and :math:`j`, the metric 

2459 ``dist(u=XA[i], v=XB[j])`` is computed and stored in the 

2460 :math:`ij` th entry. 

2461 

2462 Raises 

2463 ------ 

2464 ValueError 

2465 An exception is thrown if `XA` and `XB` do not have 

2466 the same number of columns. 

2467 

2468 Notes 

2469 ----- 

2470 The following are common calling conventions: 

2471 

2472 1. ``Y = cdist(XA, XB, 'euclidean')`` 

2473 

2474 Computes the distance between :math:`m` points using 

2475 Euclidean distance (2-norm) as the distance metric between the 

2476 points. The points are arranged as :math:`m` 

2477 :math:`n`-dimensional row vectors in the matrix X. 

2478 

2479 2. ``Y = cdist(XA, XB, 'minkowski', p=2.)`` 

2480 

2481 Computes the distances using the Minkowski distance 

2482 :math:`||u-v||_p` (:math:`p`-norm) where :math:`p \\geq 1`. 

2483 

2484 3. ``Y = cdist(XA, XB, 'cityblock')`` 

2485 

2486 Computes the city block or Manhattan distance between the 

2487 points. 

2488 

2489 4. ``Y = cdist(XA, XB, 'seuclidean', V=None)`` 

2490 

2491 Computes the standardized Euclidean distance. The standardized 

2492 Euclidean distance between two n-vectors ``u`` and ``v`` is 

2493 

2494 .. math:: 

2495 

2496 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}. 

2497 

2498 V is the variance vector; V[i] is the variance computed over all 

2499 the i'th components of the points. If not passed, it is 

2500 automatically computed. 

2501 

2502 5. ``Y = cdist(XA, XB, 'sqeuclidean')`` 

2503 

2504 Computes the squared Euclidean distance :math:`||u-v||_2^2` between 

2505 the vectors. 

2506 

2507 6. ``Y = cdist(XA, XB, 'cosine')`` 

2508 

2509 Computes the cosine distance between vectors u and v, 

2510 

2511 .. math:: 

2512 

2513 1 - \\frac{u \\cdot v} 

2514 {{||u||}_2 {||v||}_2} 

2515 

2516 where :math:`||*||_2` is the 2-norm of its argument ``*``, and 

2517 :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`. 

2518 

2519 7. ``Y = cdist(XA, XB, 'correlation')`` 

2520 

2521 Computes the correlation distance between vectors u and v. This is 

2522 

2523 .. math:: 

2524 

2525 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})} 

2526 {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2} 

2527 

2528 where :math:`\\bar{v}` is the mean of the elements of vector v, 

2529 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`. 

2530 

2531 

2532 8. ``Y = cdist(XA, XB, 'hamming')`` 

2533 

2534 Computes the normalized Hamming distance, or the proportion of 

2535 those vector elements between two n-vectors ``u`` and ``v`` 

2536 which disagree. To save memory, the matrix ``X`` can be of type 

2537 boolean. 

2538 

2539 9. ``Y = cdist(XA, XB, 'jaccard')`` 

2540 

2541 Computes the Jaccard distance between the points. Given two 

2542 vectors, ``u`` and ``v``, the Jaccard distance is the 

2543 proportion of those elements ``u[i]`` and ``v[i]`` that 

2544 disagree where at least one of them is non-zero. 

2545 

2546 10. ``Y = cdist(XA, XB, 'chebyshev')`` 

2547 

2548 Computes the Chebyshev distance between the points. The 

2549 Chebyshev distance between two n-vectors ``u`` and ``v`` is the 

2550 maximum norm-1 distance between their respective elements. More 

2551 precisely, the distance is given by 

2552 

2553 .. math:: 

2554 

2555 d(u,v) = \\max_i {|u_i-v_i|}. 

2556 

2557 11. ``Y = cdist(XA, XB, 'canberra')`` 

2558 

2559 Computes the Canberra distance between the points. The 

2560 Canberra distance between two points ``u`` and ``v`` is 

2561 

2562 .. math:: 

2563 

2564 d(u,v) = \\sum_i \\frac{|u_i-v_i|} 

2565 {|u_i|+|v_i|}. 

2566 

2567 12. ``Y = cdist(XA, XB, 'braycurtis')`` 

2568 

2569 Computes the Bray-Curtis distance between the points. The 

2570 Bray-Curtis distance between two points ``u`` and ``v`` is 

2571 

2572 

2573 .. math:: 

2574 

2575 d(u,v) = \\frac{\\sum_i (|u_i-v_i|)} 

2576 {\\sum_i (|u_i+v_i|)} 

2577 

2578 13. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)`` 

2579 

2580 Computes the Mahalanobis distance between the points. The 

2581 Mahalanobis distance between two points ``u`` and ``v`` is 

2582 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI`` 

2583 variable) is the inverse covariance. If ``VI`` is not None, 

2584 ``VI`` will be used as the inverse covariance matrix. 

2585 

2586 14. ``Y = cdist(XA, XB, 'yule')`` 

2587 

2588 Computes the Yule distance between the boolean 

2589 vectors. (see `yule` function documentation) 

2590 

2591 15. ``Y = cdist(XA, XB, 'matching')`` 

2592 

2593 Synonym for 'hamming'. 

2594 

2595 16. ``Y = cdist(XA, XB, 'dice')`` 

2596 

2597 Computes the Dice distance between the boolean vectors. (see 

2598 `dice` function documentation) 

2599 

2600 17. ``Y = cdist(XA, XB, 'kulsinski')`` 

2601 

2602 Computes the Kulsinski distance between the boolean 

2603 vectors. (see `kulsinski` function documentation) 

2604 

2605 18. ``Y = cdist(XA, XB, 'rogerstanimoto')`` 

2606 

2607 Computes the Rogers-Tanimoto distance between the boolean 

2608 vectors. (see `rogerstanimoto` function documentation) 

2609 

2610 19. ``Y = cdist(XA, XB, 'russellrao')`` 

2611 

2612 Computes the Russell-Rao distance between the boolean 

2613 vectors. (see `russellrao` function documentation) 

2614 

2615 20. ``Y = cdist(XA, XB, 'sokalmichener')`` 

2616 

2617 Computes the Sokal-Michener distance between the boolean 

2618 vectors. (see `sokalmichener` function documentation) 

2619 

2620 21. ``Y = cdist(XA, XB, 'sokalsneath')`` 

2621 

2622 Computes the Sokal-Sneath distance between the vectors. (see 

2623 `sokalsneath` function documentation) 

2624 

2625 

2626 22. ``Y = cdist(XA, XB, 'wminkowski', p=2., w=w)`` 

2627 

2628 Computes the weighted Minkowski distance between the 

2629 vectors. (see `wminkowski` function documentation) 

2630 

2631 23. ``Y = cdist(XA, XB, f)`` 

2632 

2633 Computes the distance between all pairs of vectors in X 

2634 using the user supplied 2-arity function f. For example, 

2635 Euclidean distance between the vectors could be computed 

2636 as follows:: 

2637 

2638 dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum())) 

2639 

2640 Note that you should avoid passing a reference to one of 

2641 the distance functions defined in this library. For example,:: 

2642 

2643 dm = cdist(XA, XB, sokalsneath) 

2644 

2645 would calculate the pair-wise distances between the vectors in 

2646 X using the Python function `sokalsneath`. This would result in 

2647 sokalsneath being called :math:`{n \\choose 2}` times, which 

2648 is inefficient. Instead, the optimized C version is more 

2649 efficient, and we call it using the following syntax:: 

2650 

2651 dm = cdist(XA, XB, 'sokalsneath') 

2652 

2653 Examples 

2654 -------- 

2655 Find the Euclidean distances between four 2-D coordinates: 

2656 

2657 >>> from scipy.spatial import distance 

2658 >>> coords = [(35.0456, -85.2672), 

2659 ... (35.1174, -89.9711), 

2660 ... (35.9728, -83.9422), 

2661 ... (36.1667, -86.7833)] 

2662 >>> distance.cdist(coords, coords, 'euclidean') 

2663 array([[ 0. , 4.7044, 1.6172, 1.8856], 

2664 [ 4.7044, 0. , 6.0893, 3.3561], 

2665 [ 1.6172, 6.0893, 0. , 2.8477], 

2666 [ 1.8856, 3.3561, 2.8477, 0. ]]) 

2667 

2668 

2669 Find the Manhattan distance from a 3-D point to the corners of the unit 

2670 cube: 

2671 

2672 >>> a = np.array([[0, 0, 0], 

2673 ... [0, 0, 1], 

2674 ... [0, 1, 0], 

2675 ... [0, 1, 1], 

2676 ... [1, 0, 0], 

2677 ... [1, 0, 1], 

2678 ... [1, 1, 0], 

2679 ... [1, 1, 1]]) 

2680 >>> b = np.array([[ 0.1, 0.2, 0.4]]) 

2681 >>> distance.cdist(a, b, 'cityblock') 

2682 array([[ 0.7], 

2683 [ 0.9], 

2684 [ 1.3], 

2685 [ 1.5], 

2686 [ 1.5], 

2687 [ 1.7], 

2688 [ 2.1], 

2689 [ 2.3]]) 

2690 

2691 """ 

2692 # You can also call this as: 

2693 # Y = cdist(XA, XB, 'test_abc') 

2694 # where 'abc' is the metric being tested. This computes the distance 

2695 # between all pairs of vectors in XA and XB using the distance metric 'abc' 

2696 # but with a more succinct, verifiable, but less efficient implementation. 

2697 

2698 kwargs = _args_to_kwargs_xdist(args, kwargs, metric, "cdist") 

2699 

2700 XA = np.asarray(XA, order='c') 

2701 XB = np.asarray(XB, order='c') 

2702 

2703 s = XA.shape 

2704 sB = XB.shape 

2705 

2706 if len(s) != 2: 

2707 raise ValueError('XA must be a 2-dimensional array.') 

2708 if len(sB) != 2: 

2709 raise ValueError('XB must be a 2-dimensional array.') 

2710 if s[1] != sB[1]: 

2711 raise ValueError('XA and XB must have the same number of columns ' 

2712 '(i.e. feature dimension.)') 

2713 

2714 mA = s[0] 

2715 mB = sB[0] 

2716 n = s[1] 

2717 out = kwargs.pop("out", None) 

2718 if out is None: 

2719 dm = np.empty((mA, mB), dtype=np.double) 

2720 else: 

2721 if out.shape != (mA, mB): 

2722 raise ValueError("Output array has incorrect shape.") 

2723 if not out.flags.c_contiguous: 

2724 raise ValueError("Output array must be C-contiguous.") 

2725 if out.dtype != np.double: 

2726 raise ValueError("Output array must be double type.") 

2727 dm = out 

2728 

2729 # compute blacklist for deprecated kwargs 

2730 if(metric in _METRICS['minkowski'].aka or 

2731 metric in _METRICS['wminkowski'].aka or 

2732 metric in ['test_minkowski', 'test_wminkowski'] or 

2733 metric in [minkowski, wminkowski]): 

2734 kwargs_blacklist = ["V", "VI"] 

2735 elif(metric in _METRICS['seuclidean'].aka or 

2736 metric == 'test_seuclidean' or metric == seuclidean): 

2737 kwargs_blacklist = ["p", "w", "VI"] 

2738 elif(metric in _METRICS['mahalanobis'].aka or 

2739 metric == 'test_mahalanobis' or metric == mahalanobis): 

2740 kwargs_blacklist = ["p", "w", "V"] 

2741 else: 

2742 kwargs_blacklist = ["p", "V", "VI"] 

2743 

2744 _filter_deprecated_kwargs(kwargs, kwargs_blacklist) 

2745 

2746 if callable(metric): 

2747 

2748 mstr = getattr(metric, '__name__', 'Unknown') 

2749 metric_name = _METRIC_ALIAS.get(mstr, None) 

2750 

2751 XA, XB, typ, kwargs = _validate_cdist_input(XA, XB, mA, mB, n, 

2752 metric_name, **kwargs) 

2753 

2754 for i in range(0, mA): 

2755 for j in range(0, mB): 

2756 dm[i, j] = metric(XA[i], XB[j], **kwargs) 

2757 

2758 elif isinstance(metric, str): 

2759 mstr = metric.lower() 

2760 

2761 mstr, kwargs = _select_weighted_metric(mstr, kwargs, out) 

2762 

2763 metric_name = _METRIC_ALIAS.get(mstr, None) 

2764 if metric_name is not None: 

2765 XA, XB, typ, kwargs = _validate_cdist_input(XA, XB, mA, mB, n, 

2766 metric_name, **kwargs) 

2767 # get cdist wrapper 

2768 cdist_fn = getattr(_distance_wrap, 

2769 "cdist_%s_%s_wrap" % (metric_name, typ)) 

2770 cdist_fn(XA, XB, dm, **kwargs) 

2771 return dm 

2772 

2773 elif mstr.startswith("test_"): 

2774 if mstr in _TEST_METRICS: 

2775 dm = cdist(XA, XB, _TEST_METRICS[mstr], **kwargs) 

2776 else: 

2777 raise ValueError('Unknown "Test" Distance Metric: %s' % mstr[5:]) 

2778 else: 

2779 raise ValueError('Unknown Distance Metric: %s' % mstr) 

2780 else: 

2781 raise TypeError('2nd argument metric must be a string identifier ' 

2782 'or a function.') 

2783 return dm