Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/spatial/distance.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Distance computations (:mod:`scipy.spatial.distance`)
3=====================================================
5.. sectionauthor:: Damian Eads
7Function reference
8------------------
10Distance matrix computation from a collection of raw observation vectors
11stored in a rectangular array.
13.. autosummary::
14 :toctree: generated/
16 pdist -- pairwise distances between observation vectors.
17 cdist -- distances between two collections of observation vectors
18 squareform -- convert distance matrix to a condensed one and vice versa
19 directed_hausdorff -- directed Hausdorff distance between arrays
21Predicates for checking the validity of distance matrices, both
22condensed and redundant. Also contained in this module are functions
23for computing the number of observations in a distance matrix.
25.. autosummary::
26 :toctree: generated/
28 is_valid_dm -- checks for a valid distance matrix
29 is_valid_y -- checks for a valid condensed distance matrix
30 num_obs_dm -- # of observations in a distance matrix
31 num_obs_y -- # of observations in a condensed distance matrix
33Distance functions between two numeric vectors ``u`` and ``v``. Computing
34distances over a large collection of vectors is inefficient for these
35functions. Use ``pdist`` for this purpose.
37.. autosummary::
38 :toctree: generated/
40 braycurtis -- the Bray-Curtis distance.
41 canberra -- the Canberra distance.
42 chebyshev -- the Chebyshev distance.
43 cityblock -- the Manhattan distance.
44 correlation -- the Correlation distance.
45 cosine -- the Cosine distance.
46 euclidean -- the Euclidean distance.
47 jensenshannon -- the Jensen-Shannon distance.
48 mahalanobis -- the Mahalanobis distance.
49 minkowski -- the Minkowski distance.
50 seuclidean -- the normalized Euclidean distance.
51 sqeuclidean -- the squared Euclidean distance.
52 wminkowski -- (deprecated) alias of `minkowski`.
54Distance functions between two boolean vectors (representing sets) ``u`` and
55``v``. As in the case of numerical vectors, ``pdist`` is more efficient for
56computing the distances between all pairs.
58.. autosummary::
59 :toctree: generated/
61 dice -- the Dice dissimilarity.
62 hamming -- the Hamming distance.
63 jaccard -- the Jaccard distance.
64 kulsinski -- the Kulsinski distance.
65 rogerstanimoto -- the Rogers-Tanimoto dissimilarity.
66 russellrao -- the Russell-Rao dissimilarity.
67 sokalmichener -- the Sokal-Michener dissimilarity.
68 sokalsneath -- the Sokal-Sneath dissimilarity.
69 yule -- the Yule dissimilarity.
71:func:`hamming` also operates over discrete numerical vectors.
72"""
74# Copyright (C) Damian Eads, 2007-2008. New BSD License.
76__all__ = [
77 'braycurtis',
78 'canberra',
79 'cdist',
80 'chebyshev',
81 'cityblock',
82 'correlation',
83 'cosine',
84 'dice',
85 'directed_hausdorff',
86 'euclidean',
87 'hamming',
88 'is_valid_dm',
89 'is_valid_y',
90 'jaccard',
91 'jensenshannon',
92 'kulsinski',
93 'mahalanobis',
94 'matching',
95 'minkowski',
96 'num_obs_dm',
97 'num_obs_y',
98 'pdist',
99 'rogerstanimoto',
100 'russellrao',
101 'seuclidean',
102 'sokalmichener',
103 'sokalsneath',
104 'sqeuclidean',
105 'squareform',
106 'wminkowski',
107 'yule'
108]
111import warnings
112import numpy as np
114from functools import partial
115from collections import namedtuple
116from scipy._lib._util import _asarray_validated
118from . import _distance_wrap
119from . import _hausdorff
120from ..linalg import norm
121from ..special import rel_entr
124def _args_to_kwargs_xdist(args, kwargs, metric, func_name):
125 """
126 Convert legacy positional arguments to keyword arguments for pdist/cdist.
127 """
128 if not args:
129 return kwargs
131 if (callable(metric) and metric not in [
132 braycurtis, canberra, chebyshev, cityblock, correlation, cosine,
133 dice, euclidean, hamming, jaccard, jensenshannon, kulsinski,
134 mahalanobis, matching, minkowski, rogerstanimoto, russellrao,
135 seuclidean, sokalmichener, sokalsneath, sqeuclidean, yule,
136 wminkowski]):
137 raise TypeError('When using a custom metric arguments must be passed'
138 'as keyword (i.e., ARGNAME=ARGVALUE)')
140 if func_name == 'pdist':
141 old_arg_names = ['p', 'w', 'V', 'VI']
142 else:
143 old_arg_names = ['p', 'V', 'VI', 'w']
145 num_args = len(args)
146 warnings.warn('%d metric parameters have been passed as positional.'
147 'This will raise an error in a future version.'
148 'Please pass arguments as keywords(i.e., ARGNAME=ARGVALUE)'
149 % num_args, DeprecationWarning)
151 if num_args > 4:
152 raise ValueError('Deprecated %s signature accepts only 4'
153 'positional arguments (%s), %d given.'
154 % (func_name, ', '.join(old_arg_names), num_args))
156 for old_arg, arg in zip(old_arg_names, args):
157 if old_arg in kwargs:
158 raise TypeError('%s() got multiple values for argument %s'
159 % (func_name, old_arg))
160 kwargs[old_arg] = arg
161 return kwargs
164def _copy_array_if_base_present(a):
165 """Copy the array if its base points to a parent array."""
166 if a.base is not None:
167 return a.copy()
168 return a
171def _correlation_cdist_wrap(XA, XB, dm, **kwargs):
172 XA = XA - XA.mean(axis=1, keepdims=True)
173 XB = XB - XB.mean(axis=1, keepdims=True)
174 _distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs)
177def _correlation_pdist_wrap(X, dm, **kwargs):
178 X2 = X - X.mean(axis=1, keepdims=True)
179 _distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs)
182def _convert_to_type(X, out_type):
183 return np.ascontiguousarray(X, dtype=out_type)
186def _filter_deprecated_kwargs(kwargs, args_blacklist):
187 # Filtering out old default keywords
188 for k in args_blacklist:
189 if k in kwargs:
190 del kwargs[k]
191 warnings.warn('Got unexpected kwarg %s. This will raise an error'
192 ' in a future version.' % k, DeprecationWarning)
195def _nbool_correspond_all(u, v, w=None):
196 if u.dtype == v.dtype == bool and w is None:
197 not_u = ~u
198 not_v = ~v
199 nff = (not_u & not_v).sum()
200 nft = (not_u & v).sum()
201 ntf = (u & not_v).sum()
202 ntt = (u & v).sum()
203 else:
204 dtype = np.find_common_type([int], [u.dtype, v.dtype])
205 u = u.astype(dtype)
206 v = v.astype(dtype)
207 not_u = 1.0 - u
208 not_v = 1.0 - v
209 if w is not None:
210 not_u = w * not_u
211 u = w * u
212 nff = (not_u * not_v).sum()
213 nft = (not_u * v).sum()
214 ntf = (u * not_v).sum()
215 ntt = (u * v).sum()
216 return (nff, nft, ntf, ntt)
219def _nbool_correspond_ft_tf(u, v, w=None):
220 if u.dtype == v.dtype == bool and w is None:
221 not_u = ~u
222 not_v = ~v
223 nft = (not_u & v).sum()
224 ntf = (u & not_v).sum()
225 else:
226 dtype = np.find_common_type([int], [u.dtype, v.dtype])
227 u = u.astype(dtype)
228 v = v.astype(dtype)
229 not_u = 1.0 - u
230 not_v = 1.0 - v
231 if w is not None:
232 not_u = w * not_u
233 u = w * u
234 nft = (not_u * v).sum()
235 ntf = (u * not_v).sum()
236 return (nft, ntf)
239def _validate_cdist_input(XA, XB, mA, mB, n, metric_name, **kwargs):
240 if metric_name is not None:
241 # get supported types
242 types = _METRICS[metric_name].types
243 # choose best type
244 typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0]
245 # validate data
246 XA = _convert_to_type(XA, out_type=typ)
247 XB = _convert_to_type(XB, out_type=typ)
249 # validate kwargs
250 _validate_kwargs = _METRICS[metric_name].validator
251 if _validate_kwargs:
252 kwargs = _validate_kwargs(np.vstack([XA, XB]), mA + mB, n, **kwargs)
253 else:
254 typ = None
255 return XA, XB, typ, kwargs
258def _validate_hamming_kwargs(X, m, n, **kwargs):
259 w = kwargs.get('w', np.ones((n,), dtype='double'))
261 if w.ndim != 1 or w.shape[0] != n:
262 raise ValueError("Weights must have same size as input vector. %d vs. %d" % (w.shape[0], n))
264 kwargs['w'] = _validate_weights(w)
265 return kwargs
268def _validate_mahalanobis_kwargs(X, m, n, **kwargs):
269 VI = kwargs.pop('VI', None)
270 if VI is None:
271 if m <= n:
272 # There are fewer observations than the dimension of
273 # the observations.
274 raise ValueError("The number of observations (%d) is too "
275 "small; the covariance matrix is "
276 "singular. For observations with %d "
277 "dimensions, at least %d observations "
278 "are required." % (m, n, n + 1))
279 CV = np.atleast_2d(np.cov(X.astype(np.double).T))
280 VI = np.linalg.inv(CV).T.copy()
281 kwargs["VI"] = _convert_to_double(VI)
282 return kwargs
285def _validate_minkowski_kwargs(X, m, n, **kwargs):
286 if 'p' not in kwargs:
287 kwargs['p'] = 2.
288 return kwargs
291def _validate_pdist_input(X, m, n, metric_name, **kwargs):
292 if metric_name is not None:
293 # get supported types
294 types = _METRICS[metric_name].types
295 # choose best type
296 typ = types[types.index(X.dtype)] if X.dtype in types else types[0]
297 # validate data
298 X = _convert_to_type(X, out_type=typ)
300 # validate kwargs
301 _validate_kwargs = _METRICS[metric_name].validator
302 if _validate_kwargs:
303 kwargs = _validate_kwargs(X, m, n, **kwargs)
304 else:
305 typ = None
306 return X, typ, kwargs
309def _validate_seuclidean_kwargs(X, m, n, **kwargs):
310 V = kwargs.pop('V', None)
311 if V is None:
312 V = np.var(X.astype(np.double), axis=0, ddof=1)
313 else:
314 V = np.asarray(V, order='c')
315 if len(V.shape) != 1:
316 raise ValueError('Variance vector V must '
317 'be one-dimensional.')
318 if V.shape[0] != n:
319 raise ValueError('Variance vector V must be of the same '
320 'dimension as the vectors on which the distances '
321 'are computed.')
322 kwargs['V'] = _convert_to_double(V)
323 return kwargs
326def _validate_vector(u, dtype=None):
327 # XXX Is order='c' really necessary?
328 u = np.asarray(u, dtype=dtype, order='c').squeeze()
329 # Ensure values such as u=1 and u=[1] still return 1-D arrays.
330 u = np.atleast_1d(u)
331 if u.ndim > 1:
332 raise ValueError("Input vector should be 1-D.")
333 return u
336def _validate_weights(w, dtype=np.double):
337 w = _validate_vector(w, dtype=dtype)
338 if np.any(w < 0):
339 raise ValueError("Input weights should be all non-negative")
340 return w
343def _validate_wminkowski_kwargs(X, m, n, **kwargs):
344 w = kwargs.pop('w', None)
345 if w is None:
346 raise ValueError('weighted minkowski requires a weight '
347 'vector `w` to be given.')
348 kwargs['w'] = _validate_weights(w)
349 if 'p' not in kwargs:
350 kwargs['p'] = 2.
351 return kwargs
354def directed_hausdorff(u, v, seed=0):
355 """
356 Compute the directed Hausdorff distance between two N-D arrays.
358 Distances between pairs are calculated using a Euclidean metric.
360 Parameters
361 ----------
362 u : (M,N) ndarray
363 Input array.
364 v : (O,N) ndarray
365 Input array.
366 seed : int or None
367 Local `numpy.random.RandomState` seed. Default is 0, a random
368 shuffling of u and v that guarantees reproducibility.
370 Returns
371 -------
372 d : double
373 The directed Hausdorff distance between arrays `u` and `v`,
375 index_1 : int
376 index of point contributing to Hausdorff pair in `u`
378 index_2 : int
379 index of point contributing to Hausdorff pair in `v`
381 Raises
382 ------
383 ValueError
384 An exception is thrown if `u` and `v` do not have
385 the same number of columns.
387 Notes
388 -----
389 Uses the early break technique and the random sampling approach
390 described by [1]_. Although worst-case performance is ``O(m * o)``
391 (as with the brute force algorithm), this is unlikely in practice
392 as the input data would have to require the algorithm to explore
393 every single point interaction, and after the algorithm shuffles
394 the input points at that. The best case performance is O(m), which
395 is satisfied by selecting an inner loop distance that is less than
396 cmax and leads to an early break as often as possible. The authors
397 have formally shown that the average runtime is closer to O(m).
399 .. versionadded:: 0.19.0
401 References
402 ----------
403 .. [1] A. A. Taha and A. Hanbury, "An efficient algorithm for
404 calculating the exact Hausdorff distance." IEEE Transactions On
405 Pattern Analysis And Machine Intelligence, vol. 37 pp. 2153-63,
406 2015.
408 See Also
409 --------
410 scipy.spatial.procrustes : Another similarity test for two data sets
412 Examples
413 --------
414 Find the directed Hausdorff distance between two 2-D arrays of
415 coordinates:
417 >>> from scipy.spatial.distance import directed_hausdorff
418 >>> u = np.array([(1.0, 0.0),
419 ... (0.0, 1.0),
420 ... (-1.0, 0.0),
421 ... (0.0, -1.0)])
422 >>> v = np.array([(2.0, 0.0),
423 ... (0.0, 2.0),
424 ... (-2.0, 0.0),
425 ... (0.0, -4.0)])
427 >>> directed_hausdorff(u, v)[0]
428 2.23606797749979
429 >>> directed_hausdorff(v, u)[0]
430 3.0
432 Find the general (symmetric) Hausdorff distance between two 2-D
433 arrays of coordinates:
435 >>> max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
436 3.0
438 Find the indices of the points that generate the Hausdorff distance
439 (the Hausdorff pair):
441 >>> directed_hausdorff(v, u)[1:]
442 (3, 3)
444 """
445 u = np.asarray(u, dtype=np.float64, order='c')
446 v = np.asarray(v, dtype=np.float64, order='c')
447 if u.shape[1] != v.shape[1]:
448 raise ValueError('u and v need to have the same '
449 'number of columns')
450 result = _hausdorff.directed_hausdorff(u, v, seed)
451 return result
454def minkowski(u, v, p=2, w=None):
455 """
456 Compute the Minkowski distance between two 1-D arrays.
458 The Minkowski distance between 1-D arrays `u` and `v`,
459 is defined as
461 .. math::
463 {||u-v||}_p = (\\sum{|u_i - v_i|^p})^{1/p}.
466 \\left(\\sum{w_i(|(u_i - v_i)|^p)}\\right)^{1/p}.
468 Parameters
469 ----------
470 u : (N,) array_like
471 Input array.
472 v : (N,) array_like
473 Input array.
474 p : int
475 The order of the norm of the difference :math:`{||u-v||}_p`.
476 w : (N,) array_like, optional
477 The weights for each value in `u` and `v`. Default is None,
478 which gives each value a weight of 1.0
480 Returns
481 -------
482 minkowski : double
483 The Minkowski distance between vectors `u` and `v`.
485 Examples
486 --------
487 >>> from scipy.spatial import distance
488 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 1)
489 2.0
490 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 2)
491 1.4142135623730951
492 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 3)
493 1.2599210498948732
494 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 1)
495 1.0
496 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 2)
497 1.0
498 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 3)
499 1.0
501 """
502 u = _validate_vector(u)
503 v = _validate_vector(v)
504 if p < 1:
505 raise ValueError("p must be at least 1")
506 u_v = u - v
507 if w is not None:
508 w = _validate_weights(w)
509 if p == 1:
510 root_w = w
511 if p == 2:
512 # better precision and speed
513 root_w = np.sqrt(w)
514 else:
515 root_w = np.power(w, 1/p)
516 u_v = root_w * u_v
517 dist = norm(u_v, ord=p)
518 return dist
521# `minkowski` gained weights in scipy 1.0. Once we're at say version 1.3,
522# deprecated `wminkowski`. Not done at once because it would be annoying for
523# downstream libraries that used `wminkowski` and support multiple scipy
524# versions.
525def wminkowski(u, v, p, w):
526 """
527 Compute the weighted Minkowski distance between two 1-D arrays.
529 The weighted Minkowski distance between `u` and `v`, defined as
531 .. math::
533 \\left(\\sum{(|w_i (u_i - v_i)|^p)}\\right)^{1/p}.
535 Parameters
536 ----------
537 u : (N,) array_like
538 Input array.
539 v : (N,) array_like
540 Input array.
541 p : int
542 The order of the norm of the difference :math:`{||u-v||}_p`.
543 w : (N,) array_like
544 The weight vector.
546 Returns
547 -------
548 wminkowski : double
549 The weighted Minkowski distance between vectors `u` and `v`.
551 Notes
552 -----
553 `wminkowski` is DEPRECATED. It implements a definition where weights
554 are powered. It is recommended to use the weighted version of `minkowski`
555 instead. This function will be removed in a future version of scipy.
557 Examples
558 --------
559 >>> from scipy.spatial import distance
560 >>> distance.wminkowski([1, 0, 0], [0, 1, 0], 1, np.ones(3))
561 2.0
562 >>> distance.wminkowski([1, 0, 0], [0, 1, 0], 2, np.ones(3))
563 1.4142135623730951
564 >>> distance.wminkowski([1, 0, 0], [0, 1, 0], 3, np.ones(3))
565 1.2599210498948732
566 >>> distance.wminkowski([1, 1, 0], [0, 1, 0], 1, np.ones(3))
567 1.0
568 >>> distance.wminkowski([1, 1, 0], [0, 1, 0], 2, np.ones(3))
569 1.0
570 >>> distance.wminkowski([1, 1, 0], [0, 1, 0], 3, np.ones(3))
571 1.0
573 """
574 w = _validate_weights(w)
575 return minkowski(u, v, p=p, w=w**p)
578def euclidean(u, v, w=None):
579 """
580 Computes the Euclidean distance between two 1-D arrays.
582 The Euclidean distance between 1-D arrays `u` and `v`, is defined as
584 .. math::
586 {||u-v||}_2
588 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)^{1/2}
590 Parameters
591 ----------
592 u : (N,) array_like
593 Input array.
594 v : (N,) array_like
595 Input array.
596 w : (N,) array_like, optional
597 The weights for each value in `u` and `v`. Default is None,
598 which gives each value a weight of 1.0
600 Returns
601 -------
602 euclidean : double
603 The Euclidean distance between vectors `u` and `v`.
605 Examples
606 --------
607 >>> from scipy.spatial import distance
608 >>> distance.euclidean([1, 0, 0], [0, 1, 0])
609 1.4142135623730951
610 >>> distance.euclidean([1, 1, 0], [0, 1, 0])
611 1.0
613 """
614 return minkowski(u, v, p=2, w=w)
617def sqeuclidean(u, v, w=None):
618 """
619 Compute the squared Euclidean distance between two 1-D arrays.
621 The squared Euclidean distance between `u` and `v` is defined as
623 .. math::
625 {||u-v||}_2^2
627 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)
629 Parameters
630 ----------
631 u : (N,) array_like
632 Input array.
633 v : (N,) array_like
634 Input array.
635 w : (N,) array_like, optional
636 The weights for each value in `u` and `v`. Default is None,
637 which gives each value a weight of 1.0
639 Returns
640 -------
641 sqeuclidean : double
642 The squared Euclidean distance between vectors `u` and `v`.
644 Examples
645 --------
646 >>> from scipy.spatial import distance
647 >>> distance.sqeuclidean([1, 0, 0], [0, 1, 0])
648 2.0
649 >>> distance.sqeuclidean([1, 1, 0], [0, 1, 0])
650 1.0
652 """
653 # Preserve float dtypes, but convert everything else to np.float64
654 # for stability.
655 utype, vtype = None, None
656 if not (hasattr(u, "dtype") and np.issubdtype(u.dtype, np.inexact)):
657 utype = np.float64
658 if not (hasattr(v, "dtype") and np.issubdtype(v.dtype, np.inexact)):
659 vtype = np.float64
661 u = _validate_vector(u, dtype=utype)
662 v = _validate_vector(v, dtype=vtype)
663 u_v = u - v
664 u_v_w = u_v # only want weights applied once
665 if w is not None:
666 w = _validate_weights(w)
667 u_v_w = w * u_v
668 return np.dot(u_v, u_v_w)
671def correlation(u, v, w=None, centered=True):
672 """
673 Compute the correlation distance between two 1-D arrays.
675 The correlation distance between `u` and `v`, is
676 defined as
678 .. math::
680 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
681 {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2}
683 where :math:`\\bar{u}` is the mean of the elements of `u`
684 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
686 Parameters
687 ----------
688 u : (N,) array_like
689 Input array.
690 v : (N,) array_like
691 Input array.
692 w : (N,) array_like, optional
693 The weights for each value in `u` and `v`. Default is None,
694 which gives each value a weight of 1.0
696 Returns
697 -------
698 correlation : double
699 The correlation distance between 1-D array `u` and `v`.
701 """
702 u = _validate_vector(u)
703 v = _validate_vector(v)
704 if w is not None:
705 w = _validate_weights(w)
706 if centered:
707 umu = np.average(u, weights=w)
708 vmu = np.average(v, weights=w)
709 u = u - umu
710 v = v - vmu
711 uv = np.average(u * v, weights=w)
712 uu = np.average(np.square(u), weights=w)
713 vv = np.average(np.square(v), weights=w)
714 dist = 1.0 - uv / np.sqrt(uu * vv)
715 return dist
718def cosine(u, v, w=None):
719 """
720 Compute the Cosine distance between 1-D arrays.
722 The Cosine distance between `u` and `v`, is defined as
724 .. math::
726 1 - \\frac{u \\cdot v}
727 {||u||_2 ||v||_2}.
729 where :math:`u \\cdot v` is the dot product of :math:`u` and
730 :math:`v`.
732 Parameters
733 ----------
734 u : (N,) array_like
735 Input array.
736 v : (N,) array_like
737 Input array.
738 w : (N,) array_like, optional
739 The weights for each value in `u` and `v`. Default is None,
740 which gives each value a weight of 1.0
742 Returns
743 -------
744 cosine : double
745 The Cosine distance between vectors `u` and `v`.
747 Examples
748 --------
749 >>> from scipy.spatial import distance
750 >>> distance.cosine([1, 0, 0], [0, 1, 0])
751 1.0
752 >>> distance.cosine([100, 0, 0], [0, 1, 0])
753 1.0
754 >>> distance.cosine([1, 1, 0], [0, 1, 0])
755 0.29289321881345254
757 """
758 # cosine distance is also referred to as 'uncentered correlation',
759 # or 'reflective correlation'
760 return correlation(u, v, w=w, centered=False)
763def hamming(u, v, w=None):
764 """
765 Compute the Hamming distance between two 1-D arrays.
767 The Hamming distance between 1-D arrays `u` and `v`, is simply the
768 proportion of disagreeing components in `u` and `v`. If `u` and `v` are
769 boolean vectors, the Hamming distance is
771 .. math::
773 \\frac{c_{01} + c_{10}}{n}
775 where :math:`c_{ij}` is the number of occurrences of
776 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
777 :math:`k < n`.
779 Parameters
780 ----------
781 u : (N,) array_like
782 Input array.
783 v : (N,) array_like
784 Input array.
785 w : (N,) array_like, optional
786 The weights for each value in `u` and `v`. Default is None,
787 which gives each value a weight of 1.0
789 Returns
790 -------
791 hamming : double
792 The Hamming distance between vectors `u` and `v`.
794 Examples
795 --------
796 >>> from scipy.spatial import distance
797 >>> distance.hamming([1, 0, 0], [0, 1, 0])
798 0.66666666666666663
799 >>> distance.hamming([1, 0, 0], [1, 1, 0])
800 0.33333333333333331
801 >>> distance.hamming([1, 0, 0], [2, 0, 0])
802 0.33333333333333331
803 >>> distance.hamming([1, 0, 0], [3, 0, 0])
804 0.33333333333333331
806 """
807 u = _validate_vector(u)
808 v = _validate_vector(v)
809 if u.shape != v.shape:
810 raise ValueError('The 1d arrays must have equal lengths.')
811 u_ne_v = u != v
812 if w is not None:
813 w = _validate_weights(w)
814 return np.average(u_ne_v, weights=w)
817def jaccard(u, v, w=None):
818 """
819 Compute the Jaccard-Needham dissimilarity between two boolean 1-D arrays.
821 The Jaccard-Needham dissimilarity between 1-D boolean arrays `u` and `v`,
822 is defined as
824 .. math::
826 \\frac{c_{TF} + c_{FT}}
827 {c_{TT} + c_{FT} + c_{TF}}
829 where :math:`c_{ij}` is the number of occurrences of
830 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
831 :math:`k < n`.
833 Parameters
834 ----------
835 u : (N,) array_like, bool
836 Input array.
837 v : (N,) array_like, bool
838 Input array.
839 w : (N,) array_like, optional
840 The weights for each value in `u` and `v`. Default is None,
841 which gives each value a weight of 1.0
843 Returns
844 -------
845 jaccard : double
846 The Jaccard distance between vectors `u` and `v`.
848 Notes
849 -----
850 When both `u` and `v` lead to a `0/0` division i.e. there is no overlap
851 between the items in the vectors the returned distance is 0. See the
852 Wikipedia page on the Jaccard index [1]_, and this paper [2]_.
854 .. versionchanged:: 1.2.0
855 Previously, when `u` and `v` lead to a `0/0` division, the function
856 would return NaN. This was changed to return 0 instead.
858 References
859 ----------
860 .. [1] https://en.wikipedia.org/wiki/Jaccard_index
861 .. [2] S. Kosub, "A note on the triangle inequality for the Jaccard
862 distance", 2016, Available online: https://arxiv.org/pdf/1612.02696.pdf
864 Examples
865 --------
866 >>> from scipy.spatial import distance
867 >>> distance.jaccard([1, 0, 0], [0, 1, 0])
868 1.0
869 >>> distance.jaccard([1, 0, 0], [1, 1, 0])
870 0.5
871 >>> distance.jaccard([1, 0, 0], [1, 2, 0])
872 0.5
873 >>> distance.jaccard([1, 0, 0], [1, 1, 1])
874 0.66666666666666663
876 """
877 u = _validate_vector(u)
878 v = _validate_vector(v)
880 nonzero = np.bitwise_or(u != 0, v != 0)
881 unequal_nonzero = np.bitwise_and((u != v), nonzero)
882 if w is not None:
883 w = _validate_weights(w)
884 nonzero = w * nonzero
885 unequal_nonzero = w * unequal_nonzero
886 a = np.double(unequal_nonzero.sum())
887 b = np.double(nonzero.sum())
888 return (a / b) if b != 0 else 0
891def kulsinski(u, v, w=None):
892 """
893 Compute the Kulsinski dissimilarity between two boolean 1-D arrays.
895 The Kulsinski dissimilarity between two boolean 1-D arrays `u` and `v`,
896 is defined as
898 .. math::
900 \\frac{c_{TF} + c_{FT} - c_{TT} + n}
901 {c_{FT} + c_{TF} + n}
903 where :math:`c_{ij}` is the number of occurrences of
904 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
905 :math:`k < n`.
907 Parameters
908 ----------
909 u : (N,) array_like, bool
910 Input array.
911 v : (N,) array_like, bool
912 Input array.
913 w : (N,) array_like, optional
914 The weights for each value in `u` and `v`. Default is None,
915 which gives each value a weight of 1.0
917 Returns
918 -------
919 kulsinski : double
920 The Kulsinski distance between vectors `u` and `v`.
922 Examples
923 --------
924 >>> from scipy.spatial import distance
925 >>> distance.kulsinski([1, 0, 0], [0, 1, 0])
926 1.0
927 >>> distance.kulsinski([1, 0, 0], [1, 1, 0])
928 0.75
929 >>> distance.kulsinski([1, 0, 0], [2, 1, 0])
930 0.33333333333333331
931 >>> distance.kulsinski([1, 0, 0], [3, 1, 0])
932 -0.5
934 """
935 u = _validate_vector(u)
936 v = _validate_vector(v)
937 if w is None:
938 n = float(len(u))
939 else:
940 w = _validate_weights(w)
941 n = w.sum()
942 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
944 return (ntf + nft - ntt + n) / (ntf + nft + n)
947def seuclidean(u, v, V):
948 """
949 Return the standardized Euclidean distance between two 1-D arrays.
951 The standardized Euclidean distance between `u` and `v`.
953 Parameters
954 ----------
955 u : (N,) array_like
956 Input array.
957 v : (N,) array_like
958 Input array.
959 V : (N,) array_like
960 `V` is an 1-D array of component variances. It is usually computed
961 among a larger collection vectors.
963 Returns
964 -------
965 seuclidean : double
966 The standardized Euclidean distance between vectors `u` and `v`.
968 Examples
969 --------
970 >>> from scipy.spatial import distance
971 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [0.1, 0.1, 0.1])
972 4.4721359549995796
973 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [1, 0.1, 0.1])
974 3.3166247903553998
975 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [10, 0.1, 0.1])
976 3.1780497164141406
978 """
979 u = _validate_vector(u)
980 v = _validate_vector(v)
981 V = _validate_vector(V, dtype=np.float64)
982 if V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]:
983 raise TypeError('V must be a 1-D array of the same dimension '
984 'as u and v.')
985 return euclidean(u, v, w=1/V)
988def cityblock(u, v, w=None):
989 """
990 Compute the City Block (Manhattan) distance.
992 Computes the Manhattan distance between two 1-D arrays `u` and `v`,
993 which is defined as
995 .. math::
997 \\sum_i {\\left| u_i - v_i \\right|}.
999 Parameters
1000 ----------
1001 u : (N,) array_like
1002 Input array.
1003 v : (N,) array_like
1004 Input array.
1005 w : (N,) array_like, optional
1006 The weights for each value in `u` and `v`. Default is None,
1007 which gives each value a weight of 1.0
1009 Returns
1010 -------
1011 cityblock : double
1012 The City Block (Manhattan) distance between vectors `u` and `v`.
1014 Examples
1015 --------
1016 >>> from scipy.spatial import distance
1017 >>> distance.cityblock([1, 0, 0], [0, 1, 0])
1018 2
1019 >>> distance.cityblock([1, 0, 0], [0, 2, 0])
1020 3
1021 >>> distance.cityblock([1, 0, 0], [1, 1, 0])
1022 1
1024 """
1025 u = _validate_vector(u)
1026 v = _validate_vector(v)
1027 l1_diff = abs(u - v)
1028 if w is not None:
1029 w = _validate_weights(w)
1030 l1_diff = w * l1_diff
1031 return l1_diff.sum()
1034def mahalanobis(u, v, VI):
1035 """
1036 Compute the Mahalanobis distance between two 1-D arrays.
1038 The Mahalanobis distance between 1-D arrays `u` and `v`, is defined as
1040 .. math::
1042 \\sqrt{ (u-v) V^{-1} (u-v)^T }
1044 where ``V`` is the covariance matrix. Note that the argument `VI`
1045 is the inverse of ``V``.
1047 Parameters
1048 ----------
1049 u : (N,) array_like
1050 Input array.
1051 v : (N,) array_like
1052 Input array.
1053 VI : ndarray
1054 The inverse of the covariance matrix.
1056 Returns
1057 -------
1058 mahalanobis : double
1059 The Mahalanobis distance between vectors `u` and `v`.
1061 Examples
1062 --------
1063 >>> from scipy.spatial import distance
1064 >>> iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]
1065 >>> distance.mahalanobis([1, 0, 0], [0, 1, 0], iv)
1066 1.0
1067 >>> distance.mahalanobis([0, 2, 0], [0, 1, 0], iv)
1068 1.0
1069 >>> distance.mahalanobis([2, 0, 0], [0, 1, 0], iv)
1070 1.7320508075688772
1072 """
1073 u = _validate_vector(u)
1074 v = _validate_vector(v)
1075 VI = np.atleast_2d(VI)
1076 delta = u - v
1077 m = np.dot(np.dot(delta, VI), delta)
1078 return np.sqrt(m)
1081def chebyshev(u, v, w=None):
1082 """
1083 Compute the Chebyshev distance.
1085 Computes the Chebyshev distance between two 1-D arrays `u` and `v`,
1086 which is defined as
1088 .. math::
1090 \\max_i {|u_i-v_i|}.
1092 Parameters
1093 ----------
1094 u : (N,) array_like
1095 Input vector.
1096 v : (N,) array_like
1097 Input vector.
1098 w : (N,) array_like, optional
1099 Unused, as 'max' is a weightless operation. Here for API consistency.
1101 Returns
1102 -------
1103 chebyshev : double
1104 The Chebyshev distance between vectors `u` and `v`.
1106 Examples
1107 --------
1108 >>> from scipy.spatial import distance
1109 >>> distance.chebyshev([1, 0, 0], [0, 1, 0])
1110 1
1111 >>> distance.chebyshev([1, 1, 0], [0, 1, 0])
1112 1
1114 """
1115 u = _validate_vector(u)
1116 v = _validate_vector(v)
1117 if w is not None:
1118 w = _validate_weights(w)
1119 has_weight = w > 0
1120 if has_weight.sum() < w.size:
1121 u = u[has_weight]
1122 v = v[has_weight]
1123 return max(abs(u - v))
1126def braycurtis(u, v, w=None):
1127 """
1128 Compute the Bray-Curtis distance between two 1-D arrays.
1130 Bray-Curtis distance is defined as
1132 .. math::
1134 \\sum{|u_i-v_i|} / \\sum{|u_i+v_i|}
1136 The Bray-Curtis distance is in the range [0, 1] if all coordinates are
1137 positive, and is undefined if the inputs are of length zero.
1139 Parameters
1140 ----------
1141 u : (N,) array_like
1142 Input array.
1143 v : (N,) array_like
1144 Input array.
1145 w : (N,) array_like, optional
1146 The weights for each value in `u` and `v`. Default is None,
1147 which gives each value a weight of 1.0
1149 Returns
1150 -------
1151 braycurtis : double
1152 The Bray-Curtis distance between 1-D arrays `u` and `v`.
1154 Examples
1155 --------
1156 >>> from scipy.spatial import distance
1157 >>> distance.braycurtis([1, 0, 0], [0, 1, 0])
1158 1.0
1159 >>> distance.braycurtis([1, 1, 0], [0, 1, 0])
1160 0.33333333333333331
1162 """
1163 u = _validate_vector(u)
1164 v = _validate_vector(v, dtype=np.float64)
1165 l1_diff = abs(u - v)
1166 l1_sum = abs(u + v)
1167 if w is not None:
1168 w = _validate_weights(w)
1169 l1_diff = w * l1_diff
1170 l1_sum = w * l1_sum
1171 return l1_diff.sum() / l1_sum.sum()
1174def canberra(u, v, w=None):
1175 """
1176 Compute the Canberra distance between two 1-D arrays.
1178 The Canberra distance is defined as
1180 .. math::
1182 d(u,v) = \\sum_i \\frac{|u_i-v_i|}
1183 {|u_i|+|v_i|}.
1185 Parameters
1186 ----------
1187 u : (N,) array_like
1188 Input array.
1189 v : (N,) array_like
1190 Input array.
1191 w : (N,) array_like, optional
1192 The weights for each value in `u` and `v`. Default is None,
1193 which gives each value a weight of 1.0
1195 Returns
1196 -------
1197 canberra : double
1198 The Canberra distance between vectors `u` and `v`.
1200 Notes
1201 -----
1202 When `u[i]` and `v[i]` are 0 for given i, then the fraction 0/0 = 0 is
1203 used in the calculation.
1205 Examples
1206 --------
1207 >>> from scipy.spatial import distance
1208 >>> distance.canberra([1, 0, 0], [0, 1, 0])
1209 2.0
1210 >>> distance.canberra([1, 1, 0], [0, 1, 0])
1211 1.0
1213 """
1214 u = _validate_vector(u)
1215 v = _validate_vector(v, dtype=np.float64)
1216 if w is not None:
1217 w = _validate_weights(w)
1218 with np.errstate(invalid='ignore'):
1219 abs_uv = abs(u - v)
1220 abs_u = abs(u)
1221 abs_v = abs(v)
1222 d = abs_uv / (abs_u + abs_v)
1223 if w is not None:
1224 d = w * d
1225 d = np.nansum(d)
1226 return d
1229def jensenshannon(p, q, base=None):
1230 """
1231 Compute the Jensen-Shannon distance (metric) between
1232 two 1-D probability arrays. This is the square root
1233 of the Jensen-Shannon divergence.
1235 The Jensen-Shannon distance between two probability
1236 vectors `p` and `q` is defined as,
1238 .. math::
1240 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
1242 where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
1243 and :math:`D` is the Kullback-Leibler divergence.
1245 This routine will normalize `p` and `q` if they don't sum to 1.0.
1247 Parameters
1248 ----------
1249 p : (N,) array_like
1250 left probability vector
1251 q : (N,) array_like
1252 right probability vector
1253 base : double, optional
1254 the base of the logarithm used to compute the output
1255 if not given, then the routine uses the default base of
1256 scipy.stats.entropy.
1258 Returns
1259 -------
1260 js : double
1261 The Jensen-Shannon distance between `p` and `q`
1263 .. versionadded:: 1.2.0
1265 Examples
1266 --------
1267 >>> from scipy.spatial import distance
1268 >>> distance.jensenshannon([1.0, 0.0, 0.0], [0.0, 1.0, 0.0], 2.0)
1269 1.0
1270 >>> distance.jensenshannon([1.0, 0.0], [0.5, 0.5])
1271 0.46450140402245893
1272 >>> distance.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0])
1273 0.0
1275 """
1276 p = np.asarray(p)
1277 q = np.asarray(q)
1278 p = p / np.sum(p, axis=0)
1279 q = q / np.sum(q, axis=0)
1280 m = (p + q) / 2.0
1281 left = rel_entr(p, m)
1282 right = rel_entr(q, m)
1283 js = np.sum(left, axis=0) + np.sum(right, axis=0)
1284 if base is not None:
1285 js /= np.log(base)
1286 return np.sqrt(js / 2.0)
1289def yule(u, v, w=None):
1290 """
1291 Compute the Yule dissimilarity between two boolean 1-D arrays.
1293 The Yule dissimilarity is defined as
1295 .. math::
1297 \\frac{R}{c_{TT} * c_{FF} + \\frac{R}{2}}
1299 where :math:`c_{ij}` is the number of occurrences of
1300 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1301 :math:`k < n` and :math:`R = 2.0 * c_{TF} * c_{FT}`.
1303 Parameters
1304 ----------
1305 u : (N,) array_like, bool
1306 Input array.
1307 v : (N,) array_like, bool
1308 Input array.
1309 w : (N,) array_like, optional
1310 The weights for each value in `u` and `v`. Default is None,
1311 which gives each value a weight of 1.0
1313 Returns
1314 -------
1315 yule : double
1316 The Yule dissimilarity between vectors `u` and `v`.
1318 Examples
1319 --------
1320 >>> from scipy.spatial import distance
1321 >>> distance.yule([1, 0, 0], [0, 1, 0])
1322 2.0
1323 >>> distance.yule([1, 1, 0], [0, 1, 0])
1324 0.0
1326 """
1327 u = _validate_vector(u)
1328 v = _validate_vector(v)
1329 if w is not None:
1330 w = _validate_weights(w)
1331 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
1332 return float(2.0 * ntf * nft / np.array(ntt * nff + ntf * nft))
1335@np.deprecate(message="spatial.distance.matching is deprecated in scipy 1.0.0; "
1336 "use spatial.distance.hamming instead.")
1337def matching(u, v, w=None):
1338 """
1339 Compute the Hamming distance between two boolean 1-D arrays.
1341 This is a deprecated synonym for :func:`hamming`.
1342 """
1343 return hamming(u, v, w=w)
1346def dice(u, v, w=None):
1347 """
1348 Compute the Dice dissimilarity between two boolean 1-D arrays.
1350 The Dice dissimilarity between `u` and `v`, is
1352 .. math::
1354 \\frac{c_{TF} + c_{FT}}
1355 {2c_{TT} + c_{FT} + c_{TF}}
1357 where :math:`c_{ij}` is the number of occurrences of
1358 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1359 :math:`k < n`.
1361 Parameters
1362 ----------
1363 u : (N,) ndarray, bool
1364 Input 1-D array.
1365 v : (N,) ndarray, bool
1366 Input 1-D array.
1367 w : (N,) array_like, optional
1368 The weights for each value in `u` and `v`. Default is None,
1369 which gives each value a weight of 1.0
1371 Returns
1372 -------
1373 dice : double
1374 The Dice dissimilarity between 1-D arrays `u` and `v`.
1376 Examples
1377 --------
1378 >>> from scipy.spatial import distance
1379 >>> distance.dice([1, 0, 0], [0, 1, 0])
1380 1.0
1381 >>> distance.dice([1, 0, 0], [1, 1, 0])
1382 0.3333333333333333
1383 >>> distance.dice([1, 0, 0], [2, 0, 0])
1384 -0.3333333333333333
1386 """
1387 u = _validate_vector(u)
1388 v = _validate_vector(v)
1389 if w is not None:
1390 w = _validate_weights(w)
1391 if u.dtype == v.dtype == bool and w is None:
1392 ntt = (u & v).sum()
1393 else:
1394 dtype = np.find_common_type([int], [u.dtype, v.dtype])
1395 u = u.astype(dtype)
1396 v = v.astype(dtype)
1397 if w is None:
1398 ntt = (u * v).sum()
1399 else:
1400 ntt = (u * v * w).sum()
1401 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)
1402 return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft))
1405def rogerstanimoto(u, v, w=None):
1406 """
1407 Compute the Rogers-Tanimoto dissimilarity between two boolean 1-D arrays.
1409 The Rogers-Tanimoto dissimilarity between two boolean 1-D arrays
1410 `u` and `v`, is defined as
1412 .. math::
1413 \\frac{R}
1414 {c_{TT} + c_{FF} + R}
1416 where :math:`c_{ij}` is the number of occurrences of
1417 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1418 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.
1420 Parameters
1421 ----------
1422 u : (N,) array_like, bool
1423 Input array.
1424 v : (N,) array_like, bool
1425 Input array.
1426 w : (N,) array_like, optional
1427 The weights for each value in `u` and `v`. Default is None,
1428 which gives each value a weight of 1.0
1430 Returns
1431 -------
1432 rogerstanimoto : double
1433 The Rogers-Tanimoto dissimilarity between vectors
1434 `u` and `v`.
1436 Examples
1437 --------
1438 >>> from scipy.spatial import distance
1439 >>> distance.rogerstanimoto([1, 0, 0], [0, 1, 0])
1440 0.8
1441 >>> distance.rogerstanimoto([1, 0, 0], [1, 1, 0])
1442 0.5
1443 >>> distance.rogerstanimoto([1, 0, 0], [2, 0, 0])
1444 -1.0
1446 """
1447 u = _validate_vector(u)
1448 v = _validate_vector(v)
1449 if w is not None:
1450 w = _validate_weights(w)
1451 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
1452 return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))
1455def russellrao(u, v, w=None):
1456 """
1457 Compute the Russell-Rao dissimilarity between two boolean 1-D arrays.
1459 The Russell-Rao dissimilarity between two boolean 1-D arrays, `u` and
1460 `v`, is defined as
1462 .. math::
1464 \\frac{n - c_{TT}}
1465 {n}
1467 where :math:`c_{ij}` is the number of occurrences of
1468 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1469 :math:`k < n`.
1471 Parameters
1472 ----------
1473 u : (N,) array_like, bool
1474 Input array.
1475 v : (N,) array_like, bool
1476 Input array.
1477 w : (N,) array_like, optional
1478 The weights for each value in `u` and `v`. Default is None,
1479 which gives each value a weight of 1.0
1481 Returns
1482 -------
1483 russellrao : double
1484 The Russell-Rao dissimilarity between vectors `u` and `v`.
1486 Examples
1487 --------
1488 >>> from scipy.spatial import distance
1489 >>> distance.russellrao([1, 0, 0], [0, 1, 0])
1490 1.0
1491 >>> distance.russellrao([1, 0, 0], [1, 1, 0])
1492 0.6666666666666666
1493 >>> distance.russellrao([1, 0, 0], [2, 0, 0])
1494 0.3333333333333333
1496 """
1497 u = _validate_vector(u)
1498 v = _validate_vector(v)
1499 if u.dtype == v.dtype == bool and w is None:
1500 ntt = (u & v).sum()
1501 n = float(len(u))
1502 elif w is None:
1503 ntt = (u * v).sum()
1504 n = float(len(u))
1505 else:
1506 w = _validate_weights(w)
1507 ntt = (u * v * w).sum()
1508 n = w.sum()
1509 return float(n - ntt) / n
1512def sokalmichener(u, v, w=None):
1513 """
1514 Compute the Sokal-Michener dissimilarity between two boolean 1-D arrays.
1516 The Sokal-Michener dissimilarity between boolean 1-D arrays `u` and `v`,
1517 is defined as
1519 .. math::
1521 \\frac{R}
1522 {S + R}
1524 where :math:`c_{ij}` is the number of occurrences of
1525 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1526 :math:`k < n`, :math:`R = 2 * (c_{TF} + c_{FT})` and
1527 :math:`S = c_{FF} + c_{TT}`.
1529 Parameters
1530 ----------
1531 u : (N,) array_like, bool
1532 Input array.
1533 v : (N,) array_like, bool
1534 Input array.
1535 w : (N,) array_like, optional
1536 The weights for each value in `u` and `v`. Default is None,
1537 which gives each value a weight of 1.0
1539 Returns
1540 -------
1541 sokalmichener : double
1542 The Sokal-Michener dissimilarity between vectors `u` and `v`.
1544 Examples
1545 --------
1546 >>> from scipy.spatial import distance
1547 >>> distance.sokalmichener([1, 0, 0], [0, 1, 0])
1548 0.8
1549 >>> distance.sokalmichener([1, 0, 0], [1, 1, 0])
1550 0.5
1551 >>> distance.sokalmichener([1, 0, 0], [2, 0, 0])
1552 -1.0
1554 """
1555 u = _validate_vector(u)
1556 v = _validate_vector(v)
1557 if u.dtype == v.dtype == bool and w is None:
1558 ntt = (u & v).sum()
1559 nff = (~u & ~v).sum()
1560 elif w is None:
1561 ntt = (u * v).sum()
1562 nff = ((1.0 - u) * (1.0 - v)).sum()
1563 else:
1564 w = _validate_weights(w)
1565 ntt = (u * v * w).sum()
1566 nff = ((1.0 - u) * (1.0 - v) * w).sum()
1567 (nft, ntf) = _nbool_correspond_ft_tf(u, v)
1568 return float(2.0 * (ntf + nft)) / float(ntt + nff + 2.0 * (ntf + nft))
1571def sokalsneath(u, v, w=None):
1572 """
1573 Compute the Sokal-Sneath dissimilarity between two boolean 1-D arrays.
1575 The Sokal-Sneath dissimilarity between `u` and `v`,
1577 .. math::
1579 \\frac{R}
1580 {c_{TT} + R}
1582 where :math:`c_{ij}` is the number of occurrences of
1583 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1584 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.
1586 Parameters
1587 ----------
1588 u : (N,) array_like, bool
1589 Input array.
1590 v : (N,) array_like, bool
1591 Input array.
1592 w : (N,) array_like, optional
1593 The weights for each value in `u` and `v`. Default is None,
1594 which gives each value a weight of 1.0
1596 Returns
1597 -------
1598 sokalsneath : double
1599 The Sokal-Sneath dissimilarity between vectors `u` and `v`.
1601 Examples
1602 --------
1603 >>> from scipy.spatial import distance
1604 >>> distance.sokalsneath([1, 0, 0], [0, 1, 0])
1605 1.0
1606 >>> distance.sokalsneath([1, 0, 0], [1, 1, 0])
1607 0.66666666666666663
1608 >>> distance.sokalsneath([1, 0, 0], [2, 1, 0])
1609 0.0
1610 >>> distance.sokalsneath([1, 0, 0], [3, 1, 0])
1611 -2.0
1613 """
1614 u = _validate_vector(u)
1615 v = _validate_vector(v)
1616 if u.dtype == v.dtype == bool and w is None:
1617 ntt = (u & v).sum()
1618 elif w is None:
1619 ntt = (u * v).sum()
1620 else:
1621 w = _validate_weights(w)
1622 ntt = (u * v * w).sum()
1623 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)
1624 denom = np.array(ntt + 2.0 * (ntf + nft))
1625 if not denom.any():
1626 raise ValueError('Sokal-Sneath dissimilarity is not defined for '
1627 'vectors that are entirely false.')
1628 return float(2.0 * (ntf + nft)) / denom
1631_convert_to_double = partial(_convert_to_type, out_type=np.double)
1632_convert_to_bool = partial(_convert_to_type, out_type=bool)
1634# adding python-only wrappers to _distance_wrap module
1635_distance_wrap.pdist_correlation_double_wrap = _correlation_pdist_wrap
1636_distance_wrap.cdist_correlation_double_wrap = _correlation_cdist_wrap
1638# Registry of implemented metrics:
1639# Dictionary with the following structure:
1640# {
1641# metric_name : MetricInfo(aka, types=[double], validator=None)
1642# }
1643#
1644# Where:
1645# `metric_name` must be equal to python metric name
1646#
1647# MetricInfo is a named tuple with fields:
1648# 'aka' : [list of aliases],
1649#
1650# 'validator': f(X, m, n, **kwargs) # function that check kwargs and
1651# # computes default values.
1652#
1653# 'types': [list of supported types], # X (pdist) and XA (cdist) are used to
1654# # choose the type. if there is no match
1655# # the first type is used. Default double
1656# }
1657MetricInfo = namedtuple("MetricInfo", 'aka types validator ')
1658MetricInfo.__new__.__defaults__ = (['double'], None)
1660_METRICS = {
1661 'braycurtis': MetricInfo(aka=['braycurtis']),
1662 'canberra': MetricInfo(aka=['canberra']),
1663 'chebyshev': MetricInfo(aka=['chebychev', 'chebyshev', 'cheby', 'cheb', 'ch']),
1664 'cityblock': MetricInfo(aka=['cityblock', 'cblock', 'cb', 'c']),
1665 'correlation': MetricInfo(aka=['correlation', 'co']),
1666 'cosine': MetricInfo(aka=['cosine', 'cos']),
1667 'dice': MetricInfo(aka=['dice'], types=['bool']),
1668 'euclidean': MetricInfo(aka=['euclidean', 'euclid', 'eu', 'e']),
1669 'hamming': MetricInfo(aka=['matching', 'hamming', 'hamm', 'ha', 'h'],
1670 types=['double', 'bool'],
1671 validator=_validate_hamming_kwargs),
1672 'jaccard': MetricInfo(aka=['jaccard', 'jacc', 'ja', 'j'],
1673 types=['double', 'bool']),
1674 'jensenshannon': MetricInfo(aka=['jensenshannon', 'js'],
1675 types=['double']),
1676 'kulsinski': MetricInfo(aka=['kulsinski'], types=['bool']),
1677 'mahalanobis': MetricInfo(aka=['mahalanobis', 'mahal', 'mah'],
1678 validator=_validate_mahalanobis_kwargs),
1679 'minkowski': MetricInfo(aka=['minkowski', 'mi', 'm', 'pnorm'],
1680 validator=_validate_minkowski_kwargs),
1681 'rogerstanimoto': MetricInfo(aka=['rogerstanimoto'], types=['bool']),
1682 'russellrao': MetricInfo(aka=['russellrao'], types=['bool']),
1683 'seuclidean': MetricInfo(aka=['seuclidean', 'se', 's'],
1684 validator=_validate_seuclidean_kwargs),
1685 'sokalmichener': MetricInfo(aka=['sokalmichener'], types=['bool']),
1686 'sokalsneath': MetricInfo(aka=['sokalsneath'], types=['bool']),
1687 'sqeuclidean': MetricInfo(aka=['sqeuclidean', 'sqe', 'sqeuclid']),
1688 'wminkowski': MetricInfo(aka=['wminkowski', 'wmi', 'wm', 'wpnorm'],
1689 validator=_validate_wminkowski_kwargs),
1690 'yule': MetricInfo(aka=['yule'], types=['bool']),
1691}
1694_METRIC_ALIAS = dict((alias, name)
1695 for name, info in _METRICS.items()
1696 for alias in info.aka)
1698_METRICS_NAMES = list(_METRICS.keys())
1700_TEST_METRICS = {'test_' + name: globals()[name] for name in _METRICS.keys()}
1703def _select_weighted_metric(mstr, kwargs, out):
1704 kwargs = dict(kwargs)
1706 if "w" in kwargs and kwargs["w"] is None:
1707 # w=None is the same as omitting it
1708 kwargs.pop("w")
1710 if mstr.startswith("test_") or mstr in _METRICS['wminkowski'].aka + _METRICS['hamming'].aka:
1711 # These support weights
1712 pass
1713 elif "w" in kwargs:
1714 if (mstr in _METRICS['seuclidean'].aka or
1715 mstr in _METRICS['mahalanobis'].aka):
1716 raise ValueError("metric %s incompatible with weights" % mstr)
1718 # XXX: C-versions do not support weights
1719 # need to use python version for weighting
1720 kwargs['out'] = out
1721 mstr = "test_%s" % mstr
1723 return mstr, kwargs
1726def pdist(X, metric='euclidean', *args, **kwargs):
1727 """
1728 Pairwise distances between observations in n-dimensional space.
1730 See Notes for common calling conventions.
1732 Parameters
1733 ----------
1734 X : ndarray
1735 An m by n array of m original observations in an
1736 n-dimensional space.
1737 metric : str or function, optional
1738 The distance metric to use. The distance function can
1739 be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
1740 'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
1741 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching',
1742 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
1743 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'.
1744 *args : tuple. Deprecated.
1745 Additional arguments should be passed as keyword arguments
1746 **kwargs : dict, optional
1747 Extra arguments to `metric`: refer to each metric documentation for a
1748 list of all possible arguments.
1750 Some possible arguments:
1752 p : scalar
1753 The p-norm to apply for Minkowski, weighted and unweighted.
1754 Default: 2.
1756 w : ndarray
1757 The weight vector for metrics that support weights (e.g., Minkowski).
1759 V : ndarray
1760 The variance vector for standardized Euclidean.
1761 Default: var(X, axis=0, ddof=1)
1763 VI : ndarray
1764 The inverse of the covariance matrix for Mahalanobis.
1765 Default: inv(cov(X.T)).T
1767 out : ndarray.
1768 The output array
1769 If not None, condensed distance matrix Y is stored in this array.
1770 Note: metric independent, it will become a regular keyword arg in a
1771 future scipy version
1773 Returns
1774 -------
1775 Y : ndarray
1776 Returns a condensed distance matrix Y. For
1777 each :math:`i` and :math:`j` (where :math:`i<j<m`),where m is the number
1778 of original observations. The metric ``dist(u=X[i], v=X[j])``
1779 is computed and stored in entry ``ij``.
1781 See Also
1782 --------
1783 squareform : converts between condensed distance matrices and
1784 square distance matrices.
1786 Notes
1787 -----
1788 See ``squareform`` for information on how to calculate the index of
1789 this entry or to convert the condensed distance matrix to a
1790 redundant square matrix.
1792 The following are common calling conventions.
1794 1. ``Y = pdist(X, 'euclidean')``
1796 Computes the distance between m points using Euclidean distance
1797 (2-norm) as the distance metric between the points. The points
1798 are arranged as m n-dimensional row vectors in the matrix X.
1800 2. ``Y = pdist(X, 'minkowski', p=2.)``
1802 Computes the distances using the Minkowski distance
1803 :math:`||u-v||_p` (p-norm) where :math:`p \\geq 1`.
1805 3. ``Y = pdist(X, 'cityblock')``
1807 Computes the city block or Manhattan distance between the
1808 points.
1810 4. ``Y = pdist(X, 'seuclidean', V=None)``
1812 Computes the standardized Euclidean distance. The standardized
1813 Euclidean distance between two n-vectors ``u`` and ``v`` is
1815 .. math::
1817 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}
1820 V is the variance vector; V[i] is the variance computed over all
1821 the i'th components of the points. If not passed, it is
1822 automatically computed.
1824 5. ``Y = pdist(X, 'sqeuclidean')``
1826 Computes the squared Euclidean distance :math:`||u-v||_2^2` between
1827 the vectors.
1829 6. ``Y = pdist(X, 'cosine')``
1831 Computes the cosine distance between vectors u and v,
1833 .. math::
1835 1 - \\frac{u \\cdot v}
1836 {{||u||}_2 {||v||}_2}
1838 where :math:`||*||_2` is the 2-norm of its argument ``*``, and
1839 :math:`u \\cdot v` is the dot product of ``u`` and ``v``.
1841 7. ``Y = pdist(X, 'correlation')``
1843 Computes the correlation distance between vectors u and v. This is
1845 .. math::
1847 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
1848 {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2}
1850 where :math:`\\bar{v}` is the mean of the elements of vector v,
1851 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
1853 8. ``Y = pdist(X, 'hamming')``
1855 Computes the normalized Hamming distance, or the proportion of
1856 those vector elements between two n-vectors ``u`` and ``v``
1857 which disagree. To save memory, the matrix ``X`` can be of type
1858 boolean.
1860 9. ``Y = pdist(X, 'jaccard')``
1862 Computes the Jaccard distance between the points. Given two
1863 vectors, ``u`` and ``v``, the Jaccard distance is the
1864 proportion of those elements ``u[i]`` and ``v[i]`` that
1865 disagree.
1867 10. ``Y = pdist(X, 'chebyshev')``
1869 Computes the Chebyshev distance between the points. The
1870 Chebyshev distance between two n-vectors ``u`` and ``v`` is the
1871 maximum norm-1 distance between their respective elements. More
1872 precisely, the distance is given by
1874 .. math::
1876 d(u,v) = \\max_i {|u_i-v_i|}
1878 11. ``Y = pdist(X, 'canberra')``
1880 Computes the Canberra distance between the points. The
1881 Canberra distance between two points ``u`` and ``v`` is
1883 .. math::
1885 d(u,v) = \\sum_i \\frac{|u_i-v_i|}
1886 {|u_i|+|v_i|}
1889 12. ``Y = pdist(X, 'braycurtis')``
1891 Computes the Bray-Curtis distance between the points. The
1892 Bray-Curtis distance between two points ``u`` and ``v`` is
1895 .. math::
1897 d(u,v) = \\frac{\\sum_i {|u_i-v_i|}}
1898 {\\sum_i {|u_i+v_i|}}
1900 13. ``Y = pdist(X, 'mahalanobis', VI=None)``
1902 Computes the Mahalanobis distance between the points. The
1903 Mahalanobis distance between two points ``u`` and ``v`` is
1904 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
1905 variable) is the inverse covariance. If ``VI`` is not None,
1906 ``VI`` will be used as the inverse covariance matrix.
1908 14. ``Y = pdist(X, 'yule')``
1910 Computes the Yule distance between each pair of boolean
1911 vectors. (see yule function documentation)
1913 15. ``Y = pdist(X, 'matching')``
1915 Synonym for 'hamming'.
1917 16. ``Y = pdist(X, 'dice')``
1919 Computes the Dice distance between each pair of boolean
1920 vectors. (see dice function documentation)
1922 17. ``Y = pdist(X, 'kulsinski')``
1924 Computes the Kulsinski distance between each pair of
1925 boolean vectors. (see kulsinski function documentation)
1927 18. ``Y = pdist(X, 'rogerstanimoto')``
1929 Computes the Rogers-Tanimoto distance between each pair of
1930 boolean vectors. (see rogerstanimoto function documentation)
1932 19. ``Y = pdist(X, 'russellrao')``
1934 Computes the Russell-Rao distance between each pair of
1935 boolean vectors. (see russellrao function documentation)
1937 20. ``Y = pdist(X, 'sokalmichener')``
1939 Computes the Sokal-Michener distance between each pair of
1940 boolean vectors. (see sokalmichener function documentation)
1942 21. ``Y = pdist(X, 'sokalsneath')``
1944 Computes the Sokal-Sneath distance between each pair of
1945 boolean vectors. (see sokalsneath function documentation)
1947 22. ``Y = pdist(X, 'wminkowski', p=2, w=w)``
1949 Computes the weighted Minkowski distance between each pair of
1950 vectors. (see wminkowski function documentation)
1952 23. ``Y = pdist(X, f)``
1954 Computes the distance between all pairs of vectors in X
1955 using the user supplied 2-arity function f. For example,
1956 Euclidean distance between the vectors could be computed
1957 as follows::
1959 dm = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))
1961 Note that you should avoid passing a reference to one of
1962 the distance functions defined in this library. For example,::
1964 dm = pdist(X, sokalsneath)
1966 would calculate the pair-wise distances between the vectors in
1967 X using the Python function sokalsneath. This would result in
1968 sokalsneath being called :math:`{n \\choose 2}` times, which
1969 is inefficient. Instead, the optimized C version is more
1970 efficient, and we call it using the following syntax.::
1972 dm = pdist(X, 'sokalsneath')
1974 """
1975 # You can also call this as:
1976 # Y = pdist(X, 'test_abc')
1977 # where 'abc' is the metric being tested. This computes the distance
1978 # between all pairs of vectors in X using the distance metric 'abc' but
1979 # with a more succinct, verifiable, but less efficient implementation.
1981 X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,
1982 check_finite=False)
1983 kwargs = _args_to_kwargs_xdist(args, kwargs, metric, "pdist")
1985 X = np.asarray(X, order='c')
1987 s = X.shape
1988 if len(s) != 2:
1989 raise ValueError('A 2-dimensional array must be passed.')
1991 m, n = s
1992 out = kwargs.pop("out", None)
1993 if out is None:
1994 dm = np.empty((m * (m - 1)) // 2, dtype=np.double)
1995 else:
1996 if out.shape != (m * (m - 1) // 2,):
1997 raise ValueError("output array has incorrect shape.")
1998 if not out.flags.c_contiguous:
1999 raise ValueError("Output array must be C-contiguous.")
2000 if out.dtype != np.double:
2001 raise ValueError("Output array must be double type.")
2002 dm = out
2004 # compute blacklist for deprecated kwargs
2005 if(metric in _METRICS['jensenshannon'].aka
2006 or metric == 'test_jensenshannon' or metric == jensenshannon):
2007 kwargs_blacklist = ["p", "w", "V", "VI"]
2009 elif(metric in _METRICS['minkowski'].aka
2010 or metric in _METRICS['wminkowski'].aka
2011 or metric in ['test_minkowski', 'test_wminkowski']
2012 or metric in [minkowski, wminkowski]):
2013 kwargs_blacklist = ["V", "VI"]
2015 elif(metric in _METRICS['seuclidean'].aka or
2016 metric == 'test_seuclidean' or metric == seuclidean):
2017 kwargs_blacklist = ["p", "w", "VI"]
2019 elif(metric in _METRICS['mahalanobis'].aka
2020 or metric == 'test_mahalanobis' or metric == mahalanobis):
2021 kwargs_blacklist = ["p", "w", "V"]
2023 else:
2024 kwargs_blacklist = ["p", "V", "VI"]
2026 _filter_deprecated_kwargs(kwargs, kwargs_blacklist)
2028 if callable(metric):
2029 mstr = getattr(metric, '__name__', 'UnknownCustomMetric')
2030 metric_name = _METRIC_ALIAS.get(mstr, None)
2032 if metric_name is not None:
2033 X, typ, kwargs = _validate_pdist_input(X, m, n,
2034 metric_name, **kwargs)
2036 k = 0
2037 for i in range(0, m - 1):
2038 for j in range(i + 1, m):
2039 dm[k] = metric(X[i], X[j], **kwargs)
2040 k = k + 1
2042 elif isinstance(metric, str):
2043 mstr = metric.lower()
2045 mstr, kwargs = _select_weighted_metric(mstr, kwargs, out)
2047 metric_name = _METRIC_ALIAS.get(mstr, None)
2049 if metric_name is not None:
2050 X, typ, kwargs = _validate_pdist_input(X, m, n,
2051 metric_name, **kwargs)
2053 # get pdist wrapper
2054 pdist_fn = getattr(_distance_wrap,
2055 "pdist_%s_%s_wrap" % (metric_name, typ))
2056 pdist_fn(X, dm, **kwargs)
2057 return dm
2059 elif mstr in ['old_cosine', 'old_cos']:
2060 warnings.warn('"old_cosine" is deprecated and will be removed in '
2061 'a future version. Use "cosine" instead.',
2062 DeprecationWarning)
2063 X = _convert_to_double(X)
2064 norms = np.einsum('ij,ij->i', X, X, dtype=np.double)
2065 np.sqrt(norms, out=norms)
2066 nV = norms.reshape(m, 1)
2067 # The numerator u * v
2068 nm = np.dot(X, X.T)
2069 # The denom. ||u||*||v||
2070 de = np.dot(nV, nV.T)
2071 dm = 1.0 - (nm / de)
2072 dm[range(0, m), range(0, m)] = 0.0
2073 dm = squareform(dm)
2074 elif mstr.startswith("test_"):
2075 if mstr in _TEST_METRICS:
2076 dm = pdist(X, _TEST_METRICS[mstr], **kwargs)
2077 else:
2078 raise ValueError('Unknown "Test" Distance Metric: %s' % mstr[5:])
2079 else:
2080 raise ValueError('Unknown Distance Metric: %s' % mstr)
2081 else:
2082 raise TypeError('2nd argument metric must be a string identifier '
2083 'or a function.')
2084 return dm
2087def squareform(X, force="no", checks=True):
2088 """
2089 Convert a vector-form distance vector to a square-form distance
2090 matrix, and vice-versa.
2092 Parameters
2093 ----------
2094 X : ndarray
2095 Either a condensed or redundant distance matrix.
2096 force : str, optional
2097 As with MATLAB(TM), if force is equal to ``'tovector'`` or
2098 ``'tomatrix'``, the input will be treated as a distance matrix or
2099 distance vector respectively.
2100 checks : bool, optional
2101 If set to False, no checks will be made for matrix
2102 symmetry nor zero diagonals. This is useful if it is known that
2103 ``X - X.T1`` is small and ``diag(X)`` is close to zero.
2104 These values are ignored any way so they do not disrupt the
2105 squareform transformation.
2107 Returns
2108 -------
2109 Y : ndarray
2110 If a condensed distance matrix is passed, a redundant one is
2111 returned, or if a redundant one is passed, a condensed distance
2112 matrix is returned.
2114 Notes
2115 -----
2116 1. ``v = squareform(X)``
2118 Given a square n-by-n symmetric distance matrix ``X``,
2119 ``v = squareform(X)`` returns a ``n * (n-1) / 2``
2120 (i.e. binomial coefficient n choose 2) sized vector `v`
2121 where :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`
2122 is the distance between distinct points ``i`` and ``j``.
2123 If ``X`` is non-square or asymmetric, an error is raised.
2125 2. ``X = squareform(v)``
2127 Given a ``n * (n-1) / 2`` sized vector ``v``
2128 for some integer ``n >= 1`` encoding distances as described,
2129 ``X = squareform(v)`` returns a n-by-n distance matrix ``X``.
2130 The ``X[i, j]`` and ``X[j, i]`` values are set to
2131 :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`
2132 and all diagonal elements are zero.
2134 In SciPy 0.19.0, ``squareform`` stopped casting all input types to
2135 float64, and started returning arrays of the same dtype as the input.
2137 """
2139 X = np.ascontiguousarray(X)
2141 s = X.shape
2143 if force.lower() == 'tomatrix':
2144 if len(s) != 1:
2145 raise ValueError("Forcing 'tomatrix' but input X is not a "
2146 "distance vector.")
2147 elif force.lower() == 'tovector':
2148 if len(s) != 2:
2149 raise ValueError("Forcing 'tovector' but input X is not a "
2150 "distance matrix.")
2152 # X = squareform(v)
2153 if len(s) == 1:
2154 if s[0] == 0:
2155 return np.zeros((1, 1), dtype=X.dtype)
2157 # Grab the closest value to the square root of the number
2158 # of elements times 2 to see if the number of elements
2159 # is indeed a binomial coefficient.
2160 d = int(np.ceil(np.sqrt(s[0] * 2)))
2162 # Check that v is of valid dimensions.
2163 if d * (d - 1) != s[0] * 2:
2164 raise ValueError('Incompatible vector size. It must be a binomial '
2165 'coefficient n choose 2 for some integer n >= 2.')
2167 # Allocate memory for the distance matrix.
2168 M = np.zeros((d, d), dtype=X.dtype)
2170 # Since the C code does not support striding using strides.
2171 # The dimensions are used instead.
2172 X = _copy_array_if_base_present(X)
2174 # Fill in the values of the distance matrix.
2175 _distance_wrap.to_squareform_from_vector_wrap(M, X)
2177 # Return the distance matrix.
2178 return M
2179 elif len(s) == 2:
2180 if s[0] != s[1]:
2181 raise ValueError('The matrix argument must be square.')
2182 if checks:
2183 is_valid_dm(X, throw=True, name='X')
2185 # One-side of the dimensions is set here.
2186 d = s[0]
2188 if d <= 1:
2189 return np.array([], dtype=X.dtype)
2191 # Create a vector.
2192 v = np.zeros((d * (d - 1)) // 2, dtype=X.dtype)
2194 # Since the C code does not support striding using strides.
2195 # The dimensions are used instead.
2196 X = _copy_array_if_base_present(X)
2198 # Convert the vector to squareform.
2199 _distance_wrap.to_vector_from_squareform_wrap(X, v)
2200 return v
2201 else:
2202 raise ValueError(('The first argument must be one or two dimensional '
2203 'array. A %d-dimensional array is not '
2204 'permitted') % len(s))
2207def is_valid_dm(D, tol=0.0, throw=False, name="D", warning=False):
2208 """
2209 Return True if input array is a valid distance matrix.
2211 Distance matrices must be 2-dimensional numpy arrays.
2212 They must have a zero-diagonal, and they must be symmetric.
2214 Parameters
2215 ----------
2216 D : ndarray
2217 The candidate object to test for validity.
2218 tol : float, optional
2219 The distance matrix should be symmetric. `tol` is the maximum
2220 difference between entries ``ij`` and ``ji`` for the distance
2221 metric to be considered symmetric.
2222 throw : bool, optional
2223 An exception is thrown if the distance matrix passed is not valid.
2224 name : str, optional
2225 The name of the variable to checked. This is useful if
2226 throw is set to True so the offending variable can be identified
2227 in the exception message when an exception is thrown.
2228 warning : bool, optional
2229 Instead of throwing an exception, a warning message is
2230 raised.
2232 Returns
2233 -------
2234 valid : bool
2235 True if the variable `D` passed is a valid distance matrix.
2237 Notes
2238 -----
2239 Small numerical differences in `D` and `D.T` and non-zeroness of
2240 the diagonal are ignored if they are within the tolerance specified
2241 by `tol`.
2243 """
2244 D = np.asarray(D, order='c')
2245 valid = True
2246 try:
2247 s = D.shape
2248 if len(D.shape) != 2:
2249 if name:
2250 raise ValueError(('Distance matrix \'%s\' must have shape=2 '
2251 '(i.e. be two-dimensional).') % name)
2252 else:
2253 raise ValueError('Distance matrix must have shape=2 (i.e. '
2254 'be two-dimensional).')
2255 if tol == 0.0:
2256 if not (D == D.T).all():
2257 if name:
2258 raise ValueError(('Distance matrix \'%s\' must be '
2259 'symmetric.') % name)
2260 else:
2261 raise ValueError('Distance matrix must be symmetric.')
2262 if not (D[range(0, s[0]), range(0, s[0])] == 0).all():
2263 if name:
2264 raise ValueError(('Distance matrix \'%s\' diagonal must '
2265 'be zero.') % name)
2266 else:
2267 raise ValueError('Distance matrix diagonal must be zero.')
2268 else:
2269 if not (D - D.T <= tol).all():
2270 if name:
2271 raise ValueError(('Distance matrix \'%s\' must be '
2272 'symmetric within tolerance %5.5f.')
2273 % (name, tol))
2274 else:
2275 raise ValueError('Distance matrix must be symmetric within'
2276 ' tolerance %5.5f.' % tol)
2277 if not (D[range(0, s[0]), range(0, s[0])] <= tol).all():
2278 if name:
2279 raise ValueError(('Distance matrix \'%s\' diagonal must be'
2280 ' close to zero within tolerance %5.5f.')
2281 % (name, tol))
2282 else:
2283 raise ValueError(('Distance matrix \'%s\' diagonal must be'
2284 ' close to zero within tolerance %5.5f.')
2285 % tol)
2286 except Exception as e:
2287 if throw:
2288 raise
2289 if warning:
2290 warnings.warn(str(e))
2291 valid = False
2292 return valid
2295def is_valid_y(y, warning=False, throw=False, name=None):
2296 """
2297 Return True if the input array is a valid condensed distance matrix.
2299 Condensed distance matrices must be 1-dimensional numpy arrays.
2300 Their length must be a binomial coefficient :math:`{n \\choose 2}`
2301 for some positive integer n.
2303 Parameters
2304 ----------
2305 y : ndarray
2306 The condensed distance matrix.
2307 warning : bool, optional
2308 Invokes a warning if the variable passed is not a valid
2309 condensed distance matrix. The warning message explains why
2310 the distance matrix is not valid. `name` is used when
2311 referencing the offending variable.
2312 throw : bool, optional
2313 Throws an exception if the variable passed is not a valid
2314 condensed distance matrix.
2315 name : bool, optional
2316 Used when referencing the offending variable in the
2317 warning or exception message.
2319 """
2320 y = np.asarray(y, order='c')
2321 valid = True
2322 try:
2323 if len(y.shape) != 1:
2324 if name:
2325 raise ValueError(('Condensed distance matrix \'%s\' must '
2326 'have shape=1 (i.e. be one-dimensional).')
2327 % name)
2328 else:
2329 raise ValueError('Condensed distance matrix must have shape=1 '
2330 '(i.e. be one-dimensional).')
2331 n = y.shape[0]
2332 d = int(np.ceil(np.sqrt(n * 2)))
2333 if (d * (d - 1) / 2) != n:
2334 if name:
2335 raise ValueError(('Length n of condensed distance matrix '
2336 '\'%s\' must be a binomial coefficient, i.e.'
2337 'there must be a k such that '
2338 '(k \\choose 2)=n)!') % name)
2339 else:
2340 raise ValueError('Length n of condensed distance matrix must '
2341 'be a binomial coefficient, i.e. there must '
2342 'be a k such that (k \\choose 2)=n)!')
2343 except Exception as e:
2344 if throw:
2345 raise
2346 if warning:
2347 warnings.warn(str(e))
2348 valid = False
2349 return valid
2352def num_obs_dm(d):
2353 """
2354 Return the number of original observations that correspond to a
2355 square, redundant distance matrix.
2357 Parameters
2358 ----------
2359 d : ndarray
2360 The target distance matrix.
2362 Returns
2363 -------
2364 num_obs_dm : int
2365 The number of observations in the redundant distance matrix.
2367 """
2368 d = np.asarray(d, order='c')
2369 is_valid_dm(d, tol=np.inf, throw=True, name='d')
2370 return d.shape[0]
2373def num_obs_y(Y):
2374 """
2375 Return the number of original observations that correspond to a
2376 condensed distance matrix.
2378 Parameters
2379 ----------
2380 Y : ndarray
2381 Condensed distance matrix.
2383 Returns
2384 -------
2385 n : int
2386 The number of observations in the condensed distance matrix `Y`.
2388 """
2389 Y = np.asarray(Y, order='c')
2390 is_valid_y(Y, throw=True, name='Y')
2391 k = Y.shape[0]
2392 if k == 0:
2393 raise ValueError("The number of observations cannot be determined on "
2394 "an empty distance matrix.")
2395 d = int(np.ceil(np.sqrt(k * 2)))
2396 if (d * (d - 1) / 2) != k:
2397 raise ValueError("Invalid condensed distance matrix passed. Must be "
2398 "some k where k=(n choose 2) for some n >= 2.")
2399 return d
2402def cdist(XA, XB, metric='euclidean', *args, **kwargs):
2403 """
2404 Compute distance between each pair of the two collections of inputs.
2406 See Notes for common calling conventions.
2408 Parameters
2409 ----------
2410 XA : ndarray
2411 An :math:`m_A` by :math:`n` array of :math:`m_A`
2412 original observations in an :math:`n`-dimensional space.
2413 Inputs are converted to float type.
2414 XB : ndarray
2415 An :math:`m_B` by :math:`n` array of :math:`m_B`
2416 original observations in an :math:`n`-dimensional space.
2417 Inputs are converted to float type.
2418 metric : str or callable, optional
2419 The distance metric to use. If a string, the distance function can be
2420 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',
2421 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon',
2422 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
2423 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
2424 'wminkowski', 'yule'.
2425 *args : tuple. Deprecated.
2426 Additional arguments should be passed as keyword arguments
2427 **kwargs : dict, optional
2428 Extra arguments to `metric`: refer to each metric documentation for a
2429 list of all possible arguments.
2431 Some possible arguments:
2433 p : scalar
2434 The p-norm to apply for Minkowski, weighted and unweighted.
2435 Default: 2.
2437 w : ndarray
2438 The weight vector for metrics that support weights (e.g., Minkowski).
2440 V : ndarray
2441 The variance vector for standardized Euclidean.
2442 Default: var(vstack([XA, XB]), axis=0, ddof=1)
2444 VI : ndarray
2445 The inverse of the covariance matrix for Mahalanobis.
2446 Default: inv(cov(vstack([XA, XB].T))).T
2448 out : ndarray
2449 The output array
2450 If not None, the distance matrix Y is stored in this array.
2451 Note: metric independent, it will become a regular keyword arg in a
2452 future scipy version
2454 Returns
2455 -------
2456 Y : ndarray
2457 A :math:`m_A` by :math:`m_B` distance matrix is returned.
2458 For each :math:`i` and :math:`j`, the metric
2459 ``dist(u=XA[i], v=XB[j])`` is computed and stored in the
2460 :math:`ij` th entry.
2462 Raises
2463 ------
2464 ValueError
2465 An exception is thrown if `XA` and `XB` do not have
2466 the same number of columns.
2468 Notes
2469 -----
2470 The following are common calling conventions:
2472 1. ``Y = cdist(XA, XB, 'euclidean')``
2474 Computes the distance between :math:`m` points using
2475 Euclidean distance (2-norm) as the distance metric between the
2476 points. The points are arranged as :math:`m`
2477 :math:`n`-dimensional row vectors in the matrix X.
2479 2. ``Y = cdist(XA, XB, 'minkowski', p=2.)``
2481 Computes the distances using the Minkowski distance
2482 :math:`||u-v||_p` (:math:`p`-norm) where :math:`p \\geq 1`.
2484 3. ``Y = cdist(XA, XB, 'cityblock')``
2486 Computes the city block or Manhattan distance between the
2487 points.
2489 4. ``Y = cdist(XA, XB, 'seuclidean', V=None)``
2491 Computes the standardized Euclidean distance. The standardized
2492 Euclidean distance between two n-vectors ``u`` and ``v`` is
2494 .. math::
2496 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}.
2498 V is the variance vector; V[i] is the variance computed over all
2499 the i'th components of the points. If not passed, it is
2500 automatically computed.
2502 5. ``Y = cdist(XA, XB, 'sqeuclidean')``
2504 Computes the squared Euclidean distance :math:`||u-v||_2^2` between
2505 the vectors.
2507 6. ``Y = cdist(XA, XB, 'cosine')``
2509 Computes the cosine distance between vectors u and v,
2511 .. math::
2513 1 - \\frac{u \\cdot v}
2514 {{||u||}_2 {||v||}_2}
2516 where :math:`||*||_2` is the 2-norm of its argument ``*``, and
2517 :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`.
2519 7. ``Y = cdist(XA, XB, 'correlation')``
2521 Computes the correlation distance between vectors u and v. This is
2523 .. math::
2525 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
2526 {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2}
2528 where :math:`\\bar{v}` is the mean of the elements of vector v,
2529 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
2532 8. ``Y = cdist(XA, XB, 'hamming')``
2534 Computes the normalized Hamming distance, or the proportion of
2535 those vector elements between two n-vectors ``u`` and ``v``
2536 which disagree. To save memory, the matrix ``X`` can be of type
2537 boolean.
2539 9. ``Y = cdist(XA, XB, 'jaccard')``
2541 Computes the Jaccard distance between the points. Given two
2542 vectors, ``u`` and ``v``, the Jaccard distance is the
2543 proportion of those elements ``u[i]`` and ``v[i]`` that
2544 disagree where at least one of them is non-zero.
2546 10. ``Y = cdist(XA, XB, 'chebyshev')``
2548 Computes the Chebyshev distance between the points. The
2549 Chebyshev distance between two n-vectors ``u`` and ``v`` is the
2550 maximum norm-1 distance between their respective elements. More
2551 precisely, the distance is given by
2553 .. math::
2555 d(u,v) = \\max_i {|u_i-v_i|}.
2557 11. ``Y = cdist(XA, XB, 'canberra')``
2559 Computes the Canberra distance between the points. The
2560 Canberra distance between two points ``u`` and ``v`` is
2562 .. math::
2564 d(u,v) = \\sum_i \\frac{|u_i-v_i|}
2565 {|u_i|+|v_i|}.
2567 12. ``Y = cdist(XA, XB, 'braycurtis')``
2569 Computes the Bray-Curtis distance between the points. The
2570 Bray-Curtis distance between two points ``u`` and ``v`` is
2573 .. math::
2575 d(u,v) = \\frac{\\sum_i (|u_i-v_i|)}
2576 {\\sum_i (|u_i+v_i|)}
2578 13. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)``
2580 Computes the Mahalanobis distance between the points. The
2581 Mahalanobis distance between two points ``u`` and ``v`` is
2582 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
2583 variable) is the inverse covariance. If ``VI`` is not None,
2584 ``VI`` will be used as the inverse covariance matrix.
2586 14. ``Y = cdist(XA, XB, 'yule')``
2588 Computes the Yule distance between the boolean
2589 vectors. (see `yule` function documentation)
2591 15. ``Y = cdist(XA, XB, 'matching')``
2593 Synonym for 'hamming'.
2595 16. ``Y = cdist(XA, XB, 'dice')``
2597 Computes the Dice distance between the boolean vectors. (see
2598 `dice` function documentation)
2600 17. ``Y = cdist(XA, XB, 'kulsinski')``
2602 Computes the Kulsinski distance between the boolean
2603 vectors. (see `kulsinski` function documentation)
2605 18. ``Y = cdist(XA, XB, 'rogerstanimoto')``
2607 Computes the Rogers-Tanimoto distance between the boolean
2608 vectors. (see `rogerstanimoto` function documentation)
2610 19. ``Y = cdist(XA, XB, 'russellrao')``
2612 Computes the Russell-Rao distance between the boolean
2613 vectors. (see `russellrao` function documentation)
2615 20. ``Y = cdist(XA, XB, 'sokalmichener')``
2617 Computes the Sokal-Michener distance between the boolean
2618 vectors. (see `sokalmichener` function documentation)
2620 21. ``Y = cdist(XA, XB, 'sokalsneath')``
2622 Computes the Sokal-Sneath distance between the vectors. (see
2623 `sokalsneath` function documentation)
2626 22. ``Y = cdist(XA, XB, 'wminkowski', p=2., w=w)``
2628 Computes the weighted Minkowski distance between the
2629 vectors. (see `wminkowski` function documentation)
2631 23. ``Y = cdist(XA, XB, f)``
2633 Computes the distance between all pairs of vectors in X
2634 using the user supplied 2-arity function f. For example,
2635 Euclidean distance between the vectors could be computed
2636 as follows::
2638 dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum()))
2640 Note that you should avoid passing a reference to one of
2641 the distance functions defined in this library. For example,::
2643 dm = cdist(XA, XB, sokalsneath)
2645 would calculate the pair-wise distances between the vectors in
2646 X using the Python function `sokalsneath`. This would result in
2647 sokalsneath being called :math:`{n \\choose 2}` times, which
2648 is inefficient. Instead, the optimized C version is more
2649 efficient, and we call it using the following syntax::
2651 dm = cdist(XA, XB, 'sokalsneath')
2653 Examples
2654 --------
2655 Find the Euclidean distances between four 2-D coordinates:
2657 >>> from scipy.spatial import distance
2658 >>> coords = [(35.0456, -85.2672),
2659 ... (35.1174, -89.9711),
2660 ... (35.9728, -83.9422),
2661 ... (36.1667, -86.7833)]
2662 >>> distance.cdist(coords, coords, 'euclidean')
2663 array([[ 0. , 4.7044, 1.6172, 1.8856],
2664 [ 4.7044, 0. , 6.0893, 3.3561],
2665 [ 1.6172, 6.0893, 0. , 2.8477],
2666 [ 1.8856, 3.3561, 2.8477, 0. ]])
2669 Find the Manhattan distance from a 3-D point to the corners of the unit
2670 cube:
2672 >>> a = np.array([[0, 0, 0],
2673 ... [0, 0, 1],
2674 ... [0, 1, 0],
2675 ... [0, 1, 1],
2676 ... [1, 0, 0],
2677 ... [1, 0, 1],
2678 ... [1, 1, 0],
2679 ... [1, 1, 1]])
2680 >>> b = np.array([[ 0.1, 0.2, 0.4]])
2681 >>> distance.cdist(a, b, 'cityblock')
2682 array([[ 0.7],
2683 [ 0.9],
2684 [ 1.3],
2685 [ 1.5],
2686 [ 1.5],
2687 [ 1.7],
2688 [ 2.1],
2689 [ 2.3]])
2691 """
2692 # You can also call this as:
2693 # Y = cdist(XA, XB, 'test_abc')
2694 # where 'abc' is the metric being tested. This computes the distance
2695 # between all pairs of vectors in XA and XB using the distance metric 'abc'
2696 # but with a more succinct, verifiable, but less efficient implementation.
2698 kwargs = _args_to_kwargs_xdist(args, kwargs, metric, "cdist")
2700 XA = np.asarray(XA, order='c')
2701 XB = np.asarray(XB, order='c')
2703 s = XA.shape
2704 sB = XB.shape
2706 if len(s) != 2:
2707 raise ValueError('XA must be a 2-dimensional array.')
2708 if len(sB) != 2:
2709 raise ValueError('XB must be a 2-dimensional array.')
2710 if s[1] != sB[1]:
2711 raise ValueError('XA and XB must have the same number of columns '
2712 '(i.e. feature dimension.)')
2714 mA = s[0]
2715 mB = sB[0]
2716 n = s[1]
2717 out = kwargs.pop("out", None)
2718 if out is None:
2719 dm = np.empty((mA, mB), dtype=np.double)
2720 else:
2721 if out.shape != (mA, mB):
2722 raise ValueError("Output array has incorrect shape.")
2723 if not out.flags.c_contiguous:
2724 raise ValueError("Output array must be C-contiguous.")
2725 if out.dtype != np.double:
2726 raise ValueError("Output array must be double type.")
2727 dm = out
2729 # compute blacklist for deprecated kwargs
2730 if(metric in _METRICS['minkowski'].aka or
2731 metric in _METRICS['wminkowski'].aka or
2732 metric in ['test_minkowski', 'test_wminkowski'] or
2733 metric in [minkowski, wminkowski]):
2734 kwargs_blacklist = ["V", "VI"]
2735 elif(metric in _METRICS['seuclidean'].aka or
2736 metric == 'test_seuclidean' or metric == seuclidean):
2737 kwargs_blacklist = ["p", "w", "VI"]
2738 elif(metric in _METRICS['mahalanobis'].aka or
2739 metric == 'test_mahalanobis' or metric == mahalanobis):
2740 kwargs_blacklist = ["p", "w", "V"]
2741 else:
2742 kwargs_blacklist = ["p", "V", "VI"]
2744 _filter_deprecated_kwargs(kwargs, kwargs_blacklist)
2746 if callable(metric):
2748 mstr = getattr(metric, '__name__', 'Unknown')
2749 metric_name = _METRIC_ALIAS.get(mstr, None)
2751 XA, XB, typ, kwargs = _validate_cdist_input(XA, XB, mA, mB, n,
2752 metric_name, **kwargs)
2754 for i in range(0, mA):
2755 for j in range(0, mB):
2756 dm[i, j] = metric(XA[i], XB[j], **kwargs)
2758 elif isinstance(metric, str):
2759 mstr = metric.lower()
2761 mstr, kwargs = _select_weighted_metric(mstr, kwargs, out)
2763 metric_name = _METRIC_ALIAS.get(mstr, None)
2764 if metric_name is not None:
2765 XA, XB, typ, kwargs = _validate_cdist_input(XA, XB, mA, mB, n,
2766 metric_name, **kwargs)
2767 # get cdist wrapper
2768 cdist_fn = getattr(_distance_wrap,
2769 "cdist_%s_%s_wrap" % (metric_name, typ))
2770 cdist_fn(XA, XB, dm, **kwargs)
2771 return dm
2773 elif mstr.startswith("test_"):
2774 if mstr in _TEST_METRICS:
2775 dm = cdist(XA, XB, _TEST_METRICS[mstr], **kwargs)
2776 else:
2777 raise ValueError('Unknown "Test" Distance Metric: %s' % mstr[5:])
2778 else:
2779 raise ValueError('Unknown Distance Metric: %s' % mstr)
2780 else:
2781 raise TypeError('2nd argument metric must be a string identifier '
2782 'or a function.')
2783 return dm