Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/algorithms.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Generic data algorithms. This module is experimental at the moment and not
3intended for public consumption
4"""
5import operator
6from textwrap import dedent
7from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
8from warnings import catch_warnings, simplefilter, warn
10import numpy as np
12from pandas._libs import Timestamp, algos, hashtable as htable, lib
13from pandas._libs.tslib import iNaT
14from pandas.util._decorators import Appender, Substitution
16from pandas.core.dtypes.cast import (
17 construct_1d_object_array_from_listlike,
18 infer_dtype_from_array,
19 maybe_promote,
20)
21from pandas.core.dtypes.common import (
22 ensure_float64,
23 ensure_int64,
24 ensure_object,
25 ensure_platform_int,
26 ensure_uint64,
27 is_array_like,
28 is_bool_dtype,
29 is_categorical_dtype,
30 is_complex_dtype,
31 is_datetime64_any_dtype,
32 is_datetime64_dtype,
33 is_datetime64_ns_dtype,
34 is_extension_array_dtype,
35 is_float_dtype,
36 is_integer,
37 is_integer_dtype,
38 is_list_like,
39 is_numeric_dtype,
40 is_object_dtype,
41 is_period_dtype,
42 is_scalar,
43 is_signed_integer_dtype,
44 is_timedelta64_dtype,
45 is_unsigned_integer_dtype,
46 needs_i8_conversion,
47)
48from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
49from pandas.core.dtypes.missing import isna, na_value_for_dtype
51import pandas.core.common as com
52from pandas.core.construction import array, extract_array
53from pandas.core.indexers import validate_indices
55if TYPE_CHECKING:
56 from pandas import Series
58_shared_docs: Dict[str, str] = {}
61# --------------- #
62# dtype access #
63# --------------- #
64def _ensure_data(values, dtype=None):
65 """
66 routine to ensure that our data is of the correct
67 input dtype for lower-level routines
69 This will coerce:
70 - ints -> int64
71 - uint -> uint64
72 - bool -> uint64 (TODO this should be uint8)
73 - datetimelike -> i8
74 - datetime64tz -> i8 (in local tz)
75 - categorical -> codes
77 Parameters
78 ----------
79 values : array-like
80 dtype : pandas_dtype, optional
81 coerce to this dtype
83 Returns
84 -------
85 values : ndarray
86 pandas_dtype : str or dtype
87 """
89 # we check some simple dtypes first
90 if is_object_dtype(dtype):
91 return ensure_object(np.asarray(values)), "object"
92 elif is_object_dtype(values) and dtype is None:
93 return ensure_object(np.asarray(values)), "object"
95 try:
96 if is_bool_dtype(values) or is_bool_dtype(dtype):
97 # we are actually coercing to uint64
98 # until our algos support uint8 directly (see TODO)
99 return np.asarray(values).astype("uint64"), "bool"
100 elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
101 return ensure_int64(values), "int64"
102 elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype):
103 return ensure_uint64(values), "uint64"
104 elif is_float_dtype(values) or is_float_dtype(dtype):
105 return ensure_float64(values), "float64"
106 elif is_complex_dtype(values) or is_complex_dtype(dtype):
108 # ignore the fact that we are casting to float
109 # which discards complex parts
110 with catch_warnings():
111 simplefilter("ignore", np.ComplexWarning)
112 values = ensure_float64(values)
113 return values, "float64"
115 except (TypeError, ValueError, OverflowError):
116 # if we are trying to coerce to a dtype
117 # and it is incompat this will fall through to here
118 return ensure_object(values), "object"
120 # datetimelike
121 if (
122 needs_i8_conversion(values)
123 or is_period_dtype(dtype)
124 or is_datetime64_any_dtype(dtype)
125 or is_timedelta64_dtype(dtype)
126 ):
127 if is_period_dtype(values) or is_period_dtype(dtype):
128 from pandas import PeriodIndex
130 values = PeriodIndex(values)
131 dtype = values.dtype
132 elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype):
133 from pandas import TimedeltaIndex
135 values = TimedeltaIndex(values)
136 dtype = values.dtype
137 else:
138 # Datetime
139 if values.ndim > 1 and is_datetime64_ns_dtype(values):
140 # Avoid calling the DatetimeIndex constructor as it is 1D only
141 # Note: this is reached by DataFrame.rank calls GH#27027
142 asi8 = values.view("i8")
143 dtype = values.dtype
144 return asi8, dtype
146 from pandas import DatetimeIndex
148 values = DatetimeIndex(values)
149 dtype = values.dtype
151 return values.asi8, dtype
153 elif is_categorical_dtype(values) and (
154 is_categorical_dtype(dtype) or dtype is None
155 ):
156 values = getattr(values, "values", values)
157 values = values.codes
158 dtype = "category"
160 # we are actually coercing to int64
161 # until our algos support int* directly (not all do)
162 values = ensure_int64(values)
164 return values, dtype
166 # we have failed, return object
167 values = np.asarray(values, dtype=np.object)
168 return ensure_object(values), "object"
171def _reconstruct_data(values, dtype, original):
172 """
173 reverse of _ensure_data
175 Parameters
176 ----------
177 values : ndarray
178 dtype : pandas_dtype
179 original : ndarray-like
181 Returns
182 -------
183 Index for extension types, otherwise ndarray casted to dtype
184 """
186 if is_extension_array_dtype(dtype):
187 values = dtype.construct_array_type()._from_sequence(values)
188 elif is_bool_dtype(dtype):
189 values = values.astype(dtype, copy=False)
191 # we only support object dtypes bool Index
192 if isinstance(original, ABCIndexClass):
193 values = values.astype(object, copy=False)
194 elif dtype is not None:
195 if is_datetime64_dtype(dtype):
196 dtype = "datetime64[ns]"
197 elif is_timedelta64_dtype(dtype):
198 dtype = "timedelta64[ns]"
200 values = values.astype(dtype, copy=False)
202 return values
205def _ensure_arraylike(values):
206 """
207 ensure that we are arraylike if not already
208 """
209 if not is_array_like(values):
210 inferred = lib.infer_dtype(values, skipna=False)
211 if inferred in ["mixed", "string", "unicode"]:
212 if isinstance(values, tuple):
213 values = list(values)
214 values = construct_1d_object_array_from_listlike(values)
215 else:
216 values = np.asarray(values)
217 return values
220_hashtables = {
221 "float64": htable.Float64HashTable,
222 "uint64": htable.UInt64HashTable,
223 "int64": htable.Int64HashTable,
224 "string": htable.StringHashTable,
225 "object": htable.PyObjectHashTable,
226}
229def _get_hashtable_algo(values):
230 """
231 Parameters
232 ----------
233 values : arraylike
235 Returns
236 -------
237 htable : HashTable subclass
238 values : ndarray
239 """
240 values, _ = _ensure_data(values)
242 ndtype = _check_object_for_strings(values)
243 htable = _hashtables[ndtype]
244 return htable, values
247def _get_values_for_rank(values):
248 if is_categorical_dtype(values):
249 values = values._values_for_rank()
251 values, _ = _ensure_data(values)
252 return values
255def _get_data_algo(values):
256 values = _get_values_for_rank(values)
258 ndtype = _check_object_for_strings(values)
259 htable = _hashtables.get(ndtype, _hashtables["object"])
261 return htable, values
264def _check_object_for_strings(values) -> str:
265 """
266 Check if we can use string hashtable instead of object hashtable.
268 Parameters
269 ----------
270 values : ndarray
271 ndtype : str
273 Returns
274 -------
275 str
276 """
277 ndtype = values.dtype.name
278 if ndtype == "object":
280 # it's cheaper to use a String Hash Table than Object; we infer
281 # including nulls because that is the only difference between
282 # StringHashTable and ObjectHashtable
283 if lib.infer_dtype(values, skipna=False) in ["string"]:
284 ndtype = "string"
285 return ndtype
288# --------------- #
289# top-level algos #
290# --------------- #
293def unique(values):
294 """
295 Hash table-based unique. Uniques are returned in order
296 of appearance. This does NOT sort.
298 Significantly faster than numpy.unique. Includes NA values.
300 Parameters
301 ----------
302 values : 1d array-like
304 Returns
305 -------
306 numpy.ndarray or ExtensionArray
308 The return can be:
310 * Index : when the input is an Index
311 * Categorical : when the input is a Categorical dtype
312 * ndarray : when the input is a Series/ndarray
314 Return numpy.ndarray or ExtensionArray.
316 See Also
317 --------
318 Index.unique
319 Series.unique
321 Examples
322 --------
323 >>> pd.unique(pd.Series([2, 1, 3, 3]))
324 array([2, 1, 3])
326 >>> pd.unique(pd.Series([2] + [1] * 5))
327 array([2, 1])
329 >>> pd.unique(pd.Series([pd.Timestamp('20160101'),
330 ... pd.Timestamp('20160101')]))
331 array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
333 >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
334 ... pd.Timestamp('20160101', tz='US/Eastern')]))
335 array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
336 dtype=object)
338 >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
339 ... pd.Timestamp('20160101', tz='US/Eastern')]))
340 DatetimeIndex(['2016-01-01 00:00:00-05:00'],
341 ... dtype='datetime64[ns, US/Eastern]', freq=None)
343 >>> pd.unique(list('baabc'))
344 array(['b', 'a', 'c'], dtype=object)
346 An unordered Categorical will return categories in the
347 order of appearance.
349 >>> pd.unique(pd.Series(pd.Categorical(list('baabc'))))
350 [b, a, c]
351 Categories (3, object): [b, a, c]
353 >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
354 ... categories=list('abc'))))
355 [b, a, c]
356 Categories (3, object): [b, a, c]
358 An ordered Categorical preserves the category ordering.
360 >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
361 ... categories=list('abc'),
362 ... ordered=True)))
363 [b, a, c]
364 Categories (3, object): [a < b < c]
366 An array of tuples
368 >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')])
369 array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
370 """
372 values = _ensure_arraylike(values)
374 if is_extension_array_dtype(values):
375 # Dispatch to extension dtype's unique.
376 return values.unique()
378 original = values
379 htable, values = _get_hashtable_algo(values)
381 table = htable(len(values))
382 uniques = table.unique(values)
383 uniques = _reconstruct_data(uniques, original.dtype, original)
384 return uniques
387unique1d = unique
390def isin(comps, values) -> np.ndarray:
391 """
392 Compute the isin boolean array.
394 Parameters
395 ----------
396 comps : array-like
397 values : array-like
399 Returns
400 -------
401 ndarray[bool]
402 Same length as `comps`.
403 """
404 if not is_list_like(comps):
405 raise TypeError(
406 "only list-like objects are allowed to be passed "
407 f"to isin(), you passed a [{type(comps).__name__}]"
408 )
409 if not is_list_like(values):
410 raise TypeError(
411 "only list-like objects are allowed to be passed "
412 f"to isin(), you passed a [{type(values).__name__}]"
413 )
415 if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
416 values = construct_1d_object_array_from_listlike(list(values))
418 if is_categorical_dtype(comps):
419 # TODO(extension)
420 # handle categoricals
421 return comps._values.isin(values)
423 comps = com.values_from_object(comps)
425 comps, dtype = _ensure_data(comps)
426 values, _ = _ensure_data(values, dtype=dtype)
428 # faster for larger cases to use np.in1d
429 f = htable.ismember_object
431 # GH16012
432 # Ensure np.in1d doesn't get object types or it *may* throw an exception
433 if len(comps) > 1_000_000 and not is_object_dtype(comps):
434 f = np.in1d
435 elif is_integer_dtype(comps):
436 try:
437 values = values.astype("int64", copy=False)
438 comps = comps.astype("int64", copy=False)
439 f = htable.ismember_int64
440 except (TypeError, ValueError, OverflowError):
441 values = values.astype(object)
442 comps = comps.astype(object)
444 elif is_float_dtype(comps):
445 try:
446 values = values.astype("float64", copy=False)
447 comps = comps.astype("float64", copy=False)
448 f = htable.ismember_float64
449 except (TypeError, ValueError):
450 values = values.astype(object)
451 comps = comps.astype(object)
453 return f(comps, values)
456def _factorize_array(
457 values, na_sentinel: int = -1, size_hint=None, na_value=None
458) -> Tuple[np.ndarray, np.ndarray]:
459 """
460 Factorize an array-like to codes and uniques.
462 This doesn't do any coercion of types or unboxing before factorization.
464 Parameters
465 ----------
466 values : ndarray
467 na_sentinel : int, default -1
468 size_hint : int, optional
469 Passsed through to the hashtable's 'get_labels' method
470 na_value : object, optional
471 A value in `values` to consider missing. Note: only use this
472 parameter when you know that you don't have any values pandas would
473 consider missing in the array (NaN for float data, iNaT for
474 datetimes, etc.).
476 Returns
477 -------
478 codes : ndarray
479 uniques : ndarray
480 """
481 hash_klass, values = _get_data_algo(values)
483 table = hash_klass(size_hint or len(values))
484 uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value)
486 codes = ensure_platform_int(codes)
487 return codes, uniques
490_shared_docs[
491 "factorize"
492] = """
493 Encode the object as an enumerated type or categorical variable.
495 This method is useful for obtaining a numeric representation of an
496 array when all that matters is identifying distinct values. `factorize`
497 is available as both a top-level function :func:`pandas.factorize`,
498 and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
500 Parameters
501 ----------
502 %(values)s%(sort)s
503 na_sentinel : int, default -1
504 Value to mark "not found".
505 %(size_hint)s\
507 Returns
508 -------
509 codes : ndarray
510 An integer ndarray that's an indexer into `uniques`.
511 ``uniques.take(codes)`` will have the same values as `values`.
512 uniques : ndarray, Index, or Categorical
513 The unique valid values. When `values` is Categorical, `uniques`
514 is a Categorical. When `values` is some other pandas object, an
515 `Index` is returned. Otherwise, a 1-D ndarray is returned.
517 .. note ::
519 Even if there's a missing value in `values`, `uniques` will
520 *not* contain an entry for it.
522 See Also
523 --------
524 cut : Discretize continuous-valued array.
525 unique : Find the unique value in an array.
527 Examples
528 --------
529 These examples all show factorize as a top-level method like
530 ``pd.factorize(values)``. The results are identical for methods like
531 :meth:`Series.factorize`.
533 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
534 >>> codes
535 array([0, 0, 1, 2, 0])
536 >>> uniques
537 array(['b', 'a', 'c'], dtype=object)
539 With ``sort=True``, the `uniques` will be sorted, and `codes` will be
540 shuffled so that the relationship is the maintained.
542 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
543 >>> codes
544 array([1, 1, 0, 2, 1])
545 >>> uniques
546 array(['a', 'b', 'c'], dtype=object)
548 Missing values are indicated in `codes` with `na_sentinel`
549 (``-1`` by default). Note that missing values are never
550 included in `uniques`.
552 >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
553 >>> codes
554 array([ 0, -1, 1, 2, 0])
555 >>> uniques
556 array(['b', 'a', 'c'], dtype=object)
558 Thus far, we've only factorized lists (which are internally coerced to
559 NumPy arrays). When factorizing pandas objects, the type of `uniques`
560 will differ. For Categoricals, a `Categorical` is returned.
562 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
563 >>> codes, uniques = pd.factorize(cat)
564 >>> codes
565 array([0, 0, 1])
566 >>> uniques
567 [a, c]
568 Categories (3, object): [a, b, c]
570 Notice that ``'b'`` is in ``uniques.categories``, despite not being
571 present in ``cat.values``.
573 For all other pandas objects, an Index of the appropriate type is
574 returned.
576 >>> cat = pd.Series(['a', 'a', 'c'])
577 >>> codes, uniques = pd.factorize(cat)
578 >>> codes
579 array([0, 0, 1])
580 >>> uniques
581 Index(['a', 'c'], dtype='object')
582 """
585@Substitution(
586 values=dedent(
587 """\
588 values : sequence
589 A 1-D sequence. Sequences that aren't pandas objects are
590 coerced to ndarrays before factorization.
591 """
592 ),
593 sort=dedent(
594 """\
595 sort : bool, default False
596 Sort `uniques` and shuffle `codes` to maintain the
597 relationship.
598 """
599 ),
600 size_hint=dedent(
601 """\
602 size_hint : int, optional
603 Hint to the hashtable sizer.
604 """
605 ),
606)
607@Appender(_shared_docs["factorize"])
608def factorize(
609 values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None
610) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
611 # Implementation notes: This method is responsible for 3 things
612 # 1.) coercing data to array-like (ndarray, Index, extension array)
613 # 2.) factorizing codes and uniques
614 # 3.) Maybe boxing the uniques in an Index
615 #
616 # Step 2 is dispatched to extension types (like Categorical). They are
617 # responsible only for factorization. All data coercion, sorting and boxing
618 # should happen here.
620 values = _ensure_arraylike(values)
621 original = values
623 if is_extension_array_dtype(values):
624 values = extract_array(values)
625 codes, uniques = values.factorize(na_sentinel=na_sentinel)
626 dtype = original.dtype
627 else:
628 values, dtype = _ensure_data(values)
630 if original.dtype.kind in ["m", "M"]:
631 na_value = na_value_for_dtype(original.dtype)
632 else:
633 na_value = None
635 codes, uniques = _factorize_array(
636 values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
637 )
639 if sort and len(uniques) > 0:
640 uniques, codes = safe_sort(
641 uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
642 )
644 uniques = _reconstruct_data(uniques, dtype, original)
646 # return original tenor
647 if isinstance(original, ABCIndexClass):
648 uniques = original._shallow_copy(uniques, name=None)
649 elif isinstance(original, ABCSeries):
650 from pandas import Index
652 uniques = Index(uniques)
654 return codes, uniques
657def value_counts(
658 values,
659 sort: bool = True,
660 ascending: bool = False,
661 normalize: bool = False,
662 bins=None,
663 dropna: bool = True,
664) -> "Series":
665 """
666 Compute a histogram of the counts of non-null values.
668 Parameters
669 ----------
670 values : ndarray (1-d)
671 sort : bool, default True
672 Sort by values
673 ascending : bool, default False
674 Sort in ascending order
675 normalize: bool, default False
676 If True then compute a relative histogram
677 bins : integer, optional
678 Rather than count values, group them into half-open bins,
679 convenience for pd.cut, only works with numeric data
680 dropna : bool, default True
681 Don't include counts of NaN
683 Returns
684 -------
685 Series
686 """
687 from pandas.core.series import Series
689 name = getattr(values, "name", None)
691 if bins is not None:
692 from pandas.core.reshape.tile import cut
694 values = Series(values)
695 try:
696 ii = cut(values, bins, include_lowest=True)
697 except TypeError:
698 raise TypeError("bins argument only works with numeric data.")
700 # count, remove nulls (from the index), and but the bins
701 result = ii.value_counts(dropna=dropna)
702 result = result[result.index.notna()]
703 result.index = result.index.astype("interval")
704 result = result.sort_index()
706 # if we are dropna and we have NO values
707 if dropna and (result.values == 0).all():
708 result = result.iloc[0:0]
710 # normalizing is by len of all (regardless of dropna)
711 counts = np.array([len(ii)])
713 else:
715 if is_extension_array_dtype(values):
717 # handle Categorical and sparse,
718 result = Series(values)._values.value_counts(dropna=dropna)
719 result.name = name
720 counts = result.values
722 else:
723 keys, counts = _value_counts_arraylike(values, dropna)
725 result = Series(counts, index=keys, name=name)
727 if sort:
728 result = result.sort_values(ascending=ascending)
730 if normalize:
731 result = result / float(counts.sum())
733 return result
736def _value_counts_arraylike(values, dropna: bool):
737 """
738 Parameters
739 ----------
740 values : arraylike
741 dropna : bool
743 Returns
744 -------
745 uniques : np.ndarray or ExtensionArray
746 counts : np.ndarray
747 """
748 values = _ensure_arraylike(values)
749 original = values
750 values, _ = _ensure_data(values)
751 ndtype = values.dtype.name
753 if needs_i8_conversion(original.dtype):
754 # datetime, timedelta, or period
756 keys, counts = htable.value_count_int64(values, dropna)
758 if dropna:
759 msk = keys != iNaT
760 keys, counts = keys[msk], counts[msk]
762 else:
763 # ndarray like
765 # TODO: handle uint8
766 f = getattr(htable, f"value_count_{ndtype}")
767 keys, counts = f(values, dropna)
769 mask = isna(values)
770 if not dropna and mask.any():
771 if not isna(keys).any():
772 keys = np.insert(keys, 0, np.NaN)
773 counts = np.insert(counts, 0, mask.sum())
775 keys = _reconstruct_data(keys, original.dtype, original)
777 return keys, counts
780def duplicated(values, keep="first") -> np.ndarray:
781 """
782 Return boolean ndarray denoting duplicate values.
784 Parameters
785 ----------
786 values : ndarray-like
787 Array over which to check for duplicate values.
788 keep : {'first', 'last', False}, default 'first'
789 - ``first`` : Mark duplicates as ``True`` except for the first
790 occurrence.
791 - ``last`` : Mark duplicates as ``True`` except for the last
792 occurrence.
793 - False : Mark all duplicates as ``True``.
795 Returns
796 -------
797 duplicated : ndarray
798 """
800 values, _ = _ensure_data(values)
801 ndtype = values.dtype.name
802 f = getattr(htable, f"duplicated_{ndtype}")
803 return f(values, keep=keep)
806def mode(values, dropna: bool = True) -> "Series":
807 """
808 Returns the mode(s) of an array.
810 Parameters
811 ----------
812 values : array-like
813 Array over which to check for duplicate values.
814 dropna : boolean, default True
815 Don't consider counts of NaN/NaT.
817 .. versionadded:: 0.24.0
819 Returns
820 -------
821 mode : Series
822 """
823 from pandas import Series
825 values = _ensure_arraylike(values)
826 original = values
828 # categorical is a fast-path
829 if is_categorical_dtype(values):
830 if isinstance(values, Series):
831 return Series(values.values.mode(dropna=dropna), name=values.name)
832 return values.mode(dropna=dropna)
834 if dropna and needs_i8_conversion(values.dtype):
835 mask = values.isnull()
836 values = values[~mask]
838 values, _ = _ensure_data(values)
839 ndtype = values.dtype.name
841 f = getattr(htable, f"mode_{ndtype}")
842 result = f(values, dropna=dropna)
843 try:
844 result = np.sort(result)
845 except TypeError as err:
846 warn(f"Unable to sort modes: {err}")
848 result = _reconstruct_data(result, original.dtype, original)
849 return Series(result)
852def rank(
853 values,
854 axis: int = 0,
855 method: str = "average",
856 na_option: str = "keep",
857 ascending: bool = True,
858 pct: bool = False,
859):
860 """
861 Rank the values along a given axis.
863 Parameters
864 ----------
865 values : array-like
866 Array whose values will be ranked. The number of dimensions in this
867 array must not exceed 2.
868 axis : int, default 0
869 Axis over which to perform rankings.
870 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
871 The method by which tiebreaks are broken during the ranking.
872 na_option : {'keep', 'top'}, default 'keep'
873 The method by which NaNs are placed in the ranking.
874 - ``keep``: rank each NaN value with a NaN ranking
875 - ``top``: replace each NaN with either +/- inf so that they
876 there are ranked at the top
877 ascending : boolean, default True
878 Whether or not the elements should be ranked in ascending order.
879 pct : boolean, default False
880 Whether or not to the display the returned rankings in integer form
881 (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
882 """
883 if values.ndim == 1:
884 values = _get_values_for_rank(values)
885 ranks = algos.rank_1d(
886 values,
887 ties_method=method,
888 ascending=ascending,
889 na_option=na_option,
890 pct=pct,
891 )
892 elif values.ndim == 2:
893 values = _get_values_for_rank(values)
894 ranks = algos.rank_2d(
895 values,
896 axis=axis,
897 ties_method=method,
898 ascending=ascending,
899 na_option=na_option,
900 pct=pct,
901 )
902 else:
903 raise TypeError("Array with ndim > 2 are not supported.")
905 return ranks
908def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
909 """
910 Perform array addition that checks for underflow and overflow.
912 Performs the addition of an int64 array and an int64 integer (or array)
913 but checks that they do not result in overflow first. For elements that
914 are indicated to be NaN, whether or not there is overflow for that element
915 is automatically ignored.
917 Parameters
918 ----------
919 arr : array addend.
920 b : array or scalar addend.
921 arr_mask : boolean array or None
922 array indicating which elements to exclude from checking
923 b_mask : boolean array or boolean or None
924 array or scalar indicating which element(s) to exclude from checking
926 Returns
927 -------
928 sum : An array for elements x + b for each element x in arr if b is
929 a scalar or an array for elements x + y for each element pair
930 (x, y) in (arr, b).
932 Raises
933 ------
934 OverflowError if any x + y exceeds the maximum or minimum int64 value.
935 """
936 # For performance reasons, we broadcast 'b' to the new array 'b2'
937 # so that it has the same size as 'arr'.
938 b2 = np.broadcast_to(b, arr.shape)
939 if b_mask is not None:
940 # We do the same broadcasting for b_mask as well.
941 b2_mask = np.broadcast_to(b_mask, arr.shape)
942 else:
943 b2_mask = None
945 # For elements that are NaN, regardless of their value, we should
946 # ignore whether they overflow or not when doing the checked add.
947 if arr_mask is not None and b2_mask is not None:
948 not_nan = np.logical_not(arr_mask | b2_mask)
949 elif arr_mask is not None:
950 not_nan = np.logical_not(arr_mask)
951 elif b_mask is not None:
952 not_nan = np.logical_not(b2_mask)
953 else:
954 not_nan = np.empty(arr.shape, dtype=bool)
955 not_nan.fill(True)
957 # gh-14324: For each element in 'arr' and its corresponding element
958 # in 'b2', we check the sign of the element in 'b2'. If it is positive,
959 # we then check whether its sum with the element in 'arr' exceeds
960 # np.iinfo(np.int64).max. If so, we have an overflow error. If it
961 # it is negative, we then check whether its sum with the element in
962 # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
963 # error as well.
964 mask1 = b2 > 0
965 mask2 = b2 < 0
967 if not mask1.any():
968 to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
969 elif not mask2.any():
970 to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
971 else:
972 to_raise = (
973 ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any()
974 or (
975 (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
976 ).any()
977 )
979 if to_raise:
980 raise OverflowError("Overflow in int64 addition")
981 return arr + b
984def quantile(x, q, interpolation_method="fraction"):
985 """
986 Compute sample quantile or quantiles of the input array. For example, q=0.5
987 computes the median.
989 The `interpolation_method` parameter supports three values, namely
990 `fraction` (default), `lower` and `higher`. Interpolation is done only,
991 if the desired quantile lies between two data points `i` and `j`. For
992 `fraction`, the result is an interpolated value between `i` and `j`;
993 for `lower`, the result is `i`, for `higher` the result is `j`.
995 Parameters
996 ----------
997 x : ndarray
998 Values from which to extract score.
999 q : scalar or array
1000 Percentile at which to extract score.
1001 interpolation_method : {'fraction', 'lower', 'higher'}, optional
1002 This optional parameter specifies the interpolation method to use,
1003 when the desired quantile lies between two data points `i` and `j`:
1005 - fraction: `i + (j - i)*fraction`, where `fraction` is the
1006 fractional part of the index surrounded by `i` and `j`.
1007 -lower: `i`.
1008 - higher: `j`.
1010 Returns
1011 -------
1012 score : float
1013 Score at percentile.
1015 Examples
1016 --------
1017 >>> from scipy import stats
1018 >>> a = np.arange(100)
1019 >>> stats.scoreatpercentile(a, 50)
1020 49.5
1022 """
1023 x = np.asarray(x)
1024 mask = isna(x)
1026 x = x[~mask]
1028 values = np.sort(x)
1030 def _interpolate(a, b, fraction):
1031 """
1032 Returns the point at the given fraction between a and b, where
1033 'fraction' must be between 0 and 1.
1034 """
1035 return a + (b - a) * fraction
1037 def _get_score(at):
1038 if len(values) == 0:
1039 return np.nan
1041 idx = at * (len(values) - 1)
1042 if idx % 1 == 0:
1043 score = values[int(idx)]
1044 else:
1045 if interpolation_method == "fraction":
1046 score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1)
1047 elif interpolation_method == "lower":
1048 score = values[np.floor(idx)]
1049 elif interpolation_method == "higher":
1050 score = values[np.ceil(idx)]
1051 else:
1052 raise ValueError(
1053 "interpolation_method can only be 'fraction' "
1054 ", 'lower' or 'higher'"
1055 )
1057 return score
1059 if is_scalar(q):
1060 return _get_score(q)
1061 else:
1062 q = np.asarray(q, np.float64)
1063 result = [_get_score(x) for x in q]
1064 result = np.array(result, dtype=np.float64)
1065 return result
1068# --------------- #
1069# select n #
1070# --------------- #
1073class SelectN:
1074 def __init__(self, obj, n: int, keep: str):
1075 self.obj = obj
1076 self.n = n
1077 self.keep = keep
1079 if self.keep not in ("first", "last", "all"):
1080 raise ValueError('keep must be either "first", "last" or "all"')
1082 def nlargest(self):
1083 return self.compute("nlargest")
1085 def nsmallest(self):
1086 return self.compute("nsmallest")
1088 @staticmethod
1089 def is_valid_dtype_n_method(dtype) -> bool:
1090 """
1091 Helper function to determine if dtype is valid for
1092 nsmallest/nlargest methods
1093 """
1094 return (
1095 is_numeric_dtype(dtype) and not is_complex_dtype(dtype)
1096 ) or needs_i8_conversion(dtype)
1099class SelectNSeries(SelectN):
1100 """
1101 Implement n largest/smallest for Series
1103 Parameters
1104 ----------
1105 obj : Series
1106 n : int
1107 keep : {'first', 'last'}, default 'first'
1109 Returns
1110 -------
1111 nordered : Series
1112 """
1114 def compute(self, method):
1116 n = self.n
1117 dtype = self.obj.dtype
1118 if not self.is_valid_dtype_n_method(dtype):
1119 raise TypeError(f"Cannot use method '{method}' with dtype {dtype}")
1121 if n <= 0:
1122 return self.obj[[]]
1124 dropped = self.obj.dropna()
1126 # slow method
1127 if n >= len(self.obj):
1128 reverse_it = self.keep == "last" or method == "nlargest"
1129 ascending = method == "nsmallest"
1130 slc = np.s_[::-1] if reverse_it else np.s_[:]
1131 return dropped[slc].sort_values(ascending=ascending).head(n)
1133 # fast method
1134 arr, pandas_dtype = _ensure_data(dropped.values)
1135 if method == "nlargest":
1136 arr = -arr
1137 if is_integer_dtype(pandas_dtype):
1138 # GH 21426: ensure reverse ordering at boundaries
1139 arr -= 1
1141 elif is_bool_dtype(pandas_dtype):
1142 # GH 26154: ensure False is smaller than True
1143 arr = 1 - (-arr)
1145 if self.keep == "last":
1146 arr = arr[::-1]
1148 narr = len(arr)
1149 n = min(n, narr)
1151 kth_val = algos.kth_smallest(arr.copy(), n - 1)
1152 (ns,) = np.nonzero(arr <= kth_val)
1153 inds = ns[arr[ns].argsort(kind="mergesort")]
1155 if self.keep != "all":
1156 inds = inds[:n]
1158 if self.keep == "last":
1159 # reverse indices
1160 inds = narr - 1 - inds
1162 return dropped.iloc[inds]
1165class SelectNFrame(SelectN):
1166 """
1167 Implement n largest/smallest for DataFrame
1169 Parameters
1170 ----------
1171 obj : DataFrame
1172 n : int
1173 keep : {'first', 'last'}, default 'first'
1174 columns : list or str
1176 Returns
1177 -------
1178 nordered : DataFrame
1179 """
1181 def __init__(self, obj, n: int, keep: str, columns):
1182 super().__init__(obj, n, keep)
1183 if not is_list_like(columns) or isinstance(columns, tuple):
1184 columns = [columns]
1185 columns = list(columns)
1186 self.columns = columns
1188 def compute(self, method):
1190 from pandas import Int64Index
1192 n = self.n
1193 frame = self.obj
1194 columns = self.columns
1196 for column in columns:
1197 dtype = frame[column].dtype
1198 if not self.is_valid_dtype_n_method(dtype):
1199 raise TypeError(
1200 f"Column {repr(column)} has dtype {dtype}, "
1201 f"cannot use method {repr(method)} with this dtype"
1202 )
1204 def get_indexer(current_indexer, other_indexer):
1205 """
1206 Helper function to concat `current_indexer` and `other_indexer`
1207 depending on `method`
1208 """
1209 if method == "nsmallest":
1210 return current_indexer.append(other_indexer)
1211 else:
1212 return other_indexer.append(current_indexer)
1214 # Below we save and reset the index in case index contains duplicates
1215 original_index = frame.index
1216 cur_frame = frame = frame.reset_index(drop=True)
1217 cur_n = n
1218 indexer = Int64Index([])
1220 for i, column in enumerate(columns):
1221 # For each column we apply method to cur_frame[column].
1222 # If it's the last column or if we have the number of
1223 # results desired we are done.
1224 # Otherwise there are duplicates of the largest/smallest
1225 # value and we need to look at the rest of the columns
1226 # to determine which of the rows with the largest/smallest
1227 # value in the column to keep.
1228 series = cur_frame[column]
1229 is_last_column = len(columns) - 1 == i
1230 values = getattr(series, method)(
1231 cur_n, keep=self.keep if is_last_column else "all"
1232 )
1234 if is_last_column or len(values) <= cur_n:
1235 indexer = get_indexer(indexer, values.index)
1236 break
1238 # Now find all values which are equal to
1239 # the (nsmallest: largest)/(nlarrgest: smallest)
1240 # from our series.
1241 border_value = values == values[values.index[-1]]
1243 # Some of these values are among the top-n
1244 # some aren't.
1245 unsafe_values = values[border_value]
1247 # These values are definitely among the top-n
1248 safe_values = values[~border_value]
1249 indexer = get_indexer(indexer, safe_values.index)
1251 # Go on and separate the unsafe_values on the remaining
1252 # columns.
1253 cur_frame = cur_frame.loc[unsafe_values.index]
1254 cur_n = n - len(indexer)
1256 frame = frame.take(indexer)
1258 # Restore the index on frame
1259 frame.index = original_index.take(indexer)
1261 # If there is only one column, the frame is already sorted.
1262 if len(columns) == 1:
1263 return frame
1265 ascending = method == "nsmallest"
1267 return frame.sort_values(columns, ascending=ascending, kind="mergesort")
1270# ---- #
1271# take #
1272# ---- #
1275def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
1276 def wrapper(arr, indexer, out, fill_value=np.nan):
1277 if arr_dtype is not None:
1278 arr = arr.view(arr_dtype)
1279 if out_dtype is not None:
1280 out = out.view(out_dtype)
1281 if fill_wrap is not None:
1282 fill_value = fill_wrap(fill_value)
1283 f(arr, indexer, out, fill_value=fill_value)
1285 return wrapper
1288def _convert_wrapper(f, conv_dtype):
1289 def wrapper(arr, indexer, out, fill_value=np.nan):
1290 arr = arr.astype(conv_dtype)
1291 f(arr, indexer, out, fill_value=fill_value)
1293 return wrapper
1296def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info):
1297 # this is not ideal, performance-wise, but it's better than raising
1298 # an exception (best to optimize in Cython to avoid getting here)
1299 row_idx, col_idx = indexer
1300 if mask_info is not None:
1301 (row_mask, col_mask), (row_needs, col_needs) = mask_info
1302 else:
1303 row_mask = row_idx == -1
1304 col_mask = col_idx == -1
1305 row_needs = row_mask.any()
1306 col_needs = col_mask.any()
1307 if fill_value is not None:
1308 if row_needs:
1309 out[row_mask, :] = fill_value
1310 if col_needs:
1311 out[:, col_mask] = fill_value
1312 for i in range(len(row_idx)):
1313 u_ = row_idx[i]
1314 for j in range(len(col_idx)):
1315 v = col_idx[j]
1316 out[i, j] = arr[u_, v]
1319def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info):
1320 if mask_info is not None:
1321 mask, needs_masking = mask_info
1322 else:
1323 mask = indexer == -1
1324 needs_masking = mask.any()
1325 if arr.dtype != out.dtype:
1326 arr = arr.astype(out.dtype)
1327 if arr.shape[axis] > 0:
1328 arr.take(ensure_platform_int(indexer), axis=axis, out=out)
1329 if needs_masking:
1330 outindexer = [slice(None)] * arr.ndim
1331 outindexer[axis] = mask
1332 out[tuple(outindexer)] = fill_value
1335_take_1d_dict = {
1336 ("int8", "int8"): algos.take_1d_int8_int8,
1337 ("int8", "int32"): algos.take_1d_int8_int32,
1338 ("int8", "int64"): algos.take_1d_int8_int64,
1339 ("int8", "float64"): algos.take_1d_int8_float64,
1340 ("int16", "int16"): algos.take_1d_int16_int16,
1341 ("int16", "int32"): algos.take_1d_int16_int32,
1342 ("int16", "int64"): algos.take_1d_int16_int64,
1343 ("int16", "float64"): algos.take_1d_int16_float64,
1344 ("int32", "int32"): algos.take_1d_int32_int32,
1345 ("int32", "int64"): algos.take_1d_int32_int64,
1346 ("int32", "float64"): algos.take_1d_int32_float64,
1347 ("int64", "int64"): algos.take_1d_int64_int64,
1348 ("int64", "float64"): algos.take_1d_int64_float64,
1349 ("float32", "float32"): algos.take_1d_float32_float32,
1350 ("float32", "float64"): algos.take_1d_float32_float64,
1351 ("float64", "float64"): algos.take_1d_float64_float64,
1352 ("object", "object"): algos.take_1d_object_object,
1353 ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8),
1354 ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None),
1355 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
1356 algos.take_1d_int64_int64, np.int64, np.int64, np.int64
1357 ),
1358}
1360_take_2d_axis0_dict = {
1361 ("int8", "int8"): algos.take_2d_axis0_int8_int8,
1362 ("int8", "int32"): algos.take_2d_axis0_int8_int32,
1363 ("int8", "int64"): algos.take_2d_axis0_int8_int64,
1364 ("int8", "float64"): algos.take_2d_axis0_int8_float64,
1365 ("int16", "int16"): algos.take_2d_axis0_int16_int16,
1366 ("int16", "int32"): algos.take_2d_axis0_int16_int32,
1367 ("int16", "int64"): algos.take_2d_axis0_int16_int64,
1368 ("int16", "float64"): algos.take_2d_axis0_int16_float64,
1369 ("int32", "int32"): algos.take_2d_axis0_int32_int32,
1370 ("int32", "int64"): algos.take_2d_axis0_int32_int64,
1371 ("int32", "float64"): algos.take_2d_axis0_int32_float64,
1372 ("int64", "int64"): algos.take_2d_axis0_int64_int64,
1373 ("int64", "float64"): algos.take_2d_axis0_int64_float64,
1374 ("float32", "float32"): algos.take_2d_axis0_float32_float32,
1375 ("float32", "float64"): algos.take_2d_axis0_float32_float64,
1376 ("float64", "float64"): algos.take_2d_axis0_float64_float64,
1377 ("object", "object"): algos.take_2d_axis0_object_object,
1378 ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8),
1379 ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None),
1380 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
1381 algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
1382 ),
1383}
1385_take_2d_axis1_dict = {
1386 ("int8", "int8"): algos.take_2d_axis1_int8_int8,
1387 ("int8", "int32"): algos.take_2d_axis1_int8_int32,
1388 ("int8", "int64"): algos.take_2d_axis1_int8_int64,
1389 ("int8", "float64"): algos.take_2d_axis1_int8_float64,
1390 ("int16", "int16"): algos.take_2d_axis1_int16_int16,
1391 ("int16", "int32"): algos.take_2d_axis1_int16_int32,
1392 ("int16", "int64"): algos.take_2d_axis1_int16_int64,
1393 ("int16", "float64"): algos.take_2d_axis1_int16_float64,
1394 ("int32", "int32"): algos.take_2d_axis1_int32_int32,
1395 ("int32", "int64"): algos.take_2d_axis1_int32_int64,
1396 ("int32", "float64"): algos.take_2d_axis1_int32_float64,
1397 ("int64", "int64"): algos.take_2d_axis1_int64_int64,
1398 ("int64", "float64"): algos.take_2d_axis1_int64_float64,
1399 ("float32", "float32"): algos.take_2d_axis1_float32_float32,
1400 ("float32", "float64"): algos.take_2d_axis1_float32_float64,
1401 ("float64", "float64"): algos.take_2d_axis1_float64_float64,
1402 ("object", "object"): algos.take_2d_axis1_object_object,
1403 ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8),
1404 ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None),
1405 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
1406 algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
1407 ),
1408}
1410_take_2d_multi_dict = {
1411 ("int8", "int8"): algos.take_2d_multi_int8_int8,
1412 ("int8", "int32"): algos.take_2d_multi_int8_int32,
1413 ("int8", "int64"): algos.take_2d_multi_int8_int64,
1414 ("int8", "float64"): algos.take_2d_multi_int8_float64,
1415 ("int16", "int16"): algos.take_2d_multi_int16_int16,
1416 ("int16", "int32"): algos.take_2d_multi_int16_int32,
1417 ("int16", "int64"): algos.take_2d_multi_int16_int64,
1418 ("int16", "float64"): algos.take_2d_multi_int16_float64,
1419 ("int32", "int32"): algos.take_2d_multi_int32_int32,
1420 ("int32", "int64"): algos.take_2d_multi_int32_int64,
1421 ("int32", "float64"): algos.take_2d_multi_int32_float64,
1422 ("int64", "int64"): algos.take_2d_multi_int64_int64,
1423 ("int64", "float64"): algos.take_2d_multi_int64_float64,
1424 ("float32", "float32"): algos.take_2d_multi_float32_float32,
1425 ("float32", "float64"): algos.take_2d_multi_float32_float64,
1426 ("float64", "float64"): algos.take_2d_multi_float64_float64,
1427 ("object", "object"): algos.take_2d_multi_object_object,
1428 ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8),
1429 ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None),
1430 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
1431 algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
1432 ),
1433}
1436def _get_take_nd_function(
1437 ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None
1438):
1439 if ndim <= 2:
1440 tup = (arr_dtype.name, out_dtype.name)
1441 if ndim == 1:
1442 func = _take_1d_dict.get(tup, None)
1443 elif ndim == 2:
1444 if axis == 0:
1445 func = _take_2d_axis0_dict.get(tup, None)
1446 else:
1447 func = _take_2d_axis1_dict.get(tup, None)
1448 if func is not None:
1449 return func
1451 tup = (out_dtype.name, out_dtype.name)
1452 if ndim == 1:
1453 func = _take_1d_dict.get(tup, None)
1454 elif ndim == 2:
1455 if axis == 0:
1456 func = _take_2d_axis0_dict.get(tup, None)
1457 else:
1458 func = _take_2d_axis1_dict.get(tup, None)
1459 if func is not None:
1460 func = _convert_wrapper(func, out_dtype)
1461 return func
1463 def func2(arr, indexer, out, fill_value=np.nan):
1464 indexer = ensure_int64(indexer)
1465 _take_nd_object(
1466 arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
1467 )
1469 return func2
1472def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None):
1473 """
1474 Take elements from an array.
1476 .. versionadded:: 0.23.0
1478 Parameters
1479 ----------
1480 arr : sequence
1481 Non array-likes (sequences without a dtype) are coerced
1482 to an ndarray.
1483 indices : sequence of integers
1484 Indices to be taken.
1485 axis : int, default 0
1486 The axis over which to select values.
1487 allow_fill : bool, default False
1488 How to handle negative values in `indices`.
1490 * False: negative values in `indices` indicate positional indices
1491 from the right (the default). This is similar to :func:`numpy.take`.
1493 * True: negative values in `indices` indicate
1494 missing values. These values are set to `fill_value`. Any other
1495 other negative values raise a ``ValueError``.
1497 fill_value : any, optional
1498 Fill value to use for NA-indices when `allow_fill` is True.
1499 This may be ``None``, in which case the default NA value for
1500 the type (``self.dtype.na_value``) is used.
1502 For multi-dimensional `arr`, each *element* is filled with
1503 `fill_value`.
1505 Returns
1506 -------
1507 ndarray or ExtensionArray
1508 Same type as the input.
1510 Raises
1511 ------
1512 IndexError
1513 When `indices` is out of bounds for the array.
1514 ValueError
1515 When the indexer contains negative values other than ``-1``
1516 and `allow_fill` is True.
1518 Notes
1519 -----
1520 When `allow_fill` is False, `indices` may be whatever dimensionality
1521 is accepted by NumPy for `arr`.
1523 When `allow_fill` is True, `indices` should be 1-D.
1525 See Also
1526 --------
1527 numpy.take
1529 Examples
1530 --------
1531 >>> from pandas.api.extensions import take
1533 With the default ``allow_fill=False``, negative numbers indicate
1534 positional indices from the right.
1536 >>> take(np.array([10, 20, 30]), [0, 0, -1])
1537 array([10, 10, 30])
1539 Setting ``allow_fill=True`` will place `fill_value` in those positions.
1541 >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
1542 array([10., 10., nan])
1544 >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
1545 ... fill_value=-10)
1546 array([ 10, 10, -10])
1547 """
1548 if not is_array_like(arr):
1549 arr = np.asarray(arr)
1551 indices = np.asarray(indices, dtype=np.intp)
1553 if allow_fill:
1554 # Pandas style, -1 means NA
1555 validate_indices(indices, arr.shape[axis])
1556 result = take_1d(
1557 arr, indices, axis=axis, allow_fill=True, fill_value=fill_value
1558 )
1559 else:
1560 # NumPy style
1561 result = arr.take(indices, axis=axis)
1562 return result
1565def take_nd(
1566 arr, indexer, axis: int = 0, out=None, fill_value=np.nan, allow_fill: bool = True
1567):
1568 """
1569 Specialized Cython take which sets NaN values in one pass
1571 This dispatches to ``take`` defined on ExtensionArrays. It does not
1572 currently dispatch to ``SparseArray.take`` for sparse ``arr``.
1574 Parameters
1575 ----------
1576 arr : array-like
1577 Input array.
1578 indexer : ndarray
1579 1-D array of indices to take, subarrays corresponding to -1 value
1580 indices are filed with fill_value
1581 axis : int, default 0
1582 Axis to take from
1583 out : ndarray or None, default None
1584 Optional output array, must be appropriate type to hold input and
1585 fill_value together, if indexer has any -1 value entries; call
1586 maybe_promote to determine this type for any fill_value
1587 fill_value : any, default np.nan
1588 Fill value to replace -1 values with
1589 allow_fill : boolean, default True
1590 If False, indexer is assumed to contain no -1 values so no filling
1591 will be done. This short-circuits computation of a mask. Result is
1592 undefined if allow_fill == False and -1 is present in indexer.
1594 Returns
1595 -------
1596 subarray : array-like
1597 May be the same type as the input, or cast to an ndarray.
1598 """
1599 mask_info = None
1601 if is_extension_array_dtype(arr):
1602 return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1604 arr = extract_array(arr)
1605 arr = np.asarray(arr)
1607 if indexer is None:
1608 indexer = np.arange(arr.shape[axis], dtype=np.int64)
1609 dtype, fill_value = arr.dtype, arr.dtype.type()
1610 else:
1611 indexer = ensure_int64(indexer, copy=False)
1612 if not allow_fill:
1613 dtype, fill_value = arr.dtype, arr.dtype.type()
1614 mask_info = None, False
1615 else:
1616 # check for promotion based on types only (do this first because
1617 # it's faster than computing a mask)
1618 dtype, fill_value = maybe_promote(arr.dtype, fill_value)
1619 if dtype != arr.dtype and (out is None or out.dtype != dtype):
1620 # check if promotion is actually required based on indexer
1621 mask = indexer == -1
1622 needs_masking = mask.any()
1623 mask_info = mask, needs_masking
1624 if needs_masking:
1625 if out is not None and out.dtype != dtype:
1626 raise TypeError("Incompatible type for fill_value")
1627 else:
1628 # if not, then depromote, set fill_value to dummy
1629 # (it won't be used but we don't want the cython code
1630 # to crash when trying to cast it to dtype)
1631 dtype, fill_value = arr.dtype, arr.dtype.type()
1633 flip_order = False
1634 if arr.ndim == 2:
1635 if arr.flags.f_contiguous:
1636 flip_order = True
1638 if flip_order:
1639 arr = arr.T
1640 axis = arr.ndim - axis - 1
1641 if out is not None:
1642 out = out.T
1644 # at this point, it's guaranteed that dtype can hold both the arr values
1645 # and the fill_value
1646 if out is None:
1647 out_shape_ = list(arr.shape)
1648 out_shape_[axis] = len(indexer)
1649 out_shape = tuple(out_shape_)
1650 if arr.flags.f_contiguous and axis == arr.ndim - 1:
1651 # minor tweak that can make an order-of-magnitude difference
1652 # for dataframes initialized directly from 2-d ndarrays
1653 # (s.t. df.values is c-contiguous and df._data.blocks[0] is its
1654 # f-contiguous transpose)
1655 out = np.empty(out_shape, dtype=dtype, order="F")
1656 else:
1657 out = np.empty(out_shape, dtype=dtype)
1659 func = _get_take_nd_function(
1660 arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
1661 )
1662 func(arr, indexer, out, fill_value)
1664 if flip_order:
1665 out = out.T
1666 return out
1669take_1d = take_nd
1672def take_2d_multi(arr, indexer, fill_value=np.nan):
1673 """
1674 Specialized Cython take which sets NaN values in one pass.
1675 """
1676 # This is only called from one place in DataFrame._reindex_multi,
1677 # so we know indexer is well-behaved.
1678 assert indexer is not None
1679 assert indexer[0] is not None
1680 assert indexer[1] is not None
1682 row_idx, col_idx = indexer
1684 row_idx = ensure_int64(row_idx)
1685 col_idx = ensure_int64(col_idx)
1686 indexer = row_idx, col_idx
1687 mask_info = None
1689 # check for promotion based on types only (do this first because
1690 # it's faster than computing a mask)
1691 dtype, fill_value = maybe_promote(arr.dtype, fill_value)
1692 if dtype != arr.dtype:
1693 # check if promotion is actually required based on indexer
1694 row_mask = row_idx == -1
1695 col_mask = col_idx == -1
1696 row_needs = row_mask.any()
1697 col_needs = col_mask.any()
1698 mask_info = (row_mask, col_mask), (row_needs, col_needs)
1700 if not (row_needs or col_needs):
1701 # if not, then depromote, set fill_value to dummy
1702 # (it won't be used but we don't want the cython code
1703 # to crash when trying to cast it to dtype)
1704 dtype, fill_value = arr.dtype, arr.dtype.type()
1706 # at this point, it's guaranteed that dtype can hold both the arr values
1707 # and the fill_value
1708 out_shape = len(row_idx), len(col_idx)
1709 out = np.empty(out_shape, dtype=dtype)
1711 func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
1712 if func is None and arr.dtype != out.dtype:
1713 func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
1714 if func is not None:
1715 func = _convert_wrapper(func, out.dtype)
1716 if func is None:
1718 def func(arr, indexer, out, fill_value=np.nan):
1719 _take_2d_multi_object(
1720 arr, indexer, out, fill_value=fill_value, mask_info=mask_info
1721 )
1723 func(arr, indexer, out=out, fill_value=fill_value)
1724 return out
1727# ------------ #
1728# searchsorted #
1729# ------------ #
1732def searchsorted(arr, value, side="left", sorter=None):
1733 """
1734 Find indices where elements should be inserted to maintain order.
1736 .. versionadded:: 0.25.0
1738 Find the indices into a sorted array `arr` (a) such that, if the
1739 corresponding elements in `value` were inserted before the indices,
1740 the order of `arr` would be preserved.
1742 Assuming that `arr` is sorted:
1744 ====== ================================
1745 `side` returned index `i` satisfies
1746 ====== ================================
1747 left ``arr[i-1] < value <= self[i]``
1748 right ``arr[i-1] <= value < self[i]``
1749 ====== ================================
1751 Parameters
1752 ----------
1753 arr: array-like
1754 Input array. If `sorter` is None, then it must be sorted in
1755 ascending order, otherwise `sorter` must be an array of indices
1756 that sort it.
1757 value : array_like
1758 Values to insert into `arr`.
1759 side : {'left', 'right'}, optional
1760 If 'left', the index of the first suitable location found is given.
1761 If 'right', return the last such index. If there is no suitable
1762 index, return either 0 or N (where N is the length of `self`).
1763 sorter : 1-D array_like, optional
1764 Optional array of integer indices that sort array a into ascending
1765 order. They are typically the result of argsort.
1767 Returns
1768 -------
1769 array of ints
1770 Array of insertion points with the same shape as `value`.
1772 See Also
1773 --------
1774 numpy.searchsorted : Similar method from NumPy.
1775 """
1776 if sorter is not None:
1777 sorter = ensure_platform_int(sorter)
1779 if (
1780 isinstance(arr, np.ndarray)
1781 and is_integer_dtype(arr)
1782 and (is_integer(value) or is_integer_dtype(value))
1783 ):
1784 # if `arr` and `value` have different dtypes, `arr` would be
1785 # recast by numpy, causing a slow search.
1786 # Before searching below, we therefore try to give `value` the
1787 # same dtype as `arr`, while guarding against integer overflows.
1788 iinfo = np.iinfo(arr.dtype.type)
1789 value_arr = np.array([value]) if is_scalar(value) else np.array(value)
1790 if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
1791 # value within bounds, so no overflow, so can convert value dtype
1792 # to dtype of arr
1793 dtype = arr.dtype
1794 else:
1795 dtype = value_arr.dtype
1797 if is_scalar(value):
1798 value = dtype.type(value)
1799 else:
1800 value = array(value, dtype=dtype)
1801 elif not (
1802 is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr)
1803 ):
1804 # E.g. if `arr` is an array with dtype='datetime64[ns]'
1805 # and `value` is a pd.Timestamp, we may need to convert value
1806 value_ser = array([value]) if is_scalar(value) else array(value)
1807 value = value_ser[0] if is_scalar(value) else value_ser
1808 if isinstance(value, Timestamp) and value.tzinfo is None:
1809 value = value.to_datetime64()
1811 result = arr.searchsorted(value, side=side, sorter=sorter)
1812 return result
1815# ---- #
1816# diff #
1817# ---- #
1819_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
1822def diff(arr, n: int, axis: int = 0, stacklevel=3):
1823 """
1824 difference of n between self,
1825 analogous to s-s.shift(n)
1827 Parameters
1828 ----------
1829 arr : ndarray
1830 n : int
1831 number of periods
1832 axis : int
1833 axis to shift on
1834 stacklevel : int
1835 The stacklevel for the lost dtype warning.
1837 Returns
1838 -------
1839 shifted
1840 """
1841 from pandas.core.arrays import PandasDtype
1843 n = int(n)
1844 na = np.nan
1845 dtype = arr.dtype
1847 if dtype.kind == "b":
1848 op = operator.xor
1849 else:
1850 op = operator.sub
1852 if isinstance(dtype, PandasDtype):
1853 # PandasArray cannot necessarily hold shifted versions of itself.
1854 arr = np.asarray(arr)
1855 dtype = arr.dtype
1857 if is_extension_array_dtype(dtype):
1858 if hasattr(arr, f"__{op.__name__}__"):
1859 return op(arr, arr.shift(n))
1860 else:
1861 warn(
1862 "dtype lost in 'diff()'. In the future this will raise a "
1863 "TypeError. Convert to a suitable dtype prior to calling 'diff'.",
1864 FutureWarning,
1865 stacklevel=stacklevel,
1866 )
1867 arr = np.asarray(arr)
1868 dtype = arr.dtype
1870 is_timedelta = False
1871 is_bool = False
1872 if needs_i8_conversion(arr):
1873 dtype = np.float64
1874 arr = arr.view("i8")
1875 na = iNaT
1876 is_timedelta = True
1878 elif is_bool_dtype(dtype):
1879 dtype = np.object_
1880 is_bool = True
1882 elif is_integer_dtype(dtype):
1883 dtype = np.float64
1885 dtype = np.dtype(dtype)
1886 out_arr = np.empty(arr.shape, dtype=dtype)
1888 na_indexer = [slice(None)] * arr.ndim
1889 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
1890 out_arr[tuple(na_indexer)] = na
1892 if arr.ndim == 2 and arr.dtype.name in _diff_special:
1893 # TODO: can diff_2d dtype specialization troubles be fixed by defining
1894 # out_arr inside diff_2d?
1895 algos.diff_2d(arr, out_arr, n, axis)
1896 else:
1897 # To keep mypy happy, _res_indexer is a list while res_indexer is
1898 # a tuple, ditto for lag_indexer.
1899 _res_indexer = [slice(None)] * arr.ndim
1900 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
1901 res_indexer = tuple(_res_indexer)
1903 _lag_indexer = [slice(None)] * arr.ndim
1904 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
1905 lag_indexer = tuple(_lag_indexer)
1907 # need to make sure that we account for na for datelike/timedelta
1908 # we don't actually want to subtract these i8 numbers
1909 if is_timedelta:
1910 res = arr[res_indexer]
1911 lag = arr[lag_indexer]
1913 mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na)
1914 if mask.any():
1915 res = res.copy()
1916 res[mask] = 0
1917 lag = lag.copy()
1918 lag[mask] = 0
1920 result = res - lag
1921 result[mask] = na
1922 out_arr[res_indexer] = result
1923 elif is_bool:
1924 out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer]
1925 else:
1926 out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
1928 if is_timedelta:
1929 out_arr = out_arr.astype("int64").view("timedelta64[ns]")
1931 return out_arr
1934# --------------------------------------------------------------------
1935# Helper functions
1937# Note: safe_sort is in algorithms.py instead of sorting.py because it is
1938# low-dependency, is used in this module, and used private methods from
1939# this module.
1940def safe_sort(
1941 values,
1942 codes=None,
1943 na_sentinel: int = -1,
1944 assume_unique: bool = False,
1945 verify: bool = True,
1946) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
1947 """
1948 Sort ``values`` and reorder corresponding ``codes``.
1950 ``values`` should be unique if ``codes`` is not None.
1951 Safe for use with mixed types (int, str), orders ints before strs.
1953 Parameters
1954 ----------
1955 values : list-like
1956 Sequence; must be unique if ``codes`` is not None.
1957 codes : list_like, optional
1958 Indices to ``values``. All out of bound indices are treated as
1959 "not found" and will be masked with ``na_sentinel``.
1960 na_sentinel : int, default -1
1961 Value in ``codes`` to mark "not found".
1962 Ignored when ``codes`` is None.
1963 assume_unique : bool, default False
1964 When True, ``values`` are assumed to be unique, which can speed up
1965 the calculation. Ignored when ``codes`` is None.
1966 verify : bool, default True
1967 Check if codes are out of bound for the values and put out of bound
1968 codes equal to na_sentinel. If ``verify=False``, it is assumed there
1969 are no out of bound codes. Ignored when ``codes`` is None.
1971 .. versionadded:: 0.25.0
1973 Returns
1974 -------
1975 ordered : ndarray
1976 Sorted ``values``
1977 new_codes : ndarray
1978 Reordered ``codes``; returned when ``codes`` is not None.
1980 Raises
1981 ------
1982 TypeError
1983 * If ``values`` is not list-like or if ``codes`` is neither None
1984 nor list-like
1985 * If ``values`` cannot be sorted
1986 ValueError
1987 * If ``codes`` is not None and ``values`` contain duplicates.
1988 """
1989 if not is_list_like(values):
1990 raise TypeError(
1991 "Only list-like objects are allowed to be passed to safe_sort as values"
1992 )
1994 if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
1995 # don't convert to string types
1996 dtype, _ = infer_dtype_from_array(values)
1997 values = np.asarray(values, dtype=dtype)
1999 def sort_mixed(values):
2000 # order ints before strings, safe in py3
2001 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
2002 nums = np.sort(values[~str_pos])
2003 strs = np.sort(values[str_pos])
2004 return np.concatenate([nums, np.asarray(strs, dtype=object)])
2006 sorter = None
2007 if (
2008 not is_extension_array_dtype(values)
2009 and lib.infer_dtype(values, skipna=False) == "mixed-integer"
2010 ):
2011 # unorderable in py3 if mixed str/int
2012 ordered = sort_mixed(values)
2013 else:
2014 try:
2015 sorter = values.argsort()
2016 ordered = values.take(sorter)
2017 except TypeError:
2018 # try this anyway
2019 ordered = sort_mixed(values)
2021 # codes:
2023 if codes is None:
2024 return ordered
2026 if not is_list_like(codes):
2027 raise TypeError(
2028 "Only list-like objects or None are allowed to "
2029 "be passed to safe_sort as codes"
2030 )
2031 codes = ensure_platform_int(np.asarray(codes))
2033 from pandas import Index
2035 if not assume_unique and not Index(values).is_unique:
2036 raise ValueError("values should be unique if codes is not None")
2038 if sorter is None:
2039 # mixed types
2040 hash_klass, values = _get_data_algo(values)
2041 t = hash_klass(len(values))
2042 t.map_locations(values)
2043 sorter = ensure_platform_int(t.lookup(ordered))
2045 if na_sentinel == -1:
2046 # take_1d is faster, but only works for na_sentinels of -1
2047 order2 = sorter.argsort()
2048 new_codes = take_1d(order2, codes, fill_value=-1)
2049 if verify:
2050 mask = (codes < -len(values)) | (codes >= len(values))
2051 else:
2052 mask = None
2053 else:
2054 reverse_indexer = np.empty(len(sorter), dtype=np.int_)
2055 reverse_indexer.put(sorter, np.arange(len(sorter)))
2056 # Out of bound indices will be masked with `na_sentinel` next, so we
2057 # may deal with them here without performance loss using `mode='wrap'`
2058 new_codes = reverse_indexer.take(codes, mode="wrap")
2060 mask = codes == na_sentinel
2061 if verify:
2062 mask = mask | (codes < -len(values)) | (codes >= len(values))
2064 if mask is not None:
2065 np.putmask(new_codes, mask, na_sentinel)
2067 return ordered, ensure_platform_int(new_codes)