Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/reshape/tile.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Quantilization functions and related stuff
3"""
4import numpy as np
6from pandas._libs import Timedelta, Timestamp
7from pandas._libs.lib import infer_dtype
9from pandas.core.dtypes.common import (
10 _NS_DTYPE,
11 ensure_int64,
12 is_bool_dtype,
13 is_categorical_dtype,
14 is_datetime64_dtype,
15 is_datetime64tz_dtype,
16 is_datetime_or_timedelta_dtype,
17 is_extension_array_dtype,
18 is_integer,
19 is_integer_dtype,
20 is_list_like,
21 is_scalar,
22 is_timedelta64_dtype,
23)
24from pandas.core.dtypes.generic import ABCSeries
25from pandas.core.dtypes.missing import isna
27from pandas import Categorical, Index, IntervalIndex, to_datetime, to_timedelta
28import pandas.core.algorithms as algos
29import pandas.core.nanops as nanops
32def cut(
33 x,
34 bins,
35 right: bool = True,
36 labels=None,
37 retbins: bool = False,
38 precision: int = 3,
39 include_lowest: bool = False,
40 duplicates: str = "raise",
41):
42 """
43 Bin values into discrete intervals.
45 Use `cut` when you need to segment and sort data values into bins. This
46 function is also useful for going from a continuous variable to a
47 categorical variable. For example, `cut` could convert ages to groups of
48 age ranges. Supports binning into an equal number of bins, or a
49 pre-specified array of bins.
51 Parameters
52 ----------
53 x : array-like
54 The input array to be binned. Must be 1-dimensional.
55 bins : int, sequence of scalars, or IntervalIndex
56 The criteria to bin by.
58 * int : Defines the number of equal-width bins in the range of `x`. The
59 range of `x` is extended by .1% on each side to include the minimum
60 and maximum values of `x`.
61 * sequence of scalars : Defines the bin edges allowing for non-uniform
62 width. No extension of the range of `x` is done.
63 * IntervalIndex : Defines the exact bins to be used. Note that
64 IntervalIndex for `bins` must be non-overlapping.
66 right : bool, default True
67 Indicates whether `bins` includes the rightmost edge or not. If
68 ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
69 indicate (1,2], (2,3], (3,4]. This argument is ignored when
70 `bins` is an IntervalIndex.
71 labels : array or False, default None
72 Specifies the labels for the returned bins. Must be the same length as
73 the resulting bins. If False, returns only integer indicators of the
74 bins. This affects the type of the output container (see below).
75 This argument is ignored when `bins` is an IntervalIndex. If True,
76 raises an error.
77 retbins : bool, default False
78 Whether to return the bins or not. Useful when bins is provided
79 as a scalar.
80 precision : int, default 3
81 The precision at which to store and display the bins labels.
82 include_lowest : bool, default False
83 Whether the first interval should be left-inclusive or not.
84 duplicates : {default 'raise', 'drop'}, optional
85 If bin edges are not unique, raise ValueError or drop non-uniques.
87 .. versionadded:: 0.23.0
89 Returns
90 -------
91 out : Categorical, Series, or ndarray
92 An array-like object representing the respective bin for each value
93 of `x`. The type depends on the value of `labels`.
95 * True (default) : returns a Series for Series `x` or a
96 Categorical for all other inputs. The values stored within
97 are Interval dtype.
99 * sequence of scalars : returns a Series for Series `x` or a
100 Categorical for all other inputs. The values stored within
101 are whatever the type in the sequence is.
103 * False : returns an ndarray of integers.
105 bins : numpy.ndarray or IntervalIndex.
106 The computed or specified bins. Only returned when `retbins=True`.
107 For scalar or sequence `bins`, this is an ndarray with the computed
108 bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
109 an IntervalIndex `bins`, this is equal to `bins`.
111 See Also
112 --------
113 qcut : Discretize variable into equal-sized buckets based on rank
114 or based on sample quantiles.
115 Categorical : Array type for storing data that come from a
116 fixed set of values.
117 Series : One-dimensional array with axis labels (including time series).
118 IntervalIndex : Immutable Index implementing an ordered, sliceable set.
120 Notes
121 -----
122 Any NA values will be NA in the result. Out of bounds values will be NA in
123 the resulting Series or Categorical object.
125 Examples
126 --------
127 Discretize into three equal-sized bins.
129 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
130 ... # doctest: +ELLIPSIS
131 [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
132 Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
134 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
135 ... # doctest: +ELLIPSIS
136 ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
137 Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
138 array([0.994, 3. , 5. , 7. ]))
140 Discovers the same bins, but assign them specific labels. Notice that
141 the returned Categorical's categories are `labels` and is ordered.
143 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
144 ... 3, labels=["bad", "medium", "good"])
145 [bad, good, medium, medium, good, bad]
146 Categories (3, object): [bad < medium < good]
148 ``labels=False`` implies you just want the bins back.
150 >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
151 array([0, 1, 1, 3])
153 Passing a Series as an input returns a Series with categorical dtype:
155 >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
156 ... index=['a', 'b', 'c', 'd', 'e'])
157 >>> pd.cut(s, 3)
158 ... # doctest: +ELLIPSIS
159 a (1.992, 4.667]
160 b (1.992, 4.667]
161 c (4.667, 7.333]
162 d (7.333, 10.0]
163 e (7.333, 10.0]
164 dtype: category
165 Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
167 Passing a Series as an input returns a Series with mapping value.
168 It is used to map numerically to intervals based on bins.
170 >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
171 ... index=['a', 'b', 'c', 'd', 'e'])
172 >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
173 ... # doctest: +ELLIPSIS
174 (a 0.0
175 b 1.0
176 c 2.0
177 d 3.0
178 e 4.0
179 dtype: float64, array([0, 2, 4, 6, 8]))
181 Use `drop` optional when bins is not unique
183 >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
184 ... right=False, duplicates='drop')
185 ... # doctest: +ELLIPSIS
186 (a 0.0
187 b 1.0
188 c 2.0
189 d 3.0
190 e 3.0
191 dtype: float64, array([0, 2, 4, 6, 8]))
193 Passing an IntervalIndex for `bins` results in those categories exactly.
194 Notice that values not covered by the IntervalIndex are set to NaN. 0
195 is to the left of the first bin (which is closed on the right), and 1.5
196 falls between two bins.
198 >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
199 >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
200 [NaN, (0, 1], NaN, (2, 3], (4, 5]]
201 Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
202 """
203 # NOTE: this binning code is changed a bit from histogram for var(x) == 0
205 original = x
206 x = _preprocess_for_cut(x)
207 x, dtype = _coerce_to_type(x)
209 if not np.iterable(bins):
210 if is_scalar(bins) and bins < 1:
211 raise ValueError("`bins` should be a positive integer.")
213 try: # for array-like
214 sz = x.size
215 except AttributeError:
216 x = np.asarray(x)
217 sz = x.size
219 if sz == 0:
220 raise ValueError("Cannot cut empty array")
222 rng = (nanops.nanmin(x), nanops.nanmax(x))
223 mn, mx = [mi + 0.0 for mi in rng]
225 if np.isinf(mn) or np.isinf(mx):
226 # GH 24314
227 raise ValueError(
228 "cannot specify integer `bins` when input data contains infinity"
229 )
230 elif mn == mx: # adjust end points before binning
231 mn -= 0.001 * abs(mn) if mn != 0 else 0.001
232 mx += 0.001 * abs(mx) if mx != 0 else 0.001
233 bins = np.linspace(mn, mx, bins + 1, endpoint=True)
234 else: # adjust end points after binning
235 bins = np.linspace(mn, mx, bins + 1, endpoint=True)
236 adj = (mx - mn) * 0.001 # 0.1% of the range
237 if right:
238 bins[0] -= adj
239 else:
240 bins[-1] += adj
242 elif isinstance(bins, IntervalIndex):
243 if bins.is_overlapping:
244 raise ValueError("Overlapping IntervalIndex is not accepted.")
246 else:
247 if is_datetime64tz_dtype(bins):
248 bins = np.asarray(bins, dtype=_NS_DTYPE)
249 else:
250 bins = np.asarray(bins)
251 bins = _convert_bin_to_numeric_type(bins, dtype)
253 # GH 26045: cast to float64 to avoid an overflow
254 if (np.diff(bins.astype("float64")) < 0).any():
255 raise ValueError("bins must increase monotonically.")
257 fac, bins = _bins_to_cuts(
258 x,
259 bins,
260 right=right,
261 labels=labels,
262 precision=precision,
263 include_lowest=include_lowest,
264 dtype=dtype,
265 duplicates=duplicates,
266 )
268 return _postprocess_for_cut(fac, bins, retbins, dtype, original)
271def qcut(
272 x,
273 q,
274 labels=None,
275 retbins: bool = False,
276 precision: int = 3,
277 duplicates: str = "raise",
278):
279 """
280 Quantile-based discretization function.
282 Discretize variable into equal-sized buckets based on rank or based
283 on sample quantiles. For example 1000 values for 10 quantiles would
284 produce a Categorical object indicating quantile membership for each data point.
286 Parameters
287 ----------
288 x : 1d ndarray or Series
289 q : int or list-like of int
290 Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
291 array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
292 labels : array or False, default None
293 Used as labels for the resulting bins. Must be of the same length as
294 the resulting bins. If False, return only integer indicators of the
295 bins. If True, raises an error.
296 retbins : bool, optional
297 Whether to return the (bins, labels) or not. Can be useful if bins
298 is given as a scalar.
299 precision : int, optional
300 The precision at which to store and display the bins labels.
301 duplicates : {default 'raise', 'drop'}, optional
302 If bin edges are not unique, raise ValueError or drop non-uniques.
304 Returns
305 -------
306 out : Categorical or Series or array of integers if labels is False
307 The return type (Categorical or Series) depends on the input: a Series
308 of type category if input is a Series else Categorical. Bins are
309 represented as categories when categorical data is returned.
310 bins : ndarray of floats
311 Returned only if `retbins` is True.
313 Notes
314 -----
315 Out of bounds values will be NA in the resulting Categorical object
317 Examples
318 --------
319 >>> pd.qcut(range(5), 4)
320 ... # doctest: +ELLIPSIS
321 [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
322 Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...
324 >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
325 ... # doctest: +SKIP
326 [good, good, medium, bad, bad]
327 Categories (3, object): [good < medium < bad]
329 >>> pd.qcut(range(5), 4, labels=False)
330 array([0, 0, 1, 2, 3])
331 """
332 original = x
333 x = _preprocess_for_cut(x)
334 x, dtype = _coerce_to_type(x)
336 if is_integer(q):
337 quantiles = np.linspace(0, 1, q + 1)
338 else:
339 quantiles = q
340 bins = algos.quantile(x, quantiles)
341 fac, bins = _bins_to_cuts(
342 x,
343 bins,
344 labels=labels,
345 precision=precision,
346 include_lowest=True,
347 dtype=dtype,
348 duplicates=duplicates,
349 )
351 return _postprocess_for_cut(fac, bins, retbins, dtype, original)
354def _bins_to_cuts(
355 x,
356 bins,
357 right: bool = True,
358 labels=None,
359 precision: int = 3,
360 include_lowest: bool = False,
361 dtype=None,
362 duplicates: str = "raise",
363):
365 if duplicates not in ["raise", "drop"]:
366 raise ValueError(
367 "invalid value for 'duplicates' parameter, "
368 "valid options are: raise, drop"
369 )
371 if isinstance(bins, IntervalIndex):
372 # we have a fast-path here
373 ids = bins.get_indexer(x)
374 result = Categorical.from_codes(ids, categories=bins, ordered=True)
375 return result, bins
377 unique_bins = algos.unique(bins)
378 if len(unique_bins) < len(bins) and len(bins) != 2:
379 if duplicates == "raise":
380 raise ValueError(
381 f"Bin edges must be unique: {repr(bins)}.\n"
382 f"You can drop duplicate edges by setting the 'duplicates' kwarg"
383 )
384 else:
385 bins = unique_bins
387 side = "left" if right else "right"
388 ids = ensure_int64(bins.searchsorted(x, side=side))
390 if include_lowest:
391 ids[x == bins[0]] = 1
393 na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
394 has_nas = na_mask.any()
396 if labels is not False:
397 if not (labels is None or is_list_like(labels)):
398 raise ValueError(
399 "Bin labels must either be False, None or passed in as a "
400 "list-like argument"
401 )
403 elif labels is None:
404 labels = _format_labels(
405 bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
406 )
408 else:
409 if len(labels) != len(bins) - 1:
410 raise ValueError(
411 "Bin labels must be one fewer than the number of bin edges"
412 )
414 if not is_categorical_dtype(labels):
415 labels = Categorical(labels, categories=labels, ordered=True)
417 np.putmask(ids, na_mask, 0)
418 result = algos.take_nd(labels, ids - 1)
420 else:
421 result = ids - 1
422 if has_nas:
423 result = result.astype(np.float64)
424 np.putmask(result, na_mask, np.nan)
426 return result, bins
429def _coerce_to_type(x):
430 """
431 if the passed data is of datetime/timedelta, bool or nullable int type,
432 this method converts it to numeric so that cut or qcut method can
433 handle it
434 """
435 dtype = None
437 if is_datetime64tz_dtype(x):
438 dtype = x.dtype
439 elif is_datetime64_dtype(x):
440 x = to_datetime(x)
441 dtype = np.dtype("datetime64[ns]")
442 elif is_timedelta64_dtype(x):
443 x = to_timedelta(x)
444 dtype = np.dtype("timedelta64[ns]")
445 elif is_bool_dtype(x):
446 # GH 20303
447 x = x.astype(np.int64)
448 # To support cut and qcut for IntegerArray we convert to float dtype.
449 # Will properly support in the future.
450 # https://github.com/pandas-dev/pandas/pull/31290
451 # https://github.com/pandas-dev/pandas/issues/31389
452 elif is_extension_array_dtype(x) and is_integer_dtype(x):
453 x = x.to_numpy(dtype=np.float64, na_value=np.nan)
455 if dtype is not None:
456 # GH 19768: force NaT to NaN during integer conversion
457 x = np.where(x.notna(), x.view(np.int64), np.nan)
459 return x, dtype
462def _convert_bin_to_numeric_type(bins, dtype):
463 """
464 if the passed bin is of datetime/timedelta type,
465 this method converts it to integer
467 Parameters
468 ----------
469 bins : list-like of bins
470 dtype : dtype of data
472 Raises
473 ------
474 ValueError if bins are not of a compat dtype to dtype
475 """
476 bins_dtype = infer_dtype(bins, skipna=False)
477 if is_timedelta64_dtype(dtype):
478 if bins_dtype in ["timedelta", "timedelta64"]:
479 bins = to_timedelta(bins).view(np.int64)
480 else:
481 raise ValueError("bins must be of timedelta64 dtype")
482 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
483 if bins_dtype in ["datetime", "datetime64"]:
484 bins = to_datetime(bins).view(np.int64)
485 else:
486 raise ValueError("bins must be of datetime64 dtype")
488 return bins
491def _convert_bin_to_datelike_type(bins, dtype):
492 """
493 Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
494 datelike
496 Parameters
497 ----------
498 bins : list-like of bins
499 dtype : dtype of data
501 Returns
502 -------
503 bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
504 datelike
505 """
506 if is_datetime64tz_dtype(dtype):
507 bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
508 elif is_datetime_or_timedelta_dtype(dtype):
509 bins = Index(bins.astype(np.int64), dtype=dtype)
510 return bins
513def _format_labels(
514 bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None
515):
516 """ based on the dtype, return our labels """
518 closed = "right" if right else "left"
520 if is_datetime64tz_dtype(dtype):
521 formatter = lambda x: Timestamp(x, tz=dtype.tz)
522 adjust = lambda x: x - Timedelta("1ns")
523 elif is_datetime64_dtype(dtype):
524 formatter = Timestamp
525 adjust = lambda x: x - Timedelta("1ns")
526 elif is_timedelta64_dtype(dtype):
527 formatter = Timedelta
528 adjust = lambda x: x - Timedelta("1ns")
529 else:
530 precision = _infer_precision(precision, bins)
531 formatter = lambda x: _round_frac(x, precision)
532 adjust = lambda x: x - 10 ** (-precision)
534 breaks = [formatter(b) for b in bins]
535 if right and include_lowest:
536 # adjust lhs of first interval by precision to account for being right closed
537 breaks[0] = adjust(breaks[0])
539 return IntervalIndex.from_breaks(breaks, closed=closed)
542def _preprocess_for_cut(x):
543 """
544 handles preprocessing for cut where we convert passed
545 input to array, strip the index information and store it
546 separately
547 """
549 # Check that the passed array is a Pandas or Numpy object
550 # We don't want to strip away a Pandas data-type here (e.g. datetimetz)
551 ndim = getattr(x, "ndim", None)
552 if ndim is None:
553 x = np.asarray(x)
554 if x.ndim != 1:
555 raise ValueError("Input array must be 1 dimensional")
557 return x
560def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original):
561 """
562 handles post processing for the cut method where
563 we combine the index information if the originally passed
564 datatype was a series
565 """
566 if isinstance(original, ABCSeries):
567 fac = original._constructor(fac, index=original.index, name=original.name)
569 if not retbins:
570 return fac
572 bins = _convert_bin_to_datelike_type(bins, dtype)
574 return fac, bins
577def _round_frac(x, precision: int):
578 """
579 Round the fractional part of the given number
580 """
581 if not np.isfinite(x) or x == 0:
582 return x
583 else:
584 frac, whole = np.modf(x)
585 if whole == 0:
586 digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
587 else:
588 digits = precision
589 return np.around(x, digits)
592def _infer_precision(base_precision: int, bins) -> int:
593 """Infer an appropriate precision for _round_frac
594 """
595 for precision in range(base_precision, 20):
596 levels = [_round_frac(b, precision) for b in bins]
597 if algos.unique(levels).size == bins.size:
598 return precision
599 return base_precision # default