Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/nanops.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import functools
2import itertools
3import operator
4from typing import Any, Optional, Tuple, Union
6import numpy as np
8from pandas._config import get_option
10from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib
11from pandas.compat._optional import import_optional_dependency
13from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
14from pandas.core.dtypes.common import (
15 _get_dtype,
16 is_any_int_dtype,
17 is_bool_dtype,
18 is_complex,
19 is_datetime64_dtype,
20 is_datetime64tz_dtype,
21 is_datetime_or_timedelta_dtype,
22 is_float,
23 is_float_dtype,
24 is_integer,
25 is_integer_dtype,
26 is_numeric_dtype,
27 is_object_dtype,
28 is_scalar,
29 is_timedelta64_dtype,
30 pandas_dtype,
31)
32from pandas.core.dtypes.dtypes import DatetimeTZDtype
33from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
35bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn")
36_BOTTLENECK_INSTALLED = bn is not None
37_USE_BOTTLENECK = False
40def set_use_bottleneck(v=True):
41 # set/unset to use bottleneck
42 global _USE_BOTTLENECK
43 if _BOTTLENECK_INSTALLED:
44 _USE_BOTTLENECK = v
47set_use_bottleneck(get_option("compute.use_bottleneck"))
50class disallow:
51 def __init__(self, *dtypes):
52 super().__init__()
53 self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
55 def check(self, obj) -> bool:
56 return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
58 def __call__(self, f):
59 @functools.wraps(f)
60 def _f(*args, **kwargs):
61 obj_iter = itertools.chain(args, kwargs.values())
62 if any(self.check(obj) for obj in obj_iter):
63 f_name = f.__name__.replace("nan", "")
64 raise TypeError(
65 f"reduction operation '{f_name}' not allowed for this dtype"
66 )
67 try:
68 with np.errstate(invalid="ignore"):
69 return f(*args, **kwargs)
70 except ValueError as e:
71 # we want to transform an object array
72 # ValueError message to the more typical TypeError
73 # e.g. this is normally a disallowed function on
74 # object arrays that contain strings
75 if is_object_dtype(args[0]):
76 raise TypeError(e)
77 raise
79 return _f
82class bottleneck_switch:
83 def __init__(self, name=None, **kwargs):
84 self.name = name
85 self.kwargs = kwargs
87 def __call__(self, alt):
88 bn_name = self.name or alt.__name__
90 try:
91 bn_func = getattr(bn, bn_name)
92 except (AttributeError, NameError): # pragma: no cover
93 bn_func = None
95 @functools.wraps(alt)
96 def f(values, axis=None, skipna=True, **kwds):
97 if len(self.kwargs) > 0:
98 for k, v in self.kwargs.items():
99 if k not in kwds:
100 kwds[k] = v
102 if values.size == 0 and kwds.get("min_count") is None:
103 # We are empty, returning NA for our type
104 # Only applies for the default `min_count` of None
105 # since that affects how empty arrays are handled.
106 # TODO(GH-18976) update all the nanops methods to
107 # correctly handle empty inputs and remove this check.
108 # It *may* just be `var`
109 return _na_for_min_count(values, axis)
111 if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
112 if kwds.get("mask", None) is None:
113 # `mask` is not recognised by bottleneck, would raise
114 # TypeError if called
115 kwds.pop("mask", None)
116 result = bn_func(values, axis=axis, **kwds)
118 # prefer to treat inf/-inf as NA, but must compute the func
119 # twice :(
120 if _has_infs(result):
121 result = alt(values, axis=axis, skipna=skipna, **kwds)
122 else:
123 result = alt(values, axis=axis, skipna=skipna, **kwds)
124 else:
125 result = alt(values, axis=axis, skipna=skipna, **kwds)
127 return result
129 return f
132def _bn_ok_dtype(dt, name: str) -> bool:
133 # Bottleneck chokes on datetime64
134 if not is_object_dtype(dt) and not (
135 is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt)
136 ):
138 # GH 15507
139 # bottleneck does not properly upcast during the sum
140 # so can overflow
142 # GH 9422
143 # further we also want to preserve NaN when all elements
144 # are NaN, unlinke bottleneck/numpy which consider this
145 # to be 0
146 if name in ["nansum", "nanprod"]:
147 return False
149 return True
150 return False
153def _has_infs(result) -> bool:
154 if isinstance(result, np.ndarray):
155 if result.dtype == "f8":
156 return lib.has_infs_f8(result.ravel())
157 elif result.dtype == "f4":
158 return lib.has_infs_f4(result.ravel())
159 try:
160 return np.isinf(result).any()
161 except (TypeError, NotImplementedError):
162 # if it doesn't support infs, then it can't have infs
163 return False
166def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
167 """ return the correct fill value for the dtype of the values """
168 if fill_value is not None:
169 return fill_value
170 if _na_ok_dtype(dtype):
171 if fill_value_typ is None:
172 return np.nan
173 else:
174 if fill_value_typ == "+inf":
175 return np.inf
176 else:
177 return -np.inf
178 else:
179 if fill_value_typ is None:
180 return iNaT
181 else:
182 if fill_value_typ == "+inf":
183 # need the max int here
184 return _int64_max
185 else:
186 return iNaT
189def _maybe_get_mask(
190 values: np.ndarray, skipna: bool, mask: Optional[np.ndarray]
191) -> Optional[np.ndarray]:
192 """
193 Compute a mask if and only if necessary.
195 This function will compute a mask iff it is necessary. Otherwise,
196 return the provided mask (potentially None) when a mask does not need to be
197 computed.
199 A mask is never necessary if the values array is of boolean or integer
200 dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
201 dtype that is interpretable as either boolean or integer data (eg,
202 timedelta64), a mask must be provided.
204 If the skipna parameter is False, a new mask will not be computed.
206 The mask is computed using isna() by default. Setting invert=True selects
207 notna() as the masking function.
209 Parameters
210 ----------
211 values : ndarray
212 input array to potentially compute mask for
213 skipna : bool
214 boolean for whether NaNs should be skipped
215 mask : Optional[ndarray]
216 nan-mask if known
218 Returns
219 -------
220 Optional[np.ndarray]
221 """
223 if mask is None:
224 if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype):
225 # Boolean data cannot contain nulls, so signal via mask being None
226 return None
228 if skipna:
229 mask = isna(values)
231 return mask
234def _get_values(
235 values: np.ndarray,
236 skipna: bool,
237 fill_value: Any = None,
238 fill_value_typ: Optional[str] = None,
239 mask: Optional[np.ndarray] = None,
240) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]:
241 """
242 Utility to get the values view, mask, dtype, dtype_max, and fill_value.
244 If both mask and fill_value/fill_value_typ are not None and skipna is True,
245 the values array will be copied.
247 For input arrays of boolean or integer dtypes, copies will only occur if a
248 precomputed mask, a fill_value/fill_value_typ, and skipna=True are
249 provided.
251 Parameters
252 ----------
253 values : ndarray
254 input array to potentially compute mask for
255 skipna : bool
256 boolean for whether NaNs should be skipped
257 fill_value : Any
258 value to fill NaNs with
259 fill_value_typ : str
260 Set to '+inf' or '-inf' to handle dtype-specific infinities
261 mask : Optional[np.ndarray]
262 nan-mask if known
264 Returns
265 -------
266 values : ndarray
267 Potential copy of input value array
268 mask : Optional[ndarray[bool]]
269 Mask for values, if deemed necessary to compute
270 dtype : dtype
271 dtype for values
272 dtype_max : dtype
273 platform independent dtype
274 fill_value : Any
275 fill value used
276 """
278 # In _get_values is only called from within nanops, and in all cases
279 # with scalar fill_value. This guarantee is important for the
280 # maybe_upcast_putmask call below
281 assert is_scalar(fill_value)
283 mask = _maybe_get_mask(values, skipna, mask)
285 if is_datetime64tz_dtype(values):
286 # lib.values_from_object returns M8[ns] dtype instead of tz-aware,
287 # so this case must be handled separately from the rest
288 dtype = values.dtype
289 values = getattr(values, "_values", values)
290 else:
291 values = lib.values_from_object(values)
292 dtype = values.dtype
294 if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values):
295 # changing timedelta64/datetime64 to int64 needs to happen after
296 # finding `mask` above
297 values = getattr(values, "asi8", values)
298 values = values.view(np.int64)
300 dtype_ok = _na_ok_dtype(dtype)
302 # get our fill value (in case we need to provide an alternative
303 # dtype for it)
304 fill_value = _get_fill_value(
305 dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
306 )
308 copy = (mask is not None) and (fill_value is not None)
310 if skipna and copy:
311 values = values.copy()
312 if dtype_ok:
313 np.putmask(values, mask, fill_value)
315 # promote if needed
316 else:
317 values, _ = maybe_upcast_putmask(values, mask, fill_value)
319 # return a platform independent precision dtype
320 dtype_max = dtype
321 if is_integer_dtype(dtype) or is_bool_dtype(dtype):
322 dtype_max = np.int64
323 elif is_float_dtype(dtype):
324 dtype_max = np.float64
326 return values, mask, dtype, dtype_max, fill_value
329def _na_ok_dtype(dtype):
330 # TODO: what about datetime64tz? PeriodDtype?
331 return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64))
334def _wrap_results(result, dtype, fill_value=None):
335 """ wrap our results if needed """
337 if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
338 if fill_value is None:
339 # GH#24293
340 fill_value = iNaT
341 if not isinstance(result, np.ndarray):
342 tz = getattr(dtype, "tz", None)
343 assert not isna(fill_value), "Expected non-null fill_value"
344 if result == fill_value:
345 result = np.nan
346 result = Timestamp(result, tz=tz)
347 else:
348 result = result.view(dtype)
349 elif is_timedelta64_dtype(dtype):
350 if not isinstance(result, np.ndarray):
351 if result == fill_value:
352 result = np.nan
354 # raise if we have a timedelta64[ns] which is too large
355 if np.fabs(result) > _int64_max:
356 raise ValueError("overflow in timedelta operation")
358 result = Timedelta(result, unit="ns")
359 else:
360 result = result.astype("m8[ns]").view(dtype)
362 return result
365def _na_for_min_count(values, axis: Optional[int]):
366 """
367 Return the missing value for `values`.
369 Parameters
370 ----------
371 values : ndarray
372 axis : int or None
373 axis for the reduction, required if values.ndim > 1.
375 Returns
376 -------
377 result : scalar or ndarray
378 For 1-D values, returns a scalar of the correct missing type.
379 For 2-D values, returns a 1-D array where each element is missing.
380 """
381 # we either return np.nan or pd.NaT
382 if is_numeric_dtype(values):
383 values = values.astype("float64")
384 fill_value = na_value_for_dtype(values.dtype)
386 if values.ndim == 1:
387 return fill_value
388 else:
389 assert axis is not None # assertion to make mypy happy
390 result_shape = values.shape[:axis] + values.shape[axis + 1 :]
391 result = np.empty(result_shape, dtype=values.dtype)
392 result.fill(fill_value)
393 return result
396def nanany(values, axis=None, skipna: bool = True, mask=None):
397 """
398 Check if any elements along an axis evaluate to True.
400 Parameters
401 ----------
402 values : ndarray
403 axis : int, optional
404 skipna : bool, default True
405 mask : ndarray[bool], optional
406 nan-mask if known
408 Returns
409 -------
410 result : bool
412 Examples
413 --------
414 >>> import pandas.core.nanops as nanops
415 >>> s = pd.Series([1, 2])
416 >>> nanops.nanany(s)
417 True
419 >>> import pandas.core.nanops as nanops
420 >>> s = pd.Series([np.nan])
421 >>> nanops.nanany(s)
422 False
423 """
424 values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
425 return values.any(axis)
428def nanall(values, axis=None, skipna: bool = True, mask=None):
429 """
430 Check if all elements along an axis evaluate to True.
432 Parameters
433 ----------
434 values : ndarray
435 axis: int, optional
436 skipna : bool, default True
437 mask : ndarray[bool], optional
438 nan-mask if known
440 Returns
441 -------
442 result : bool
444 Examples
445 --------
446 >>> import pandas.core.nanops as nanops
447 >>> s = pd.Series([1, 2, np.nan])
448 >>> nanops.nanall(s)
449 True
451 >>> import pandas.core.nanops as nanops
452 >>> s = pd.Series([1, 0])
453 >>> nanops.nanall(s)
454 False
455 """
456 values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
457 return values.all(axis)
460@disallow("M8")
461def nansum(values, axis=None, skipna=True, min_count=0, mask=None):
462 """
463 Sum the elements along an axis ignoring NaNs
465 Parameters
466 ----------
467 values : ndarray[dtype]
468 axis: int, optional
469 skipna : bool, default True
470 min_count: int, default 0
471 mask : ndarray[bool], optional
472 nan-mask if known
474 Returns
475 -------
476 result : dtype
478 Examples
479 --------
480 >>> import pandas.core.nanops as nanops
481 >>> s = pd.Series([1, 2, np.nan])
482 >>> nanops.nansum(s)
483 3.0
484 """
485 values, mask, dtype, dtype_max, _ = _get_values(
486 values, skipna, fill_value=0, mask=mask
487 )
488 dtype_sum = dtype_max
489 if is_float_dtype(dtype):
490 dtype_sum = dtype
491 elif is_timedelta64_dtype(dtype):
492 dtype_sum = np.float64
493 the_sum = values.sum(axis, dtype=dtype_sum)
494 the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
496 return _wrap_results(the_sum, dtype)
499@disallow("M8", DatetimeTZDtype)
500@bottleneck_switch()
501def nanmean(values, axis=None, skipna=True, mask=None):
502 """
503 Compute the mean of the element along an axis ignoring NaNs
505 Parameters
506 ----------
507 values : ndarray
508 axis: int, optional
509 skipna : bool, default True
510 mask : ndarray[bool], optional
511 nan-mask if known
513 Returns
514 -------
515 result : float
516 Unless input is a float array, in which case use the same
517 precision as the input array.
519 Examples
520 --------
521 >>> import pandas.core.nanops as nanops
522 >>> s = pd.Series([1, 2, np.nan])
523 >>> nanops.nanmean(s)
524 1.5
525 """
526 values, mask, dtype, dtype_max, _ = _get_values(
527 values, skipna, fill_value=0, mask=mask
528 )
529 dtype_sum = dtype_max
530 dtype_count = np.float64
531 if (
532 is_integer_dtype(dtype)
533 or is_timedelta64_dtype(dtype)
534 or is_datetime64_dtype(dtype)
535 or is_datetime64tz_dtype(dtype)
536 ):
537 dtype_sum = np.float64
538 elif is_float_dtype(dtype):
539 dtype_sum = dtype
540 dtype_count = dtype
541 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
542 the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
544 if axis is not None and getattr(the_sum, "ndim", False):
545 with np.errstate(all="ignore"):
546 # suppress division by zero warnings
547 the_mean = the_sum / count
548 ct_mask = count == 0
549 if ct_mask.any():
550 the_mean[ct_mask] = np.nan
551 else:
552 the_mean = the_sum / count if count > 0 else np.nan
554 return _wrap_results(the_mean, dtype)
557@disallow("M8")
558@bottleneck_switch()
559def nanmedian(values, axis=None, skipna=True, mask=None):
560 """
561 Parameters
562 ----------
563 values : ndarray
564 axis: int, optional
565 skipna : bool, default True
566 mask : ndarray[bool], optional
567 nan-mask if known
569 Returns
570 -------
571 result : float
572 Unless input is a float array, in which case use the same
573 precision as the input array.
575 Examples
576 --------
577 >>> import pandas.core.nanops as nanops
578 >>> s = pd.Series([1, np.nan, 2, 2])
579 >>> nanops.nanmedian(s)
580 2.0
581 """
583 def get_median(x):
584 mask = notna(x)
585 if not skipna and not mask.all():
586 return np.nan
587 return np.nanmedian(x[mask])
589 values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask)
590 if not is_float_dtype(values):
591 values = values.astype("f8")
592 if mask is not None:
593 values[mask] = np.nan
595 if axis is None:
596 values = values.ravel()
598 notempty = values.size
600 # an array from a frame
601 if values.ndim > 1:
603 # there's a non-empty array to apply over otherwise numpy raises
604 if notempty:
605 if not skipna:
606 return _wrap_results(
607 np.apply_along_axis(get_median, axis, values), dtype
608 )
610 # fastpath for the skipna case
611 return _wrap_results(np.nanmedian(values, axis), dtype)
613 # must return the correct shape, but median is not defined for the
614 # empty set so return nans of shape "everything but the passed axis"
615 # since "axis" is where the reduction would occur if we had a nonempty
616 # array
617 shp = np.array(values.shape)
618 dims = np.arange(values.ndim)
619 ret = np.empty(shp[dims != axis])
620 ret.fill(np.nan)
621 return _wrap_results(ret, dtype)
623 # otherwise return a scalar value
624 return _wrap_results(get_median(values) if notempty else np.nan, dtype)
627def _get_counts_nanvar(
628 value_counts: Tuple[int],
629 mask: Optional[np.ndarray],
630 axis: Optional[int],
631 ddof: int,
632 dtype=float,
633) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]:
634 """ Get the count of non-null values along an axis, accounting
635 for degrees of freedom.
637 Parameters
638 ----------
639 values_shape : Tuple[int]
640 shape tuple from values ndarray, used if mask is None
641 mask : Optional[ndarray[bool]]
642 locations in values that should be considered missing
643 axis : Optional[int]
644 axis to count along
645 ddof : int
646 degrees of freedom
647 dtype : type, optional
648 type to use for count
650 Returns
651 -------
652 count : scalar or array
653 d : scalar or array
654 """
655 dtype = _get_dtype(dtype)
656 count = _get_counts(value_counts, mask, axis, dtype=dtype)
657 d = count - dtype.type(ddof)
659 # always return NaN, never inf
660 if is_scalar(count):
661 if count <= ddof:
662 count = np.nan
663 d = np.nan
664 else:
665 mask2: np.ndarray = count <= ddof
666 if mask2.any():
667 np.putmask(d, mask2, np.nan)
668 np.putmask(count, mask2, np.nan)
669 return count, d
672@disallow("M8")
673@bottleneck_switch(ddof=1)
674def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
675 """
676 Compute the standard deviation along given axis while ignoring NaNs
678 Parameters
679 ----------
680 values : ndarray
681 axis: int, optional
682 skipna : bool, default True
683 ddof : int, default 1
684 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
685 where N represents the number of elements.
686 mask : ndarray[bool], optional
687 nan-mask if known
689 Returns
690 -------
691 result : float
692 Unless input is a float array, in which case use the same
693 precision as the input array.
695 Examples
696 --------
697 >>> import pandas.core.nanops as nanops
698 >>> s = pd.Series([1, np.nan, 2, 3])
699 >>> nanops.nanstd(s)
700 1.0
701 """
702 orig_dtype = values.dtype
703 values, mask, dtype, dtype_max, fill_value = _get_values(values, skipna, mask=mask)
705 result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
706 return _wrap_results(result, orig_dtype)
709@disallow("M8", "m8")
710@bottleneck_switch(ddof=1)
711def nanvar(values, axis=None, skipna=True, ddof=1, mask=None):
712 """
713 Compute the variance along given axis while ignoring NaNs
715 Parameters
716 ----------
717 values : ndarray
718 axis: int, optional
719 skipna : bool, default True
720 ddof : int, default 1
721 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
722 where N represents the number of elements.
723 mask : ndarray[bool], optional
724 nan-mask if known
726 Returns
727 -------
728 result : float
729 Unless input is a float array, in which case use the same
730 precision as the input array.
732 Examples
733 --------
734 >>> import pandas.core.nanops as nanops
735 >>> s = pd.Series([1, np.nan, 2, 3])
736 >>> nanops.nanvar(s)
737 1.0
738 """
739 values = lib.values_from_object(values)
740 dtype = values.dtype
741 mask = _maybe_get_mask(values, skipna, mask)
742 if is_any_int_dtype(values):
743 values = values.astype("f8")
744 if mask is not None:
745 values[mask] = np.nan
747 if is_float_dtype(values):
748 count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
749 else:
750 count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
752 if skipna and mask is not None:
753 values = values.copy()
754 np.putmask(values, mask, 0)
756 # xref GH10242
757 # Compute variance via two-pass algorithm, which is stable against
758 # cancellation errors and relatively accurate for small numbers of
759 # observations.
760 #
761 # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
762 avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
763 if axis is not None:
764 avg = np.expand_dims(avg, axis)
765 sqr = _ensure_numeric((avg - values) ** 2)
766 if mask is not None:
767 np.putmask(sqr, mask, 0)
768 result = sqr.sum(axis=axis, dtype=np.float64) / d
770 # Return variance as np.float64 (the datatype used in the accumulator),
771 # unless we were dealing with a float array, in which case use the same
772 # precision as the original values array.
773 if is_float_dtype(dtype):
774 result = result.astype(dtype)
775 return _wrap_results(result, values.dtype)
778@disallow("M8", "m8")
779def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
780 """
781 Compute the standard error in the mean along given axis while ignoring NaNs
783 Parameters
784 ----------
785 values : ndarray
786 axis: int, optional
787 skipna : bool, default True
788 ddof : int, default 1
789 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
790 where N represents the number of elements.
791 mask : ndarray[bool], optional
792 nan-mask if known
794 Returns
795 -------
796 result : float64
797 Unless input is a float array, in which case use the same
798 precision as the input array.
800 Examples
801 --------
802 >>> import pandas.core.nanops as nanops
803 >>> s = pd.Series([1, np.nan, 2, 3])
804 >>> nanops.nansem(s)
805 0.5773502691896258
806 """
808 # This checks if non-numeric-like data is passed with numeric_only=False
809 # and raises a TypeError otherwise
810 nanvar(values, axis, skipna, ddof=ddof, mask=mask)
812 mask = _maybe_get_mask(values, skipna, mask)
813 if not is_float_dtype(values.dtype):
814 values = values.astype("f8")
816 count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
817 var = nanvar(values, axis, skipna, ddof=ddof)
819 return np.sqrt(var) / np.sqrt(count)
822def _nanminmax(meth, fill_value_typ):
823 @bottleneck_switch(name="nan" + meth)
824 def reduction(values, axis=None, skipna=True, mask=None):
826 values, mask, dtype, dtype_max, fill_value = _get_values(
827 values, skipna, fill_value_typ=fill_value_typ, mask=mask
828 )
830 if (axis is not None and values.shape[axis] == 0) or values.size == 0:
831 try:
832 result = getattr(values, meth)(axis, dtype=dtype_max)
833 result.fill(np.nan)
834 except (AttributeError, TypeError, ValueError):
835 result = np.nan
836 else:
837 result = getattr(values, meth)(axis)
839 result = _wrap_results(result, dtype, fill_value)
840 return _maybe_null_out(result, axis, mask, values.shape)
842 return reduction
845nanmin = _nanminmax("min", fill_value_typ="+inf")
846nanmax = _nanminmax("max", fill_value_typ="-inf")
849@disallow("O")
850def nanargmax(values, axis=None, skipna=True, mask=None):
851 """
852 Parameters
853 ----------
854 values : ndarray
855 axis: int, optional
856 skipna : bool, default True
857 mask : ndarray[bool], optional
858 nan-mask if known
860 Returns
861 -------
862 result : int
863 The index of max value in specified axis or -1 in the NA case
865 Examples
866 --------
867 >>> import pandas.core.nanops as nanops
868 >>> s = pd.Series([1, 2, 3, np.nan, 4])
869 >>> nanops.nanargmax(s)
870 4
871 """
872 values, mask, dtype, _, _ = _get_values(
873 values, True, fill_value_typ="-inf", mask=mask
874 )
875 result = values.argmax(axis)
876 result = _maybe_arg_null_out(result, axis, mask, skipna)
877 return result
880@disallow("O")
881def nanargmin(values, axis=None, skipna=True, mask=None):
882 """
883 Parameters
884 ----------
885 values : ndarray
886 axis: int, optional
887 skipna : bool, default True
888 mask : ndarray[bool], optional
889 nan-mask if known
891 Returns
892 -------
893 result : int
894 The index of min value in specified axis or -1 in the NA case
896 Examples
897 --------
898 >>> import pandas.core.nanops as nanops
899 >>> s = pd.Series([1, 2, 3, np.nan, 4])
900 >>> nanops.nanargmin(s)
901 0
902 """
903 values, mask, dtype, _, _ = _get_values(
904 values, True, fill_value_typ="+inf", mask=mask
905 )
906 result = values.argmin(axis)
907 result = _maybe_arg_null_out(result, axis, mask, skipna)
908 return result
911@disallow("M8", "m8")
912def nanskew(values, axis=None, skipna=True, mask=None):
913 """ Compute the sample skewness.
915 The statistic computed here is the adjusted Fisher-Pearson standardized
916 moment coefficient G1. The algorithm computes this coefficient directly
917 from the second and third central moment.
919 Parameters
920 ----------
921 values : ndarray
922 axis: int, optional
923 skipna : bool, default True
924 mask : ndarray[bool], optional
925 nan-mask if known
927 Returns
928 -------
929 result : float64
930 Unless input is a float array, in which case use the same
931 precision as the input array.
933 Examples
934 --------
935 >>> import pandas.core.nanops as nanops
936 >>> s = pd.Series([1,np.nan, 1, 2])
937 >>> nanops.nanskew(s)
938 1.7320508075688787
939 """
940 values = lib.values_from_object(values)
941 mask = _maybe_get_mask(values, skipna, mask)
942 if not is_float_dtype(values.dtype):
943 values = values.astype("f8")
944 count = _get_counts(values.shape, mask, axis)
945 else:
946 count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
948 if skipna and mask is not None:
949 values = values.copy()
950 np.putmask(values, mask, 0)
952 mean = values.sum(axis, dtype=np.float64) / count
953 if axis is not None:
954 mean = np.expand_dims(mean, axis)
956 adjusted = values - mean
957 if skipna and mask is not None:
958 np.putmask(adjusted, mask, 0)
959 adjusted2 = adjusted ** 2
960 adjusted3 = adjusted2 * adjusted
961 m2 = adjusted2.sum(axis, dtype=np.float64)
962 m3 = adjusted3.sum(axis, dtype=np.float64)
964 # floating point error
965 #
966 # #18044 in _libs/windows.pyx calc_skew follow this behavior
967 # to fix the fperr to treat m2 <1e-14 as zero
968 m2 = _zero_out_fperr(m2)
969 m3 = _zero_out_fperr(m3)
971 with np.errstate(invalid="ignore", divide="ignore"):
972 result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
974 dtype = values.dtype
975 if is_float_dtype(dtype):
976 result = result.astype(dtype)
978 if isinstance(result, np.ndarray):
979 result = np.where(m2 == 0, 0, result)
980 result[count < 3] = np.nan
981 return result
982 else:
983 result = 0 if m2 == 0 else result
984 if count < 3:
985 return np.nan
986 return result
989@disallow("M8", "m8")
990def nankurt(values, axis=None, skipna=True, mask=None):
991 """
992 Compute the sample excess kurtosis
994 The statistic computed here is the adjusted Fisher-Pearson standardized
995 moment coefficient G2, computed directly from the second and fourth
996 central moment.
998 Parameters
999 ----------
1000 values : ndarray
1001 axis: int, optional
1002 skipna : bool, default True
1003 mask : ndarray[bool], optional
1004 nan-mask if known
1006 Returns
1007 -------
1008 result : float64
1009 Unless input is a float array, in which case use the same
1010 precision as the input array.
1012 Examples
1013 --------
1014 >>> import pandas.core.nanops as nanops
1015 >>> s = pd.Series([1,np.nan, 1, 3, 2])
1016 >>> nanops.nankurt(s)
1017 -1.2892561983471076
1018 """
1019 values = lib.values_from_object(values)
1020 mask = _maybe_get_mask(values, skipna, mask)
1021 if not is_float_dtype(values.dtype):
1022 values = values.astype("f8")
1023 count = _get_counts(values.shape, mask, axis)
1024 else:
1025 count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
1027 if skipna and mask is not None:
1028 values = values.copy()
1029 np.putmask(values, mask, 0)
1031 mean = values.sum(axis, dtype=np.float64) / count
1032 if axis is not None:
1033 mean = np.expand_dims(mean, axis)
1035 adjusted = values - mean
1036 if skipna and mask is not None:
1037 np.putmask(adjusted, mask, 0)
1038 adjusted2 = adjusted ** 2
1039 adjusted4 = adjusted2 ** 2
1040 m2 = adjusted2.sum(axis, dtype=np.float64)
1041 m4 = adjusted4.sum(axis, dtype=np.float64)
1043 with np.errstate(invalid="ignore", divide="ignore"):
1044 adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
1045 numer = count * (count + 1) * (count - 1) * m4
1046 denom = (count - 2) * (count - 3) * m2 ** 2
1048 # floating point error
1049 #
1050 # #18044 in _libs/windows.pyx calc_kurt follow this behavior
1051 # to fix the fperr to treat denom <1e-14 as zero
1052 numer = _zero_out_fperr(numer)
1053 denom = _zero_out_fperr(denom)
1055 if not isinstance(denom, np.ndarray):
1056 # if ``denom`` is a scalar, check these corner cases first before
1057 # doing division
1058 if count < 4:
1059 return np.nan
1060 if denom == 0:
1061 return 0
1063 with np.errstate(invalid="ignore", divide="ignore"):
1064 result = numer / denom - adj
1066 dtype = values.dtype
1067 if is_float_dtype(dtype):
1068 result = result.astype(dtype)
1070 if isinstance(result, np.ndarray):
1071 result = np.where(denom == 0, 0, result)
1072 result[count < 4] = np.nan
1074 return result
1077@disallow("M8", "m8")
1078def nanprod(values, axis=None, skipna=True, min_count=0, mask=None):
1079 """
1080 Parameters
1081 ----------
1082 values : ndarray[dtype]
1083 axis: int, optional
1084 skipna : bool, default True
1085 min_count: int, default 0
1086 mask : ndarray[bool], optional
1087 nan-mask if known
1089 Returns
1090 -------
1091 result : dtype
1093 Examples
1094 --------
1095 >>> import pandas.core.nanops as nanops
1096 >>> s = pd.Series([1, 2, 3, np.nan])
1097 >>> nanops.nanprod(s)
1098 6.0
1100 Returns
1101 -------
1102 The product of all elements on a given axis. ( NaNs are treated as 1)
1103 """
1104 mask = _maybe_get_mask(values, skipna, mask)
1106 if skipna and mask is not None:
1107 values = values.copy()
1108 values[mask] = 1
1109 result = values.prod(axis)
1110 return _maybe_null_out(result, axis, mask, values.shape, min_count=min_count)
1113def _maybe_arg_null_out(
1114 result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], skipna: bool
1115) -> Union[np.ndarray, int]:
1116 # helper function for nanargmin/nanargmax
1117 if mask is None:
1118 return result
1120 if axis is None or not getattr(result, "ndim", False):
1121 if skipna:
1122 if mask.all():
1123 result = -1
1124 else:
1125 if mask.any():
1126 result = -1
1127 else:
1128 if skipna:
1129 na_mask = mask.all(axis)
1130 else:
1131 na_mask = mask.any(axis)
1132 if na_mask.any():
1133 result[na_mask] = -1
1134 return result
1137def _get_counts(
1138 values_shape: Tuple[int],
1139 mask: Optional[np.ndarray],
1140 axis: Optional[int],
1141 dtype=float,
1142) -> Union[int, np.ndarray]:
1143 """ Get the count of non-null values along an axis
1145 Parameters
1146 ----------
1147 values_shape : Tuple[int]
1148 shape tuple from values ndarray, used if mask is None
1149 mask : Optional[ndarray[bool]]
1150 locations in values that should be considered missing
1151 axis : Optional[int]
1152 axis to count along
1153 dtype : type, optional
1154 type to use for count
1156 Returns
1157 -------
1158 count : scalar or array
1159 """
1160 dtype = _get_dtype(dtype)
1161 if axis is None:
1162 if mask is not None:
1163 n = mask.size - mask.sum()
1164 else:
1165 n = np.prod(values_shape)
1166 return dtype.type(n)
1168 if mask is not None:
1169 count = mask.shape[axis] - mask.sum(axis)
1170 else:
1171 count = values_shape[axis]
1173 if is_scalar(count):
1174 return dtype.type(count)
1175 try:
1176 return count.astype(dtype)
1177 except AttributeError:
1178 return np.array(count, dtype=dtype)
1181def _maybe_null_out(
1182 result: np.ndarray,
1183 axis: Optional[int],
1184 mask: Optional[np.ndarray],
1185 shape: Tuple,
1186 min_count: int = 1,
1187) -> np.ndarray:
1188 if mask is not None and axis is not None and getattr(result, "ndim", False):
1189 null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
1190 if np.any(null_mask):
1191 if is_numeric_dtype(result):
1192 if np.iscomplexobj(result):
1193 result = result.astype("c16")
1194 else:
1195 result = result.astype("f8")
1196 result[null_mask] = np.nan
1197 else:
1198 # GH12941, use None to auto cast null
1199 result[null_mask] = None
1200 elif result is not NaT:
1201 if mask is not None:
1202 null_mask = mask.size - mask.sum()
1203 else:
1204 null_mask = np.prod(shape)
1205 if null_mask < min_count:
1206 result = np.nan
1208 return result
1211def _zero_out_fperr(arg):
1212 # #18044 reference this behavior to fix rolling skew/kurt issue
1213 if isinstance(arg, np.ndarray):
1214 with np.errstate(invalid="ignore"):
1215 return np.where(np.abs(arg) < 1e-14, 0, arg)
1216 else:
1217 return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
1220@disallow("M8", "m8")
1221def nancorr(a, b, method="pearson", min_periods=None):
1222 """
1223 a, b: ndarrays
1224 """
1225 if len(a) != len(b):
1226 raise AssertionError("Operands to nancorr must have same size")
1228 if min_periods is None:
1229 min_periods = 1
1231 valid = notna(a) & notna(b)
1232 if not valid.all():
1233 a = a[valid]
1234 b = b[valid]
1236 if len(a) < min_periods:
1237 return np.nan
1239 f = get_corr_func(method)
1240 return f(a, b)
1243def get_corr_func(method):
1244 if method in ["kendall", "spearman"]:
1245 from scipy.stats import kendalltau, spearmanr
1246 elif method in ["pearson"]:
1247 pass
1248 elif callable(method):
1249 return method
1250 else:
1251 raise ValueError(
1252 f"Unkown method '{method}', expected one of 'kendall', 'spearman'"
1253 )
1255 def _pearson(a, b):
1256 return np.corrcoef(a, b)[0, 1]
1258 def _kendall(a, b):
1259 # kendallttau returns a tuple of the tau statistic and pvalue
1260 rs = kendalltau(a, b)
1261 return rs[0]
1263 def _spearman(a, b):
1264 return spearmanr(a, b)[0]
1266 _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman}
1267 return _cor_methods[method]
1270@disallow("M8", "m8")
1271def nancov(a, b, min_periods=None):
1272 if len(a) != len(b):
1273 raise AssertionError("Operands to nancov must have same size")
1275 if min_periods is None:
1276 min_periods = 1
1278 valid = notna(a) & notna(b)
1279 if not valid.all():
1280 a = a[valid]
1281 b = b[valid]
1283 if len(a) < min_periods:
1284 return np.nan
1286 return np.cov(a, b)[0, 1]
1289def _ensure_numeric(x):
1290 if isinstance(x, np.ndarray):
1291 if is_integer_dtype(x) or is_bool_dtype(x):
1292 x = x.astype(np.float64)
1293 elif is_object_dtype(x):
1294 try:
1295 x = x.astype(np.complex128)
1296 except (TypeError, ValueError):
1297 x = x.astype(np.float64)
1298 else:
1299 if not np.any(np.imag(x)):
1300 x = x.real
1301 elif not (is_float(x) or is_integer(x) or is_complex(x)):
1302 try:
1303 x = float(x)
1304 except ValueError:
1305 # e.g. "1+1j" or "foo"
1306 try:
1307 x = complex(x)
1308 except ValueError:
1309 # e.g. "foo"
1310 raise TypeError(f"Could not convert {x} to numeric")
1311 return x
1314# NA-friendly array comparisons
1317def make_nancomp(op):
1318 def f(x, y):
1319 xmask = isna(x)
1320 ymask = isna(y)
1321 mask = xmask | ymask
1323 with np.errstate(all="ignore"):
1324 result = op(x, y)
1326 if mask.any():
1327 if is_bool_dtype(result):
1328 result = result.astype("O")
1329 np.putmask(result, mask, np.nan)
1331 return result
1333 return f
1336nangt = make_nancomp(operator.gt)
1337nange = make_nancomp(operator.ge)
1338nanlt = make_nancomp(operator.lt)
1339nanle = make_nancomp(operator.le)
1340naneq = make_nancomp(operator.eq)
1341nanne = make_nancomp(operator.ne)
1344def _nanpercentile_1d(values, mask, q, na_value, interpolation):
1345 """
1346 Wrapper for np.percentile that skips missing values, specialized to
1347 1-dimensional case.
1349 Parameters
1350 ----------
1351 values : array over which to find quantiles
1352 mask : ndarray[bool]
1353 locations in values that should be considered missing
1354 q : scalar or array of quantile indices to find
1355 na_value : scalar
1356 value to return for empty or all-null values
1357 interpolation : str
1359 Returns
1360 -------
1361 quantiles : scalar or array
1362 """
1363 # mask is Union[ExtensionArray, ndarray]
1364 values = values[~mask]
1366 if len(values) == 0:
1367 if lib.is_scalar(q):
1368 return na_value
1369 else:
1370 return np.array([na_value] * len(q), dtype=values.dtype)
1372 return np.percentile(values, q, interpolation=interpolation)
1375def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
1376 """
1377 Wrapper for np.percentile that skips missing values.
1379 Parameters
1380 ----------
1381 values : array over which to find quantiles
1382 q : scalar or array of quantile indices to find
1383 axis : {0, 1}
1384 na_value : scalar
1385 value to return for empty or all-null values
1386 mask : ndarray[bool]
1387 locations in values that should be considered missing
1388 ndim : {1, 2}
1389 interpolation : str
1391 Returns
1392 -------
1393 quantiles : scalar or array
1394 """
1395 if values.dtype.kind in ["m", "M"]:
1396 # need to cast to integer to avoid rounding errors in numpy
1397 result = nanpercentile(
1398 values.view("i8"), q, axis, na_value.view("i8"), mask, ndim, interpolation
1399 )
1401 # Note: we have to do do `astype` and not view because in general we
1402 # have float result at this point, not i8
1403 return result.astype(values.dtype)
1405 if not lib.is_scalar(mask) and mask.any():
1406 if ndim == 1:
1407 return _nanpercentile_1d(
1408 values, mask, q, na_value, interpolation=interpolation
1409 )
1410 else:
1411 # for nonconsolidatable blocks mask is 1D, but values 2D
1412 if mask.ndim < values.ndim:
1413 mask = mask.reshape(values.shape)
1414 if axis == 0:
1415 values = values.T
1416 mask = mask.T
1417 result = [
1418 _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation)
1419 for (val, m) in zip(list(values), list(mask))
1420 ]
1421 result = np.array(result, dtype=values.dtype, copy=False).T
1422 return result
1423 else:
1424 return np.percentile(values, q, axis=axis, interpolation=interpolation)