Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/missing.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Routines for filling missing data.
3"""
5import numpy as np
7from pandas._libs import algos, lib
8from pandas.compat._optional import import_optional_dependency
10from pandas.core.dtypes.cast import infer_dtype_from_array
11from pandas.core.dtypes.common import (
12 ensure_float64,
13 is_datetime64_dtype,
14 is_datetime64tz_dtype,
15 is_integer_dtype,
16 is_numeric_v_string_like,
17 is_scalar,
18 is_timedelta64_dtype,
19 needs_i8_conversion,
20)
21from pandas.core.dtypes.missing import isna
24def mask_missing(arr, values_to_mask):
25 """
26 Return a masking array of same size/shape as arr
27 with entries equaling any member of values_to_mask set to True
28 """
29 dtype, values_to_mask = infer_dtype_from_array(values_to_mask)
31 try:
32 values_to_mask = np.array(values_to_mask, dtype=dtype)
34 except Exception:
35 values_to_mask = np.array(values_to_mask, dtype=object)
37 na_mask = isna(values_to_mask)
38 nonna = values_to_mask[~na_mask]
40 mask = None
41 for x in nonna:
42 if mask is None:
43 if is_numeric_v_string_like(arr, x):
44 # GH#29553 prevent numpy deprecation warnings
45 mask = False
46 else:
47 mask = arr == x
49 # if x is a string and arr is not, then we get False and we must
50 # expand the mask to size arr.shape
51 if is_scalar(mask):
52 mask = np.zeros(arr.shape, dtype=bool)
53 else:
54 if is_numeric_v_string_like(arr, x):
55 # GH#29553 prevent numpy deprecation warnings
56 mask |= False
57 else:
58 mask |= arr == x
60 if na_mask.any():
61 if mask is None:
62 mask = isna(arr)
63 else:
64 mask |= isna(arr)
66 # GH 21977
67 if mask is None:
68 mask = np.zeros(arr.shape, dtype=bool)
70 return mask
73def clean_fill_method(method, allow_nearest=False):
74 # asfreq is compat for resampling
75 if method in [None, "asfreq"]:
76 return None
78 if isinstance(method, str):
79 method = method.lower()
80 if method == "ffill":
81 method = "pad"
82 elif method == "bfill":
83 method = "backfill"
85 valid_methods = ["pad", "backfill"]
86 expecting = "pad (ffill) or backfill (bfill)"
87 if allow_nearest:
88 valid_methods.append("nearest")
89 expecting = "pad (ffill), backfill (bfill) or nearest"
90 if method not in valid_methods:
91 raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}")
92 return method
95def clean_interp_method(method, **kwargs):
96 order = kwargs.get("order")
97 valid = [
98 "linear",
99 "time",
100 "index",
101 "values",
102 "nearest",
103 "zero",
104 "slinear",
105 "quadratic",
106 "cubic",
107 "barycentric",
108 "polynomial",
109 "krogh",
110 "piecewise_polynomial",
111 "pchip",
112 "akima",
113 "spline",
114 "from_derivatives",
115 ]
116 if method in ("spline", "polynomial") and order is None:
117 raise ValueError("You must specify the order of the spline or polynomial.")
118 if method not in valid:
119 raise ValueError(f"method must be one of {valid}. Got '{method}' instead.")
121 return method
124def find_valid_index(values, how: str):
125 """
126 Retrieves the index of the first valid value.
128 Parameters
129 ----------
130 values : ndarray or ExtensionArray
131 how : {'first', 'last'}
132 Use this parameter to change between the first or last valid index.
134 Returns
135 -------
136 int or None
137 """
138 assert how in ["first", "last"]
140 if len(values) == 0: # early stop
141 return None
143 is_valid = ~isna(values)
145 if values.ndim == 2:
146 is_valid = is_valid.any(1) # reduce axis 1
148 if how == "first":
149 idxpos = is_valid[::].argmax()
151 if how == "last":
152 idxpos = len(values) - 1 - is_valid[::-1].argmax()
154 chk_notna = is_valid[idxpos]
156 if not chk_notna:
157 return None
158 return idxpos
161def interpolate_1d(
162 xvalues,
163 yvalues,
164 method="linear",
165 limit=None,
166 limit_direction="forward",
167 limit_area=None,
168 fill_value=None,
169 bounds_error=False,
170 order=None,
171 **kwargs,
172):
173 """
174 Logic for the 1-d interpolation. The result should be 1-d, inputs
175 xvalues and yvalues will each be 1-d arrays of the same length.
177 Bounds_error is currently hardcoded to False since non-scipy ones don't
178 take it as an argument.
179 """
180 # Treat the original, non-scipy methods first.
182 invalid = isna(yvalues)
183 valid = ~invalid
185 if not valid.any():
186 # have to call np.asarray(xvalues) since xvalues could be an Index
187 # which can't be mutated
188 result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
189 result.fill(np.nan)
190 return result
192 if valid.all():
193 return yvalues
195 if method == "time":
196 if not getattr(xvalues, "is_all_dates", None):
197 # if not issubclass(xvalues.dtype.type, np.datetime64):
198 raise ValueError(
199 "time-weighted interpolation only works "
200 "on Series or DataFrames with a "
201 "DatetimeIndex"
202 )
203 method = "values"
205 valid_limit_directions = ["forward", "backward", "both"]
206 limit_direction = limit_direction.lower()
207 if limit_direction not in valid_limit_directions:
208 raise ValueError(
209 "Invalid limit_direction: expecting one of "
210 f"{valid_limit_directions}, got '{limit_direction}'."
211 )
213 if limit_area is not None:
214 valid_limit_areas = ["inside", "outside"]
215 limit_area = limit_area.lower()
216 if limit_area not in valid_limit_areas:
217 raise ValueError(
218 f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
219 f"{limit_area}."
220 )
222 # default limit is unlimited GH #16282
223 limit = algos._validate_limit(nobs=None, limit=limit)
225 # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
226 all_nans = set(np.flatnonzero(invalid))
227 start_nans = set(range(find_valid_index(yvalues, "first")))
228 end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
229 mid_nans = all_nans - start_nans - end_nans
231 # Like the sets above, preserve_nans contains indices of invalid values,
232 # but in this case, it is the final set of indices that need to be
233 # preserved as NaN after the interpolation.
235 # For example if limit_direction='forward' then preserve_nans will
236 # contain indices of NaNs at the beginning of the series, and NaNs that
237 # are more than'limit' away from the prior non-NaN.
239 # set preserve_nans based on direction using _interp_limit
240 if limit_direction == "forward":
241 preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
242 elif limit_direction == "backward":
243 preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
244 else:
245 # both directions... just use _interp_limit
246 preserve_nans = set(_interp_limit(invalid, limit, limit))
248 # if limit_area is set, add either mid or outside indices
249 # to preserve_nans GH #16284
250 if limit_area == "inside":
251 # preserve NaNs on the outside
252 preserve_nans |= start_nans | end_nans
253 elif limit_area == "outside":
254 # preserve NaNs on the inside
255 preserve_nans |= mid_nans
257 # sort preserve_nans and covert to list
258 preserve_nans = sorted(preserve_nans)
260 xvalues = getattr(xvalues, "values", xvalues)
261 yvalues = getattr(yvalues, "values", yvalues)
262 result = yvalues.copy()
264 if method in ["linear", "time", "index", "values"]:
265 if method in ("values", "index"):
266 inds = np.asarray(xvalues)
267 # hack for DatetimeIndex, #1646
268 if needs_i8_conversion(inds.dtype.type):
269 inds = inds.view(np.int64)
270 if inds.dtype == np.object_:
271 inds = lib.maybe_convert_objects(inds)
272 else:
273 inds = xvalues
274 # np.interp requires sorted X values, #21037
275 indexer = np.argsort(inds[valid])
276 result[invalid] = np.interp(
277 inds[invalid], inds[valid][indexer], yvalues[valid][indexer]
278 )
279 result[preserve_nans] = np.nan
280 return result
282 sp_methods = [
283 "nearest",
284 "zero",
285 "slinear",
286 "quadratic",
287 "cubic",
288 "barycentric",
289 "krogh",
290 "spline",
291 "polynomial",
292 "from_derivatives",
293 "piecewise_polynomial",
294 "pchip",
295 "akima",
296 ]
298 if method in sp_methods:
299 inds = np.asarray(xvalues)
300 # hack for DatetimeIndex, #1646
301 if issubclass(inds.dtype.type, np.datetime64):
302 inds = inds.view(np.int64)
303 result[invalid] = _interpolate_scipy_wrapper(
304 inds[valid],
305 yvalues[valid],
306 inds[invalid],
307 method=method,
308 fill_value=fill_value,
309 bounds_error=bounds_error,
310 order=order,
311 **kwargs,
312 )
313 result[preserve_nans] = np.nan
314 return result
317def _interpolate_scipy_wrapper(
318 x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs
319):
320 """
321 Passed off to scipy.interpolate.interp1d. method is scipy's kind.
322 Returns an array interpolated at new_x. Add any new methods to
323 the list in _clean_interp_method.
324 """
325 extra = f"{method} interpolation requires SciPy."
326 import_optional_dependency("scipy", extra=extra)
327 from scipy import interpolate
329 new_x = np.asarray(new_x)
331 # ignores some kwargs that could be passed along.
332 alt_methods = {
333 "barycentric": interpolate.barycentric_interpolate,
334 "krogh": interpolate.krogh_interpolate,
335 "from_derivatives": _from_derivatives,
336 "piecewise_polynomial": _from_derivatives,
337 }
339 if getattr(x, "is_all_dates", False):
340 # GH 5975, scipy.interp1d can't handle datetime64s
341 x, new_x = x._values.astype("i8"), new_x.astype("i8")
343 if method == "pchip":
344 try:
345 alt_methods["pchip"] = interpolate.pchip_interpolate
346 except AttributeError:
347 raise ImportError(
348 "Your version of Scipy does not support PCHIP interpolation."
349 )
350 elif method == "akima":
351 alt_methods["akima"] = _akima_interpolate
353 interp1d_methods = [
354 "nearest",
355 "zero",
356 "slinear",
357 "quadratic",
358 "cubic",
359 "polynomial",
360 ]
361 if method in interp1d_methods:
362 if method == "polynomial":
363 method = order
364 terp = interpolate.interp1d(
365 x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error
366 )
367 new_y = terp(new_x)
368 elif method == "spline":
369 # GH #10633, #24014
370 if isna(order) or (order <= 0):
371 raise ValueError(
372 f"order needs to be specified and greater than 0; got order: {order}"
373 )
374 terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
375 new_y = terp(new_x)
376 else:
377 # GH 7295: need to be able to write for some reason
378 # in some circumstances: check all three
379 if not x.flags.writeable:
380 x = x.copy()
381 if not y.flags.writeable:
382 y = y.copy()
383 if not new_x.flags.writeable:
384 new_x = new_x.copy()
385 method = alt_methods[method]
386 new_y = method(x, y, new_x, **kwargs)
387 return new_y
390def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
391 """
392 Convenience function for interpolate.BPoly.from_derivatives.
394 Construct a piecewise polynomial in the Bernstein basis, compatible
395 with the specified values and derivatives at breakpoints.
397 Parameters
398 ----------
399 xi : array_like
400 sorted 1D array of x-coordinates
401 yi : array_like or list of array-likes
402 yi[i][j] is the j-th derivative known at xi[i]
403 order: None or int or array_like of ints. Default: None.
404 Specifies the degree of local polynomials. If not None, some
405 derivatives are ignored.
406 der : int or list
407 How many derivatives to extract; None for all potentially nonzero
408 derivatives (that is a number equal to the number of points), or a
409 list of derivatives to extract. This numberincludes the function
410 value as 0th derivative.
411 extrapolate : bool, optional
412 Whether to extrapolate to ouf-of-bounds points based on first and last
413 intervals, or to return NaNs. Default: True.
415 See Also
416 --------
417 scipy.interpolate.BPoly.from_derivatives
419 Returns
420 -------
421 y : scalar or array_like
422 The result, of length R or length M or M by R.
423 """
424 from scipy import interpolate
426 # return the method for compat with scipy version & backwards compat
427 method = interpolate.BPoly.from_derivatives
428 m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate)
430 return m(x)
433def _akima_interpolate(xi, yi, x, der=0, axis=0):
434 """
435 Convenience function for akima interpolation.
436 xi and yi are arrays of values used to approximate some function f,
437 with ``yi = f(xi)``.
439 See `Akima1DInterpolator` for details.
441 Parameters
442 ----------
443 xi : array_like
444 A sorted list of x-coordinates, of length N.
445 yi : array_like
446 A 1-D array of real values. `yi`'s length along the interpolation
447 axis must be equal to the length of `xi`. If N-D array, use axis
448 parameter to select correct axis.
449 x : scalar or array_like
450 Of length M.
451 der : int or list, optional
452 How many derivatives to extract; None for all potentially
453 nonzero derivatives (that is a number equal to the number
454 of points), or a list of derivatives to extract. This number
455 includes the function value as 0th derivative.
456 axis : int, optional
457 Axis in the yi array corresponding to the x-coordinate values.
459 See Also
460 --------
461 scipy.interpolate.Akima1DInterpolator
463 Returns
464 -------
465 y : scalar or array_like
466 The result, of length R or length M or M by R,
468 """
469 from scipy import interpolate
471 P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
473 if der == 0:
474 return P(x)
475 elif interpolate._isscalar(der):
476 return P(x, der=der)
477 else:
478 return [P(x, nu) for nu in der]
481def interpolate_2d(
482 values, method="pad", axis=0, limit=None, fill_value=None, dtype=None
483):
484 """
485 Perform an actual interpolation of values, values will be make 2-d if
486 needed fills inplace, returns the result.
487 """
488 orig_values = values
490 transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
492 # reshape a 1 dim if needed
493 ndim = values.ndim
494 if values.ndim == 1:
495 if axis != 0: # pragma: no cover
496 raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
497 values = values.reshape(tuple((1,) + values.shape))
499 if fill_value is None:
500 mask = None
501 else: # todo create faster fill func without masking
502 mask = mask_missing(transf(values), fill_value)
504 method = clean_fill_method(method)
505 if method == "pad":
506 values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
507 else:
508 values = transf(
509 backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)
510 )
512 # reshape back
513 if ndim == 1:
514 values = values[0]
516 if orig_values.dtype.kind == "M":
517 # convert float back to datetime64
518 values = values.astype(orig_values.dtype)
520 return values
523def _cast_values_for_fillna(values, dtype):
524 """
525 Cast values to a dtype that algos.pad and algos.backfill can handle.
526 """
527 # TODO: for int-dtypes we make a copy, but for everything else this
528 # alters the values in-place. Is this intentional?
530 if (
531 is_datetime64_dtype(dtype)
532 or is_datetime64tz_dtype(dtype)
533 or is_timedelta64_dtype(dtype)
534 ):
535 values = values.view(np.int64)
537 elif is_integer_dtype(values):
538 # NB: this check needs to come after the datetime64 check above
539 values = ensure_float64(values)
541 return values
544def _fillna_prep(values, mask=None, dtype=None):
545 # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d
546 if dtype is None:
547 dtype = values.dtype
549 if mask is None:
550 # This needs to occur before datetime/timedeltas are cast to int64
551 mask = isna(values)
553 values = _cast_values_for_fillna(values, dtype)
555 mask = mask.view(np.uint8)
556 return values, mask
559def pad_1d(values, limit=None, mask=None, dtype=None):
560 values, mask = _fillna_prep(values, mask, dtype)
561 algos.pad_inplace(values, mask, limit=limit)
562 return values
565def backfill_1d(values, limit=None, mask=None, dtype=None):
566 values, mask = _fillna_prep(values, mask, dtype)
567 algos.backfill_inplace(values, mask, limit=limit)
568 return values
571def pad_2d(values, limit=None, mask=None, dtype=None):
572 values, mask = _fillna_prep(values, mask, dtype)
574 if np.all(values.shape):
575 algos.pad_2d_inplace(values, mask, limit=limit)
576 else:
577 # for test coverage
578 pass
579 return values
582def backfill_2d(values, limit=None, mask=None, dtype=None):
583 values, mask = _fillna_prep(values, mask, dtype)
585 if np.all(values.shape):
586 algos.backfill_2d_inplace(values, mask, limit=limit)
587 else:
588 # for test coverage
589 pass
590 return values
593_fill_methods = {"pad": pad_1d, "backfill": backfill_1d}
596def get_fill_func(method):
597 method = clean_fill_method(method)
598 return _fill_methods[method]
601def clean_reindex_fill_method(method):
602 return clean_fill_method(method, allow_nearest=True)
605def _interp_limit(invalid, fw_limit, bw_limit):
606 """
607 Get indexers of values that won't be filled
608 because they exceed the limits.
610 Parameters
611 ----------
612 invalid : boolean ndarray
613 fw_limit : int or None
614 forward limit to index
615 bw_limit : int or None
616 backward limit to index
618 Returns
619 -------
620 set of indexers
622 Notes
623 -----
624 This is equivalent to the more readable, but slower
626 .. code-block:: python
628 def _interp_limit(invalid, fw_limit, bw_limit):
629 for x in np.where(invalid)[0]:
630 if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
631 yield x
632 """
633 # handle forward first; the backward direction is the same except
634 # 1. operate on the reversed array
635 # 2. subtract the returned indices from N - 1
636 N = len(invalid)
637 f_idx = set()
638 b_idx = set()
640 def inner(invalid, limit):
641 limit = min(limit, N)
642 windowed = _rolling_window(invalid, limit + 1).all(1)
643 idx = set(np.where(windowed)[0] + limit) | set(
644 np.where((~invalid[: limit + 1]).cumsum() == 0)[0]
645 )
646 return idx
648 if fw_limit is not None:
650 if fw_limit == 0:
651 f_idx = set(np.where(invalid)[0])
652 else:
653 f_idx = inner(invalid, fw_limit)
655 if bw_limit is not None:
657 if bw_limit == 0:
658 # then we don't even need to care about backwards
659 # just use forwards
660 return f_idx
661 else:
662 b_idx = list(inner(invalid[::-1], bw_limit))
663 b_idx = set(N - 1 - np.asarray(b_idx))
664 if fw_limit == 0:
665 return b_idx
667 return f_idx & b_idx
670def _rolling_window(a, window):
671 """
672 [True, True, False, True, False], 2 ->
674 [
675 [True, True],
676 [True, False],
677 [False, True],
678 [True, False],
679 ]
680 """
681 # https://stackoverflow.com/a/6811241
682 shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
683 strides = a.strides + (a.strides[-1],)
684 return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)