Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Routines for filling missing data. 

3""" 

4 

5import numpy as np 

6 

7from pandas._libs import algos, lib 

8from pandas.compat._optional import import_optional_dependency 

9 

10from pandas.core.dtypes.cast import infer_dtype_from_array 

11from pandas.core.dtypes.common import ( 

12 ensure_float64, 

13 is_datetime64_dtype, 

14 is_datetime64tz_dtype, 

15 is_integer_dtype, 

16 is_numeric_v_string_like, 

17 is_scalar, 

18 is_timedelta64_dtype, 

19 needs_i8_conversion, 

20) 

21from pandas.core.dtypes.missing import isna 

22 

23 

24def mask_missing(arr, values_to_mask): 

25 """ 

26 Return a masking array of same size/shape as arr 

27 with entries equaling any member of values_to_mask set to True 

28 """ 

29 dtype, values_to_mask = infer_dtype_from_array(values_to_mask) 

30 

31 try: 

32 values_to_mask = np.array(values_to_mask, dtype=dtype) 

33 

34 except Exception: 

35 values_to_mask = np.array(values_to_mask, dtype=object) 

36 

37 na_mask = isna(values_to_mask) 

38 nonna = values_to_mask[~na_mask] 

39 

40 mask = None 

41 for x in nonna: 

42 if mask is None: 

43 if is_numeric_v_string_like(arr, x): 

44 # GH#29553 prevent numpy deprecation warnings 

45 mask = False 

46 else: 

47 mask = arr == x 

48 

49 # if x is a string and arr is not, then we get False and we must 

50 # expand the mask to size arr.shape 

51 if is_scalar(mask): 

52 mask = np.zeros(arr.shape, dtype=bool) 

53 else: 

54 if is_numeric_v_string_like(arr, x): 

55 # GH#29553 prevent numpy deprecation warnings 

56 mask |= False 

57 else: 

58 mask |= arr == x 

59 

60 if na_mask.any(): 

61 if mask is None: 

62 mask = isna(arr) 

63 else: 

64 mask |= isna(arr) 

65 

66 # GH 21977 

67 if mask is None: 

68 mask = np.zeros(arr.shape, dtype=bool) 

69 

70 return mask 

71 

72 

73def clean_fill_method(method, allow_nearest=False): 

74 # asfreq is compat for resampling 

75 if method in [None, "asfreq"]: 

76 return None 

77 

78 if isinstance(method, str): 

79 method = method.lower() 

80 if method == "ffill": 

81 method = "pad" 

82 elif method == "bfill": 

83 method = "backfill" 

84 

85 valid_methods = ["pad", "backfill"] 

86 expecting = "pad (ffill) or backfill (bfill)" 

87 if allow_nearest: 

88 valid_methods.append("nearest") 

89 expecting = "pad (ffill), backfill (bfill) or nearest" 

90 if method not in valid_methods: 

91 raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}") 

92 return method 

93 

94 

95def clean_interp_method(method, **kwargs): 

96 order = kwargs.get("order") 

97 valid = [ 

98 "linear", 

99 "time", 

100 "index", 

101 "values", 

102 "nearest", 

103 "zero", 

104 "slinear", 

105 "quadratic", 

106 "cubic", 

107 "barycentric", 

108 "polynomial", 

109 "krogh", 

110 "piecewise_polynomial", 

111 "pchip", 

112 "akima", 

113 "spline", 

114 "from_derivatives", 

115 ] 

116 if method in ("spline", "polynomial") and order is None: 

117 raise ValueError("You must specify the order of the spline or polynomial.") 

118 if method not in valid: 

119 raise ValueError(f"method must be one of {valid}. Got '{method}' instead.") 

120 

121 return method 

122 

123 

124def find_valid_index(values, how: str): 

125 """ 

126 Retrieves the index of the first valid value. 

127 

128 Parameters 

129 ---------- 

130 values : ndarray or ExtensionArray 

131 how : {'first', 'last'} 

132 Use this parameter to change between the first or last valid index. 

133 

134 Returns 

135 ------- 

136 int or None 

137 """ 

138 assert how in ["first", "last"] 

139 

140 if len(values) == 0: # early stop 

141 return None 

142 

143 is_valid = ~isna(values) 

144 

145 if values.ndim == 2: 

146 is_valid = is_valid.any(1) # reduce axis 1 

147 

148 if how == "first": 

149 idxpos = is_valid[::].argmax() 

150 

151 if how == "last": 

152 idxpos = len(values) - 1 - is_valid[::-1].argmax() 

153 

154 chk_notna = is_valid[idxpos] 

155 

156 if not chk_notna: 

157 return None 

158 return idxpos 

159 

160 

161def interpolate_1d( 

162 xvalues, 

163 yvalues, 

164 method="linear", 

165 limit=None, 

166 limit_direction="forward", 

167 limit_area=None, 

168 fill_value=None, 

169 bounds_error=False, 

170 order=None, 

171 **kwargs, 

172): 

173 """ 

174 Logic for the 1-d interpolation. The result should be 1-d, inputs 

175 xvalues and yvalues will each be 1-d arrays of the same length. 

176 

177 Bounds_error is currently hardcoded to False since non-scipy ones don't 

178 take it as an argument. 

179 """ 

180 # Treat the original, non-scipy methods first. 

181 

182 invalid = isna(yvalues) 

183 valid = ~invalid 

184 

185 if not valid.any(): 

186 # have to call np.asarray(xvalues) since xvalues could be an Index 

187 # which can't be mutated 

188 result = np.empty_like(np.asarray(xvalues), dtype=np.float64) 

189 result.fill(np.nan) 

190 return result 

191 

192 if valid.all(): 

193 return yvalues 

194 

195 if method == "time": 

196 if not getattr(xvalues, "is_all_dates", None): 

197 # if not issubclass(xvalues.dtype.type, np.datetime64): 

198 raise ValueError( 

199 "time-weighted interpolation only works " 

200 "on Series or DataFrames with a " 

201 "DatetimeIndex" 

202 ) 

203 method = "values" 

204 

205 valid_limit_directions = ["forward", "backward", "both"] 

206 limit_direction = limit_direction.lower() 

207 if limit_direction not in valid_limit_directions: 

208 raise ValueError( 

209 "Invalid limit_direction: expecting one of " 

210 f"{valid_limit_directions}, got '{limit_direction}'." 

211 ) 

212 

213 if limit_area is not None: 

214 valid_limit_areas = ["inside", "outside"] 

215 limit_area = limit_area.lower() 

216 if limit_area not in valid_limit_areas: 

217 raise ValueError( 

218 f"Invalid limit_area: expecting one of {valid_limit_areas}, got " 

219 f"{limit_area}." 

220 ) 

221 

222 # default limit is unlimited GH #16282 

223 limit = algos._validate_limit(nobs=None, limit=limit) 

224 

225 # These are sets of index pointers to invalid values... i.e. {0, 1, etc... 

226 all_nans = set(np.flatnonzero(invalid)) 

227 start_nans = set(range(find_valid_index(yvalues, "first"))) 

228 end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) 

229 mid_nans = all_nans - start_nans - end_nans 

230 

231 # Like the sets above, preserve_nans contains indices of invalid values, 

232 # but in this case, it is the final set of indices that need to be 

233 # preserved as NaN after the interpolation. 

234 

235 # For example if limit_direction='forward' then preserve_nans will 

236 # contain indices of NaNs at the beginning of the series, and NaNs that 

237 # are more than'limit' away from the prior non-NaN. 

238 

239 # set preserve_nans based on direction using _interp_limit 

240 if limit_direction == "forward": 

241 preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) 

242 elif limit_direction == "backward": 

243 preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) 

244 else: 

245 # both directions... just use _interp_limit 

246 preserve_nans = set(_interp_limit(invalid, limit, limit)) 

247 

248 # if limit_area is set, add either mid or outside indices 

249 # to preserve_nans GH #16284 

250 if limit_area == "inside": 

251 # preserve NaNs on the outside 

252 preserve_nans |= start_nans | end_nans 

253 elif limit_area == "outside": 

254 # preserve NaNs on the inside 

255 preserve_nans |= mid_nans 

256 

257 # sort preserve_nans and covert to list 

258 preserve_nans = sorted(preserve_nans) 

259 

260 xvalues = getattr(xvalues, "values", xvalues) 

261 yvalues = getattr(yvalues, "values", yvalues) 

262 result = yvalues.copy() 

263 

264 if method in ["linear", "time", "index", "values"]: 

265 if method in ("values", "index"): 

266 inds = np.asarray(xvalues) 

267 # hack for DatetimeIndex, #1646 

268 if needs_i8_conversion(inds.dtype.type): 

269 inds = inds.view(np.int64) 

270 if inds.dtype == np.object_: 

271 inds = lib.maybe_convert_objects(inds) 

272 else: 

273 inds = xvalues 

274 # np.interp requires sorted X values, #21037 

275 indexer = np.argsort(inds[valid]) 

276 result[invalid] = np.interp( 

277 inds[invalid], inds[valid][indexer], yvalues[valid][indexer] 

278 ) 

279 result[preserve_nans] = np.nan 

280 return result 

281 

282 sp_methods = [ 

283 "nearest", 

284 "zero", 

285 "slinear", 

286 "quadratic", 

287 "cubic", 

288 "barycentric", 

289 "krogh", 

290 "spline", 

291 "polynomial", 

292 "from_derivatives", 

293 "piecewise_polynomial", 

294 "pchip", 

295 "akima", 

296 ] 

297 

298 if method in sp_methods: 

299 inds = np.asarray(xvalues) 

300 # hack for DatetimeIndex, #1646 

301 if issubclass(inds.dtype.type, np.datetime64): 

302 inds = inds.view(np.int64) 

303 result[invalid] = _interpolate_scipy_wrapper( 

304 inds[valid], 

305 yvalues[valid], 

306 inds[invalid], 

307 method=method, 

308 fill_value=fill_value, 

309 bounds_error=bounds_error, 

310 order=order, 

311 **kwargs, 

312 ) 

313 result[preserve_nans] = np.nan 

314 return result 

315 

316 

317def _interpolate_scipy_wrapper( 

318 x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs 

319): 

320 """ 

321 Passed off to scipy.interpolate.interp1d. method is scipy's kind. 

322 Returns an array interpolated at new_x. Add any new methods to 

323 the list in _clean_interp_method. 

324 """ 

325 extra = f"{method} interpolation requires SciPy." 

326 import_optional_dependency("scipy", extra=extra) 

327 from scipy import interpolate 

328 

329 new_x = np.asarray(new_x) 

330 

331 # ignores some kwargs that could be passed along. 

332 alt_methods = { 

333 "barycentric": interpolate.barycentric_interpolate, 

334 "krogh": interpolate.krogh_interpolate, 

335 "from_derivatives": _from_derivatives, 

336 "piecewise_polynomial": _from_derivatives, 

337 } 

338 

339 if getattr(x, "is_all_dates", False): 

340 # GH 5975, scipy.interp1d can't handle datetime64s 

341 x, new_x = x._values.astype("i8"), new_x.astype("i8") 

342 

343 if method == "pchip": 

344 try: 

345 alt_methods["pchip"] = interpolate.pchip_interpolate 

346 except AttributeError: 

347 raise ImportError( 

348 "Your version of Scipy does not support PCHIP interpolation." 

349 ) 

350 elif method == "akima": 

351 alt_methods["akima"] = _akima_interpolate 

352 

353 interp1d_methods = [ 

354 "nearest", 

355 "zero", 

356 "slinear", 

357 "quadratic", 

358 "cubic", 

359 "polynomial", 

360 ] 

361 if method in interp1d_methods: 

362 if method == "polynomial": 

363 method = order 

364 terp = interpolate.interp1d( 

365 x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error 

366 ) 

367 new_y = terp(new_x) 

368 elif method == "spline": 

369 # GH #10633, #24014 

370 if isna(order) or (order <= 0): 

371 raise ValueError( 

372 f"order needs to be specified and greater than 0; got order: {order}" 

373 ) 

374 terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs) 

375 new_y = terp(new_x) 

376 else: 

377 # GH 7295: need to be able to write for some reason 

378 # in some circumstances: check all three 

379 if not x.flags.writeable: 

380 x = x.copy() 

381 if not y.flags.writeable: 

382 y = y.copy() 

383 if not new_x.flags.writeable: 

384 new_x = new_x.copy() 

385 method = alt_methods[method] 

386 new_y = method(x, y, new_x, **kwargs) 

387 return new_y 

388 

389 

390def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False): 

391 """ 

392 Convenience function for interpolate.BPoly.from_derivatives. 

393 

394 Construct a piecewise polynomial in the Bernstein basis, compatible 

395 with the specified values and derivatives at breakpoints. 

396 

397 Parameters 

398 ---------- 

399 xi : array_like 

400 sorted 1D array of x-coordinates 

401 yi : array_like or list of array-likes 

402 yi[i][j] is the j-th derivative known at xi[i] 

403 order: None or int or array_like of ints. Default: None. 

404 Specifies the degree of local polynomials. If not None, some 

405 derivatives are ignored. 

406 der : int or list 

407 How many derivatives to extract; None for all potentially nonzero 

408 derivatives (that is a number equal to the number of points), or a 

409 list of derivatives to extract. This numberincludes the function 

410 value as 0th derivative. 

411 extrapolate : bool, optional 

412 Whether to extrapolate to ouf-of-bounds points based on first and last 

413 intervals, or to return NaNs. Default: True. 

414 

415 See Also 

416 -------- 

417 scipy.interpolate.BPoly.from_derivatives 

418 

419 Returns 

420 ------- 

421 y : scalar or array_like 

422 The result, of length R or length M or M by R. 

423 """ 

424 from scipy import interpolate 

425 

426 # return the method for compat with scipy version & backwards compat 

427 method = interpolate.BPoly.from_derivatives 

428 m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate) 

429 

430 return m(x) 

431 

432 

433def _akima_interpolate(xi, yi, x, der=0, axis=0): 

434 """ 

435 Convenience function for akima interpolation. 

436 xi and yi are arrays of values used to approximate some function f, 

437 with ``yi = f(xi)``. 

438 

439 See `Akima1DInterpolator` for details. 

440 

441 Parameters 

442 ---------- 

443 xi : array_like 

444 A sorted list of x-coordinates, of length N. 

445 yi : array_like 

446 A 1-D array of real values. `yi`'s length along the interpolation 

447 axis must be equal to the length of `xi`. If N-D array, use axis 

448 parameter to select correct axis. 

449 x : scalar or array_like 

450 Of length M. 

451 der : int or list, optional 

452 How many derivatives to extract; None for all potentially 

453 nonzero derivatives (that is a number equal to the number 

454 of points), or a list of derivatives to extract. This number 

455 includes the function value as 0th derivative. 

456 axis : int, optional 

457 Axis in the yi array corresponding to the x-coordinate values. 

458 

459 See Also 

460 -------- 

461 scipy.interpolate.Akima1DInterpolator 

462 

463 Returns 

464 ------- 

465 y : scalar or array_like 

466 The result, of length R or length M or M by R, 

467 

468 """ 

469 from scipy import interpolate 

470 

471 P = interpolate.Akima1DInterpolator(xi, yi, axis=axis) 

472 

473 if der == 0: 

474 return P(x) 

475 elif interpolate._isscalar(der): 

476 return P(x, der=der) 

477 else: 

478 return [P(x, nu) for nu in der] 

479 

480 

481def interpolate_2d( 

482 values, method="pad", axis=0, limit=None, fill_value=None, dtype=None 

483): 

484 """ 

485 Perform an actual interpolation of values, values will be make 2-d if 

486 needed fills inplace, returns the result. 

487 """ 

488 orig_values = values 

489 

490 transf = (lambda x: x) if axis == 0 else (lambda x: x.T) 

491 

492 # reshape a 1 dim if needed 

493 ndim = values.ndim 

494 if values.ndim == 1: 

495 if axis != 0: # pragma: no cover 

496 raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") 

497 values = values.reshape(tuple((1,) + values.shape)) 

498 

499 if fill_value is None: 

500 mask = None 

501 else: # todo create faster fill func without masking 

502 mask = mask_missing(transf(values), fill_value) 

503 

504 method = clean_fill_method(method) 

505 if method == "pad": 

506 values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) 

507 else: 

508 values = transf( 

509 backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) 

510 ) 

511 

512 # reshape back 

513 if ndim == 1: 

514 values = values[0] 

515 

516 if orig_values.dtype.kind == "M": 

517 # convert float back to datetime64 

518 values = values.astype(orig_values.dtype) 

519 

520 return values 

521 

522 

523def _cast_values_for_fillna(values, dtype): 

524 """ 

525 Cast values to a dtype that algos.pad and algos.backfill can handle. 

526 """ 

527 # TODO: for int-dtypes we make a copy, but for everything else this 

528 # alters the values in-place. Is this intentional? 

529 

530 if ( 

531 is_datetime64_dtype(dtype) 

532 or is_datetime64tz_dtype(dtype) 

533 or is_timedelta64_dtype(dtype) 

534 ): 

535 values = values.view(np.int64) 

536 

537 elif is_integer_dtype(values): 

538 # NB: this check needs to come after the datetime64 check above 

539 values = ensure_float64(values) 

540 

541 return values 

542 

543 

544def _fillna_prep(values, mask=None, dtype=None): 

545 # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d 

546 if dtype is None: 

547 dtype = values.dtype 

548 

549 if mask is None: 

550 # This needs to occur before datetime/timedeltas are cast to int64 

551 mask = isna(values) 

552 

553 values = _cast_values_for_fillna(values, dtype) 

554 

555 mask = mask.view(np.uint8) 

556 return values, mask 

557 

558 

559def pad_1d(values, limit=None, mask=None, dtype=None): 

560 values, mask = _fillna_prep(values, mask, dtype) 

561 algos.pad_inplace(values, mask, limit=limit) 

562 return values 

563 

564 

565def backfill_1d(values, limit=None, mask=None, dtype=None): 

566 values, mask = _fillna_prep(values, mask, dtype) 

567 algos.backfill_inplace(values, mask, limit=limit) 

568 return values 

569 

570 

571def pad_2d(values, limit=None, mask=None, dtype=None): 

572 values, mask = _fillna_prep(values, mask, dtype) 

573 

574 if np.all(values.shape): 

575 algos.pad_2d_inplace(values, mask, limit=limit) 

576 else: 

577 # for test coverage 

578 pass 

579 return values 

580 

581 

582def backfill_2d(values, limit=None, mask=None, dtype=None): 

583 values, mask = _fillna_prep(values, mask, dtype) 

584 

585 if np.all(values.shape): 

586 algos.backfill_2d_inplace(values, mask, limit=limit) 

587 else: 

588 # for test coverage 

589 pass 

590 return values 

591 

592 

593_fill_methods = {"pad": pad_1d, "backfill": backfill_1d} 

594 

595 

596def get_fill_func(method): 

597 method = clean_fill_method(method) 

598 return _fill_methods[method] 

599 

600 

601def clean_reindex_fill_method(method): 

602 return clean_fill_method(method, allow_nearest=True) 

603 

604 

605def _interp_limit(invalid, fw_limit, bw_limit): 

606 """ 

607 Get indexers of values that won't be filled 

608 because they exceed the limits. 

609 

610 Parameters 

611 ---------- 

612 invalid : boolean ndarray 

613 fw_limit : int or None 

614 forward limit to index 

615 bw_limit : int or None 

616 backward limit to index 

617 

618 Returns 

619 ------- 

620 set of indexers 

621 

622 Notes 

623 ----- 

624 This is equivalent to the more readable, but slower 

625 

626 .. code-block:: python 

627 

628 def _interp_limit(invalid, fw_limit, bw_limit): 

629 for x in np.where(invalid)[0]: 

630 if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): 

631 yield x 

632 """ 

633 # handle forward first; the backward direction is the same except 

634 # 1. operate on the reversed array 

635 # 2. subtract the returned indices from N - 1 

636 N = len(invalid) 

637 f_idx = set() 

638 b_idx = set() 

639 

640 def inner(invalid, limit): 

641 limit = min(limit, N) 

642 windowed = _rolling_window(invalid, limit + 1).all(1) 

643 idx = set(np.where(windowed)[0] + limit) | set( 

644 np.where((~invalid[: limit + 1]).cumsum() == 0)[0] 

645 ) 

646 return idx 

647 

648 if fw_limit is not None: 

649 

650 if fw_limit == 0: 

651 f_idx = set(np.where(invalid)[0]) 

652 else: 

653 f_idx = inner(invalid, fw_limit) 

654 

655 if bw_limit is not None: 

656 

657 if bw_limit == 0: 

658 # then we don't even need to care about backwards 

659 # just use forwards 

660 return f_idx 

661 else: 

662 b_idx = list(inner(invalid[::-1], bw_limit)) 

663 b_idx = set(N - 1 - np.asarray(b_idx)) 

664 if fw_limit == 0: 

665 return b_idx 

666 

667 return f_idx & b_idx 

668 

669 

670def _rolling_window(a, window): 

671 """ 

672 [True, True, False, True, False], 2 -> 

673 

674 [ 

675 [True, True], 

676 [True, False], 

677 [False, True], 

678 [True, False], 

679 ] 

680 """ 

681 # https://stackoverflow.com/a/6811241 

682 shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 

683 strides = a.strides + (a.strides[-1],) 

684 return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)