Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""An interface for extending pandas with custom arrays. 

2 

3.. warning:: 

4 

5 This is an experimental API and subject to breaking changes 

6 without warning. 

7""" 

8import operator 

9from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union 

10 

11import numpy as np 

12 

13from pandas._libs import lib 

14from pandas._typing import ArrayLike 

15from pandas.compat import set_function_name 

16from pandas.compat.numpy import function as nv 

17from pandas.errors import AbstractMethodError 

18from pandas.util._decorators import Appender, Substitution 

19from pandas.util._validators import validate_fillna_kwargs 

20 

21from pandas.core.dtypes.common import is_array_like, is_list_like 

22from pandas.core.dtypes.dtypes import ExtensionDtype 

23from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries 

24from pandas.core.dtypes.missing import isna 

25 

26from pandas.core import ops 

27from pandas.core.algorithms import _factorize_array, unique 

28from pandas.core.missing import backfill_1d, pad_1d 

29from pandas.core.sorting import nargsort 

30 

31_extension_array_shared_docs: Dict[str, str] = dict() 

32 

33 

34def try_cast_to_ea(cls_or_instance, obj, dtype=None): 

35 """ 

36 Call to `_from_sequence` that returns the object unchanged on Exception. 

37 

38 Parameters 

39 ---------- 

40 cls_or_instance : ExtensionArray subclass or instance 

41 obj : arraylike 

42 Values to pass to cls._from_sequence 

43 dtype : ExtensionDtype, optional 

44 

45 Returns 

46 ------- 

47 ExtensionArray or obj 

48 """ 

49 try: 

50 result = cls_or_instance._from_sequence(obj, dtype=dtype) 

51 except Exception: 

52 # We can't predict what downstream EA constructors may raise 

53 result = obj 

54 return result 

55 

56 

57class ExtensionArray: 

58 """ 

59 Abstract base class for custom 1-D array types. 

60 

61 pandas will recognize instances of this class as proper arrays 

62 with a custom type and will not attempt to coerce them to objects. They 

63 may be stored directly inside a :class:`DataFrame` or :class:`Series`. 

64 

65 .. versionadded:: 0.23.0 

66 

67 Attributes 

68 ---------- 

69 dtype 

70 nbytes 

71 ndim 

72 shape 

73 

74 Methods 

75 ------- 

76 argsort 

77 astype 

78 copy 

79 dropna 

80 factorize 

81 fillna 

82 isna 

83 ravel 

84 repeat 

85 searchsorted 

86 shift 

87 take 

88 unique 

89 view 

90 _concat_same_type 

91 _formatter 

92 _from_factorized 

93 _from_sequence 

94 _from_sequence_of_strings 

95 _ndarray_values 

96 _reduce 

97 _values_for_argsort 

98 _values_for_factorize 

99 

100 Notes 

101 ----- 

102 The interface includes the following abstract methods that must be 

103 implemented by subclasses: 

104 

105 * _from_sequence 

106 * _from_factorized 

107 * __getitem__ 

108 * __len__ 

109 * dtype 

110 * nbytes 

111 * isna 

112 * take 

113 * copy 

114 * _concat_same_type 

115 

116 A default repr displaying the type, (truncated) data, length, 

117 and dtype is provided. It can be customized or replaced by 

118 by overriding: 

119 

120 * __repr__ : A default repr for the ExtensionArray. 

121 * _formatter : Print scalars inside a Series or DataFrame. 

122 

123 Some methods require casting the ExtensionArray to an ndarray of Python 

124 objects with ``self.astype(object)``, which may be expensive. When 

125 performance is a concern, we highly recommend overriding the following 

126 methods: 

127 

128 * fillna 

129 * dropna 

130 * unique 

131 * factorize / _values_for_factorize 

132 * argsort / _values_for_argsort 

133 * searchsorted 

134 

135 The remaining methods implemented on this class should be performant, 

136 as they only compose abstract methods. Still, a more efficient 

137 implementation may be available, and these methods can be overridden. 

138 

139 One can implement methods to handle array reductions. 

140 

141 * _reduce 

142 

143 One can implement methods to handle parsing from strings that will be used 

144 in methods such as ``pandas.io.parsers.read_csv``. 

145 

146 * _from_sequence_of_strings 

147 

148 This class does not inherit from 'abc.ABCMeta' for performance reasons. 

149 Methods and properties required by the interface raise 

150 ``pandas.errors.AbstractMethodError`` and no ``register`` method is 

151 provided for registering virtual subclasses. 

152 

153 ExtensionArrays are limited to 1 dimension. 

154 

155 They may be backed by none, one, or many NumPy arrays. For example, 

156 ``pandas.Categorical`` is an extension array backed by two arrays, 

157 one for codes and one for categories. An array of IPv6 address may 

158 be backed by a NumPy structured array with two fields, one for the 

159 lower 64 bits and one for the upper 64 bits. Or they may be backed 

160 by some other storage type, like Python lists. Pandas makes no 

161 assumptions on how the data are stored, just that it can be converted 

162 to a NumPy array. 

163 The ExtensionArray interface does not impose any rules on how this data 

164 is stored. However, currently, the backing data cannot be stored in 

165 attributes called ``.values`` or ``._values`` to ensure full compatibility 

166 with pandas internals. But other names as ``.data``, ``._data``, 

167 ``._items``, ... can be freely used. 

168 

169 If implementing NumPy's ``__array_ufunc__`` interface, pandas expects 

170 that 

171 

172 1. You defer by returning ``NotImplemented`` when any Series are present 

173 in `inputs`. Pandas will extract the arrays and call the ufunc again. 

174 2. You define a ``_HANDLED_TYPES`` tuple as an attribute on the class. 

175 Pandas inspect this to determine whether the ufunc is valid for the 

176 types present. 

177 

178 See :ref:`extending.extension.ufunc` for more. 

179 """ 

180 

181 # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. 

182 # Don't override this. 

183 _typ = "extension" 

184 

185 # ------------------------------------------------------------------------ 

186 # Constructors 

187 # ------------------------------------------------------------------------ 

188 

189 @classmethod 

190 def _from_sequence(cls, scalars, dtype=None, copy=False): 

191 """ 

192 Construct a new ExtensionArray from a sequence of scalars. 

193 

194 Parameters 

195 ---------- 

196 scalars : Sequence 

197 Each element will be an instance of the scalar type for this 

198 array, ``cls.dtype.type``. 

199 dtype : dtype, optional 

200 Construct for this particular dtype. This should be a Dtype 

201 compatible with the ExtensionArray. 

202 copy : bool, default False 

203 If True, copy the underlying data. 

204 

205 Returns 

206 ------- 

207 ExtensionArray 

208 """ 

209 raise AbstractMethodError(cls) 

210 

211 @classmethod 

212 def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): 

213 """Construct a new ExtensionArray from a sequence of strings. 

214 

215 .. versionadded:: 0.24.0 

216 

217 Parameters 

218 ---------- 

219 strings : Sequence 

220 Each element will be an instance of the scalar type for this 

221 array, ``cls.dtype.type``. 

222 dtype : dtype, optional 

223 Construct for this particular dtype. This should be a Dtype 

224 compatible with the ExtensionArray. 

225 copy : bool, default False 

226 If True, copy the underlying data. 

227 

228 Returns 

229 ------- 

230 ExtensionArray 

231 """ 

232 raise AbstractMethodError(cls) 

233 

234 @classmethod 

235 def _from_factorized(cls, values, original): 

236 """ 

237 Reconstruct an ExtensionArray after factorization. 

238 

239 Parameters 

240 ---------- 

241 values : ndarray 

242 An integer ndarray with the factorized values. 

243 original : ExtensionArray 

244 The original ExtensionArray that factorize was called on. 

245 

246 See Also 

247 -------- 

248 factorize 

249 ExtensionArray.factorize 

250 """ 

251 raise AbstractMethodError(cls) 

252 

253 # ------------------------------------------------------------------------ 

254 # Must be a Sequence 

255 # ------------------------------------------------------------------------ 

256 

257 def __getitem__(self, item): 

258 # type (Any) -> Any 

259 """ 

260 Select a subset of self. 

261 

262 Parameters 

263 ---------- 

264 item : int, slice, or ndarray 

265 * int: The position in 'self' to get. 

266 

267 * slice: A slice object, where 'start', 'stop', and 'step' are 

268 integers or None 

269 

270 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' 

271 

272 Returns 

273 ------- 

274 item : scalar or ExtensionArray 

275 

276 Notes 

277 ----- 

278 For scalar ``item``, return a scalar value suitable for the array's 

279 type. This should be an instance of ``self.dtype.type``. 

280 

281 For slice ``key``, return an instance of ``ExtensionArray``, even 

282 if the slice is length 0 or 1. 

283 

284 For a boolean mask, return an instance of ``ExtensionArray``, filtered 

285 to the values where ``item`` is True. 

286 """ 

287 raise AbstractMethodError(self) 

288 

289 def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: 

290 """ 

291 Set one or more values inplace. 

292 

293 This method is not required to satisfy the pandas extension array 

294 interface. 

295 

296 Parameters 

297 ---------- 

298 key : int, ndarray, or slice 

299 When called from, e.g. ``Series.__setitem__``, ``key`` will be 

300 one of 

301 

302 * scalar int 

303 * ndarray of integers. 

304 * boolean ndarray 

305 * slice object 

306 

307 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object 

308 value or values to be set of ``key``. 

309 

310 Returns 

311 ------- 

312 None 

313 """ 

314 # Some notes to the ExtensionArray implementor who may have ended up 

315 # here. While this method is not required for the interface, if you 

316 # *do* choose to implement __setitem__, then some semantics should be 

317 # observed: 

318 # 

319 # * Setting multiple values : ExtensionArrays should support setting 

320 # multiple values at once, 'key' will be a sequence of integers and 

321 # 'value' will be a same-length sequence. 

322 # 

323 # * Broadcasting : For a sequence 'key' and a scalar 'value', 

324 # each position in 'key' should be set to 'value'. 

325 # 

326 # * Coercion : Most users will expect basic coercion to work. For 

327 # example, a string like '2018-01-01' is coerced to a datetime 

328 # when setting on a datetime64ns array. In general, if the 

329 # __init__ method coerces that value, then so should __setitem__ 

330 # Note, also, that Series/DataFrame.where internally use __setitem__ 

331 # on a copy of the data. 

332 raise NotImplementedError(f"{type(self)} does not implement __setitem__.") 

333 

334 def __len__(self) -> int: 

335 """ 

336 Length of this array 

337 

338 Returns 

339 ------- 

340 length : int 

341 """ 

342 raise AbstractMethodError(self) 

343 

344 def __iter__(self): 

345 """ 

346 Iterate over elements of the array. 

347 """ 

348 # This needs to be implemented so that pandas recognizes extension 

349 # arrays as list-like. The default implementation makes successive 

350 # calls to ``__getitem__``, which may be slower than necessary. 

351 for i in range(len(self)): 

352 yield self[i] 

353 

354 def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): 

355 """ 

356 Convert to a NumPy ndarray. 

357 

358 .. versionadded:: 1.0.0 

359 

360 This is similar to :meth:`numpy.asarray`, but may provide additional control 

361 over how the conversion is done. 

362 

363 Parameters 

364 ---------- 

365 dtype : str or numpy.dtype, optional 

366 The dtype to pass to :meth:`numpy.asarray`. 

367 copy : bool, default False 

368 Whether to ensure that the returned value is a not a view on 

369 another array. Note that ``copy=False`` does not *ensure* that 

370 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that 

371 a copy is made, even if not strictly necessary. 

372 na_value : Any, optional 

373 The value to use for missing values. The default value depends 

374 on `dtype` and the type of the array. 

375 

376 Returns 

377 ------- 

378 numpy.ndarray 

379 """ 

380 result = np.asarray(self, dtype=dtype) 

381 if copy or na_value is not lib.no_default: 

382 result = result.copy() 

383 if na_value is not lib.no_default: 

384 result[self.isna()] = na_value 

385 return result 

386 

387 # ------------------------------------------------------------------------ 

388 # Required attributes 

389 # ------------------------------------------------------------------------ 

390 

391 @property 

392 def dtype(self) -> ExtensionDtype: 

393 """ 

394 An instance of 'ExtensionDtype'. 

395 """ 

396 raise AbstractMethodError(self) 

397 

398 @property 

399 def shape(self) -> Tuple[int, ...]: 

400 """ 

401 Return a tuple of the array dimensions. 

402 """ 

403 return (len(self),) 

404 

405 @property 

406 def size(self) -> int: 

407 """ 

408 The number of elements in the array. 

409 """ 

410 return np.prod(self.shape) 

411 

412 @property 

413 def ndim(self) -> int: 

414 """ 

415 Extension Arrays are only allowed to be 1-dimensional. 

416 """ 

417 return 1 

418 

419 @property 

420 def nbytes(self) -> int: 

421 """ 

422 The number of bytes needed to store this object in memory. 

423 """ 

424 # If this is expensive to compute, return an approximate lower bound 

425 # on the number of bytes needed. 

426 raise AbstractMethodError(self) 

427 

428 # ------------------------------------------------------------------------ 

429 # Additional Methods 

430 # ------------------------------------------------------------------------ 

431 

432 def astype(self, dtype, copy=True): 

433 """ 

434 Cast to a NumPy array with 'dtype'. 

435 

436 Parameters 

437 ---------- 

438 dtype : str or dtype 

439 Typecode or data-type to which the array is cast. 

440 copy : bool, default True 

441 Whether to copy the data, even if not necessary. If False, 

442 a copy is made only if the old dtype does not match the 

443 new dtype. 

444 

445 Returns 

446 ------- 

447 array : ndarray 

448 NumPy ndarray with 'dtype' for its dtype. 

449 """ 

450 return np.array(self, dtype=dtype, copy=copy) 

451 

452 def isna(self) -> ArrayLike: 

453 """ 

454 A 1-D array indicating if each value is missing. 

455 

456 Returns 

457 ------- 

458 na_values : Union[np.ndarray, ExtensionArray] 

459 In most cases, this should return a NumPy ndarray. For 

460 exceptional cases like ``SparseArray``, where returning 

461 an ndarray would be expensive, an ExtensionArray may be 

462 returned. 

463 

464 Notes 

465 ----- 

466 If returning an ExtensionArray, then 

467 

468 * ``na_values._is_boolean`` should be True 

469 * `na_values` should implement :func:`ExtensionArray._reduce` 

470 * ``na_values.any`` and ``na_values.all`` should be implemented 

471 """ 

472 raise AbstractMethodError(self) 

473 

474 def _values_for_argsort(self) -> np.ndarray: 

475 """ 

476 Return values for sorting. 

477 

478 Returns 

479 ------- 

480 ndarray 

481 The transformed values should maintain the ordering between values 

482 within the array. 

483 

484 See Also 

485 -------- 

486 ExtensionArray.argsort 

487 """ 

488 # Note: this is used in `ExtensionArray.argsort`. 

489 return np.array(self) 

490 

491 def argsort( 

492 self, ascending: bool = True, kind: str = "quicksort", *args, **kwargs 

493 ) -> np.ndarray: 

494 """ 

495 Return the indices that would sort this array. 

496 

497 Parameters 

498 ---------- 

499 ascending : bool, default True 

500 Whether the indices should result in an ascending 

501 or descending sort. 

502 kind : {'quicksort', 'mergesort', 'heapsort'}, optional 

503 Sorting algorithm. 

504 *args, **kwargs: 

505 passed through to :func:`numpy.argsort`. 

506 

507 Returns 

508 ------- 

509 ndarray 

510 Array of indices that sort ``self``. If NaN values are contained, 

511 NaN values are placed at the end. 

512 

513 See Also 

514 -------- 

515 numpy.argsort : Sorting implementation used internally. 

516 """ 

517 # Implementor note: You have two places to override the behavior of 

518 # argsort. 

519 # 1. _values_for_argsort : construct the values passed to np.argsort 

520 # 2. argsort : total control over sorting. 

521 ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) 

522 

523 result = nargsort(self, kind=kind, ascending=ascending, na_position="last") 

524 return result 

525 

526 def fillna(self, value=None, method=None, limit=None): 

527 """ 

528 Fill NA/NaN values using the specified method. 

529 

530 Parameters 

531 ---------- 

532 value : scalar, array-like 

533 If a scalar value is passed it is used to fill all missing values. 

534 Alternatively, an array-like 'value' can be given. It's expected 

535 that the array-like have the same length as 'self'. 

536 method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None 

537 Method to use for filling holes in reindexed Series 

538 pad / ffill: propagate last valid observation forward to next valid 

539 backfill / bfill: use NEXT valid observation to fill gap. 

540 limit : int, default None 

541 If method is specified, this is the maximum number of consecutive 

542 NaN values to forward/backward fill. In other words, if there is 

543 a gap with more than this number of consecutive NaNs, it will only 

544 be partially filled. If method is not specified, this is the 

545 maximum number of entries along the entire axis where NaNs will be 

546 filled. 

547 

548 Returns 

549 ------- 

550 ExtensionArray 

551 With NA/NaN filled. 

552 """ 

553 value, method = validate_fillna_kwargs(value, method) 

554 

555 mask = self.isna() 

556 

557 if is_array_like(value): 

558 if len(value) != len(self): 

559 raise ValueError( 

560 f"Length of 'value' does not match. Got ({len(value)}) " 

561 f"expected {len(self)}" 

562 ) 

563 value = value[mask] 

564 

565 if mask.any(): 

566 if method is not None: 

567 func = pad_1d if method == "pad" else backfill_1d 

568 new_values = func(self.astype(object), limit=limit, mask=mask) 

569 new_values = self._from_sequence(new_values, dtype=self.dtype) 

570 else: 

571 # fill with value 

572 new_values = self.copy() 

573 new_values[mask] = value 

574 else: 

575 new_values = self.copy() 

576 return new_values 

577 

578 def dropna(self): 

579 """ 

580 Return ExtensionArray without NA values. 

581 

582 Returns 

583 ------- 

584 valid : ExtensionArray 

585 """ 

586 return self[~self.isna()] 

587 

588 def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: 

589 """ 

590 Shift values by desired number. 

591 

592 Newly introduced missing values are filled with 

593 ``self.dtype.na_value``. 

594 

595 .. versionadded:: 0.24.0 

596 

597 Parameters 

598 ---------- 

599 periods : int, default 1 

600 The number of periods to shift. Negative values are allowed 

601 for shifting backwards. 

602 

603 fill_value : object, optional 

604 The scalar value to use for newly introduced missing values. 

605 The default is ``self.dtype.na_value``. 

606 

607 .. versionadded:: 0.24.0 

608 

609 Returns 

610 ------- 

611 ExtensionArray 

612 Shifted. 

613 

614 Notes 

615 ----- 

616 If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is 

617 returned. 

618 

619 If ``periods > len(self)``, then an array of size 

620 len(self) is returned, with all values filled with 

621 ``self.dtype.na_value``. 

622 """ 

623 # Note: this implementation assumes that `self.dtype.na_value` can be 

624 # stored in an instance of your ExtensionArray with `self.dtype`. 

625 if not len(self) or periods == 0: 

626 return self.copy() 

627 

628 if isna(fill_value): 

629 fill_value = self.dtype.na_value 

630 

631 empty = self._from_sequence( 

632 [fill_value] * min(abs(periods), len(self)), dtype=self.dtype 

633 ) 

634 if periods > 0: 

635 a = empty 

636 b = self[:-periods] 

637 else: 

638 a = self[abs(periods) :] 

639 b = empty 

640 return self._concat_same_type([a, b]) 

641 

642 def unique(self): 

643 """ 

644 Compute the ExtensionArray of unique values. 

645 

646 Returns 

647 ------- 

648 uniques : ExtensionArray 

649 """ 

650 uniques = unique(self.astype(object)) 

651 return self._from_sequence(uniques, dtype=self.dtype) 

652 

653 def searchsorted(self, value, side="left", sorter=None): 

654 """ 

655 Find indices where elements should be inserted to maintain order. 

656 

657 .. versionadded:: 0.24.0 

658 

659 Find the indices into a sorted array `self` (a) such that, if the 

660 corresponding elements in `value` were inserted before the indices, 

661 the order of `self` would be preserved. 

662 

663 Assuming that `self` is sorted: 

664 

665 ====== ================================ 

666 `side` returned index `i` satisfies 

667 ====== ================================ 

668 left ``self[i-1] < value <= self[i]`` 

669 right ``self[i-1] <= value < self[i]`` 

670 ====== ================================ 

671 

672 Parameters 

673 ---------- 

674 value : array_like 

675 Values to insert into `self`. 

676 side : {'left', 'right'}, optional 

677 If 'left', the index of the first suitable location found is given. 

678 If 'right', return the last such index. If there is no suitable 

679 index, return either 0 or N (where N is the length of `self`). 

680 sorter : 1-D array_like, optional 

681 Optional array of integer indices that sort array a into ascending 

682 order. They are typically the result of argsort. 

683 

684 Returns 

685 ------- 

686 array of ints 

687 Array of insertion points with the same shape as `value`. 

688 

689 See Also 

690 -------- 

691 numpy.searchsorted : Similar method from NumPy. 

692 """ 

693 # Note: the base tests provided by pandas only test the basics. 

694 # We do not test 

695 # 1. Values outside the range of the `data_for_sorting` fixture 

696 # 2. Values between the values in the `data_for_sorting` fixture 

697 # 3. Missing values. 

698 arr = self.astype(object) 

699 return arr.searchsorted(value, side=side, sorter=sorter) 

700 

701 def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: 

702 """ 

703 Return an array and missing value suitable for factorization. 

704 

705 Returns 

706 ------- 

707 values : ndarray 

708 

709 An array suitable for factorization. This should maintain order 

710 and be a supported dtype (Float64, Int64, UInt64, String, Object). 

711 By default, the extension array is cast to object dtype. 

712 na_value : object 

713 The value in `values` to consider missing. This will be treated 

714 as NA in the factorization routines, so it will be coded as 

715 `na_sentinal` and not included in `uniques`. By default, 

716 ``np.nan`` is used. 

717 

718 Notes 

719 ----- 

720 The values returned by this method are also used in 

721 :func:`pandas.util.hash_pandas_object`. 

722 """ 

723 return self.astype(object), np.nan 

724 

725 def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]: 

726 """ 

727 Encode the extension array as an enumerated type. 

728 

729 Parameters 

730 ---------- 

731 na_sentinel : int, default -1 

732 Value to use in the `codes` array to indicate missing values. 

733 

734 Returns 

735 ------- 

736 codes : ndarray 

737 An integer NumPy array that's an indexer into the original 

738 ExtensionArray. 

739 uniques : ExtensionArray 

740 An ExtensionArray containing the unique values of `self`. 

741 

742 .. note:: 

743 

744 uniques will *not* contain an entry for the NA value of 

745 the ExtensionArray if there are any missing values present 

746 in `self`. 

747 

748 See Also 

749 -------- 

750 factorize : Top-level factorize method that dispatches here. 

751 

752 Notes 

753 ----- 

754 :meth:`pandas.factorize` offers a `sort` keyword as well. 

755 """ 

756 # Implementer note: There are two ways to override the behavior of 

757 # pandas.factorize 

758 # 1. _values_for_factorize and _from_factorize. 

759 # Specify the values passed to pandas' internal factorization 

760 # routines, and how to convert from those values back to the 

761 # original ExtensionArray. 

762 # 2. ExtensionArray.factorize. 

763 # Complete control over factorization. 

764 arr, na_value = self._values_for_factorize() 

765 

766 codes, uniques = _factorize_array( 

767 arr, na_sentinel=na_sentinel, na_value=na_value 

768 ) 

769 

770 uniques = self._from_factorized(uniques, self) 

771 return codes, uniques 

772 

773 _extension_array_shared_docs[ 

774 "repeat" 

775 ] = """ 

776 Repeat elements of a %(klass)s. 

777 

778 Returns a new %(klass)s where each element of the current %(klass)s 

779 is repeated consecutively a given number of times. 

780 

781 Parameters 

782 ---------- 

783 repeats : int or array of ints 

784 The number of repetitions for each element. This should be a 

785 non-negative integer. Repeating 0 times will return an empty 

786 %(klass)s. 

787 axis : None 

788 Must be ``None``. Has no effect but is accepted for compatibility 

789 with numpy. 

790 

791 Returns 

792 ------- 

793 repeated_array : %(klass)s 

794 Newly created %(klass)s with repeated elements. 

795 

796 See Also 

797 -------- 

798 Series.repeat : Equivalent function for Series. 

799 Index.repeat : Equivalent function for Index. 

800 numpy.repeat : Similar method for :class:`numpy.ndarray`. 

801 ExtensionArray.take : Take arbitrary positions. 

802 

803 Examples 

804 -------- 

805 >>> cat = pd.Categorical(['a', 'b', 'c']) 

806 >>> cat 

807 [a, b, c] 

808 Categories (3, object): [a, b, c] 

809 >>> cat.repeat(2) 

810 [a, a, b, b, c, c] 

811 Categories (3, object): [a, b, c] 

812 >>> cat.repeat([1, 2, 3]) 

813 [a, b, b, c, c, c] 

814 Categories (3, object): [a, b, c] 

815 """ 

816 

817 @Substitution(klass="ExtensionArray") 

818 @Appender(_extension_array_shared_docs["repeat"]) 

819 def repeat(self, repeats, axis=None): 

820 nv.validate_repeat(tuple(), dict(axis=axis)) 

821 ind = np.arange(len(self)).repeat(repeats) 

822 return self.take(ind) 

823 

824 # ------------------------------------------------------------------------ 

825 # Indexing methods 

826 # ------------------------------------------------------------------------ 

827 

828 def take( 

829 self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None 

830 ) -> ABCExtensionArray: 

831 """ 

832 Take elements from an array. 

833 

834 Parameters 

835 ---------- 

836 indices : sequence of int 

837 Indices to be taken. 

838 allow_fill : bool, default False 

839 How to handle negative values in `indices`. 

840 

841 * False: negative values in `indices` indicate positional indices 

842 from the right (the default). This is similar to 

843 :func:`numpy.take`. 

844 

845 * True: negative values in `indices` indicate 

846 missing values. These values are set to `fill_value`. Any other 

847 other negative values raise a ``ValueError``. 

848 

849 fill_value : any, optional 

850 Fill value to use for NA-indices when `allow_fill` is True. 

851 This may be ``None``, in which case the default NA value for 

852 the type, ``self.dtype.na_value``, is used. 

853 

854 For many ExtensionArrays, there will be two representations of 

855 `fill_value`: a user-facing "boxed" scalar, and a low-level 

856 physical NA value. `fill_value` should be the user-facing version, 

857 and the implementation should handle translating that to the 

858 physical version for processing the take if necessary. 

859 

860 Returns 

861 ------- 

862 ExtensionArray 

863 

864 Raises 

865 ------ 

866 IndexError 

867 When the indices are out of bounds for the array. 

868 ValueError 

869 When `indices` contains negative values other than ``-1`` 

870 and `allow_fill` is True. 

871 

872 See Also 

873 -------- 

874 numpy.take 

875 api.extensions.take 

876 

877 Notes 

878 ----- 

879 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, 

880 ``iloc``, when `indices` is a sequence of values. Additionally, 

881 it's called by :meth:`Series.reindex`, or any other method 

882 that causes realignment, with a `fill_value`. 

883 

884 Examples 

885 -------- 

886 Here's an example implementation, which relies on casting the 

887 extension array to object dtype. This uses the helper method 

888 :func:`pandas.api.extensions.take`. 

889 

890 .. code-block:: python 

891 

892 def take(self, indices, allow_fill=False, fill_value=None): 

893 from pandas.core.algorithms import take 

894 

895 # If the ExtensionArray is backed by an ndarray, then 

896 # just pass that here instead of coercing to object. 

897 data = self.astype(object) 

898 

899 if allow_fill and fill_value is None: 

900 fill_value = self.dtype.na_value 

901 

902 # fill value should always be translated from the scalar 

903 # type for the array, to the physical storage type for 

904 # the data, before passing to take. 

905 

906 result = take(data, indices, fill_value=fill_value, 

907 allow_fill=allow_fill) 

908 return self._from_sequence(result, dtype=self.dtype) 

909 """ 

910 # Implementer note: The `fill_value` parameter should be a user-facing 

911 # value, an instance of self.dtype.type. When passed `fill_value=None`, 

912 # the default of `self.dtype.na_value` should be used. 

913 # This may differ from the physical storage type your ExtensionArray 

914 # uses. In this case, your implementation is responsible for casting 

915 # the user-facing type to the storage type, before using 

916 # pandas.api.extensions.take 

917 raise AbstractMethodError(self) 

918 

919 def copy(self) -> ABCExtensionArray: 

920 """ 

921 Return a copy of the array. 

922 

923 Returns 

924 ------- 

925 ExtensionArray 

926 """ 

927 raise AbstractMethodError(self) 

928 

929 def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: 

930 """ 

931 Return a view on the array. 

932 

933 Parameters 

934 ---------- 

935 dtype : str, np.dtype, or ExtensionDtype, optional 

936 Default None. 

937 

938 Returns 

939 ------- 

940 ExtensionArray 

941 A view of the :class:`ExtensionArray`. 

942 """ 

943 # NB: 

944 # - This must return a *new* object referencing the same data, not self. 

945 # - The only case that *must* be implemented is with dtype=None, 

946 # giving a view with the same dtype as self. 

947 if dtype is not None: 

948 raise NotImplementedError(dtype) 

949 return self[:] 

950 

951 # ------------------------------------------------------------------------ 

952 # Printing 

953 # ------------------------------------------------------------------------ 

954 

955 def __repr__(self) -> str: 

956 from pandas.io.formats.printing import format_object_summary 

957 

958 # the short repr has no trailing newline, while the truncated 

959 # repr does. So we include a newline in our template, and strip 

960 # any trailing newlines from format_object_summary 

961 data = format_object_summary( 

962 self, self._formatter(), indent_for_name=False 

963 ).rstrip(", \n") 

964 class_name = f"<{type(self).__name__}>\n" 

965 return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" 

966 

967 def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: 

968 """Formatting function for scalar values. 

969 

970 This is used in the default '__repr__'. The returned formatting 

971 function receives instances of your scalar type. 

972 

973 Parameters 

974 ---------- 

975 boxed : bool, default False 

976 An indicated for whether or not your array is being printed 

977 within a Series, DataFrame, or Index (True), or just by 

978 itself (False). This may be useful if you want scalar values 

979 to appear differently within a Series versus on its own (e.g. 

980 quoted or not). 

981 

982 Returns 

983 ------- 

984 Callable[[Any], str] 

985 A callable that gets instances of the scalar type and 

986 returns a string. By default, :func:`repr` is used 

987 when ``boxed=False`` and :func:`str` is used when 

988 ``boxed=True``. 

989 """ 

990 if boxed: 

991 return str 

992 return repr 

993 

994 # ------------------------------------------------------------------------ 

995 # Reshaping 

996 # ------------------------------------------------------------------------ 

997 

998 def ravel(self, order="C") -> ABCExtensionArray: 

999 """ 

1000 Return a flattened view on this array. 

1001 

1002 Parameters 

1003 ---------- 

1004 order : {None, 'C', 'F', 'A', 'K'}, default 'C' 

1005 

1006 Returns 

1007 ------- 

1008 ExtensionArray 

1009 

1010 Notes 

1011 ----- 

1012 - Because ExtensionArrays are 1D-only, this is a no-op. 

1013 - The "order" argument is ignored, is for compatibility with NumPy. 

1014 """ 

1015 return self 

1016 

1017 @classmethod 

1018 def _concat_same_type( 

1019 cls, to_concat: Sequence[ABCExtensionArray] 

1020 ) -> ABCExtensionArray: 

1021 """ 

1022 Concatenate multiple array. 

1023 

1024 Parameters 

1025 ---------- 

1026 to_concat : sequence of this type 

1027 

1028 Returns 

1029 ------- 

1030 ExtensionArray 

1031 """ 

1032 raise AbstractMethodError(cls) 

1033 

1034 # The _can_hold_na attribute is set to True so that pandas internals 

1035 # will use the ExtensionDtype.na_value as the NA value in operations 

1036 # such as take(), reindex(), shift(), etc. In addition, those results 

1037 # will then be of the ExtensionArray subclass rather than an array 

1038 # of objects 

1039 _can_hold_na = True 

1040 

1041 @property 

1042 def _ndarray_values(self) -> np.ndarray: 

1043 """ 

1044 Internal pandas method for lossy conversion to a NumPy ndarray. 

1045 

1046 This method is not part of the pandas interface. 

1047 

1048 The expectation is that this is cheap to compute, and is primarily 

1049 used for interacting with our indexers. 

1050 

1051 Returns 

1052 ------- 

1053 array : ndarray 

1054 """ 

1055 return np.array(self) 

1056 

1057 def _reduce(self, name, skipna=True, **kwargs): 

1058 """ 

1059 Return a scalar result of performing the reduction operation. 

1060 

1061 Parameters 

1062 ---------- 

1063 name : str 

1064 Name of the function, supported values are: 

1065 { any, all, min, max, sum, mean, median, prod, 

1066 std, var, sem, kurt, skew }. 

1067 skipna : bool, default True 

1068 If True, skip NaN values. 

1069 **kwargs 

1070 Additional keyword arguments passed to the reduction function. 

1071 Currently, `ddof` is the only supported kwarg. 

1072 

1073 Returns 

1074 ------- 

1075 scalar 

1076 

1077 Raises 

1078 ------ 

1079 TypeError : subclass does not define reductions 

1080 """ 

1081 raise TypeError(f"cannot perform {name} with type {self.dtype}") 

1082 

1083 

1084class ExtensionOpsMixin: 

1085 """ 

1086 A base class for linking the operators to their dunder names. 

1087 

1088 .. note:: 

1089 

1090 You may want to set ``__array_priority__`` if you want your 

1091 implementation to be called when involved in binary operations 

1092 with NumPy arrays. 

1093 """ 

1094 

1095 @classmethod 

1096 def _add_arithmetic_ops(cls): 

1097 cls.__add__ = cls._create_arithmetic_method(operator.add) 

1098 cls.__radd__ = cls._create_arithmetic_method(ops.radd) 

1099 cls.__sub__ = cls._create_arithmetic_method(operator.sub) 

1100 cls.__rsub__ = cls._create_arithmetic_method(ops.rsub) 

1101 cls.__mul__ = cls._create_arithmetic_method(operator.mul) 

1102 cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) 

1103 cls.__pow__ = cls._create_arithmetic_method(operator.pow) 

1104 cls.__rpow__ = cls._create_arithmetic_method(ops.rpow) 

1105 cls.__mod__ = cls._create_arithmetic_method(operator.mod) 

1106 cls.__rmod__ = cls._create_arithmetic_method(ops.rmod) 

1107 cls.__floordiv__ = cls._create_arithmetic_method(operator.floordiv) 

1108 cls.__rfloordiv__ = cls._create_arithmetic_method(ops.rfloordiv) 

1109 cls.__truediv__ = cls._create_arithmetic_method(operator.truediv) 

1110 cls.__rtruediv__ = cls._create_arithmetic_method(ops.rtruediv) 

1111 cls.__divmod__ = cls._create_arithmetic_method(divmod) 

1112 cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod) 

1113 

1114 @classmethod 

1115 def _add_comparison_ops(cls): 

1116 cls.__eq__ = cls._create_comparison_method(operator.eq) 

1117 cls.__ne__ = cls._create_comparison_method(operator.ne) 

1118 cls.__lt__ = cls._create_comparison_method(operator.lt) 

1119 cls.__gt__ = cls._create_comparison_method(operator.gt) 

1120 cls.__le__ = cls._create_comparison_method(operator.le) 

1121 cls.__ge__ = cls._create_comparison_method(operator.ge) 

1122 

1123 @classmethod 

1124 def _add_logical_ops(cls): 

1125 cls.__and__ = cls._create_logical_method(operator.and_) 

1126 cls.__rand__ = cls._create_logical_method(ops.rand_) 

1127 cls.__or__ = cls._create_logical_method(operator.or_) 

1128 cls.__ror__ = cls._create_logical_method(ops.ror_) 

1129 cls.__xor__ = cls._create_logical_method(operator.xor) 

1130 cls.__rxor__ = cls._create_logical_method(ops.rxor) 

1131 

1132 

1133class ExtensionScalarOpsMixin(ExtensionOpsMixin): 

1134 """ 

1135 A mixin for defining ops on an ExtensionArray. 

1136 

1137 It is assumed that the underlying scalar objects have the operators 

1138 already defined. 

1139 

1140 Notes 

1141 ----- 

1142 If you have defined a subclass MyExtensionArray(ExtensionArray), then 

1143 use MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin) to 

1144 get the arithmetic operators. After the definition of MyExtensionArray, 

1145 insert the lines 

1146 

1147 MyExtensionArray._add_arithmetic_ops() 

1148 MyExtensionArray._add_comparison_ops() 

1149 

1150 to link the operators to your class. 

1151 

1152 .. note:: 

1153 

1154 You may want to set ``__array_priority__`` if you want your 

1155 implementation to be called when involved in binary operations 

1156 with NumPy arrays. 

1157 """ 

1158 

1159 @classmethod 

1160 def _create_method(cls, op, coerce_to_dtype=True): 

1161 """ 

1162 A class method that returns a method that will correspond to an 

1163 operator for an ExtensionArray subclass, by dispatching to the 

1164 relevant operator defined on the individual elements of the 

1165 ExtensionArray. 

1166 

1167 Parameters 

1168 ---------- 

1169 op : function 

1170 An operator that takes arguments op(a, b) 

1171 coerce_to_dtype : bool, default True 

1172 boolean indicating whether to attempt to convert 

1173 the result to the underlying ExtensionArray dtype. 

1174 If it's not possible to create a new ExtensionArray with the 

1175 values, an ndarray is returned instead. 

1176 

1177 Returns 

1178 ------- 

1179 Callable[[Any, Any], Union[ndarray, ExtensionArray]] 

1180 A method that can be bound to a class. When used, the method 

1181 receives the two arguments, one of which is the instance of 

1182 this class, and should return an ExtensionArray or an ndarray. 

1183 

1184 Returning an ndarray may be necessary when the result of the 

1185 `op` cannot be stored in the ExtensionArray. The dtype of the 

1186 ndarray uses NumPy's normal inference rules. 

1187 

1188 Examples 

1189 -------- 

1190 Given an ExtensionArray subclass called MyExtensionArray, use 

1191 

1192 >>> __add__ = cls._create_method(operator.add) 

1193 

1194 in the class definition of MyExtensionArray to create the operator 

1195 for addition, that will be based on the operator implementation 

1196 of the underlying elements of the ExtensionArray 

1197 """ 

1198 

1199 def _binop(self, other): 

1200 def convert_values(param): 

1201 if isinstance(param, ExtensionArray) or is_list_like(param): 

1202 ovalues = param 

1203 else: # Assume its an object 

1204 ovalues = [param] * len(self) 

1205 return ovalues 

1206 

1207 if isinstance(other, (ABCSeries, ABCIndexClass)): 

1208 # rely on pandas to unbox and dispatch to us 

1209 return NotImplemented 

1210 

1211 lvalues = self 

1212 rvalues = convert_values(other) 

1213 

1214 # If the operator is not defined for the underlying objects, 

1215 # a TypeError should be raised 

1216 res = [op(a, b) for (a, b) in zip(lvalues, rvalues)] 

1217 

1218 def _maybe_convert(arr): 

1219 if coerce_to_dtype: 

1220 # https://github.com/pandas-dev/pandas/issues/22850 

1221 # We catch all regular exceptions here, and fall back 

1222 # to an ndarray. 

1223 res = try_cast_to_ea(self, arr) 

1224 if not isinstance(res, type(self)): 

1225 # exception raised in _from_sequence; ensure we have ndarray 

1226 res = np.asarray(arr) 

1227 else: 

1228 res = np.asarray(arr) 

1229 return res 

1230 

1231 if op.__name__ in {"divmod", "rdivmod"}: 

1232 a, b = zip(*res) 

1233 return _maybe_convert(a), _maybe_convert(b) 

1234 

1235 return _maybe_convert(res) 

1236 

1237 op_name = ops._get_op_name(op, True) 

1238 return set_function_name(_binop, op_name, cls) 

1239 

1240 @classmethod 

1241 def _create_arithmetic_method(cls, op): 

1242 return cls._create_method(op) 

1243 

1244 @classmethod 

1245 def _create_comparison_method(cls, op): 

1246 return cls._create_method(op, coerce_to_dtype=False)