Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Base and utility classes for pandas objects. 

3""" 

4import builtins 

5import textwrap 

6from typing import Dict, FrozenSet, List, Optional 

7 

8import numpy as np 

9 

10import pandas._libs.lib as lib 

11from pandas.compat import PYPY 

12from pandas.compat.numpy import function as nv 

13from pandas.errors import AbstractMethodError 

14from pandas.util._decorators import Appender, Substitution, cache_readonly 

15from pandas.util._validators import validate_bool_kwarg 

16 

17from pandas.core.dtypes.cast import is_nested_object 

18from pandas.core.dtypes.common import ( 

19 is_categorical_dtype, 

20 is_dict_like, 

21 is_extension_array_dtype, 

22 is_list_like, 

23 is_object_dtype, 

24 is_scalar, 

25 needs_i8_conversion, 

26) 

27from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries 

28from pandas.core.dtypes.missing import isna 

29 

30from pandas.core import algorithms, common as com 

31from pandas.core.accessor import DirNamesMixin 

32from pandas.core.algorithms import duplicated, unique1d, value_counts 

33from pandas.core.arrays import ExtensionArray 

34from pandas.core.construction import create_series_with_explicit_dtype 

35import pandas.core.nanops as nanops 

36 

37_shared_docs: Dict[str, str] = dict() 

38_indexops_doc_kwargs = dict( 

39 klass="IndexOpsMixin", 

40 inplace="", 

41 unique="IndexOpsMixin", 

42 duplicated="IndexOpsMixin", 

43) 

44 

45 

46class PandasObject(DirNamesMixin): 

47 """baseclass for various pandas objects""" 

48 

49 @property 

50 def _constructor(self): 

51 """class constructor (for this class it's just `__class__`""" 

52 return type(self) 

53 

54 def __repr__(self) -> str: 

55 """ 

56 Return a string representation for a particular object. 

57 """ 

58 # Should be overwritten by base classes 

59 return object.__repr__(self) 

60 

61 def _reset_cache(self, key=None): 

62 """ 

63 Reset cached properties. If ``key`` is passed, only clears that key. 

64 """ 

65 if getattr(self, "_cache", None) is None: 

66 return 

67 if key is None: 

68 self._cache.clear() 

69 else: 

70 self._cache.pop(key, None) 

71 

72 def __sizeof__(self): 

73 """ 

74 Generates the total memory usage for an object that returns 

75 either a value or Series of values 

76 """ 

77 if hasattr(self, "memory_usage"): 

78 mem = self.memory_usage(deep=True) 

79 if not is_scalar(mem): 

80 mem = mem.sum() 

81 return int(mem) 

82 

83 # no memory_usage attribute, so fall back to 

84 # object's 'sizeof' 

85 return super().__sizeof__() 

86 

87 

88class NoNewAttributesMixin: 

89 """Mixin which prevents adding new attributes. 

90 

91 Prevents additional attributes via xxx.attribute = "something" after a 

92 call to `self.__freeze()`. Mainly used to prevent the user from using 

93 wrong attributes on an accessor (`Series.cat/.str/.dt`). 

94 

95 If you really want to add a new attribute at a later time, you need to use 

96 `object.__setattr__(self, key, value)`. 

97 """ 

98 

99 def _freeze(self): 

100 """Prevents setting additional attributes""" 

101 object.__setattr__(self, "__frozen", True) 

102 

103 # prevent adding any attribute via s.xxx.new_attribute = ... 

104 def __setattr__(self, key, value): 

105 # _cache is used by a decorator 

106 # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key) 

107 # because 

108 # 1.) getattr is false for attributes that raise errors 

109 # 2.) cls.__dict__ doesn't traverse into base classes 

110 if getattr(self, "__frozen", False) and not ( 

111 key == "_cache" 

112 or key in type(self).__dict__ 

113 or getattr(self, key, None) is not None 

114 ): 

115 raise AttributeError(f"You cannot add any new attribute '{key}'") 

116 object.__setattr__(self, key, value) 

117 

118 

119class GroupByError(Exception): 

120 pass 

121 

122 

123class DataError(GroupByError): 

124 pass 

125 

126 

127class SpecificationError(GroupByError): 

128 pass 

129 

130 

131class SelectionMixin: 

132 """ 

133 mixin implementing the selection & aggregation interface on a group-like 

134 object sub-classes need to define: obj, exclusions 

135 """ 

136 

137 _selection = None 

138 _internal_names = ["_cache", "__setstate__"] 

139 _internal_names_set = set(_internal_names) 

140 

141 _builtin_table = {builtins.sum: np.sum, builtins.max: np.max, builtins.min: np.min} 

142 

143 _cython_table = { 

144 builtins.sum: "sum", 

145 builtins.max: "max", 

146 builtins.min: "min", 

147 np.all: "all", 

148 np.any: "any", 

149 np.sum: "sum", 

150 np.nansum: "sum", 

151 np.mean: "mean", 

152 np.nanmean: "mean", 

153 np.prod: "prod", 

154 np.nanprod: "prod", 

155 np.std: "std", 

156 np.nanstd: "std", 

157 np.var: "var", 

158 np.nanvar: "var", 

159 np.median: "median", 

160 np.nanmedian: "median", 

161 np.max: "max", 

162 np.nanmax: "max", 

163 np.min: "min", 

164 np.nanmin: "min", 

165 np.cumprod: "cumprod", 

166 np.nancumprod: "cumprod", 

167 np.cumsum: "cumsum", 

168 np.nancumsum: "cumsum", 

169 } 

170 

171 @property 

172 def _selection_name(self): 

173 """ 

174 return a name for myself; this would ideally be called 

175 the 'name' property, but we cannot conflict with the 

176 Series.name property which can be set 

177 """ 

178 if self._selection is None: 

179 return None # 'result' 

180 else: 

181 return self._selection 

182 

183 @property 

184 def _selection_list(self): 

185 if not isinstance( 

186 self._selection, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray) 

187 ): 

188 return [self._selection] 

189 return self._selection 

190 

191 @cache_readonly 

192 def _selected_obj(self): 

193 

194 if self._selection is None or isinstance(self.obj, ABCSeries): 

195 return self.obj 

196 else: 

197 return self.obj[self._selection] 

198 

199 @cache_readonly 

200 def ndim(self) -> int: 

201 return self._selected_obj.ndim 

202 

203 @cache_readonly 

204 def _obj_with_exclusions(self): 

205 if self._selection is not None and isinstance(self.obj, ABCDataFrame): 

206 return self.obj.reindex(columns=self._selection_list) 

207 

208 if len(self.exclusions) > 0: 

209 return self.obj.drop(self.exclusions, axis=1) 

210 else: 

211 return self.obj 

212 

213 def __getitem__(self, key): 

214 if self._selection is not None: 

215 raise IndexError(f"Column(s) {self._selection} already selected") 

216 

217 if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): 

218 if len(self.obj.columns.intersection(key)) != len(key): 

219 bad_keys = list(set(key).difference(self.obj.columns)) 

220 raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") 

221 return self._gotitem(list(key), ndim=2) 

222 

223 elif not getattr(self, "as_index", False): 

224 if key not in self.obj.columns: 

225 raise KeyError(f"Column not found: {key}") 

226 return self._gotitem(key, ndim=2) 

227 

228 else: 

229 if key not in self.obj: 

230 raise KeyError(f"Column not found: {key}") 

231 return self._gotitem(key, ndim=1) 

232 

233 def _gotitem(self, key, ndim, subset=None): 

234 """ 

235 sub-classes to define 

236 return a sliced object 

237 

238 Parameters 

239 ---------- 

240 key : string / list of selections 

241 ndim : 1,2 

242 requested ndim of result 

243 subset : object, default None 

244 subset to act on 

245 

246 """ 

247 raise AbstractMethodError(self) 

248 

249 def aggregate(self, func, *args, **kwargs): 

250 raise AbstractMethodError(self) 

251 

252 agg = aggregate 

253 

254 def _try_aggregate_string_function(self, arg: str, *args, **kwargs): 

255 """ 

256 if arg is a string, then try to operate on it: 

257 - try to find a function (or attribute) on ourselves 

258 - try to find a numpy function 

259 - raise 

260 

261 """ 

262 assert isinstance(arg, str) 

263 

264 f = getattr(self, arg, None) 

265 if f is not None: 

266 if callable(f): 

267 return f(*args, **kwargs) 

268 

269 # people may try to aggregate on a non-callable attribute 

270 # but don't let them think they can pass args to it 

271 assert len(args) == 0 

272 assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 

273 return f 

274 

275 f = getattr(np, arg, None) 

276 if f is not None: 

277 if hasattr(self, "__array__"): 

278 # in particular exclude Window 

279 return f(self, *args, **kwargs) 

280 

281 raise AttributeError( 

282 f"'{arg}' is not a valid function for '{type(self).__name__}' object" 

283 ) 

284 

285 def _aggregate(self, arg, *args, **kwargs): 

286 """ 

287 provide an implementation for the aggregators 

288 

289 Parameters 

290 ---------- 

291 arg : string, dict, function 

292 *args : args to pass on to the function 

293 **kwargs : kwargs to pass on to the function 

294 

295 Returns 

296 ------- 

297 tuple of result, how 

298 

299 Notes 

300 ----- 

301 how can be a string describe the required post-processing, or 

302 None if not required 

303 """ 

304 is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) 

305 

306 _axis = kwargs.pop("_axis", None) 

307 if _axis is None: 

308 _axis = getattr(self, "axis", 0) 

309 

310 if isinstance(arg, str): 

311 return self._try_aggregate_string_function(arg, *args, **kwargs), None 

312 

313 if isinstance(arg, dict): 

314 # aggregate based on the passed dict 

315 if _axis != 0: # pragma: no cover 

316 raise ValueError("Can only pass dict with axis=0") 

317 

318 obj = self._selected_obj 

319 

320 # if we have a dict of any non-scalars 

321 # eg. {'A' : ['mean']}, normalize all to 

322 # be list-likes 

323 if any(is_aggregator(x) for x in arg.values()): 

324 new_arg = {} 

325 for k, v in arg.items(): 

326 if not isinstance(v, (tuple, list, dict)): 

327 new_arg[k] = [v] 

328 else: 

329 new_arg[k] = v 

330 

331 # the keys must be in the columns 

332 # for ndim=2, or renamers for ndim=1 

333 

334 # ok for now, but deprecated 

335 # {'A': { 'ra': 'mean' }} 

336 # {'A': { 'ra': ['mean'] }} 

337 # {'ra': ['mean']} 

338 

339 # not ok 

340 # {'ra' : { 'A' : 'mean' }} 

341 if isinstance(v, dict): 

342 raise SpecificationError("nested renamer is not supported") 

343 elif isinstance(obj, ABCSeries): 

344 raise SpecificationError("nested renamer is not supported") 

345 elif isinstance(obj, ABCDataFrame) and k not in obj.columns: 

346 raise KeyError(f"Column '{k}' does not exist!") 

347 

348 arg = new_arg 

349 

350 else: 

351 # deprecation of renaming keys 

352 # GH 15931 

353 keys = list(arg.keys()) 

354 if isinstance(obj, ABCDataFrame) and len( 

355 obj.columns.intersection(keys) 

356 ) != len(keys): 

357 raise SpecificationError("nested renamer is not supported") 

358 

359 from pandas.core.reshape.concat import concat 

360 

361 def _agg_1dim(name, how, subset=None): 

362 """ 

363 aggregate a 1-dim with how 

364 """ 

365 colg = self._gotitem(name, ndim=1, subset=subset) 

366 if colg.ndim != 1: 

367 raise SpecificationError( 

368 "nested dictionary is ambiguous in aggregation" 

369 ) 

370 return colg.aggregate(how) 

371 

372 def _agg_2dim(name, how): 

373 """ 

374 aggregate a 2-dim with how 

375 """ 

376 colg = self._gotitem(self._selection, ndim=2, subset=obj) 

377 return colg.aggregate(how) 

378 

379 def _agg(arg, func): 

380 """ 

381 run the aggregations over the arg with func 

382 return a dict 

383 """ 

384 result = {} 

385 for fname, agg_how in arg.items(): 

386 result[fname] = func(fname, agg_how) 

387 return result 

388 

389 # set the final keys 

390 keys = list(arg.keys()) 

391 result = {} 

392 

393 if self._selection is not None: 

394 

395 sl = set(self._selection_list) 

396 

397 # we are a Series like object, 

398 # but may have multiple aggregations 

399 if len(sl) == 1: 

400 

401 result = _agg( 

402 arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how) 

403 ) 

404 

405 # we are selecting the same set as we are aggregating 

406 elif not len(sl - set(keys)): 

407 

408 result = _agg(arg, _agg_1dim) 

409 

410 # we are a DataFrame, with possibly multiple aggregations 

411 else: 

412 

413 result = _agg(arg, _agg_2dim) 

414 

415 # no selection 

416 else: 

417 

418 try: 

419 result = _agg(arg, _agg_1dim) 

420 except SpecificationError: 

421 

422 # we are aggregating expecting all 1d-returns 

423 # but we have 2d 

424 result = _agg(arg, _agg_2dim) 

425 

426 # combine results 

427 

428 def is_any_series() -> bool: 

429 # return a boolean if we have *any* nested series 

430 return any(isinstance(r, ABCSeries) for r in result.values()) 

431 

432 def is_any_frame() -> bool: 

433 # return a boolean if we have *any* nested series 

434 return any(isinstance(r, ABCDataFrame) for r in result.values()) 

435 

436 if isinstance(result, list): 

437 return concat(result, keys=keys, axis=1, sort=True), True 

438 

439 elif is_any_frame(): 

440 # we have a dict of DataFrames 

441 # return a MI DataFrame 

442 

443 return concat([result[k] for k in keys], keys=keys, axis=1), True 

444 

445 elif isinstance(self, ABCSeries) and is_any_series(): 

446 

447 # we have a dict of Series 

448 # return a MI Series 

449 try: 

450 result = concat(result) 

451 except TypeError: 

452 # we want to give a nice error here if 

453 # we have non-same sized objects, so 

454 # we don't automatically broadcast 

455 

456 raise ValueError( 

457 "cannot perform both aggregation " 

458 "and transformation operations " 

459 "simultaneously" 

460 ) 

461 

462 return result, True 

463 

464 # fall thru 

465 from pandas import DataFrame, Series 

466 

467 try: 

468 result = DataFrame(result) 

469 except ValueError: 

470 

471 # we have a dict of scalars 

472 result = Series(result, name=getattr(self, "name", None)) 

473 

474 return result, True 

475 elif is_list_like(arg): 

476 # we require a list, but not an 'str' 

477 return self._aggregate_multiple_funcs(arg, _axis=_axis), None 

478 else: 

479 result = None 

480 

481 f = self._get_cython_func(arg) 

482 if f and not args and not kwargs: 

483 return getattr(self, f)(), None 

484 

485 # caller can react 

486 return result, True 

487 

488 def _aggregate_multiple_funcs(self, arg, _axis): 

489 from pandas.core.reshape.concat import concat 

490 

491 if _axis != 0: 

492 raise NotImplementedError("axis other than 0 is not supported") 

493 

494 if self._selected_obj.ndim == 1: 

495 obj = self._selected_obj 

496 else: 

497 obj = self._obj_with_exclusions 

498 

499 results = [] 

500 keys = [] 

501 

502 # degenerate case 

503 if obj.ndim == 1: 

504 for a in arg: 

505 colg = self._gotitem(obj.name, ndim=1, subset=obj) 

506 try: 

507 new_res = colg.aggregate(a) 

508 

509 except TypeError: 

510 pass 

511 else: 

512 results.append(new_res) 

513 

514 # make sure we find a good name 

515 name = com.get_callable_name(a) or a 

516 keys.append(name) 

517 

518 # multiples 

519 else: 

520 for index, col in enumerate(obj): 

521 colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) 

522 try: 

523 new_res = colg.aggregate(arg) 

524 except (TypeError, DataError): 

525 pass 

526 except ValueError as err: 

527 # cannot aggregate 

528 if "Must produce aggregated value" in str(err): 

529 # raised directly in _aggregate_named 

530 pass 

531 elif "no results" in str(err): 

532 # raised direcly in _aggregate_multiple_funcs 

533 pass 

534 else: 

535 raise 

536 else: 

537 results.append(new_res) 

538 keys.append(col) 

539 

540 # if we are empty 

541 if not len(results): 

542 raise ValueError("no results") 

543 

544 try: 

545 return concat(results, keys=keys, axis=1, sort=False) 

546 except TypeError: 

547 

548 # we are concatting non-NDFrame objects, 

549 # e.g. a list of scalars 

550 

551 from pandas import Series 

552 

553 result = Series(results, index=keys, name=self.name) 

554 if is_nested_object(result): 

555 raise ValueError("cannot combine transform and aggregation operations") 

556 return result 

557 

558 def _get_cython_func(self, arg: str) -> Optional[str]: 

559 """ 

560 if we define an internal function for this argument, return it 

561 """ 

562 return self._cython_table.get(arg) 

563 

564 def _is_builtin_func(self, arg): 

565 """ 

566 if we define an builtin function for this argument, return it, 

567 otherwise return the arg 

568 """ 

569 return self._builtin_table.get(arg, arg) 

570 

571 

572class ShallowMixin: 

573 _attributes: List[str] = [] 

574 

575 def _shallow_copy(self, obj=None, **kwargs): 

576 """ 

577 return a new object with the replacement attributes 

578 """ 

579 if obj is None: 

580 obj = self._selected_obj.copy() 

581 

582 if isinstance(obj, self._constructor): 

583 obj = obj.obj 

584 for attr in self._attributes: 

585 if attr not in kwargs: 

586 kwargs[attr] = getattr(self, attr) 

587 return self._constructor(obj, **kwargs) 

588 

589 

590class IndexOpsMixin: 

591 """ 

592 Common ops mixin to support a unified interface / docs for Series / Index 

593 """ 

594 

595 # ndarray compatibility 

596 __array_priority__ = 1000 

597 _deprecations: FrozenSet[str] = frozenset( 

598 ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ 

599 ) 

600 

601 def transpose(self, *args, **kwargs): 

602 """ 

603 Return the transpose, which is by definition self. 

604 

605 Returns 

606 ------- 

607 %(klass)s 

608 """ 

609 nv.validate_transpose(args, kwargs) 

610 return self 

611 

612 T = property( 

613 transpose, 

614 doc=""" 

615 Return the transpose, which is by definition self. 

616 """, 

617 ) 

618 

619 @property 

620 def shape(self): 

621 """ 

622 Return a tuple of the shape of the underlying data. 

623 """ 

624 return self._values.shape 

625 

626 @property 

627 def ndim(self) -> int: 

628 """ 

629 Number of dimensions of the underlying data, by definition 1. 

630 """ 

631 return 1 

632 

633 def item(self): 

634 """ 

635 Return the first element of the underlying data as a python scalar. 

636 

637 Returns 

638 ------- 

639 scalar 

640 The first element of %(klass)s. 

641 

642 Raises 

643 ------ 

644 ValueError 

645 If the data is not length-1. 

646 """ 

647 if not ( 

648 is_extension_array_dtype(self.dtype) or needs_i8_conversion(self.dtype) 

649 ): 

650 # numpy returns ints instead of datetime64/timedelta64 objects, 

651 # which we need to wrap in Timestamp/Timedelta/Period regardless. 

652 return self.values.item() 

653 

654 if len(self) == 1: 

655 return next(iter(self)) 

656 else: 

657 raise ValueError("can only convert an array of size 1 to a Python scalar") 

658 

659 @property 

660 def nbytes(self): 

661 """ 

662 Return the number of bytes in the underlying data. 

663 """ 

664 return self._values.nbytes 

665 

666 @property 

667 def size(self): 

668 """ 

669 Return the number of elements in the underlying data. 

670 """ 

671 return len(self._values) 

672 

673 @property 

674 def array(self) -> ExtensionArray: 

675 """ 

676 The ExtensionArray of the data backing this Series or Index. 

677 

678 .. versionadded:: 0.24.0 

679 

680 Returns 

681 ------- 

682 ExtensionArray 

683 An ExtensionArray of the values stored within. For extension 

684 types, this is the actual array. For NumPy native types, this 

685 is a thin (no copy) wrapper around :class:`numpy.ndarray`. 

686 

687 ``.array`` differs ``.values`` which may require converting the 

688 data to a different form. 

689 

690 See Also 

691 -------- 

692 Index.to_numpy : Similar method that always returns a NumPy array. 

693 Series.to_numpy : Similar method that always returns a NumPy array. 

694 

695 Notes 

696 ----- 

697 This table lays out the different array types for each extension 

698 dtype within pandas. 

699 

700 ================== ============================= 

701 dtype array type 

702 ================== ============================= 

703 category Categorical 

704 period PeriodArray 

705 interval IntervalArray 

706 IntegerNA IntegerArray 

707 string StringArray 

708 boolean BooleanArray 

709 datetime64[ns, tz] DatetimeArray 

710 ================== ============================= 

711 

712 For any 3rd-party extension types, the array type will be an 

713 ExtensionArray. 

714 

715 For all remaining dtypes ``.array`` will be a 

716 :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray 

717 stored within. If you absolutely need a NumPy array (possibly with 

718 copying / coercing data), then use :meth:`Series.to_numpy` instead. 

719 

720 Examples 

721 -------- 

722 

723 For regular NumPy types like int, and float, a PandasArray 

724 is returned. 

725 

726 >>> pd.Series([1, 2, 3]).array 

727 <PandasArray> 

728 [1, 2, 3] 

729 Length: 3, dtype: int64 

730 

731 For extension types, like Categorical, the actual ExtensionArray 

732 is returned 

733 

734 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) 

735 >>> ser.array 

736 [a, b, a] 

737 Categories (2, object): [a, b] 

738 """ 

739 raise AbstractMethodError(self) 

740 

741 def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): 

742 """ 

743 A NumPy ndarray representing the values in this Series or Index. 

744 

745 .. versionadded:: 0.24.0 

746 

747 Parameters 

748 ---------- 

749 dtype : str or numpy.dtype, optional 

750 The dtype to pass to :meth:`numpy.asarray`. 

751 copy : bool, default False 

752 Whether to ensure that the returned value is a not a view on 

753 another array. Note that ``copy=False`` does not *ensure* that 

754 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that 

755 a copy is made, even if not strictly necessary. 

756 na_value : Any, optional 

757 The value to use for missing values. The default value depends 

758 on `dtype` and the type of the array. 

759 

760 .. versionadded:: 1.0.0 

761 

762 **kwargs 

763 Additional keywords passed through to the ``to_numpy`` method 

764 of the underlying array (for extension arrays). 

765 

766 .. versionadded:: 1.0.0 

767 

768 Returns 

769 ------- 

770 numpy.ndarray 

771 

772 See Also 

773 -------- 

774 Series.array : Get the actual data stored within. 

775 Index.array : Get the actual data stored within. 

776 DataFrame.to_numpy : Similar method for DataFrame. 

777 

778 Notes 

779 ----- 

780 The returned array will be the same up to equality (values equal 

781 in `self` will be equal in the returned array; likewise for values 

782 that are not equal). When `self` contains an ExtensionArray, the 

783 dtype may be different. For example, for a category-dtype Series, 

784 ``to_numpy()`` will return a NumPy array and the categorical dtype 

785 will be lost. 

786 

787 For NumPy dtypes, this will be a reference to the actual data stored 

788 in this Series or Index (assuming ``copy=False``). Modifying the result 

789 in place will modify the data stored in the Series or Index (not that 

790 we recommend doing that). 

791 

792 For extension types, ``to_numpy()`` *may* require copying data and 

793 coercing the result to a NumPy type (possibly object), which may be 

794 expensive. When you need a no-copy reference to the underlying data, 

795 :attr:`Series.array` should be used instead. 

796 

797 This table lays out the different dtypes and default return types of 

798 ``to_numpy()`` for various dtypes within pandas. 

799 

800 ================== ================================ 

801 dtype array type 

802 ================== ================================ 

803 category[T] ndarray[T] (same dtype as input) 

804 period ndarray[object] (Periods) 

805 interval ndarray[object] (Intervals) 

806 IntegerNA ndarray[object] 

807 datetime64[ns] datetime64[ns] 

808 datetime64[ns, tz] ndarray[object] (Timestamps) 

809 ================== ================================ 

810 

811 Examples 

812 -------- 

813 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) 

814 >>> ser.to_numpy() 

815 array(['a', 'b', 'a'], dtype=object) 

816 

817 Specify the `dtype` to control how datetime-aware data is represented. 

818 Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` 

819 objects, each with the correct ``tz``. 

820 

821 >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) 

822 >>> ser.to_numpy(dtype=object) 

823 array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'), 

824 Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], 

825 dtype=object) 

826 

827 Or ``dtype='datetime64[ns]'`` to return an ndarray of native 

828 datetime64 values. The values are converted to UTC and the timezone 

829 info is dropped. 

830 

831 >>> ser.to_numpy(dtype="datetime64[ns]") 

832 ... # doctest: +ELLIPSIS 

833 array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], 

834 dtype='datetime64[ns]') 

835 """ 

836 if is_extension_array_dtype(self.dtype): 

837 return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) 

838 else: 

839 if kwargs: 

840 msg = "to_numpy() got an unexpected keyword argument '{}'".format( 

841 list(kwargs.keys())[0] 

842 ) 

843 raise TypeError(msg) 

844 

845 result = np.asarray(self._values, dtype=dtype) 

846 # TODO(GH-24345): Avoid potential double copy 

847 if copy or na_value is not lib.no_default: 

848 result = result.copy() 

849 if na_value is not lib.no_default: 

850 result[self.isna()] = na_value 

851 return result 

852 

853 @property 

854 def _ndarray_values(self) -> np.ndarray: 

855 """ 

856 The data as an ndarray, possibly losing information. 

857 

858 The expectation is that this is cheap to compute, and is primarily 

859 used for interacting with our indexers. 

860 

861 - categorical -> codes 

862 """ 

863 if is_extension_array_dtype(self): 

864 return self.array._ndarray_values 

865 # As a mixin, we depend on the mixing class having values. 

866 # Special mixin syntax may be developed in the future: 

867 # https://github.com/python/typing/issues/246 

868 return self.values # type: ignore 

869 

870 @property 

871 def empty(self): 

872 return not self.size 

873 

874 def max(self, axis=None, skipna=True, *args, **kwargs): 

875 """ 

876 Return the maximum value of the Index. 

877 

878 Parameters 

879 ---------- 

880 axis : int, optional 

881 For compatibility with NumPy. Only 0 or None are allowed. 

882 skipna : bool, default True 

883 

884 Returns 

885 ------- 

886 scalar 

887 Maximum value. 

888 

889 See Also 

890 -------- 

891 Index.min : Return the minimum value in an Index. 

892 Series.max : Return the maximum value in a Series. 

893 DataFrame.max : Return the maximum values in a DataFrame. 

894 

895 Examples 

896 -------- 

897 >>> idx = pd.Index([3, 2, 1]) 

898 >>> idx.max() 

899 3 

900 

901 >>> idx = pd.Index(['c', 'b', 'a']) 

902 >>> idx.max() 

903 'c' 

904 

905 For a MultiIndex, the maximum is determined lexicographically. 

906 

907 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) 

908 >>> idx.max() 

909 ('b', 2) 

910 """ 

911 nv.validate_minmax_axis(axis) 

912 nv.validate_max(args, kwargs) 

913 return nanops.nanmax(self._values, skipna=skipna) 

914 

915 def argmax(self, axis=None, skipna=True, *args, **kwargs): 

916 """ 

917 Return an ndarray of the maximum argument indexer. 

918 

919 Parameters 

920 ---------- 

921 axis : {None} 

922 Dummy argument for consistency with Series. 

923 skipna : bool, default True 

924 

925 Returns 

926 ------- 

927 numpy.ndarray 

928 Indices of the maximum values. 

929 

930 See Also 

931 -------- 

932 numpy.ndarray.argmax 

933 """ 

934 nv.validate_minmax_axis(axis) 

935 nv.validate_argmax_with_skipna(skipna, args, kwargs) 

936 return nanops.nanargmax(self._values, skipna=skipna) 

937 

938 def min(self, axis=None, skipna=True, *args, **kwargs): 

939 """ 

940 Return the minimum value of the Index. 

941 

942 Parameters 

943 ---------- 

944 axis : {None} 

945 Dummy argument for consistency with Series. 

946 skipna : bool, default True 

947 

948 Returns 

949 ------- 

950 scalar 

951 Minimum value. 

952 

953 See Also 

954 -------- 

955 Index.max : Return the maximum value of the object. 

956 Series.min : Return the minimum value in a Series. 

957 DataFrame.min : Return the minimum values in a DataFrame. 

958 

959 Examples 

960 -------- 

961 >>> idx = pd.Index([3, 2, 1]) 

962 >>> idx.min() 

963 1 

964 

965 >>> idx = pd.Index(['c', 'b', 'a']) 

966 >>> idx.min() 

967 'a' 

968 

969 For a MultiIndex, the minimum is determined lexicographically. 

970 

971 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) 

972 >>> idx.min() 

973 ('a', 1) 

974 """ 

975 nv.validate_minmax_axis(axis) 

976 nv.validate_min(args, kwargs) 

977 return nanops.nanmin(self._values, skipna=skipna) 

978 

979 def argmin(self, axis=None, skipna=True, *args, **kwargs): 

980 """ 

981 Return a ndarray of the minimum argument indexer. 

982 

983 Parameters 

984 ---------- 

985 axis : {None} 

986 Dummy argument for consistency with Series. 

987 skipna : bool, default True 

988 

989 Returns 

990 ------- 

991 numpy.ndarray 

992 

993 See Also 

994 -------- 

995 numpy.ndarray.argmin 

996 """ 

997 nv.validate_minmax_axis(axis) 

998 nv.validate_argmax_with_skipna(skipna, args, kwargs) 

999 return nanops.nanargmin(self._values, skipna=skipna) 

1000 

1001 def tolist(self): 

1002 """ 

1003 Return a list of the values. 

1004 

1005 These are each a scalar type, which is a Python scalar 

1006 (for str, int, float) or a pandas scalar 

1007 (for Timestamp/Timedelta/Interval/Period) 

1008 

1009 Returns 

1010 ------- 

1011 list 

1012 

1013 See Also 

1014 -------- 

1015 numpy.ndarray.tolist 

1016 """ 

1017 if self.dtype.kind in ["m", "M"]: 

1018 return [com.maybe_box_datetimelike(x) for x in self._values] 

1019 elif is_extension_array_dtype(self._values): 

1020 return list(self._values) 

1021 else: 

1022 return self._values.tolist() 

1023 

1024 to_list = tolist 

1025 

1026 def __iter__(self): 

1027 """ 

1028 Return an iterator of the values. 

1029 

1030 These are each a scalar type, which is a Python scalar 

1031 (for str, int, float) or a pandas scalar 

1032 (for Timestamp/Timedelta/Interval/Period) 

1033 

1034 Returns 

1035 ------- 

1036 iterator 

1037 """ 

1038 # We are explicitly making element iterators. 

1039 if self.dtype.kind in ["m", "M"]: 

1040 return map(com.maybe_box_datetimelike, self._values) 

1041 elif is_extension_array_dtype(self._values): 

1042 return iter(self._values) 

1043 else: 

1044 return map(self._values.item, range(self._values.size)) 

1045 

1046 @cache_readonly 

1047 def hasnans(self): 

1048 """ 

1049 Return if I have any nans; enables various perf speedups. 

1050 """ 

1051 return bool(isna(self).any()) 

1052 

1053 def _reduce( 

1054 self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds 

1055 ): 

1056 """ perform the reduction type operation if we can """ 

1057 func = getattr(self, name, None) 

1058 if func is None: 

1059 raise TypeError( 

1060 f"{type(self).__name__} cannot perform the operation {name}" 

1061 ) 

1062 return func(skipna=skipna, **kwds) 

1063 

1064 def _map_values(self, mapper, na_action=None): 

1065 """ 

1066 An internal function that maps values using the input 

1067 correspondence (which can be a dict, Series, or function). 

1068 

1069 Parameters 

1070 ---------- 

1071 mapper : function, dict, or Series 

1072 The input correspondence object 

1073 na_action : {None, 'ignore'} 

1074 If 'ignore', propagate NA values, without passing them to the 

1075 mapping function 

1076 

1077 Returns 

1078 ------- 

1079 Union[Index, MultiIndex], inferred 

1080 The output of the mapping function applied to the index. 

1081 If the function returns a tuple with more than one element 

1082 a MultiIndex will be returned. 

1083 

1084 """ 

1085 

1086 # we can fastpath dict/Series to an efficient map 

1087 # as we know that we are not going to have to yield 

1088 # python types 

1089 if is_dict_like(mapper): 

1090 if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): 

1091 # If a dictionary subclass defines a default value method, 

1092 # convert mapper to a lookup function (GH #15999). 

1093 dict_with_default = mapper 

1094 mapper = lambda x: dict_with_default[x] 

1095 else: 

1096 # Dictionary does not have a default. Thus it's safe to 

1097 # convert to an Series for efficiency. 

1098 # we specify the keys here to handle the 

1099 # possibility that they are tuples 

1100 

1101 # The return value of mapping with an empty mapper is 

1102 # expected to be pd.Series(np.nan, ...). As np.nan is 

1103 # of dtype float64 the return value of this method should 

1104 # be float64 as well 

1105 mapper = create_series_with_explicit_dtype( 

1106 mapper, dtype_if_empty=np.float64 

1107 ) 

1108 

1109 if isinstance(mapper, ABCSeries): 

1110 # Since values were input this means we came from either 

1111 # a dict or a series and mapper should be an index 

1112 if is_categorical_dtype(self._values): 

1113 # use the built in categorical series mapper which saves 

1114 # time by mapping the categories instead of all values 

1115 return self._values.map(mapper) 

1116 if is_extension_array_dtype(self.dtype): 

1117 values = self._values 

1118 else: 

1119 values = self.values 

1120 

1121 indexer = mapper.index.get_indexer(values) 

1122 new_values = algorithms.take_1d(mapper._values, indexer) 

1123 

1124 return new_values 

1125 

1126 # we must convert to python types 

1127 if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): 

1128 # GH#23179 some EAs do not have `map` 

1129 values = self._values 

1130 if na_action is not None: 

1131 raise NotImplementedError 

1132 map_f = lambda values, f: values.map(f) 

1133 else: 

1134 values = self.astype(object) 

1135 values = getattr(values, "values", values) 

1136 if na_action == "ignore": 

1137 

1138 def map_f(values, f): 

1139 return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) 

1140 

1141 else: 

1142 map_f = lib.map_infer 

1143 

1144 # mapper is a function 

1145 new_values = map_f(values, mapper) 

1146 

1147 return new_values 

1148 

1149 def value_counts( 

1150 self, normalize=False, sort=True, ascending=False, bins=None, dropna=True 

1151 ): 

1152 """ 

1153 Return a Series containing counts of unique values. 

1154 

1155 The resulting object will be in descending order so that the 

1156 first element is the most frequently-occurring element. 

1157 Excludes NA values by default. 

1158 

1159 Parameters 

1160 ---------- 

1161 normalize : bool, default False 

1162 If True then the object returned will contain the relative 

1163 frequencies of the unique values. 

1164 sort : bool, default True 

1165 Sort by frequencies. 

1166 ascending : bool, default False 

1167 Sort in ascending order. 

1168 bins : int, optional 

1169 Rather than count values, group them into half-open bins, 

1170 a convenience for ``pd.cut``, only works with numeric data. 

1171 dropna : bool, default True 

1172 Don't include counts of NaN. 

1173 

1174 Returns 

1175 ------- 

1176 Series 

1177 

1178 See Also 

1179 -------- 

1180 Series.count: Number of non-NA elements in a Series. 

1181 DataFrame.count: Number of non-NA elements in a DataFrame. 

1182 

1183 Examples 

1184 -------- 

1185 >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) 

1186 >>> index.value_counts() 

1187 3.0 2 

1188 4.0 1 

1189 2.0 1 

1190 1.0 1 

1191 dtype: int64 

1192 

1193 With `normalize` set to `True`, returns the relative frequency by 

1194 dividing all values by the sum of values. 

1195 

1196 >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) 

1197 >>> s.value_counts(normalize=True) 

1198 3.0 0.4 

1199 4.0 0.2 

1200 2.0 0.2 

1201 1.0 0.2 

1202 dtype: float64 

1203 

1204 **bins** 

1205 

1206 Bins can be useful for going from a continuous variable to a 

1207 categorical variable; instead of counting unique 

1208 apparitions of values, divide the index in the specified 

1209 number of half-open bins. 

1210 

1211 >>> s.value_counts(bins=3) 

1212 (2.0, 3.0] 2 

1213 (0.996, 2.0] 2 

1214 (3.0, 4.0] 1 

1215 dtype: int64 

1216 

1217 **dropna** 

1218 

1219 With `dropna` set to `False` we can also see NaN index values. 

1220 

1221 >>> s.value_counts(dropna=False) 

1222 3.0 2 

1223 NaN 1 

1224 4.0 1 

1225 2.0 1 

1226 1.0 1 

1227 dtype: int64 

1228 """ 

1229 result = value_counts( 

1230 self, 

1231 sort=sort, 

1232 ascending=ascending, 

1233 normalize=normalize, 

1234 bins=bins, 

1235 dropna=dropna, 

1236 ) 

1237 return result 

1238 

1239 def unique(self): 

1240 values = self._values 

1241 

1242 if hasattr(values, "unique"): 

1243 

1244 result = values.unique() 

1245 else: 

1246 result = unique1d(values) 

1247 

1248 return result 

1249 

1250 def nunique(self, dropna=True): 

1251 """ 

1252 Return number of unique elements in the object. 

1253 

1254 Excludes NA values by default. 

1255 

1256 Parameters 

1257 ---------- 

1258 dropna : bool, default True 

1259 Don't include NaN in the count. 

1260 

1261 Returns 

1262 ------- 

1263 int 

1264 

1265 See Also 

1266 -------- 

1267 DataFrame.nunique: Method nunique for DataFrame. 

1268 Series.count: Count non-NA/null observations in the Series. 

1269 

1270 Examples 

1271 -------- 

1272 >>> s = pd.Series([1, 3, 5, 7, 7]) 

1273 >>> s 

1274 0 1 

1275 1 3 

1276 2 5 

1277 3 7 

1278 4 7 

1279 dtype: int64 

1280 

1281 >>> s.nunique() 

1282 4 

1283 """ 

1284 uniqs = self.unique() 

1285 n = len(uniqs) 

1286 if dropna and isna(uniqs).any(): 

1287 n -= 1 

1288 return n 

1289 

1290 @property 

1291 def is_unique(self): 

1292 """ 

1293 Return boolean if values in the object are unique. 

1294 

1295 Returns 

1296 ------- 

1297 bool 

1298 """ 

1299 return self.nunique(dropna=False) == len(self) 

1300 

1301 @property 

1302 def is_monotonic(self): 

1303 """ 

1304 Return boolean if values in the object are 

1305 monotonic_increasing. 

1306 

1307 Returns 

1308 ------- 

1309 bool 

1310 """ 

1311 from pandas import Index 

1312 

1313 return Index(self).is_monotonic 

1314 

1315 is_monotonic_increasing = is_monotonic 

1316 

1317 @property 

1318 def is_monotonic_decreasing(self) -> bool: 

1319 """ 

1320 Return boolean if values in the object are 

1321 monotonic_decreasing. 

1322 

1323 Returns 

1324 ------- 

1325 bool 

1326 """ 

1327 from pandas import Index 

1328 

1329 return Index(self).is_monotonic_decreasing 

1330 

1331 def memory_usage(self, deep=False): 

1332 """ 

1333 Memory usage of the values. 

1334 

1335 Parameters 

1336 ---------- 

1337 deep : bool 

1338 Introspect the data deeply, interrogate 

1339 `object` dtypes for system-level memory consumption. 

1340 

1341 Returns 

1342 ------- 

1343 bytes used 

1344 

1345 See Also 

1346 -------- 

1347 numpy.ndarray.nbytes 

1348 

1349 Notes 

1350 ----- 

1351 Memory usage does not include memory consumed by elements that 

1352 are not components of the array if deep=False or if used on PyPy 

1353 """ 

1354 if hasattr(self.array, "memory_usage"): 

1355 return self.array.memory_usage(deep=deep) 

1356 

1357 v = self.array.nbytes 

1358 if deep and is_object_dtype(self) and not PYPY: 

1359 v += lib.memory_usage_of_objects(self._values) 

1360 return v 

1361 

1362 @Substitution( 

1363 values="", 

1364 order="", 

1365 size_hint="", 

1366 sort=textwrap.dedent( 

1367 """\ 

1368 sort : bool, default False 

1369 Sort `uniques` and shuffle `codes` to maintain the 

1370 relationship. 

1371 """ 

1372 ), 

1373 ) 

1374 @Appender(algorithms._shared_docs["factorize"]) 

1375 def factorize(self, sort=False, na_sentinel=-1): 

1376 return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) 

1377 

1378 _shared_docs[ 

1379 "searchsorted" 

1380 ] = """ 

1381 Find indices where elements should be inserted to maintain order. 

1382 

1383 Find the indices into a sorted %(klass)s `self` such that, if the 

1384 corresponding elements in `value` were inserted before the indices, 

1385 the order of `self` would be preserved. 

1386 

1387 .. note:: 

1388 

1389 The %(klass)s *must* be monotonically sorted, otherwise 

1390 wrong locations will likely be returned. Pandas does *not* 

1391 check this for you. 

1392 

1393 Parameters 

1394 ---------- 

1395 value : array_like 

1396 Values to insert into `self`. 

1397 side : {'left', 'right'}, optional 

1398 If 'left', the index of the first suitable location found is given. 

1399 If 'right', return the last such index. If there is no suitable 

1400 index, return either 0 or N (where N is the length of `self`). 

1401 sorter : 1-D array_like, optional 

1402 Optional array of integer indices that sort `self` into ascending 

1403 order. They are typically the result of ``np.argsort``. 

1404 

1405 Returns 

1406 ------- 

1407 int or array of int 

1408 A scalar or array of insertion points with the 

1409 same shape as `value`. 

1410 

1411 .. versionchanged:: 0.24.0 

1412 If `value` is a scalar, an int is now always returned. 

1413 Previously, scalar inputs returned an 1-item array for 

1414 :class:`Series` and :class:`Categorical`. 

1415 

1416 See Also 

1417 -------- 

1418 sort_values 

1419 numpy.searchsorted 

1420 

1421 Notes 

1422 ----- 

1423 Binary search is used to find the required insertion points. 

1424 

1425 Examples 

1426 -------- 

1427 

1428 >>> x = pd.Series([1, 2, 3]) 

1429 >>> x 

1430 0 1 

1431 1 2 

1432 2 3 

1433 dtype: int64 

1434 

1435 >>> x.searchsorted(4) 

1436 3 

1437 

1438 >>> x.searchsorted([0, 4]) 

1439 array([0, 3]) 

1440 

1441 >>> x.searchsorted([1, 3], side='left') 

1442 array([0, 2]) 

1443 

1444 >>> x.searchsorted([1, 3], side='right') 

1445 array([1, 3]) 

1446 

1447 >>> x = pd.Categorical(['apple', 'bread', 'bread', 

1448 'cheese', 'milk'], ordered=True) 

1449 [apple, bread, bread, cheese, milk] 

1450 Categories (4, object): [apple < bread < cheese < milk] 

1451 

1452 >>> x.searchsorted('bread') 

1453 1 

1454 

1455 >>> x.searchsorted(['bread'], side='right') 

1456 array([3]) 

1457 

1458 If the values are not monotonically sorted, wrong locations 

1459 may be returned: 

1460 

1461 >>> x = pd.Series([2, 1, 3]) 

1462 >>> x.searchsorted(1) 

1463 0 # wrong result, correct would be 1 

1464 """ 

1465 

1466 @Substitution(klass="Index") 

1467 @Appender(_shared_docs["searchsorted"]) 

1468 def searchsorted(self, value, side="left", sorter=None): 

1469 return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) 

1470 

1471 def drop_duplicates(self, keep="first", inplace=False): 

1472 inplace = validate_bool_kwarg(inplace, "inplace") 

1473 if isinstance(self, ABCIndexClass): 

1474 if self.is_unique: 

1475 return self._shallow_copy() 

1476 

1477 duplicated = self.duplicated(keep=keep) 

1478 result = self[np.logical_not(duplicated)] 

1479 if inplace: 

1480 return self._update_inplace(result) 

1481 else: 

1482 return result 

1483 

1484 def duplicated(self, keep="first"): 

1485 if isinstance(self, ABCIndexClass): 

1486 if self.is_unique: 

1487 return np.zeros(len(self), dtype=np.bool) 

1488 return duplicated(self, keep=keep) 

1489 else: 

1490 return self._constructor( 

1491 duplicated(self, keep=keep), index=self.index 

1492 ).__finalize__(self) 

1493 

1494 # ---------------------------------------------------------------------- 

1495 # abstracts 

1496 

1497 def _update_inplace(self, result, verify_is_copy=True, **kwargs): 

1498 raise AbstractMethodError(self)