Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import operator 

2from shutil import get_terminal_size 

3from typing import Dict, Hashable, List, Type, Union, cast 

4from warnings import warn 

5 

6import numpy as np 

7 

8from pandas._config import get_option 

9 

10from pandas._libs import algos as libalgos, hashtable as htable 

11from pandas._typing import ArrayLike, Dtype, Ordered, Scalar 

12from pandas.compat.numpy import function as nv 

13from pandas.util._decorators import ( 

14 Appender, 

15 Substitution, 

16 cache_readonly, 

17 deprecate_kwarg, 

18) 

19from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs 

20 

21from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike 

22from pandas.core.dtypes.common import ( 

23 ensure_int64, 

24 ensure_object, 

25 ensure_platform_int, 

26 is_categorical_dtype, 

27 is_datetime64_dtype, 

28 is_dict_like, 

29 is_dtype_equal, 

30 is_extension_array_dtype, 

31 is_integer_dtype, 

32 is_iterator, 

33 is_list_like, 

34 is_object_dtype, 

35 is_scalar, 

36 is_sequence, 

37 is_timedelta64_dtype, 

38 needs_i8_conversion, 

39) 

40from pandas.core.dtypes.dtypes import CategoricalDtype 

41from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries 

42from pandas.core.dtypes.inference import is_hashable 

43from pandas.core.dtypes.missing import isna, notna 

44 

45from pandas.core import ops 

46from pandas.core.accessor import PandasDelegate, delegate_names 

47import pandas.core.algorithms as algorithms 

48from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d 

49from pandas.core.arrays.base import ( 

50 ExtensionArray, 

51 _extension_array_shared_docs, 

52 try_cast_to_ea, 

53) 

54from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs 

55import pandas.core.common as com 

56from pandas.core.construction import array, extract_array, sanitize_array 

57from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing 

58from pandas.core.missing import interpolate_2d 

59from pandas.core.ops.common import unpack_zerodim_and_defer 

60from pandas.core.sorting import nargsort 

61 

62from pandas.io.formats import console 

63 

64 

65def _cat_compare_op(op): 

66 opname = f"__{op.__name__}__" 

67 

68 @unpack_zerodim_and_defer(opname) 

69 def func(self, other): 

70 if is_list_like(other) and len(other) != len(self): 

71 # TODO: Could this fail if the categories are listlike objects? 

72 raise ValueError("Lengths must match.") 

73 

74 if not self.ordered: 

75 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: 

76 raise TypeError( 

77 "Unordered Categoricals can only compare equality or not" 

78 ) 

79 if isinstance(other, Categorical): 

80 # Two Categoricals can only be be compared if the categories are 

81 # the same (maybe up to ordering, depending on ordered) 

82 

83 msg = "Categoricals can only be compared if 'categories' are the same." 

84 if len(self.categories) != len(other.categories): 

85 raise TypeError(msg + " Categories are different lengths") 

86 elif self.ordered and not (self.categories == other.categories).all(): 

87 raise TypeError(msg) 

88 elif not set(self.categories) == set(other.categories): 

89 raise TypeError(msg) 

90 

91 if not (self.ordered == other.ordered): 

92 raise TypeError( 

93 "Categoricals can only be compared if 'ordered' is the same" 

94 ) 

95 if not self.ordered and not self.categories.equals(other.categories): 

96 # both unordered and different order 

97 other_codes = _get_codes_for_values(other, self.categories) 

98 else: 

99 other_codes = other._codes 

100 

101 f = getattr(self._codes, opname) 

102 ret = f(other_codes) 

103 mask = (self._codes == -1) | (other_codes == -1) 

104 if mask.any(): 

105 # In other series, the leads to False, so do that here too 

106 ret[mask] = False 

107 return ret 

108 

109 if is_scalar(other): 

110 if other in self.categories: 

111 i = self.categories.get_loc(other) 

112 ret = getattr(self._codes, opname)(i) 

113 

114 if opname not in {"__eq__", "__ge__", "__gt__"}: 

115 # check for NaN needed if we are not equal or larger 

116 mask = self._codes == -1 

117 ret[mask] = False 

118 return ret 

119 else: 

120 if opname == "__eq__": 

121 return np.zeros(len(self), dtype=bool) 

122 elif opname == "__ne__": 

123 return np.ones(len(self), dtype=bool) 

124 else: 

125 raise TypeError( 

126 f"Cannot compare a Categorical for op {opname} with a " 

127 "scalar, which is not a category." 

128 ) 

129 else: 

130 

131 # allow categorical vs object dtype array comparisons for equality 

132 # these are only positional comparisons 

133 if opname in ["__eq__", "__ne__"]: 

134 return getattr(np.array(self), opname)(np.array(other)) 

135 

136 raise TypeError( 

137 f"Cannot compare a Categorical for op {opname} with " 

138 f"type {type(other)}.\nIf you want to compare values, " 

139 "use 'np.asarray(cat) <op> other'." 

140 ) 

141 

142 func.__name__ = opname 

143 

144 return func 

145 

146 

147def contains(cat, key, container): 

148 """ 

149 Helper for membership check for ``key`` in ``cat``. 

150 

151 This is a helper method for :method:`__contains__` 

152 and :class:`CategoricalIndex.__contains__`. 

153 

154 Returns True if ``key`` is in ``cat.categories`` and the 

155 location of ``key`` in ``categories`` is in ``container``. 

156 

157 Parameters 

158 ---------- 

159 cat : :class:`Categorical`or :class:`categoricalIndex` 

160 key : a hashable object 

161 The key to check membership for. 

162 container : Container (e.g. list-like or mapping) 

163 The container to check for membership in. 

164 

165 Returns 

166 ------- 

167 is_in : bool 

168 True if ``key`` is in ``self.categories`` and location of 

169 ``key`` in ``categories`` is in ``container``, else False. 

170 

171 Notes 

172 ----- 

173 This method does not check for NaN values. Do that separately 

174 before calling this method. 

175 """ 

176 hash(key) 

177 

178 # get location of key in categories. 

179 # If a KeyError, the key isn't in categories, so logically 

180 # can't be in container either. 

181 try: 

182 loc = cat.categories.get_loc(key) 

183 except (KeyError, TypeError): 

184 return False 

185 

186 # loc is the location of key in categories, but also the *value* 

187 # for key in container. So, `key` may be in categories, 

188 # but still not in `container`. Example ('b' in categories, 

189 # but not in values): 

190 # 'b' in Categorical(['a'], categories=['a', 'b']) # False 

191 if is_scalar(loc): 

192 return loc in container 

193 else: 

194 # if categories is an IntervalIndex, loc is an array. 

195 return any(loc_ in container for loc_ in loc) 

196 

197 

198_codes_doc = """ 

199The category codes of this categorical. 

200 

201Level codes are an array if integer which are the positions of the real 

202values in the categories array. 

203 

204There is not setter, use the other categorical methods and the normal item 

205setter to change values in the categorical. 

206""" 

207 

208 

209class Categorical(ExtensionArray, PandasObject): 

210 """ 

211 Represent a categorical variable in classic R / S-plus fashion. 

212 

213 `Categoricals` can only take on only a limited, and usually fixed, number 

214 of possible values (`categories`). In contrast to statistical categorical 

215 variables, a `Categorical` might have an order, but numerical operations 

216 (additions, divisions, ...) are not possible. 

217 

218 All values of the `Categorical` are either in `categories` or `np.nan`. 

219 Assigning values outside of `categories` will raise a `ValueError`. Order 

220 is defined by the order of the `categories`, not lexical order of the 

221 values. 

222 

223 Parameters 

224 ---------- 

225 values : list-like 

226 The values of the categorical. If categories are given, values not in 

227 categories will be replaced with NaN. 

228 categories : Index-like (unique), optional 

229 The unique categories for this categorical. If not given, the 

230 categories are assumed to be the unique values of `values` (sorted, if 

231 possible, otherwise in the order in which they appear). 

232 ordered : bool, default False 

233 Whether or not this categorical is treated as a ordered categorical. 

234 If True, the resulting categorical will be ordered. 

235 An ordered categorical respects, when sorted, the order of its 

236 `categories` attribute (which in turn is the `categories` argument, if 

237 provided). 

238 dtype : CategoricalDtype 

239 An instance of ``CategoricalDtype`` to use for this categorical. 

240 

241 .. versionadded:: 0.21.0 

242 

243 Attributes 

244 ---------- 

245 categories : Index 

246 The categories of this categorical 

247 codes : ndarray 

248 The codes (integer positions, which point to the categories) of this 

249 categorical, read only. 

250 ordered : bool 

251 Whether or not this Categorical is ordered. 

252 dtype : CategoricalDtype 

253 The instance of ``CategoricalDtype`` storing the ``categories`` 

254 and ``ordered``. 

255 

256 .. versionadded:: 0.21.0 

257 

258 Methods 

259 ------- 

260 from_codes 

261 __array__ 

262 

263 Raises 

264 ------ 

265 ValueError 

266 If the categories do not validate. 

267 TypeError 

268 If an explicit ``ordered=True`` is given but no `categories` and the 

269 `values` are not sortable. 

270 

271 See Also 

272 -------- 

273 CategoricalDtype : Type for categorical data. 

274 CategoricalIndex : An Index with an underlying ``Categorical``. 

275 

276 Notes 

277 ----- 

278 See the `user guide 

279 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_ 

280 for more. 

281 

282 Examples 

283 -------- 

284 >>> pd.Categorical([1, 2, 3, 1, 2, 3]) 

285 [1, 2, 3, 1, 2, 3] 

286 Categories (3, int64): [1, 2, 3] 

287 

288 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) 

289 [a, b, c, a, b, c] 

290 Categories (3, object): [a, b, c] 

291 

292 Ordered `Categoricals` can be sorted according to the custom order 

293 of the categories and can have a min and max value. 

294 

295 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, 

296 ... categories=['c', 'b', 'a']) 

297 >>> c 

298 [a, b, c, a, b, c] 

299 Categories (3, object): [c < b < a] 

300 >>> c.min() 

301 'c' 

302 """ 

303 

304 # For comparisons, so that numpy uses our implementation if the compare 

305 # ops, which raise 

306 __array_priority__ = 1000 

307 _dtype = CategoricalDtype(ordered=False) 

308 # tolist is not actually deprecated, just suppressed in the __dir__ 

309 _deprecations = PandasObject._deprecations | frozenset(["tolist"]) 

310 _typ = "categorical" 

311 

312 def __init__( 

313 self, values, categories=None, ordered=None, dtype=None, fastpath=False 

314 ): 

315 

316 dtype = CategoricalDtype._from_values_or_dtype( 

317 values, categories, ordered, dtype 

318 ) 

319 # At this point, dtype is always a CategoricalDtype, but 

320 # we may have dtype.categories be None, and we need to 

321 # infer categories in a factorization step further below 

322 

323 if fastpath: 

324 self._codes = coerce_indexer_dtype(values, dtype.categories) 

325 self._dtype = self._dtype.update_dtype(dtype) 

326 return 

327 

328 # null_mask indicates missing values we want to exclude from inference. 

329 # This means: only missing values in list-likes (not arrays/ndframes). 

330 null_mask = np.array(False) 

331 

332 # sanitize input 

333 if is_categorical_dtype(values): 

334 if dtype.categories is None: 

335 dtype = CategoricalDtype(values.categories, dtype.ordered) 

336 elif not isinstance(values, (ABCIndexClass, ABCSeries)): 

337 # sanitize_array coerces np.nan to a string under certain versions 

338 # of numpy 

339 values = maybe_infer_to_datetimelike(values, convert_dates=True) 

340 if not isinstance(values, np.ndarray): 

341 values = _convert_to_list_like(values) 

342 

343 # By convention, empty lists result in object dtype: 

344 if len(values) == 0: 

345 sanitize_dtype = "object" 

346 else: 

347 sanitize_dtype = None 

348 null_mask = isna(values) 

349 if null_mask.any(): 

350 values = [values[idx] for idx in np.where(~null_mask)[0]] 

351 values = sanitize_array(values, None, dtype=sanitize_dtype) 

352 

353 if dtype.categories is None: 

354 try: 

355 codes, categories = factorize(values, sort=True) 

356 except TypeError: 

357 codes, categories = factorize(values, sort=False) 

358 if dtype.ordered: 

359 # raise, as we don't have a sortable data structure and so 

360 # the user should give us one by specifying categories 

361 raise TypeError( 

362 "'values' is not ordered, please " 

363 "explicitly specify the categories order " 

364 "by passing in a categories argument." 

365 ) 

366 except ValueError: 

367 

368 # FIXME 

369 raise NotImplementedError( 

370 "> 1 ndim Categorical are not supported at this time" 

371 ) 

372 

373 # we're inferring from values 

374 dtype = CategoricalDtype(categories, dtype.ordered) 

375 

376 elif is_categorical_dtype(values): 

377 old_codes = ( 

378 values._values.codes if isinstance(values, ABCSeries) else values.codes 

379 ) 

380 codes = _recode_for_categories( 

381 old_codes, values.dtype.categories, dtype.categories 

382 ) 

383 

384 else: 

385 codes = _get_codes_for_values(values, dtype.categories) 

386 

387 if null_mask.any(): 

388 # Reinsert -1 placeholders for previously removed missing values 

389 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) 

390 full_codes[~null_mask] = codes 

391 codes = full_codes 

392 

393 self._dtype = self._dtype.update_dtype(dtype) 

394 self._codes = coerce_indexer_dtype(codes, dtype.categories) 

395 

396 @property 

397 def categories(self): 

398 """ 

399 The categories of this categorical. 

400 

401 Setting assigns new values to each category (effectively a rename of 

402 each individual category). 

403 

404 The assigned value has to be a list-like object. All items must be 

405 unique and the number of items in the new categories must be the same 

406 as the number of items in the old categories. 

407 

408 Assigning to `categories` is a inplace operation! 

409 

410 Raises 

411 ------ 

412 ValueError 

413 If the new categories do not validate as categories or if the 

414 number of new categories is unequal the number of old categories 

415 

416 See Also 

417 -------- 

418 rename_categories 

419 reorder_categories 

420 add_categories 

421 remove_categories 

422 remove_unused_categories 

423 set_categories 

424 """ 

425 return self.dtype.categories 

426 

427 @categories.setter 

428 def categories(self, categories): 

429 new_dtype = CategoricalDtype(categories, ordered=self.ordered) 

430 if self.dtype.categories is not None and len(self.dtype.categories) != len( 

431 new_dtype.categories 

432 ): 

433 raise ValueError( 

434 "new categories need to have the same number of " 

435 "items as the old categories!" 

436 ) 

437 self._dtype = new_dtype 

438 

439 @property 

440 def ordered(self) -> Ordered: 

441 """ 

442 Whether the categories have an ordered relationship. 

443 """ 

444 return self.dtype.ordered 

445 

446 @property 

447 def dtype(self) -> CategoricalDtype: 

448 """ 

449 The :class:`~pandas.api.types.CategoricalDtype` for this instance. 

450 """ 

451 return self._dtype 

452 

453 @property 

454 def _ndarray_values(self) -> np.ndarray: 

455 return self.codes 

456 

457 @property 

458 def _constructor(self) -> Type["Categorical"]: 

459 return Categorical 

460 

461 @classmethod 

462 def _from_sequence(cls, scalars, dtype=None, copy=False): 

463 return Categorical(scalars, dtype=dtype) 

464 

465 def _formatter(self, boxed=False): 

466 # Defer to CategoricalFormatter's formatter. 

467 return None 

468 

469 def copy(self) -> "Categorical": 

470 """ 

471 Copy constructor. 

472 """ 

473 return self._constructor( 

474 values=self._codes.copy(), dtype=self.dtype, fastpath=True 

475 ) 

476 

477 def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: 

478 """ 

479 Coerce this type to another dtype 

480 

481 Parameters 

482 ---------- 

483 dtype : numpy dtype or pandas type 

484 copy : bool, default True 

485 By default, astype always returns a newly allocated object. 

486 If copy is set to False and dtype is categorical, the original 

487 object is returned. 

488 """ 

489 if is_categorical_dtype(dtype): 

490 dtype = cast(Union[str, CategoricalDtype], dtype) 

491 

492 # GH 10696/18593 

493 dtype = self.dtype.update_dtype(dtype) 

494 self = self.copy() if copy else self 

495 if dtype == self.dtype: 

496 return self 

497 return self._set_dtype(dtype) 

498 if is_extension_array_dtype(dtype): 

499 return array(self, dtype=dtype, copy=copy) # type: ignore # GH 28770 

500 if is_integer_dtype(dtype) and self.isna().any(): 

501 raise ValueError("Cannot convert float NaN to integer") 

502 return np.array(self, dtype=dtype, copy=copy) 

503 

504 @cache_readonly 

505 def size(self) -> int: 

506 """ 

507 Return the len of myself. 

508 """ 

509 return self._codes.size 

510 

511 @cache_readonly 

512 def itemsize(self) -> int: 

513 """ 

514 return the size of a single category 

515 """ 

516 return self.categories.itemsize 

517 

518 def tolist(self) -> List[Scalar]: 

519 """ 

520 Return a list of the values. 

521 

522 These are each a scalar type, which is a Python scalar 

523 (for str, int, float) or a pandas scalar 

524 (for Timestamp/Timedelta/Interval/Period) 

525 """ 

526 return list(self) 

527 

528 to_list = tolist 

529 

530 @classmethod 

531 def _from_inferred_categories( 

532 cls, inferred_categories, inferred_codes, dtype, true_values=None 

533 ): 

534 """ 

535 Construct a Categorical from inferred values. 

536 

537 For inferred categories (`dtype` is None) the categories are sorted. 

538 For explicit `dtype`, the `inferred_categories` are cast to the 

539 appropriate type. 

540 

541 Parameters 

542 ---------- 

543 inferred_categories : Index 

544 inferred_codes : Index 

545 dtype : CategoricalDtype or 'category' 

546 true_values : list, optional 

547 If none are provided, the default ones are 

548 "True", "TRUE", and "true." 

549 

550 Returns 

551 ------- 

552 Categorical 

553 """ 

554 from pandas import Index, to_numeric, to_datetime, to_timedelta 

555 

556 cats = Index(inferred_categories) 

557 known_categories = ( 

558 isinstance(dtype, CategoricalDtype) and dtype.categories is not None 

559 ) 

560 

561 if known_categories: 

562 # Convert to a specialized type with `dtype` if specified. 

563 if dtype.categories.is_numeric(): 

564 cats = to_numeric(inferred_categories, errors="coerce") 

565 elif is_datetime64_dtype(dtype.categories): 

566 cats = to_datetime(inferred_categories, errors="coerce") 

567 elif is_timedelta64_dtype(dtype.categories): 

568 cats = to_timedelta(inferred_categories, errors="coerce") 

569 elif dtype.categories.is_boolean(): 

570 if true_values is None: 

571 true_values = ["True", "TRUE", "true"] 

572 

573 cats = cats.isin(true_values) 

574 

575 if known_categories: 

576 # Recode from observation order to dtype.categories order. 

577 categories = dtype.categories 

578 codes = _recode_for_categories(inferred_codes, cats, categories) 

579 elif not cats.is_monotonic_increasing: 

580 # Sort categories and recode for unknown categories. 

581 unsorted = cats.copy() 

582 categories = cats.sort_values() 

583 

584 codes = _recode_for_categories(inferred_codes, unsorted, categories) 

585 dtype = CategoricalDtype(categories, ordered=False) 

586 else: 

587 dtype = CategoricalDtype(cats, ordered=False) 

588 codes = inferred_codes 

589 

590 return cls(codes, dtype=dtype, fastpath=True) 

591 

592 @classmethod 

593 def from_codes(cls, codes, categories=None, ordered=None, dtype=None): 

594 """ 

595 Make a Categorical type from codes and categories or dtype. 

596 

597 This constructor is useful if you already have codes and 

598 categories/dtype and so do not need the (computation intensive) 

599 factorization step, which is usually done on the constructor. 

600 

601 If your data does not follow this convention, please use the normal 

602 constructor. 

603 

604 Parameters 

605 ---------- 

606 codes : array-like of int 

607 An integer array, where each integer points to a category in 

608 categories or dtype.categories, or else is -1 for NaN. 

609 categories : index-like, optional 

610 The categories for the categorical. Items need to be unique. 

611 If the categories are not given here, then they must be provided 

612 in `dtype`. 

613 ordered : bool, optional 

614 Whether or not this categorical is treated as an ordered 

615 categorical. If not given here or in `dtype`, the resulting 

616 categorical will be unordered. 

617 dtype : CategoricalDtype or "category", optional 

618 If :class:`CategoricalDtype`, cannot be used together with 

619 `categories` or `ordered`. 

620 

621 .. versionadded:: 0.24.0 

622 

623 When `dtype` is provided, neither `categories` nor `ordered` 

624 should be provided. 

625 

626 Returns 

627 ------- 

628 Categorical 

629 

630 Examples 

631 -------- 

632 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) 

633 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) 

634 [a, b, a, b] 

635 Categories (2, object): [a < b] 

636 """ 

637 dtype = CategoricalDtype._from_values_or_dtype( 

638 categories=categories, ordered=ordered, dtype=dtype 

639 ) 

640 if dtype.categories is None: 

641 msg = ( 

642 "The categories must be provided in 'categories' or " 

643 "'dtype'. Both were None." 

644 ) 

645 raise ValueError(msg) 

646 

647 if is_extension_array_dtype(codes) and is_integer_dtype(codes): 

648 # Avoid the implicit conversion of Int to object 

649 if isna(codes).any(): 

650 raise ValueError("codes cannot contain NA values") 

651 codes = codes.to_numpy(dtype=np.int64) 

652 else: 

653 codes = np.asarray(codes) 

654 if len(codes) and not is_integer_dtype(codes): 

655 raise ValueError("codes need to be array-like integers") 

656 

657 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): 

658 raise ValueError("codes need to be between -1 and len(categories)-1") 

659 

660 return cls(codes, dtype=dtype, fastpath=True) 

661 

662 def _get_codes(self): 

663 """ 

664 Get the codes. 

665 

666 Returns 

667 ------- 

668 codes : integer array view 

669 A non writable view of the `codes` array. 

670 """ 

671 v = self._codes.view() 

672 v.flags.writeable = False 

673 return v 

674 

675 def _set_codes(self, codes): 

676 """ 

677 Not settable by the user directly 

678 """ 

679 raise ValueError("cannot set Categorical codes directly") 

680 

681 codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) 

682 

683 def _set_categories(self, categories, fastpath=False): 

684 """ 

685 Sets new categories inplace 

686 

687 Parameters 

688 ---------- 

689 fastpath : bool, default False 

690 Don't perform validation of the categories for uniqueness or nulls 

691 

692 Examples 

693 -------- 

694 >>> c = pd.Categorical(['a', 'b']) 

695 >>> c 

696 [a, b] 

697 Categories (2, object): [a, b] 

698 

699 >>> c._set_categories(pd.Index(['a', 'c'])) 

700 >>> c 

701 [a, c] 

702 Categories (2, object): [a, c] 

703 """ 

704 

705 if fastpath: 

706 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) 

707 else: 

708 new_dtype = CategoricalDtype(categories, ordered=self.ordered) 

709 if ( 

710 not fastpath 

711 and self.dtype.categories is not None 

712 and len(new_dtype.categories) != len(self.dtype.categories) 

713 ): 

714 raise ValueError( 

715 "new categories need to have the same number of " 

716 "items than the old categories!" 

717 ) 

718 

719 self._dtype = new_dtype 

720 

721 def _set_dtype(self, dtype: CategoricalDtype) -> "Categorical": 

722 """ 

723 Internal method for directly updating the CategoricalDtype 

724 

725 Parameters 

726 ---------- 

727 dtype : CategoricalDtype 

728 

729 Notes 

730 ----- 

731 We don't do any validation here. It's assumed that the dtype is 

732 a (valid) instance of `CategoricalDtype`. 

733 """ 

734 codes = _recode_for_categories(self.codes, self.categories, dtype.categories) 

735 return type(self)(codes, dtype=dtype, fastpath=True) 

736 

737 def set_ordered(self, value, inplace=False): 

738 """ 

739 Set the ordered attribute to the boolean value. 

740 

741 Parameters 

742 ---------- 

743 value : bool 

744 Set whether this categorical is ordered (True) or not (False). 

745 inplace : bool, default False 

746 Whether or not to set the ordered attribute in-place or return 

747 a copy of this categorical with ordered set to the value. 

748 """ 

749 inplace = validate_bool_kwarg(inplace, "inplace") 

750 new_dtype = CategoricalDtype(self.categories, ordered=value) 

751 cat = self if inplace else self.copy() 

752 cat._dtype = new_dtype 

753 if not inplace: 

754 return cat 

755 

756 def as_ordered(self, inplace=False): 

757 """ 

758 Set the Categorical to be ordered. 

759 

760 Parameters 

761 ---------- 

762 inplace : bool, default False 

763 Whether or not to set the ordered attribute in-place or return 

764 a copy of this categorical with ordered set to True. 

765 

766 Returns 

767 ------- 

768 Categorical 

769 Ordered Categorical. 

770 """ 

771 inplace = validate_bool_kwarg(inplace, "inplace") 

772 return self.set_ordered(True, inplace=inplace) 

773 

774 def as_unordered(self, inplace=False): 

775 """ 

776 Set the Categorical to be unordered. 

777 

778 Parameters 

779 ---------- 

780 inplace : bool, default False 

781 Whether or not to set the ordered attribute in-place or return 

782 a copy of this categorical with ordered set to False. 

783 

784 Returns 

785 ------- 

786 Categorical 

787 Unordered Categorical. 

788 """ 

789 inplace = validate_bool_kwarg(inplace, "inplace") 

790 return self.set_ordered(False, inplace=inplace) 

791 

792 def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): 

793 """ 

794 Set the categories to the specified new_categories. 

795 

796 `new_categories` can include new categories (which will result in 

797 unused categories) or remove old categories (which results in values 

798 set to NaN). If `rename==True`, the categories will simple be renamed 

799 (less or more items than in old categories will result in values set to 

800 NaN or in unused categories respectively). 

801 

802 This method can be used to perform more than one action of adding, 

803 removing, and reordering simultaneously and is therefore faster than 

804 performing the individual steps via the more specialised methods. 

805 

806 On the other hand this methods does not do checks (e.g., whether the 

807 old categories are included in the new categories on a reorder), which 

808 can result in surprising changes, for example when using special string 

809 dtypes, which does not considers a S1 string equal to a single char 

810 python string. 

811 

812 Parameters 

813 ---------- 

814 new_categories : Index-like 

815 The categories in new order. 

816 ordered : bool, default False 

817 Whether or not the categorical is treated as a ordered categorical. 

818 If not given, do not change the ordered information. 

819 rename : bool, default False 

820 Whether or not the new_categories should be considered as a rename 

821 of the old categories or as reordered categories. 

822 inplace : bool, default False 

823 Whether or not to reorder the categories in-place or return a copy 

824 of this categorical with reordered categories. 

825 

826 Returns 

827 ------- 

828 Categorical with reordered categories or None if inplace. 

829 

830 Raises 

831 ------ 

832 ValueError 

833 If new_categories does not validate as categories 

834 

835 See Also 

836 -------- 

837 rename_categories 

838 reorder_categories 

839 add_categories 

840 remove_categories 

841 remove_unused_categories 

842 """ 

843 inplace = validate_bool_kwarg(inplace, "inplace") 

844 if ordered is None: 

845 ordered = self.dtype.ordered 

846 new_dtype = CategoricalDtype(new_categories, ordered=ordered) 

847 

848 cat = self if inplace else self.copy() 

849 if rename: 

850 if cat.dtype.categories is not None and len(new_dtype.categories) < len( 

851 cat.dtype.categories 

852 ): 

853 # remove all _codes which are larger and set to -1/NaN 

854 cat._codes[cat._codes >= len(new_dtype.categories)] = -1 

855 else: 

856 codes = _recode_for_categories( 

857 cat.codes, cat.categories, new_dtype.categories 

858 ) 

859 cat._codes = codes 

860 cat._dtype = new_dtype 

861 

862 if not inplace: 

863 return cat 

864 

865 def rename_categories(self, new_categories, inplace=False): 

866 """ 

867 Rename categories. 

868 

869 Parameters 

870 ---------- 

871 new_categories : list-like, dict-like or callable 

872 

873 New categories which will replace old categories. 

874 

875 * list-like: all items must be unique and the number of items in 

876 the new categories must match the existing number of categories. 

877 

878 * dict-like: specifies a mapping from 

879 old categories to new. Categories not contained in the mapping 

880 are passed through and extra categories in the mapping are 

881 ignored. 

882 

883 .. versionadded:: 0.21.0. 

884 

885 * callable : a callable that is called on all items in the old 

886 categories and whose return values comprise the new categories. 

887 

888 .. versionadded:: 0.23.0. 

889 

890 inplace : bool, default False 

891 Whether or not to rename the categories inplace or return a copy of 

892 this categorical with renamed categories. 

893 

894 Returns 

895 ------- 

896 cat : Categorical or None 

897 With ``inplace=False``, the new categorical is returned. 

898 With ``inplace=True``, there is no return value. 

899 

900 Raises 

901 ------ 

902 ValueError 

903 If new categories are list-like and do not have the same number of 

904 items than the current categories or do not validate as categories 

905 

906 See Also 

907 -------- 

908 reorder_categories 

909 add_categories 

910 remove_categories 

911 remove_unused_categories 

912 set_categories 

913 

914 Examples 

915 -------- 

916 >>> c = pd.Categorical(['a', 'a', 'b']) 

917 >>> c.rename_categories([0, 1]) 

918 [0, 0, 1] 

919 Categories (2, int64): [0, 1] 

920 

921 For dict-like ``new_categories``, extra keys are ignored and 

922 categories not in the dictionary are passed through 

923 

924 >>> c.rename_categories({'a': 'A', 'c': 'C'}) 

925 [A, A, b] 

926 Categories (2, object): [A, b] 

927 

928 You may also provide a callable to create the new categories 

929 

930 >>> c.rename_categories(lambda x: x.upper()) 

931 [A, A, B] 

932 Categories (2, object): [A, B] 

933 """ 

934 inplace = validate_bool_kwarg(inplace, "inplace") 

935 cat = self if inplace else self.copy() 

936 

937 if is_dict_like(new_categories): 

938 cat.categories = [new_categories.get(item, item) for item in cat.categories] 

939 elif callable(new_categories): 

940 cat.categories = [new_categories(item) for item in cat.categories] 

941 else: 

942 cat.categories = new_categories 

943 if not inplace: 

944 return cat 

945 

946 def reorder_categories(self, new_categories, ordered=None, inplace=False): 

947 """ 

948 Reorder categories as specified in new_categories. 

949 

950 `new_categories` need to include all old categories and no new category 

951 items. 

952 

953 Parameters 

954 ---------- 

955 new_categories : Index-like 

956 The categories in new order. 

957 ordered : bool, optional 

958 Whether or not the categorical is treated as a ordered categorical. 

959 If not given, do not change the ordered information. 

960 inplace : bool, default False 

961 Whether or not to reorder the categories inplace or return a copy of 

962 this categorical with reordered categories. 

963 

964 Returns 

965 ------- 

966 cat : Categorical with reordered categories or None if inplace. 

967 

968 Raises 

969 ------ 

970 ValueError 

971 If the new categories do not contain all old category items or any 

972 new ones 

973 

974 See Also 

975 -------- 

976 rename_categories 

977 add_categories 

978 remove_categories 

979 remove_unused_categories 

980 set_categories 

981 """ 

982 inplace = validate_bool_kwarg(inplace, "inplace") 

983 if set(self.dtype.categories) != set(new_categories): 

984 raise ValueError( 

985 "items in new_categories are not the same as in old categories" 

986 ) 

987 return self.set_categories(new_categories, ordered=ordered, inplace=inplace) 

988 

989 def add_categories(self, new_categories, inplace=False): 

990 """ 

991 Add new categories. 

992 

993 `new_categories` will be included at the last/highest place in the 

994 categories and will be unused directly after this call. 

995 

996 Parameters 

997 ---------- 

998 new_categories : category or list-like of category 

999 The new categories to be included. 

1000 inplace : bool, default False 

1001 Whether or not to add the categories inplace or return a copy of 

1002 this categorical with added categories. 

1003 

1004 Returns 

1005 ------- 

1006 cat : Categorical with new categories added or None if inplace. 

1007 

1008 Raises 

1009 ------ 

1010 ValueError 

1011 If the new categories include old categories or do not validate as 

1012 categories 

1013 

1014 See Also 

1015 -------- 

1016 rename_categories 

1017 reorder_categories 

1018 remove_categories 

1019 remove_unused_categories 

1020 set_categories 

1021 """ 

1022 inplace = validate_bool_kwarg(inplace, "inplace") 

1023 if not is_list_like(new_categories): 

1024 new_categories = [new_categories] 

1025 already_included = set(new_categories) & set(self.dtype.categories) 

1026 if len(already_included) != 0: 

1027 raise ValueError( 

1028 f"new categories must not include old categories: {already_included}" 

1029 ) 

1030 new_categories = list(self.dtype.categories) + list(new_categories) 

1031 new_dtype = CategoricalDtype(new_categories, self.ordered) 

1032 

1033 cat = self if inplace else self.copy() 

1034 cat._dtype = new_dtype 

1035 cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) 

1036 if not inplace: 

1037 return cat 

1038 

1039 def remove_categories(self, removals, inplace=False): 

1040 """ 

1041 Remove the specified categories. 

1042 

1043 `removals` must be included in the old categories. Values which were in 

1044 the removed categories will be set to NaN 

1045 

1046 Parameters 

1047 ---------- 

1048 removals : category or list of categories 

1049 The categories which should be removed. 

1050 inplace : bool, default False 

1051 Whether or not to remove the categories inplace or return a copy of 

1052 this categorical with removed categories. 

1053 

1054 Returns 

1055 ------- 

1056 cat : Categorical with removed categories or None if inplace. 

1057 

1058 Raises 

1059 ------ 

1060 ValueError 

1061 If the removals are not contained in the categories 

1062 

1063 See Also 

1064 -------- 

1065 rename_categories 

1066 reorder_categories 

1067 add_categories 

1068 remove_unused_categories 

1069 set_categories 

1070 """ 

1071 inplace = validate_bool_kwarg(inplace, "inplace") 

1072 if not is_list_like(removals): 

1073 removals = [removals] 

1074 

1075 removal_set = set(removals) 

1076 not_included = removal_set - set(self.dtype.categories) 

1077 new_categories = [c for c in self.dtype.categories if c not in removal_set] 

1078 

1079 # GH 10156 

1080 if any(isna(removals)): 

1081 not_included = {x for x in not_included if notna(x)} 

1082 new_categories = [x for x in new_categories if notna(x)] 

1083 

1084 if len(not_included) != 0: 

1085 raise ValueError(f"removals must all be in old categories: {not_included}") 

1086 

1087 return self.set_categories( 

1088 new_categories, ordered=self.ordered, rename=False, inplace=inplace 

1089 ) 

1090 

1091 def remove_unused_categories(self, inplace=False): 

1092 """ 

1093 Remove categories which are not used. 

1094 

1095 Parameters 

1096 ---------- 

1097 inplace : bool, default False 

1098 Whether or not to drop unused categories inplace or return a copy of 

1099 this categorical with unused categories dropped. 

1100 

1101 Returns 

1102 ------- 

1103 cat : Categorical with unused categories dropped or None if inplace. 

1104 

1105 See Also 

1106 -------- 

1107 rename_categories 

1108 reorder_categories 

1109 add_categories 

1110 remove_categories 

1111 set_categories 

1112 """ 

1113 inplace = validate_bool_kwarg(inplace, "inplace") 

1114 cat = self if inplace else self.copy() 

1115 idx, inv = np.unique(cat._codes, return_inverse=True) 

1116 

1117 if idx.size != 0 and idx[0] == -1: # na sentinel 

1118 idx, inv = idx[1:], inv - 1 

1119 

1120 new_categories = cat.dtype.categories.take(idx) 

1121 new_dtype = CategoricalDtype._from_fastpath( 

1122 new_categories, ordered=self.ordered 

1123 ) 

1124 cat._dtype = new_dtype 

1125 cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) 

1126 

1127 if not inplace: 

1128 return cat 

1129 

1130 def map(self, mapper): 

1131 """ 

1132 Map categories using input correspondence (dict, Series, or function). 

1133 

1134 Maps the categories to new categories. If the mapping correspondence is 

1135 one-to-one the result is a :class:`~pandas.Categorical` which has the 

1136 same order property as the original, otherwise a :class:`~pandas.Index` 

1137 is returned. NaN values are unaffected. 

1138 

1139 If a `dict` or :class:`~pandas.Series` is used any unmapped category is 

1140 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` 

1141 will be returned. 

1142 

1143 Parameters 

1144 ---------- 

1145 mapper : function, dict, or Series 

1146 Mapping correspondence. 

1147 

1148 Returns 

1149 ------- 

1150 pandas.Categorical or pandas.Index 

1151 Mapped categorical. 

1152 

1153 See Also 

1154 -------- 

1155 CategoricalIndex.map : Apply a mapping correspondence on a 

1156 :class:`~pandas.CategoricalIndex`. 

1157 Index.map : Apply a mapping correspondence on an 

1158 :class:`~pandas.Index`. 

1159 Series.map : Apply a mapping correspondence on a 

1160 :class:`~pandas.Series`. 

1161 Series.apply : Apply more complex functions on a 

1162 :class:`~pandas.Series`. 

1163 

1164 Examples 

1165 -------- 

1166 >>> cat = pd.Categorical(['a', 'b', 'c']) 

1167 >>> cat 

1168 [a, b, c] 

1169 Categories (3, object): [a, b, c] 

1170 >>> cat.map(lambda x: x.upper()) 

1171 [A, B, C] 

1172 Categories (3, object): [A, B, C] 

1173 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}) 

1174 [first, second, third] 

1175 Categories (3, object): [first, second, third] 

1176 

1177 If the mapping is one-to-one the ordering of the categories is 

1178 preserved: 

1179 

1180 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) 

1181 >>> cat 

1182 [a, b, c] 

1183 Categories (3, object): [a < b < c] 

1184 >>> cat.map({'a': 3, 'b': 2, 'c': 1}) 

1185 [3, 2, 1] 

1186 Categories (3, int64): [3 < 2 < 1] 

1187 

1188 If the mapping is not one-to-one an :class:`~pandas.Index` is returned: 

1189 

1190 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}) 

1191 Index(['first', 'second', 'first'], dtype='object') 

1192 

1193 If a `dict` is used, all unmapped categories are mapped to `NaN` and 

1194 the result is an :class:`~pandas.Index`: 

1195 

1196 >>> cat.map({'a': 'first', 'b': 'second'}) 

1197 Index(['first', 'second', nan], dtype='object') 

1198 """ 

1199 new_categories = self.categories.map(mapper) 

1200 try: 

1201 return self.from_codes( 

1202 self._codes.copy(), categories=new_categories, ordered=self.ordered 

1203 ) 

1204 except ValueError: 

1205 # NA values are represented in self._codes with -1 

1206 # np.take causes NA values to take final element in new_categories 

1207 if np.any(self._codes == -1): 

1208 new_categories = new_categories.insert(len(new_categories), np.nan) 

1209 return np.take(new_categories, self._codes) 

1210 

1211 __eq__ = _cat_compare_op(operator.eq) 

1212 __ne__ = _cat_compare_op(operator.ne) 

1213 __lt__ = _cat_compare_op(operator.lt) 

1214 __gt__ = _cat_compare_op(operator.gt) 

1215 __le__ = _cat_compare_op(operator.le) 

1216 __ge__ = _cat_compare_op(operator.ge) 

1217 

1218 # for Series/ndarray like compat 

1219 @property 

1220 def shape(self): 

1221 """ 

1222 Shape of the Categorical. 

1223 

1224 For internal compatibility with numpy arrays. 

1225 

1226 Returns 

1227 ------- 

1228 shape : tuple 

1229 """ 

1230 

1231 return tuple([len(self._codes)]) 

1232 

1233 def shift(self, periods, fill_value=None): 

1234 """ 

1235 Shift Categorical by desired number of periods. 

1236 

1237 Parameters 

1238 ---------- 

1239 periods : int 

1240 Number of periods to move, can be positive or negative 

1241 fill_value : object, optional 

1242 The scalar value to use for newly introduced missing values. 

1243 

1244 .. versionadded:: 0.24.0 

1245 

1246 Returns 

1247 ------- 

1248 shifted : Categorical 

1249 """ 

1250 # since categoricals always have ndim == 1, an axis parameter 

1251 # doesn't make any sense here. 

1252 codes = self.codes 

1253 if codes.ndim > 1: 

1254 raise NotImplementedError("Categorical with ndim > 1.") 

1255 if np.prod(codes.shape) and (periods != 0): 

1256 codes = np.roll(codes, ensure_platform_int(periods), axis=0) 

1257 if isna(fill_value): 

1258 fill_value = -1 

1259 elif fill_value in self.categories: 

1260 fill_value = self.categories.get_loc(fill_value) 

1261 else: 

1262 raise ValueError( 

1263 f"'fill_value={fill_value}' is not present " 

1264 "in this Categorical's categories" 

1265 ) 

1266 if periods > 0: 

1267 codes[:periods] = fill_value 

1268 else: 

1269 codes[periods:] = fill_value 

1270 

1271 return self.from_codes(codes, dtype=self.dtype) 

1272 

1273 def __array__(self, dtype=None) -> np.ndarray: 

1274 """ 

1275 The numpy array interface. 

1276 

1277 Returns 

1278 ------- 

1279 numpy.array 

1280 A numpy array of either the specified dtype or, 

1281 if dtype==None (default), the same dtype as 

1282 categorical.categories.dtype. 

1283 """ 

1284 ret = take_1d(self.categories.values, self._codes) 

1285 if dtype and not is_dtype_equal(dtype, self.categories.dtype): 

1286 return np.asarray(ret, dtype) 

1287 if is_extension_array_dtype(ret): 

1288 # When we're a Categorical[ExtensionArray], like Interval, 

1289 # we need to ensure __array__ get's all the way to an 

1290 # ndarray. 

1291 ret = np.asarray(ret) 

1292 return ret 

1293 

1294 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): 

1295 # for binary ops, use our custom dunder methods 

1296 result = ops.maybe_dispatch_ufunc_to_dunder_op( 

1297 self, ufunc, method, *inputs, **kwargs 

1298 ) 

1299 if result is not NotImplemented: 

1300 return result 

1301 

1302 # for all other cases, raise for now (similarly as what happens in 

1303 # Series.__array_prepare__) 

1304 raise TypeError( 

1305 f"Object with dtype {self.dtype} cannot perform " 

1306 f"the numpy op {ufunc.__name__}" 

1307 ) 

1308 

1309 def __setstate__(self, state): 

1310 """Necessary for making this object picklable""" 

1311 if not isinstance(state, dict): 

1312 raise Exception("invalid pickle state") 

1313 

1314 # compat with pre 0.21.0 CategoricalDtype change 

1315 if "_dtype" not in state: 

1316 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) 

1317 

1318 for k, v in state.items(): 

1319 setattr(self, k, v) 

1320 

1321 @property 

1322 def T(self): 

1323 """ 

1324 Return transposed numpy array. 

1325 """ 

1326 return self 

1327 

1328 @property 

1329 def nbytes(self): 

1330 return self._codes.nbytes + self.dtype.categories.values.nbytes 

1331 

1332 def memory_usage(self, deep=False): 

1333 """ 

1334 Memory usage of my values 

1335 

1336 Parameters 

1337 ---------- 

1338 deep : bool 

1339 Introspect the data deeply, interrogate 

1340 `object` dtypes for system-level memory consumption 

1341 

1342 Returns 

1343 ------- 

1344 bytes used 

1345 

1346 Notes 

1347 ----- 

1348 Memory usage does not include memory consumed by elements that 

1349 are not components of the array if deep=False 

1350 

1351 See Also 

1352 -------- 

1353 numpy.ndarray.nbytes 

1354 """ 

1355 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) 

1356 

1357 @Substitution(klass="Categorical") 

1358 @Appender(_shared_docs["searchsorted"]) 

1359 def searchsorted(self, value, side="left", sorter=None): 

1360 # searchsorted is very performance sensitive. By converting codes 

1361 # to same dtype as self.codes, we get much faster performance. 

1362 if is_scalar(value): 

1363 codes = self.categories.get_loc(value) 

1364 codes = self.codes.dtype.type(codes) 

1365 else: 

1366 locs = [self.categories.get_loc(x) for x in value] 

1367 codes = np.array(locs, dtype=self.codes.dtype) 

1368 return self.codes.searchsorted(codes, side=side, sorter=sorter) 

1369 

1370 def isna(self): 

1371 """ 

1372 Detect missing values 

1373 

1374 Missing values (-1 in .codes) are detected. 

1375 

1376 Returns 

1377 ------- 

1378 a boolean array of whether my values are null 

1379 

1380 See Also 

1381 -------- 

1382 isna : Top-level isna. 

1383 isnull : Alias of isna. 

1384 Categorical.notna : Boolean inverse of Categorical.isna. 

1385 

1386 """ 

1387 

1388 ret = self._codes == -1 

1389 return ret 

1390 

1391 isnull = isna 

1392 

1393 def notna(self): 

1394 """ 

1395 Inverse of isna 

1396 

1397 Both missing values (-1 in .codes) and NA as a category are detected as 

1398 null. 

1399 

1400 Returns 

1401 ------- 

1402 a boolean array of whether my values are not null 

1403 

1404 See Also 

1405 -------- 

1406 notna : Top-level notna. 

1407 notnull : Alias of notna. 

1408 Categorical.isna : Boolean inverse of Categorical.notna. 

1409 

1410 """ 

1411 return ~self.isna() 

1412 

1413 notnull = notna 

1414 

1415 def put(self, *args, **kwargs): 

1416 """ 

1417 Replace specific elements in the Categorical with given values. 

1418 """ 

1419 raise NotImplementedError(("'put' is not yet implemented for Categorical")) 

1420 

1421 def dropna(self): 

1422 """ 

1423 Return the Categorical without null values. 

1424 

1425 Missing values (-1 in .codes) are detected. 

1426 

1427 Returns 

1428 ------- 

1429 valid : Categorical 

1430 """ 

1431 result = self[self.notna()] 

1432 

1433 return result 

1434 

1435 def value_counts(self, dropna=True): 

1436 """ 

1437 Return a Series containing counts of each category. 

1438 

1439 Every category will have an entry, even those with a count of 0. 

1440 

1441 Parameters 

1442 ---------- 

1443 dropna : bool, default True 

1444 Don't include counts of NaN. 

1445 

1446 Returns 

1447 ------- 

1448 counts : Series 

1449 

1450 See Also 

1451 -------- 

1452 Series.value_counts 

1453 """ 

1454 from pandas import Series, CategoricalIndex 

1455 

1456 code, cat = self._codes, self.categories 

1457 ncat, mask = len(cat), 0 <= code 

1458 ix, clean = np.arange(ncat), mask.all() 

1459 

1460 if dropna or clean: 

1461 obs = code if clean else code[mask] 

1462 count = np.bincount(obs, minlength=ncat or 0) 

1463 else: 

1464 count = np.bincount(np.where(mask, code, ncat)) 

1465 ix = np.append(ix, -1) 

1466 

1467 ix = self._constructor(ix, dtype=self.dtype, fastpath=True) 

1468 

1469 return Series(count, index=CategoricalIndex(ix), dtype="int64") 

1470 

1471 def _internal_get_values(self): 

1472 """ 

1473 Return the values. 

1474 

1475 For internal compatibility with pandas formatting. 

1476 

1477 Returns 

1478 ------- 

1479 np.ndarray or Index 

1480 A numpy array of the same dtype as categorical.categories.dtype or 

1481 Index if datetime / periods. 

1482 """ 

1483 # if we are a datetime and period index, return Index to keep metadata 

1484 if needs_i8_conversion(self.categories): 

1485 return self.categories.take(self._codes, fill_value=np.nan) 

1486 elif is_integer_dtype(self.categories) and -1 in self._codes: 

1487 return self.categories.astype("object").take(self._codes, fill_value=np.nan) 

1488 return np.array(self) 

1489 

1490 def check_for_ordered(self, op): 

1491 """ assert that we are ordered """ 

1492 if not self.ordered: 

1493 raise TypeError( 

1494 f"Categorical is not ordered for operation {op}\n" 

1495 "you can use .as_ordered() to change the " 

1496 "Categorical to an ordered one\n" 

1497 ) 

1498 

1499 def _values_for_argsort(self): 

1500 return self._codes.copy() 

1501 

1502 def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): 

1503 """ 

1504 Return the indices that would sort the Categorical. 

1505 

1506 .. versionchanged:: 0.25.0 

1507 

1508 Changed to sort missing values at the end. 

1509 

1510 Parameters 

1511 ---------- 

1512 ascending : bool, default True 

1513 Whether the indices should result in an ascending 

1514 or descending sort. 

1515 kind : {'quicksort', 'mergesort', 'heapsort'}, optional 

1516 Sorting algorithm. 

1517 *args, **kwargs: 

1518 passed through to :func:`numpy.argsort`. 

1519 

1520 Returns 

1521 ------- 

1522 numpy.array 

1523 

1524 See Also 

1525 -------- 

1526 numpy.ndarray.argsort 

1527 

1528 Notes 

1529 ----- 

1530 While an ordering is applied to the category values, arg-sorting 

1531 in this context refers more to organizing and grouping together 

1532 based on matching category values. Thus, this function can be 

1533 called on an unordered Categorical instance unlike the functions 

1534 'Categorical.min' and 'Categorical.max'. 

1535 

1536 Examples 

1537 -------- 

1538 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() 

1539 array([2, 0, 1, 3]) 

1540 

1541 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], 

1542 ... categories=['c', 'b', 'a'], 

1543 ... ordered=True) 

1544 >>> cat.argsort() 

1545 array([3, 0, 1, 2]) 

1546 

1547 Missing values are placed at the end 

1548 

1549 >>> cat = pd.Categorical([2, None, 1]) 

1550 >>> cat.argsort() 

1551 array([2, 0, 1]) 

1552 """ 

1553 return super().argsort(ascending=ascending, kind=kind, *args, **kwargs) 

1554 

1555 def sort_values(self, inplace=False, ascending=True, na_position="last"): 

1556 """ 

1557 Sort the Categorical by category value returning a new 

1558 Categorical by default. 

1559 

1560 While an ordering is applied to the category values, sorting in this 

1561 context refers more to organizing and grouping together based on 

1562 matching category values. Thus, this function can be called on an 

1563 unordered Categorical instance unlike the functions 'Categorical.min' 

1564 and 'Categorical.max'. 

1565 

1566 Parameters 

1567 ---------- 

1568 inplace : bool, default False 

1569 Do operation in place. 

1570 ascending : bool, default True 

1571 Order ascending. Passing False orders descending. The 

1572 ordering parameter provides the method by which the 

1573 category values are organized. 

1574 na_position : {'first', 'last'} (optional, default='last') 

1575 'first' puts NaNs at the beginning 

1576 'last' puts NaNs at the end 

1577 

1578 Returns 

1579 ------- 

1580 Categorical or None 

1581 

1582 See Also 

1583 -------- 

1584 Categorical.sort 

1585 Series.sort_values 

1586 

1587 Examples 

1588 -------- 

1589 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 

1590 >>> c 

1591 [1, 2, 2, 1, 5] 

1592 Categories (3, int64): [1, 2, 5] 

1593 >>> c.sort_values() 

1594 [1, 1, 2, 2, 5] 

1595 Categories (3, int64): [1, 2, 5] 

1596 >>> c.sort_values(ascending=False) 

1597 [5, 2, 2, 1, 1] 

1598 Categories (3, int64): [1, 2, 5] 

1599 

1600 Inplace sorting can be done as well: 

1601 

1602 >>> c.sort_values(inplace=True) 

1603 >>> c 

1604 [1, 1, 2, 2, 5] 

1605 Categories (3, int64): [1, 2, 5] 

1606 >>> 

1607 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 

1608 

1609 'sort_values' behaviour with NaNs. Note that 'na_position' 

1610 is independent of the 'ascending' parameter: 

1611 

1612 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) 

1613 >>> c 

1614 [NaN, 2.0, 2.0, NaN, 5.0] 

1615 Categories (2, int64): [2, 5] 

1616 >>> c.sort_values() 

1617 [2.0, 2.0, 5.0, NaN, NaN] 

1618 Categories (2, int64): [2, 5] 

1619 >>> c.sort_values(ascending=False) 

1620 [5.0, 2.0, 2.0, NaN, NaN] 

1621 Categories (2, int64): [2, 5] 

1622 >>> c.sort_values(na_position='first') 

1623 [NaN, NaN, 2.0, 2.0, 5.0] 

1624 Categories (2, int64): [2, 5] 

1625 >>> c.sort_values(ascending=False, na_position='first') 

1626 [NaN, NaN, 5.0, 2.0, 2.0] 

1627 Categories (2, int64): [2, 5] 

1628 """ 

1629 inplace = validate_bool_kwarg(inplace, "inplace") 

1630 if na_position not in ["last", "first"]: 

1631 raise ValueError(f"invalid na_position: {repr(na_position)}") 

1632 

1633 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) 

1634 

1635 if inplace: 

1636 self._codes = self._codes[sorted_idx] 

1637 else: 

1638 return self._constructor( 

1639 values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True 

1640 ) 

1641 

1642 def _values_for_rank(self): 

1643 """ 

1644 For correctly ranking ordered categorical data. See GH#15420 

1645 

1646 Ordered categorical data should be ranked on the basis of 

1647 codes with -1 translated to NaN. 

1648 

1649 Returns 

1650 ------- 

1651 numpy.array 

1652 

1653 """ 

1654 from pandas import Series 

1655 

1656 if self.ordered: 

1657 values = self.codes 

1658 mask = values == -1 

1659 if mask.any(): 

1660 values = values.astype("float64") 

1661 values[mask] = np.nan 

1662 elif self.categories.is_numeric(): 

1663 values = np.array(self) 

1664 else: 

1665 # reorder the categories (so rank can use the float codes) 

1666 # instead of passing an object array to rank 

1667 values = np.array( 

1668 self.rename_categories(Series(self.categories).rank().values) 

1669 ) 

1670 return values 

1671 

1672 def view(self, dtype=None): 

1673 if dtype is not None: 

1674 raise NotImplementedError(dtype) 

1675 return self._constructor(values=self._codes, dtype=self.dtype, fastpath=True) 

1676 

1677 def to_dense(self): 

1678 """ 

1679 Return my 'dense' representation 

1680 

1681 For internal compatibility with numpy arrays. 

1682 

1683 Returns 

1684 ------- 

1685 dense : array 

1686 """ 

1687 return np.asarray(self) 

1688 

1689 def fillna(self, value=None, method=None, limit=None): 

1690 """ 

1691 Fill NA/NaN values using the specified method. 

1692 

1693 Parameters 

1694 ---------- 

1695 value : scalar, dict, Series 

1696 If a scalar value is passed it is used to fill all missing values. 

1697 Alternatively, a Series or dict can be used to fill in different 

1698 values for each index. The value should not be a list. The 

1699 value(s) passed should either be in the categories or should be 

1700 NaN. 

1701 method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None 

1702 Method to use for filling holes in reindexed Series 

1703 pad / ffill: propagate last valid observation forward to next valid 

1704 backfill / bfill: use NEXT valid observation to fill gap 

1705 limit : int, default None 

1706 (Not implemented yet for Categorical!) 

1707 If method is specified, this is the maximum number of consecutive 

1708 NaN values to forward/backward fill. In other words, if there is 

1709 a gap with more than this number of consecutive NaNs, it will only 

1710 be partially filled. If method is not specified, this is the 

1711 maximum number of entries along the entire axis where NaNs will be 

1712 filled. 

1713 

1714 Returns 

1715 ------- 

1716 filled : Categorical with NA/NaN filled 

1717 """ 

1718 value, method = validate_fillna_kwargs( 

1719 value, method, validate_scalar_dict_value=False 

1720 ) 

1721 

1722 if value is None: 

1723 value = np.nan 

1724 if limit is not None: 

1725 raise NotImplementedError( 

1726 "specifying a limit for fillna has not been implemented yet" 

1727 ) 

1728 

1729 codes = self._codes 

1730 

1731 # pad / bfill 

1732 if method is not None: 

1733 

1734 values = self.to_dense().reshape(-1, len(self)) 

1735 values = interpolate_2d(values, method, 0, None, value).astype( 

1736 self.categories.dtype 

1737 )[0] 

1738 codes = _get_codes_for_values(values, self.categories) 

1739 

1740 else: 

1741 

1742 # If value is a dict or a Series (a dict value has already 

1743 # been converted to a Series) 

1744 if isinstance(value, ABCSeries): 

1745 if not value[~value.isin(self.categories)].isna().all(): 

1746 raise ValueError("fill value must be in categories") 

1747 

1748 values_codes = _get_codes_for_values(value, self.categories) 

1749 indexer = np.where(codes == -1) 

1750 codes[indexer] = values_codes[indexer] 

1751 

1752 # If value is not a dict or Series it should be a scalar 

1753 elif is_hashable(value): 

1754 if not isna(value) and value not in self.categories: 

1755 raise ValueError("fill value must be in categories") 

1756 

1757 mask = codes == -1 

1758 if mask.any(): 

1759 codes = codes.copy() 

1760 if isna(value): 

1761 codes[mask] = -1 

1762 else: 

1763 codes[mask] = self.categories.get_loc(value) 

1764 

1765 else: 

1766 raise TypeError( 

1767 f"'value' parameter must be a scalar, dict " 

1768 f"or Series, but you passed a {type(value).__name__}" 

1769 ) 

1770 

1771 return self._constructor(codes, dtype=self.dtype, fastpath=True) 

1772 

1773 def take(self, indexer, allow_fill: bool = False, fill_value=None): 

1774 """ 

1775 Take elements from the Categorical. 

1776 

1777 Parameters 

1778 ---------- 

1779 indexer : sequence of int 

1780 The indices in `self` to take. The meaning of negative values in 

1781 `indexer` depends on the value of `allow_fill`. 

1782 allow_fill : bool, default False 

1783 How to handle negative values in `indexer`. 

1784 

1785 * False: negative values in `indices` indicate positional indices 

1786 from the right. This is similar to 

1787 :func:`numpy.take`. 

1788 

1789 * True: negative values in `indices` indicate missing values 

1790 (the default). These values are set to `fill_value`. Any other 

1791 other negative values raise a ``ValueError``. 

1792 

1793 .. versionchanged:: 1.0.0 

1794 

1795 Default value changed from ``True`` to ``False``. 

1796 

1797 fill_value : object 

1798 The value to use for `indices` that are missing (-1), when 

1799 ``allow_fill=True``. This should be the category, i.e. a value 

1800 in ``self.categories``, not a code. 

1801 

1802 Returns 

1803 ------- 

1804 Categorical 

1805 This Categorical will have the same categories and ordered as 

1806 `self`. 

1807 

1808 See Also 

1809 -------- 

1810 Series.take : Similar method for Series. 

1811 numpy.ndarray.take : Similar method for NumPy arrays. 

1812 

1813 Examples 

1814 -------- 

1815 >>> cat = pd.Categorical(['a', 'a', 'b']) 

1816 >>> cat 

1817 [a, a, b] 

1818 Categories (2, object): [a, b] 

1819 

1820 Specify ``allow_fill==False`` to have negative indices mean indexing 

1821 from the right. 

1822 

1823 >>> cat.take([0, -1, -2], allow_fill=False) 

1824 [a, b, a] 

1825 Categories (2, object): [a, b] 

1826 

1827 With ``allow_fill=True``, indices equal to ``-1`` mean "missing" 

1828 values that should be filled with the `fill_value`, which is 

1829 ``np.nan`` by default. 

1830 

1831 >>> cat.take([0, -1, -1], allow_fill=True) 

1832 [a, NaN, NaN] 

1833 Categories (2, object): [a, b] 

1834 

1835 The fill value can be specified. 

1836 

1837 >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') 

1838 [a, a, a] 

1839 Categories (3, object): [a, b] 

1840 

1841 Specifying a fill value that's not in ``self.categories`` 

1842 will raise a ``TypeError``. 

1843 """ 

1844 indexer = np.asarray(indexer, dtype=np.intp) 

1845 

1846 dtype = self.dtype 

1847 

1848 if isna(fill_value): 

1849 fill_value = -1 

1850 elif allow_fill: 

1851 # convert user-provided `fill_value` to codes 

1852 if fill_value in self.categories: 

1853 fill_value = self.categories.get_loc(fill_value) 

1854 else: 

1855 msg = ( 

1856 f"'fill_value' ('{fill_value}') is not in this " 

1857 "Categorical's categories." 

1858 ) 

1859 raise TypeError(msg) 

1860 

1861 codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) 

1862 result = type(self).from_codes(codes, dtype=dtype) 

1863 return result 

1864 

1865 def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): 

1866 # GH#27745 deprecate alias that other EAs dont have 

1867 warn( 

1868 "Categorical.take_nd is deprecated, use Categorical.take instead", 

1869 FutureWarning, 

1870 stacklevel=2, 

1871 ) 

1872 return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) 

1873 

1874 def __len__(self) -> int: 

1875 """ 

1876 The length of this Categorical. 

1877 """ 

1878 return len(self._codes) 

1879 

1880 def __iter__(self): 

1881 """ 

1882 Returns an Iterator over the values of this Categorical. 

1883 """ 

1884 return iter(self._internal_get_values().tolist()) 

1885 

1886 def __contains__(self, key) -> bool: 

1887 """ 

1888 Returns True if `key` is in this Categorical. 

1889 """ 

1890 # if key is a NaN, check if any NaN is in self. 

1891 if is_scalar(key) and isna(key): 

1892 return self.isna().any() 

1893 

1894 return contains(self, key, container=self._codes) 

1895 

1896 def _tidy_repr(self, max_vals=10, footer=True) -> str: 

1897 """ a short repr displaying only max_vals and an optional (but default 

1898 footer) 

1899 """ 

1900 num = max_vals // 2 

1901 head = self[:num]._get_repr(length=False, footer=False) 

1902 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) 

1903 

1904 result = f"{head[:-1]}, ..., {tail[1:]}" 

1905 if footer: 

1906 result = f"{result}\n{self._repr_footer()}" 

1907 

1908 return str(result) 

1909 

1910 def _repr_categories(self): 

1911 """ 

1912 return the base repr for the categories 

1913 """ 

1914 max_categories = ( 

1915 10 

1916 if get_option("display.max_categories") == 0 

1917 else get_option("display.max_categories") 

1918 ) 

1919 from pandas.io.formats import format as fmt 

1920 

1921 if len(self.categories) > max_categories: 

1922 num = max_categories // 2 

1923 head = fmt.format_array(self.categories[:num], None) 

1924 tail = fmt.format_array(self.categories[-num:], None) 

1925 category_strs = head + ["..."] + tail 

1926 else: 

1927 category_strs = fmt.format_array(self.categories, None) 

1928 

1929 # Strip all leading spaces, which format_array adds for columns... 

1930 category_strs = [x.strip() for x in category_strs] 

1931 return category_strs 

1932 

1933 def _repr_categories_info(self) -> str: 

1934 """ 

1935 Returns a string representation of the footer. 

1936 """ 

1937 

1938 category_strs = self._repr_categories() 

1939 dtype = str(self.categories.dtype) 

1940 levheader = f"Categories ({len(self.categories)}, {dtype}): " 

1941 width, height = get_terminal_size() 

1942 max_width = get_option("display.width") or width 

1943 if console.in_ipython_frontend(): 

1944 # 0 = no breaks 

1945 max_width = 0 

1946 levstring = "" 

1947 start = True 

1948 cur_col_len = len(levheader) # header 

1949 sep_len, sep = (3, " < ") if self.ordered else (2, ", ") 

1950 linesep = sep.rstrip() + "\n" # remove whitespace 

1951 for val in category_strs: 

1952 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: 

1953 levstring += linesep + (" " * (len(levheader) + 1)) 

1954 cur_col_len = len(levheader) + 1 # header + a whitespace 

1955 elif not start: 

1956 levstring += sep 

1957 cur_col_len += len(val) 

1958 levstring += val 

1959 start = False 

1960 # replace to simple save space by 

1961 return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" 

1962 

1963 def _repr_footer(self) -> str: 

1964 info = self._repr_categories_info() 

1965 return f"Length: {len(self)}\n{info}" 

1966 

1967 def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str: 

1968 from pandas.io.formats import format as fmt 

1969 

1970 formatter = fmt.CategoricalFormatter( 

1971 self, length=length, na_rep=na_rep, footer=footer 

1972 ) 

1973 result = formatter.to_string() 

1974 return str(result) 

1975 

1976 def __repr__(self) -> str: 

1977 """ 

1978 String representation. 

1979 """ 

1980 _maxlen = 10 

1981 if len(self._codes) > _maxlen: 

1982 result = self._tidy_repr(_maxlen) 

1983 elif len(self._codes) > 0: 

1984 result = self._get_repr(length=len(self) > _maxlen) 

1985 else: 

1986 msg = self._get_repr(length=False, footer=True).replace("\n", ", ") 

1987 result = f"[], {msg}" 

1988 

1989 return result 

1990 

1991 def _maybe_coerce_indexer(self, indexer): 

1992 """ 

1993 return an indexer coerced to the codes dtype 

1994 """ 

1995 if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i": 

1996 indexer = indexer.astype(self._codes.dtype) 

1997 return indexer 

1998 

1999 def __getitem__(self, key): 

2000 """ 

2001 Return an item. 

2002 """ 

2003 if isinstance(key, (int, np.integer)): 

2004 i = self._codes[key] 

2005 if i == -1: 

2006 return np.nan 

2007 else: 

2008 return self.categories[i] 

2009 

2010 key = check_array_indexer(self, key) 

2011 

2012 result = self._codes[key] 

2013 if result.ndim > 1: 

2014 deprecate_ndim_indexing(result) 

2015 return result 

2016 return self._constructor(result, dtype=self.dtype, fastpath=True) 

2017 

2018 def __setitem__(self, key, value): 

2019 """ 

2020 Item assignment. 

2021 

2022 Raises 

2023 ------ 

2024 ValueError 

2025 If (one or more) Value is not in categories or if a assigned 

2026 `Categorical` does not have the same categories 

2027 """ 

2028 value = extract_array(value, extract_numpy=True) 

2029 

2030 # require identical categories set 

2031 if isinstance(value, Categorical): 

2032 if not is_dtype_equal(self, value): 

2033 raise ValueError( 

2034 "Cannot set a Categorical with another, " 

2035 "without identical categories" 

2036 ) 

2037 if not self.categories.equals(value.categories): 

2038 new_codes = _recode_for_categories( 

2039 value.codes, value.categories, self.categories 

2040 ) 

2041 value = Categorical.from_codes(new_codes, dtype=self.dtype) 

2042 

2043 rvalue = value if is_list_like(value) else [value] 

2044 

2045 from pandas import Index 

2046 

2047 to_add = Index(rvalue).difference(self.categories) 

2048 

2049 # no assignments of values not in categories, but it's always ok to set 

2050 # something to np.nan 

2051 if len(to_add) and not isna(to_add).all(): 

2052 raise ValueError( 

2053 "Cannot setitem on a Categorical with a new " 

2054 "category, set the categories first" 

2055 ) 

2056 

2057 # set by position 

2058 if isinstance(key, (int, np.integer)): 

2059 pass 

2060 

2061 # tuple of indexers (dataframe) 

2062 elif isinstance(key, tuple): 

2063 # only allow 1 dimensional slicing, but can 

2064 # in a 2-d case be passd (slice(None),....) 

2065 if len(key) == 2: 

2066 if not com.is_null_slice(key[0]): 

2067 raise AssertionError("invalid slicing for a 1-ndim categorical") 

2068 key = key[1] 

2069 elif len(key) == 1: 

2070 key = key[0] 

2071 else: 

2072 raise AssertionError("invalid slicing for a 1-ndim categorical") 

2073 

2074 # slicing in Series or Categorical 

2075 elif isinstance(key, slice): 

2076 pass 

2077 

2078 # else: array of True/False in Series or Categorical 

2079 

2080 lindexer = self.categories.get_indexer(rvalue) 

2081 lindexer = self._maybe_coerce_indexer(lindexer) 

2082 

2083 key = check_array_indexer(self, key) 

2084 self._codes[key] = lindexer 

2085 

2086 def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: 

2087 """ 

2088 Compute the inverse of a categorical, returning 

2089 a dict of categories -> indexers. 

2090 

2091 *This is an internal function* 

2092 

2093 Returns 

2094 ------- 

2095 dict of categories -> indexers 

2096 

2097 Examples 

2098 -------- 

2099 >>> c = pd.Categorical(list('aabca')) 

2100 >>> c 

2101 [a, a, b, c, a] 

2102 Categories (3, object): [a, b, c] 

2103 >>> c.categories 

2104 Index(['a', 'b', 'c'], dtype='object') 

2105 >>> c.codes 

2106 array([0, 0, 1, 2, 0], dtype=int8) 

2107 >>> c._reverse_indexer() 

2108 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} 

2109 

2110 """ 

2111 categories = self.categories 

2112 r, counts = libalgos.groupsort_indexer( 

2113 self.codes.astype("int64"), categories.size 

2114 ) 

2115 counts = counts.cumsum() 

2116 _result = (r[start:end] for start, end in zip(counts, counts[1:])) 

2117 result = dict(zip(categories, _result)) 

2118 return result 

2119 

2120 # reduction ops # 

2121 def _reduce(self, name, axis=0, **kwargs): 

2122 func = getattr(self, name, None) 

2123 if func is None: 

2124 raise TypeError(f"Categorical cannot perform the operation {name}") 

2125 return func(**kwargs) 

2126 

2127 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") 

2128 def min(self, skipna=True, **kwargs): 

2129 """ 

2130 The minimum value of the object. 

2131 

2132 Only ordered `Categoricals` have a minimum! 

2133 

2134 .. versionchanged:: 1.0.0 

2135 

2136 Returns an NA value on empty arrays 

2137 

2138 Raises 

2139 ------ 

2140 TypeError 

2141 If the `Categorical` is not `ordered`. 

2142 

2143 Returns 

2144 ------- 

2145 min : the minimum of this `Categorical` 

2146 """ 

2147 nv.validate_min((), kwargs) 

2148 self.check_for_ordered("min") 

2149 

2150 if not len(self._codes): 

2151 return self.dtype.na_value 

2152 

2153 good = self._codes != -1 

2154 if not good.all(): 

2155 if skipna and good.any(): 

2156 pointer = self._codes[good].min() 

2157 else: 

2158 return np.nan 

2159 else: 

2160 pointer = self._codes.min() 

2161 return self.categories[pointer] 

2162 

2163 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") 

2164 def max(self, skipna=True, **kwargs): 

2165 """ 

2166 The maximum value of the object. 

2167 

2168 Only ordered `Categoricals` have a maximum! 

2169 

2170 .. versionchanged:: 1.0.0 

2171 

2172 Returns an NA value on empty arrays 

2173 

2174 Raises 

2175 ------ 

2176 TypeError 

2177 If the `Categorical` is not `ordered`. 

2178 

2179 Returns 

2180 ------- 

2181 max : the maximum of this `Categorical` 

2182 """ 

2183 nv.validate_max((), kwargs) 

2184 self.check_for_ordered("max") 

2185 

2186 if not len(self._codes): 

2187 return self.dtype.na_value 

2188 

2189 good = self._codes != -1 

2190 if not good.all(): 

2191 if skipna and good.any(): 

2192 pointer = self._codes[good].max() 

2193 else: 

2194 return np.nan 

2195 else: 

2196 pointer = self._codes.max() 

2197 return self.categories[pointer] 

2198 

2199 def mode(self, dropna=True): 

2200 """ 

2201 Returns the mode(s) of the Categorical. 

2202 

2203 Always returns `Categorical` even if only one value. 

2204 

2205 Parameters 

2206 ---------- 

2207 dropna : bool, default True 

2208 Don't consider counts of NaN/NaT. 

2209 

2210 .. versionadded:: 0.24.0 

2211 

2212 Returns 

2213 ------- 

2214 modes : `Categorical` (sorted) 

2215 """ 

2216 codes = self._codes 

2217 if dropna: 

2218 good = self._codes != -1 

2219 codes = self._codes[good] 

2220 codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) 

2221 return self._constructor(values=codes, dtype=self.dtype, fastpath=True) 

2222 

2223 def unique(self): 

2224 """ 

2225 Return the ``Categorical`` which ``categories`` and ``codes`` are 

2226 unique. Unused categories are NOT returned. 

2227 

2228 - unordered category: values and categories are sorted by appearance 

2229 order. 

2230 - ordered category: values are sorted by appearance order, categories 

2231 keeps existing order. 

2232 

2233 Returns 

2234 ------- 

2235 unique values : ``Categorical`` 

2236 

2237 Examples 

2238 -------- 

2239 An unordered Categorical will return categories in the 

2240 order of appearance. 

2241 

2242 >>> pd.Categorical(list('baabc')) 

2243 [b, a, c] 

2244 Categories (3, object): [b, a, c] 

2245 

2246 >>> pd.Categorical(list('baabc'), categories=list('abc')) 

2247 [b, a, c] 

2248 Categories (3, object): [b, a, c] 

2249 

2250 An ordered Categorical preserves the category ordering. 

2251 

2252 >>> pd.Categorical(list('baabc'), 

2253 ... categories=list('abc'), 

2254 ... ordered=True) 

2255 [b, a, c] 

2256 Categories (3, object): [a < b < c] 

2257 

2258 See Also 

2259 -------- 

2260 unique 

2261 CategoricalIndex.unique 

2262 Series.unique 

2263 

2264 """ 

2265 

2266 # unlike np.unique, unique1d does not sort 

2267 unique_codes = unique1d(self.codes) 

2268 cat = self.copy() 

2269 

2270 # keep nan in codes 

2271 cat._codes = unique_codes 

2272 

2273 # exclude nan from indexer for categories 

2274 take_codes = unique_codes[unique_codes != -1] 

2275 if self.ordered: 

2276 take_codes = np.sort(take_codes) 

2277 return cat.set_categories(cat.categories.take(take_codes)) 

2278 

2279 def _values_for_factorize(self): 

2280 codes = self.codes.astype("int64") 

2281 return codes, -1 

2282 

2283 @classmethod 

2284 def _from_factorized(cls, uniques, original): 

2285 return original._constructor( 

2286 original.categories.take(uniques), dtype=original.dtype 

2287 ) 

2288 

2289 def equals(self, other): 

2290 """ 

2291 Returns True if categorical arrays are equal. 

2292 

2293 Parameters 

2294 ---------- 

2295 other : `Categorical` 

2296 

2297 Returns 

2298 ------- 

2299 bool 

2300 """ 

2301 if self.is_dtype_equal(other): 

2302 if self.categories.equals(other.categories): 

2303 # fastpath to avoid re-coding 

2304 other_codes = other._codes 

2305 else: 

2306 other_codes = _recode_for_categories( 

2307 other.codes, other.categories, self.categories 

2308 ) 

2309 return np.array_equal(self._codes, other_codes) 

2310 return False 

2311 

2312 def is_dtype_equal(self, other): 

2313 """ 

2314 Returns True if categoricals are the same dtype 

2315 same categories, and same ordered 

2316 

2317 Parameters 

2318 ---------- 

2319 other : Categorical 

2320 

2321 Returns 

2322 ------- 

2323 bool 

2324 """ 

2325 

2326 try: 

2327 return hash(self.dtype) == hash(other.dtype) 

2328 except (AttributeError, TypeError): 

2329 return False 

2330 

2331 def describe(self): 

2332 """ 

2333 Describes this Categorical 

2334 

2335 Returns 

2336 ------- 

2337 description: `DataFrame` 

2338 A dataframe with frequency and counts by category. 

2339 """ 

2340 counts = self.value_counts(dropna=False) 

2341 freqs = counts / float(counts.sum()) 

2342 

2343 from pandas.core.reshape.concat import concat 

2344 

2345 result = concat([counts, freqs], axis=1) 

2346 result.columns = ["counts", "freqs"] 

2347 result.index.name = "categories" 

2348 

2349 return result 

2350 

2351 @Substitution(klass="Categorical") 

2352 @Appender(_extension_array_shared_docs["repeat"]) 

2353 def repeat(self, repeats, axis=None): 

2354 nv.validate_repeat(tuple(), dict(axis=axis)) 

2355 codes = self._codes.repeat(repeats) 

2356 return self._constructor(values=codes, dtype=self.dtype, fastpath=True) 

2357 

2358 # Implement the ExtensionArray interface 

2359 @property 

2360 def _can_hold_na(self): 

2361 return True 

2362 

2363 @classmethod 

2364 def _concat_same_type(self, to_concat): 

2365 from pandas.core.dtypes.concat import concat_categorical 

2366 

2367 return concat_categorical(to_concat) 

2368 

2369 def isin(self, values): 

2370 """ 

2371 Check whether `values` are contained in Categorical. 

2372 

2373 Return a boolean NumPy Array showing whether each element in 

2374 the Categorical matches an element in the passed sequence of 

2375 `values` exactly. 

2376 

2377 Parameters 

2378 ---------- 

2379 values : set or list-like 

2380 The sequence of values to test. Passing in a single string will 

2381 raise a ``TypeError``. Instead, turn a single string into a 

2382 list of one element. 

2383 

2384 Returns 

2385 ------- 

2386 isin : numpy.ndarray (bool dtype) 

2387 

2388 Raises 

2389 ------ 

2390 TypeError 

2391 * If `values` is not a set or list-like 

2392 

2393 See Also 

2394 -------- 

2395 pandas.Series.isin : Equivalent method on Series. 

2396 

2397 Examples 

2398 -------- 

2399 

2400 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', 

2401 ... 'hippo']) 

2402 >>> s.isin(['cow', 'lama']) 

2403 array([ True, True, True, False, True, False]) 

2404 

2405 Passing a single string as ``s.isin('lama')`` will raise an error. Use 

2406 a list of one element instead: 

2407 

2408 >>> s.isin(['lama']) 

2409 array([ True, False, True, False, True, False]) 

2410 """ 

2411 if not is_list_like(values): 

2412 values_type = type(values).__name__ 

2413 raise TypeError( 

2414 "only list-like objects are allowed to be passed" 

2415 f" to isin(), you passed a [{values_type}]" 

2416 ) 

2417 values = sanitize_array(values, None, None) 

2418 null_mask = np.asarray(isna(values)) 

2419 code_values = self.categories.get_indexer(values) 

2420 code_values = code_values[null_mask | (code_values >= 0)] 

2421 return algorithms.isin(self.codes, code_values) 

2422 

2423 def replace(self, to_replace, value, inplace: bool = False): 

2424 """ 

2425 Replaces all instances of one value with another 

2426 

2427 Parameters 

2428 ---------- 

2429 to_replace: object 

2430 The value to be replaced 

2431 

2432 value: object 

2433 The value to replace it with 

2434 

2435 inplace: bool 

2436 Whether the operation is done in-place 

2437 

2438 Returns 

2439 ------- 

2440 None if inplace is True, otherwise the new Categorical after replacement 

2441 

2442 

2443 Examples 

2444 -------- 

2445 >>> s = pd.Categorical([1, 2, 1, 3]) 

2446 >>> s.replace(1, 3) 

2447 [3, 3, 2, 3] 

2448 Categories (2, int64): [2, 3] 

2449 """ 

2450 inplace = validate_bool_kwarg(inplace, "inplace") 

2451 cat = self if inplace else self.copy() 

2452 

2453 # build a dict of (to replace -> value) pairs 

2454 if is_list_like(to_replace): 

2455 # if to_replace is list-like and value is scalar 

2456 replace_dict = {replace_value: value for replace_value in to_replace} 

2457 else: 

2458 # if both to_replace and value are scalar 

2459 replace_dict = {to_replace: value} 

2460 

2461 # other cases, like if both to_replace and value are list-like or if 

2462 # to_replace is a dict, are handled separately in NDFrame 

2463 for replace_value, new_value in replace_dict.items(): 

2464 if new_value == replace_value: 

2465 continue 

2466 if replace_value in cat.categories: 

2467 if isna(new_value): 

2468 cat.remove_categories(replace_value, inplace=True) 

2469 continue 

2470 categories = cat.categories.tolist() 

2471 index = categories.index(replace_value) 

2472 if new_value in cat.categories: 

2473 value_index = categories.index(new_value) 

2474 cat._codes[cat._codes == index] = value_index 

2475 cat.remove_categories(replace_value, inplace=True) 

2476 else: 

2477 categories[index] = new_value 

2478 cat.rename_categories(categories, inplace=True) 

2479 if not inplace: 

2480 return cat 

2481 

2482 

2483# The Series.cat accessor 

2484 

2485 

2486@delegate_names( 

2487 delegate=Categorical, accessors=["categories", "ordered"], typ="property" 

2488) 

2489@delegate_names( 

2490 delegate=Categorical, 

2491 accessors=[ 

2492 "rename_categories", 

2493 "reorder_categories", 

2494 "add_categories", 

2495 "remove_categories", 

2496 "remove_unused_categories", 

2497 "set_categories", 

2498 "as_ordered", 

2499 "as_unordered", 

2500 ], 

2501 typ="method", 

2502) 

2503class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 

2504 """ 

2505 Accessor object for categorical properties of the Series values. 

2506 

2507 Be aware that assigning to `categories` is a inplace operation, while all 

2508 methods return new categorical data per default (but can be called with 

2509 `inplace=True`). 

2510 

2511 Parameters 

2512 ---------- 

2513 data : Series or CategoricalIndex 

2514 

2515 Examples 

2516 -------- 

2517 >>> s.cat.categories 

2518 >>> s.cat.categories = list('abc') 

2519 >>> s.cat.rename_categories(list('cab')) 

2520 >>> s.cat.reorder_categories(list('cab')) 

2521 >>> s.cat.add_categories(['d','e']) 

2522 >>> s.cat.remove_categories(['d']) 

2523 >>> s.cat.remove_unused_categories() 

2524 >>> s.cat.set_categories(list('abcde')) 

2525 >>> s.cat.as_ordered() 

2526 >>> s.cat.as_unordered() 

2527 """ 

2528 

2529 _deprecations = PandasObject._deprecations | frozenset( 

2530 ["categorical", "index", "name"] 

2531 ) 

2532 

2533 def __init__(self, data): 

2534 self._validate(data) 

2535 self._parent = data.values 

2536 self._index = data.index 

2537 self._name = data.name 

2538 self._freeze() 

2539 

2540 @staticmethod 

2541 def _validate(data): 

2542 if not is_categorical_dtype(data.dtype): 

2543 raise AttributeError("Can only use .cat accessor with a 'category' dtype") 

2544 

2545 def _delegate_property_get(self, name): 

2546 return getattr(self._parent, name) 

2547 

2548 def _delegate_property_set(self, name, new_values): 

2549 return setattr(self._parent, name, new_values) 

2550 

2551 @property 

2552 def codes(self): 

2553 """ 

2554 Return Series of codes as well as the index. 

2555 """ 

2556 from pandas import Series 

2557 

2558 return Series(self._parent.codes, index=self._index) 

2559 

2560 def _delegate_method(self, name, *args, **kwargs): 

2561 from pandas import Series 

2562 

2563 method = getattr(self._parent, name) 

2564 res = method(*args, **kwargs) 

2565 if res is not None: 

2566 return Series(res, index=self._index, name=self._name) 

2567 

2568 

2569# utility routines 

2570 

2571 

2572def _get_codes_for_values(values, categories): 

2573 """ 

2574 utility routine to turn values into codes given the specified categories 

2575 """ 

2576 dtype_equal = is_dtype_equal(values.dtype, categories.dtype) 

2577 

2578 if dtype_equal: 

2579 # To prevent erroneous dtype coercion in _get_data_algo, retrieve 

2580 # the underlying numpy array. gh-22702 

2581 values = getattr(values, "_ndarray_values", values) 

2582 categories = getattr(categories, "_ndarray_values", categories) 

2583 elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values): 

2584 # Support inferring the correct extension dtype from an array of 

2585 # scalar objects. e.g. 

2586 # Categorical(array[Period, Period], categories=PeriodIndex(...)) 

2587 cls = categories.dtype.construct_array_type() 

2588 values = try_cast_to_ea(cls, values) 

2589 if not isinstance(values, cls): 

2590 # exception raised in _from_sequence 

2591 values = ensure_object(values) 

2592 categories = ensure_object(categories) 

2593 else: 

2594 values = ensure_object(values) 

2595 categories = ensure_object(categories) 

2596 

2597 hash_klass, vals = _get_data_algo(values) 

2598 _, cats = _get_data_algo(categories) 

2599 t = hash_klass(len(cats)) 

2600 t.map_locations(cats) 

2601 return coerce_indexer_dtype(t.lookup(vals), cats) 

2602 

2603 

2604def _recode_for_categories(codes: np.ndarray, old_categories, new_categories): 

2605 """ 

2606 Convert a set of codes for to a new set of categories 

2607 

2608 Parameters 

2609 ---------- 

2610 codes : np.ndarray 

2611 old_categories, new_categories : Index 

2612 

2613 Returns 

2614 ------- 

2615 new_codes : np.ndarray[np.int64] 

2616 

2617 Examples 

2618 -------- 

2619 >>> old_cat = pd.Index(['b', 'a', 'c']) 

2620 >>> new_cat = pd.Index(['a', 'b']) 

2621 >>> codes = np.array([0, 1, 1, 2]) 

2622 >>> _recode_for_categories(codes, old_cat, new_cat) 

2623 array([ 1, 0, 0, -1]) 

2624 """ 

2625 if len(old_categories) == 0: 

2626 # All null anyway, so just retain the nulls 

2627 return codes.copy() 

2628 elif new_categories.equals(old_categories): 

2629 # Same categories, so no need to actually recode 

2630 return codes.copy() 

2631 indexer = coerce_indexer_dtype( 

2632 new_categories.get_indexer(old_categories), new_categories 

2633 ) 

2634 new_codes = take_1d(indexer, codes.copy(), fill_value=-1) 

2635 return new_codes 

2636 

2637 

2638def _convert_to_list_like(list_like): 

2639 if hasattr(list_like, "dtype"): 

2640 return list_like 

2641 if isinstance(list_like, list): 

2642 return list_like 

2643 if is_sequence(list_like) or isinstance(list_like, tuple) or is_iterator(list_like): 

2644 return list(list_like) 

2645 elif is_scalar(list_like): 

2646 return [list_like] 

2647 else: 

2648 # TODO: is this reached? 

2649 return [list_like] 

2650 

2651 

2652def factorize_from_iterable(values): 

2653 """ 

2654 Factorize an input `values` into `categories` and `codes`. Preserves 

2655 categorical dtype in `categories`. 

2656 

2657 *This is an internal function* 

2658 

2659 Parameters 

2660 ---------- 

2661 values : list-like 

2662 

2663 Returns 

2664 ------- 

2665 codes : ndarray 

2666 categories : Index 

2667 If `values` has a categorical dtype, then `categories` is 

2668 a CategoricalIndex keeping the categories and order of `values`. 

2669 """ 

2670 if not is_list_like(values): 

2671 raise TypeError("Input must be list-like") 

2672 

2673 if is_categorical_dtype(values): 

2674 values = extract_array(values) 

2675 # The Categorical we want to build has the same categories 

2676 # as values but its codes are by def [0, ..., len(n_categories) - 1] 

2677 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) 

2678 categories = Categorical.from_codes(cat_codes, dtype=values.dtype) 

2679 codes = values.codes 

2680 else: 

2681 # The value of ordered is irrelevant since we don't use cat as such, 

2682 # but only the resulting categories, the order of which is independent 

2683 # from ordered. Set ordered to False as default. See GH #15457 

2684 cat = Categorical(values, ordered=False) 

2685 categories = cat.categories 

2686 codes = cat.codes 

2687 return codes, categories 

2688 

2689 

2690def factorize_from_iterables(iterables): 

2691 """ 

2692 A higher-level wrapper over `factorize_from_iterable`. 

2693 

2694 *This is an internal function* 

2695 

2696 Parameters 

2697 ---------- 

2698 iterables : list-like of list-likes 

2699 

2700 Returns 

2701 ------- 

2702 codes_list : list of ndarrays 

2703 categories_list : list of Indexes 

2704 

2705 Notes 

2706 ----- 

2707 See `factorize_from_iterable` for more info. 

2708 """ 

2709 if len(iterables) == 0: 

2710 # For consistency, it should return a list of 2 lists. 

2711 return [[], []] 

2712 return map(list, zip(*(factorize_from_iterable(it) for it in iterables)))