Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2DataFrame 

3--------- 

4An efficient 2D container for potentially mixed-type time series or other 

5labeled data series. 

6 

7Similar to its R counterpart, data.frame, except providing automatic data 

8alignment and a host of useful data manipulation methods having to do with the 

9labeling information 

10""" 

11import collections 

12from collections import abc 

13from io import StringIO 

14import itertools 

15import sys 

16from textwrap import dedent 

17from typing import ( 

18 IO, 

19 TYPE_CHECKING, 

20 Any, 

21 FrozenSet, 

22 Hashable, 

23 Iterable, 

24 List, 

25 Optional, 

26 Sequence, 

27 Set, 

28 Tuple, 

29 Type, 

30 Union, 

31 cast, 

32) 

33import warnings 

34 

35import numpy as np 

36import numpy.ma as ma 

37 

38from pandas._config import get_option 

39 

40from pandas._libs import algos as libalgos, lib 

41from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer 

42from pandas.compat import PY37 

43from pandas.compat._optional import import_optional_dependency 

44from pandas.compat.numpy import function as nv 

45from pandas.util._decorators import ( 

46 Appender, 

47 Substitution, 

48 deprecate_kwarg, 

49 rewrite_axis_style_signature, 

50) 

51from pandas.util._validators import ( 

52 validate_axis_style_args, 

53 validate_bool_kwarg, 

54 validate_percentile, 

55) 

56 

57from pandas.core.dtypes.cast import ( 

58 cast_scalar_to_array, 

59 coerce_to_dtypes, 

60 find_common_type, 

61 infer_dtype_from_scalar, 

62 invalidate_string_dtypes, 

63 maybe_cast_to_datetime, 

64 maybe_convert_platform, 

65 maybe_downcast_to_dtype, 

66 maybe_infer_to_datetimelike, 

67 maybe_upcast, 

68 maybe_upcast_putmask, 

69) 

70from pandas.core.dtypes.common import ( 

71 ensure_float64, 

72 ensure_int64, 

73 ensure_platform_int, 

74 infer_dtype_from_object, 

75 is_bool_dtype, 

76 is_dict_like, 

77 is_dtype_equal, 

78 is_extension_array_dtype, 

79 is_float_dtype, 

80 is_hashable, 

81 is_integer, 

82 is_integer_dtype, 

83 is_iterator, 

84 is_list_like, 

85 is_named_tuple, 

86 is_object_dtype, 

87 is_scalar, 

88 is_sequence, 

89 needs_i8_conversion, 

90) 

91from pandas.core.dtypes.generic import ( 

92 ABCDataFrame, 

93 ABCIndexClass, 

94 ABCMultiIndex, 

95 ABCSeries, 

96) 

97from pandas.core.dtypes.missing import isna, notna 

98 

99from pandas.core import algorithms, common as com, nanops, ops 

100from pandas.core.accessor import CachedAccessor 

101from pandas.core.arrays import Categorical, ExtensionArray 

102from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray 

103from pandas.core.arrays.sparse import SparseFrameAccessor 

104from pandas.core.generic import NDFrame, _shared_docs 

105from pandas.core.groupby import generic as groupby_generic 

106from pandas.core.indexes import base as ibase 

107from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences 

108from pandas.core.indexes.datetimes import DatetimeIndex 

109from pandas.core.indexes.multi import maybe_droplevels 

110from pandas.core.indexes.period import PeriodIndex 

111from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable 

112from pandas.core.internals import BlockManager 

113from pandas.core.internals.construction import ( 

114 arrays_to_mgr, 

115 get_names_from_index, 

116 init_dict, 

117 init_ndarray, 

118 masked_rec_array_to_mgr, 

119 reorder_arrays, 

120 sanitize_index, 

121 to_arrays, 

122) 

123from pandas.core.ops.missing import dispatch_fill_zeros 

124from pandas.core.series import Series 

125 

126from pandas.io.common import get_filepath_or_buffer 

127from pandas.io.formats import console, format as fmt 

128from pandas.io.formats.printing import pprint_thing 

129import pandas.plotting 

130 

131if TYPE_CHECKING: 

132 from pandas.io.formats.style import Styler 

133 

134# --------------------------------------------------------------------- 

135# Docstring templates 

136 

137_shared_doc_kwargs = dict( 

138 axes="index, columns", 

139 klass="DataFrame", 

140 axes_single_arg="{0 or 'index', 1 or 'columns'}", 

141 axis="""axis : {0 or 'index', 1 or 'columns'}, default 0 

142 If 0 or 'index': apply function to each column. 

143 If 1 or 'columns': apply function to each row.""", 

144 optional_by=""" 

145 by : str or list of str 

146 Name or list of names to sort by. 

147 

148 - if `axis` is 0 or `'index'` then `by` may contain index 

149 levels and/or column labels. 

150 - if `axis` is 1 or `'columns'` then `by` may contain column 

151 levels and/or index labels. 

152 

153 .. versionchanged:: 0.23.0 

154 

155 Allow specifying index or column level names.""", 

156 versionadded_to_excel="", 

157 optional_labels="""labels : array-like, optional 

158 New labels / index to conform the axis specified by 'axis' to.""", 

159 optional_axis="""axis : int or str, optional 

160 Axis to target. Can be either the axis name ('index', 'columns') 

161 or number (0, 1).""", 

162) 

163 

164_numeric_only_doc = """numeric_only : boolean, default None 

165 Include only float, int, boolean data. If None, will attempt to use 

166 everything, then use only numeric data 

167""" 

168 

169_merge_doc = """ 

170Merge DataFrame or named Series objects with a database-style join. 

171 

172The join is done on columns or indexes. If joining columns on 

173columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes 

174on indexes or indexes on a column or columns, the index will be passed on. 

175 

176Parameters 

177----------%s 

178right : DataFrame or named Series 

179 Object to merge with. 

180how : {'left', 'right', 'outer', 'inner'}, default 'inner' 

181 Type of merge to be performed. 

182 

183 * left: use only keys from left frame, similar to a SQL left outer join; 

184 preserve key order. 

185 * right: use only keys from right frame, similar to a SQL right outer join; 

186 preserve key order. 

187 * outer: use union of keys from both frames, similar to a SQL full outer 

188 join; sort keys lexicographically. 

189 * inner: use intersection of keys from both frames, similar to a SQL inner 

190 join; preserve the order of the left keys. 

191on : label or list 

192 Column or index level names to join on. These must be found in both 

193 DataFrames. If `on` is None and not merging on indexes then this defaults 

194 to the intersection of the columns in both DataFrames. 

195left_on : label or list, or array-like 

196 Column or index level names to join on in the left DataFrame. Can also 

197 be an array or list of arrays of the length of the left DataFrame. 

198 These arrays are treated as if they are columns. 

199right_on : label or list, or array-like 

200 Column or index level names to join on in the right DataFrame. Can also 

201 be an array or list of arrays of the length of the right DataFrame. 

202 These arrays are treated as if they are columns. 

203left_index : bool, default False 

204 Use the index from the left DataFrame as the join key(s). If it is a 

205 MultiIndex, the number of keys in the other DataFrame (either the index 

206 or a number of columns) must match the number of levels. 

207right_index : bool, default False 

208 Use the index from the right DataFrame as the join key. Same caveats as 

209 left_index. 

210sort : bool, default False 

211 Sort the join keys lexicographically in the result DataFrame. If False, 

212 the order of the join keys depends on the join type (how keyword). 

213suffixes : tuple of (str, str), default ('_x', '_y') 

214 Suffix to apply to overlapping column names in the left and right 

215 side, respectively. To raise an exception on overlapping columns use 

216 (False, False). 

217copy : bool, default True 

218 If False, avoid copy if possible. 

219indicator : bool or str, default False 

220 If True, adds a column to output DataFrame called "_merge" with 

221 information on the source of each row. 

222 If string, column with information on source of each row will be added to 

223 output DataFrame, and column will be named value of string. 

224 Information column is Categorical-type and takes on a value of "left_only" 

225 for observations whose merge key only appears in 'left' DataFrame, 

226 "right_only" for observations whose merge key only appears in 'right' 

227 DataFrame, and "both" if the observation's merge key is found in both. 

228 

229validate : str, optional 

230 If specified, checks if merge is of specified type. 

231 

232 * "one_to_one" or "1:1": check if merge keys are unique in both 

233 left and right datasets. 

234 * "one_to_many" or "1:m": check if merge keys are unique in left 

235 dataset. 

236 * "many_to_one" or "m:1": check if merge keys are unique in right 

237 dataset. 

238 * "many_to_many" or "m:m": allowed, but does not result in checks. 

239 

240 .. versionadded:: 0.21.0 

241 

242Returns 

243------- 

244DataFrame 

245 A DataFrame of the two merged objects. 

246 

247See Also 

248-------- 

249merge_ordered : Merge with optional filling/interpolation. 

250merge_asof : Merge on nearest keys. 

251DataFrame.join : Similar method using indices. 

252 

253Notes 

254----- 

255Support for specifying index levels as the `on`, `left_on`, and 

256`right_on` parameters was added in version 0.23.0 

257Support for merging named Series objects was added in version 0.24.0 

258 

259Examples 

260-------- 

261 

262>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], 

263... 'value': [1, 2, 3, 5]}) 

264>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], 

265... 'value': [5, 6, 7, 8]}) 

266>>> df1 

267 lkey value 

2680 foo 1 

2691 bar 2 

2702 baz 3 

2713 foo 5 

272>>> df2 

273 rkey value 

2740 foo 5 

2751 bar 6 

2762 baz 7 

2773 foo 8 

278 

279Merge df1 and df2 on the lkey and rkey columns. The value columns have 

280the default suffixes, _x and _y, appended. 

281 

282>>> df1.merge(df2, left_on='lkey', right_on='rkey') 

283 lkey value_x rkey value_y 

2840 foo 1 foo 5 

2851 foo 1 foo 8 

2862 foo 5 foo 5 

2873 foo 5 foo 8 

2884 bar 2 bar 6 

2895 baz 3 baz 7 

290 

291Merge DataFrames df1 and df2 with specified left and right suffixes 

292appended to any overlapping columns. 

293 

294>>> df1.merge(df2, left_on='lkey', right_on='rkey', 

295... suffixes=('_left', '_right')) 

296 lkey value_left rkey value_right 

2970 foo 1 foo 5 

2981 foo 1 foo 8 

2992 foo 5 foo 5 

3003 foo 5 foo 8 

3014 bar 2 bar 6 

3025 baz 3 baz 7 

303 

304Merge DataFrames df1 and df2, but raise an exception if the DataFrames have 

305any overlapping columns. 

306 

307>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) 

308Traceback (most recent call last): 

309... 

310ValueError: columns overlap but no suffix specified: 

311 Index(['value'], dtype='object') 

312""" 

313 

314 

315# ----------------------------------------------------------------------- 

316# DataFrame class 

317 

318 

319class DataFrame(NDFrame): 

320 """ 

321 Two-dimensional, size-mutable, potentially heterogeneous tabular data. 

322 

323 Data structure also contains labeled axes (rows and columns). 

324 Arithmetic operations align on both row and column labels. Can be 

325 thought of as a dict-like container for Series objects. The primary 

326 pandas data structure. 

327 

328 Parameters 

329 ---------- 

330 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame 

331 Dict can contain Series, arrays, constants, or list-like objects. 

332 

333 .. versionchanged:: 0.23.0 

334 If data is a dict, column order follows insertion-order for 

335 Python 3.6 and later. 

336 

337 .. versionchanged:: 0.25.0 

338 If data is a list of dicts, column order follows insertion-order 

339 for Python 3.6 and later. 

340 

341 index : Index or array-like 

342 Index to use for resulting frame. Will default to RangeIndex if 

343 no indexing information part of input data and no index provided. 

344 columns : Index or array-like 

345 Column labels to use for resulting frame. Will default to 

346 RangeIndex (0, 1, 2, ..., n) if no column labels are provided. 

347 dtype : dtype, default None 

348 Data type to force. Only a single dtype is allowed. If None, infer. 

349 copy : bool, default False 

350 Copy data from inputs. Only affects DataFrame / 2d ndarray input. 

351 

352 See Also 

353 -------- 

354 DataFrame.from_records : Constructor from tuples, also record arrays. 

355 DataFrame.from_dict : From dicts of Series, arrays, or dicts. 

356 read_csv 

357 read_table 

358 read_clipboard 

359 

360 Examples 

361 -------- 

362 Constructing DataFrame from a dictionary. 

363 

364 >>> d = {'col1': [1, 2], 'col2': [3, 4]} 

365 >>> df = pd.DataFrame(data=d) 

366 >>> df 

367 col1 col2 

368 0 1 3 

369 1 2 4 

370 

371 Notice that the inferred dtype is int64. 

372 

373 >>> df.dtypes 

374 col1 int64 

375 col2 int64 

376 dtype: object 

377 

378 To enforce a single dtype: 

379 

380 >>> df = pd.DataFrame(data=d, dtype=np.int8) 

381 >>> df.dtypes 

382 col1 int8 

383 col2 int8 

384 dtype: object 

385 

386 Constructing DataFrame from numpy ndarray: 

387 

388 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 

389 ... columns=['a', 'b', 'c']) 

390 >>> df2 

391 a b c 

392 0 1 2 3 

393 1 4 5 6 

394 2 7 8 9 

395 """ 

396 

397 _typ = "dataframe" 

398 

399 @property 

400 def _constructor(self) -> Type["DataFrame"]: 

401 return DataFrame 

402 

403 _constructor_sliced: Type[Series] = Series 

404 _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) 

405 _accessors: Set[str] = {"sparse"} 

406 

407 @property 

408 def _constructor_expanddim(self): 

409 raise NotImplementedError("Not supported for DataFrames!") 

410 

411 # ---------------------------------------------------------------------- 

412 # Constructors 

413 

414 def __init__( 

415 self, 

416 data=None, 

417 index: Optional[Axes] = None, 

418 columns: Optional[Axes] = None, 

419 dtype: Optional[Dtype] = None, 

420 copy: bool = False, 

421 ): 

422 if data is None: 

423 data = {} 

424 if dtype is not None: 

425 dtype = self._validate_dtype(dtype) 

426 

427 if isinstance(data, DataFrame): 

428 data = data._data 

429 

430 if isinstance(data, BlockManager): 

431 mgr = self._init_mgr( 

432 data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy 

433 ) 

434 elif isinstance(data, dict): 

435 mgr = init_dict(data, index, columns, dtype=dtype) 

436 elif isinstance(data, ma.MaskedArray): 

437 import numpy.ma.mrecords as mrecords 

438 

439 # masked recarray 

440 if isinstance(data, mrecords.MaskedRecords): 

441 mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) 

442 

443 # a masked array 

444 else: 

445 mask = ma.getmaskarray(data) 

446 if mask.any(): 

447 data, fill_value = maybe_upcast(data, copy=True) 

448 data.soften_mask() # set hardmask False if it was True 

449 data[mask] = fill_value 

450 else: 

451 data = data.copy() 

452 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) 

453 

454 elif isinstance(data, (np.ndarray, Series, Index)): 

455 if data.dtype.names: 

456 data_columns = list(data.dtype.names) 

457 data = {k: data[k] for k in data_columns} 

458 if columns is None: 

459 columns = data_columns 

460 mgr = init_dict(data, index, columns, dtype=dtype) 

461 elif getattr(data, "name", None) is not None: 

462 mgr = init_dict({data.name: data}, index, columns, dtype=dtype) 

463 else: 

464 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) 

465 

466 # For data is list-like, or Iterable (will consume into list) 

467 elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): 

468 if not isinstance(data, (abc.Sequence, ExtensionArray)): 

469 data = list(data) 

470 if len(data) > 0: 

471 if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: 

472 if is_named_tuple(data[0]) and columns is None: 

473 columns = data[0]._fields 

474 arrays, columns = to_arrays(data, columns, dtype=dtype) 

475 columns = ensure_index(columns) 

476 

477 # set the index 

478 if index is None: 

479 if isinstance(data[0], Series): 

480 index = get_names_from_index(data) 

481 elif isinstance(data[0], Categorical): 

482 index = ibase.default_index(len(data[0])) 

483 else: 

484 index = ibase.default_index(len(data)) 

485 

486 mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) 

487 else: 

488 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) 

489 else: 

490 mgr = init_dict({}, index, columns, dtype=dtype) 

491 else: 

492 try: 

493 arr = np.array(data, dtype=dtype, copy=copy) 

494 except (ValueError, TypeError) as e: 

495 exc = TypeError( 

496 "DataFrame constructor called with " 

497 f"incompatible data and dtype: {e}" 

498 ) 

499 raise exc from e 

500 

501 if arr.ndim == 0 and index is not None and columns is not None: 

502 values = cast_scalar_to_array( 

503 (len(index), len(columns)), data, dtype=dtype 

504 ) 

505 mgr = init_ndarray( 

506 values, index, columns, dtype=values.dtype, copy=False 

507 ) 

508 else: 

509 raise ValueError("DataFrame constructor not properly called!") 

510 

511 NDFrame.__init__(self, mgr, fastpath=True) 

512 

513 # ---------------------------------------------------------------------- 

514 

515 @property 

516 def axes(self) -> List[Index]: 

517 """ 

518 Return a list representing the axes of the DataFrame. 

519 

520 It has the row axis labels and column axis labels as the only members. 

521 They are returned in that order. 

522 

523 Examples 

524 -------- 

525 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

526 >>> df.axes 

527 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], 

528 dtype='object')] 

529 """ 

530 return [self.index, self.columns] 

531 

532 @property 

533 def shape(self) -> Tuple[int, int]: 

534 """ 

535 Return a tuple representing the dimensionality of the DataFrame. 

536 

537 See Also 

538 -------- 

539 ndarray.shape 

540 

541 Examples 

542 -------- 

543 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

544 >>> df.shape 

545 (2, 2) 

546 

547 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], 

548 ... 'col3': [5, 6]}) 

549 >>> df.shape 

550 (2, 3) 

551 """ 

552 return len(self.index), len(self.columns) 

553 

554 @property 

555 def _is_homogeneous_type(self) -> bool: 

556 """ 

557 Whether all the columns in a DataFrame have the same type. 

558 

559 Returns 

560 ------- 

561 bool 

562 

563 See Also 

564 -------- 

565 Index._is_homogeneous_type : Whether the object has a single 

566 dtype. 

567 MultiIndex._is_homogeneous_type : Whether all the levels of a 

568 MultiIndex have the same dtype. 

569 

570 Examples 

571 -------- 

572 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type 

573 True 

574 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type 

575 False 

576 

577 Items with the same type but different sizes are considered 

578 different types. 

579 

580 >>> DataFrame({ 

581 ... "A": np.array([1, 2], dtype=np.int32), 

582 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type 

583 False 

584 """ 

585 if self._data.any_extension_types: 

586 return len({block.dtype for block in self._data.blocks}) == 1 

587 else: 

588 return not self._data.is_mixed_type 

589 

590 # ---------------------------------------------------------------------- 

591 # Rendering Methods 

592 

593 def _repr_fits_vertical_(self) -> bool: 

594 """ 

595 Check length against max_rows. 

596 """ 

597 max_rows = get_option("display.max_rows") 

598 return len(self) <= max_rows 

599 

600 def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: 

601 """ 

602 Check if full repr fits in horizontal boundaries imposed by the display 

603 options width and max_columns. 

604 

605 In case off non-interactive session, no boundaries apply. 

606 

607 `ignore_width` is here so ipnb+HTML output can behave the way 

608 users expect. display.max_columns remains in effect. 

609 GH3541, GH3573 

610 """ 

611 width, height = console.get_console_size() 

612 max_columns = get_option("display.max_columns") 

613 nb_columns = len(self.columns) 

614 

615 # exceed max columns 

616 if (max_columns and nb_columns > max_columns) or ( 

617 (not ignore_width) and width and nb_columns > (width // 2) 

618 ): 

619 return False 

620 

621 # used by repr_html under IPython notebook or scripts ignore terminal 

622 # dims 

623 if ignore_width or not console.in_interactive_session(): 

624 return True 

625 

626 if get_option("display.width") is not None or console.in_ipython_frontend(): 

627 # check at least the column row for excessive width 

628 max_rows = 1 

629 else: 

630 max_rows = get_option("display.max_rows") 

631 

632 # when auto-detecting, so width=None and not in ipython front end 

633 # check whether repr fits horizontal by actually checking 

634 # the width of the rendered repr 

635 buf = StringIO() 

636 

637 # only care about the stuff we'll actually print out 

638 # and to_string on entire frame may be expensive 

639 d = self 

640 

641 if not (max_rows is None): # unlimited rows 

642 # min of two, where one may be None 

643 d = d.iloc[: min(max_rows, len(d))] 

644 else: 

645 return True 

646 

647 d.to_string(buf=buf) 

648 value = buf.getvalue() 

649 repr_width = max(len(l) for l in value.split("\n")) 

650 

651 return repr_width < width 

652 

653 def _info_repr(self) -> bool: 

654 """ 

655 True if the repr should show the info view. 

656 """ 

657 info_repr_option = get_option("display.large_repr") == "info" 

658 return info_repr_option and not ( 

659 self._repr_fits_horizontal_() and self._repr_fits_vertical_() 

660 ) 

661 

662 def __repr__(self) -> str: 

663 """ 

664 Return a string representation for a particular DataFrame. 

665 """ 

666 buf = StringIO("") 

667 if self._info_repr(): 

668 self.info(buf=buf) 

669 return buf.getvalue() 

670 

671 max_rows = get_option("display.max_rows") 

672 min_rows = get_option("display.min_rows") 

673 max_cols = get_option("display.max_columns") 

674 max_colwidth = get_option("display.max_colwidth") 

675 show_dimensions = get_option("display.show_dimensions") 

676 if get_option("display.expand_frame_repr"): 

677 width, _ = console.get_console_size() 

678 else: 

679 width = None 

680 self.to_string( 

681 buf=buf, 

682 max_rows=max_rows, 

683 min_rows=min_rows, 

684 max_cols=max_cols, 

685 line_width=width, 

686 max_colwidth=max_colwidth, 

687 show_dimensions=show_dimensions, 

688 ) 

689 

690 return buf.getvalue() 

691 

692 def _repr_html_(self) -> Optional[str]: 

693 """ 

694 Return a html representation for a particular DataFrame. 

695 

696 Mainly for IPython notebook. 

697 """ 

698 if self._info_repr(): 

699 buf = StringIO("") 

700 self.info(buf=buf) 

701 # need to escape the <class>, should be the first line. 

702 val = buf.getvalue().replace("<", r"&lt;", 1) 

703 val = val.replace(">", r"&gt;", 1) 

704 return "<pre>" + val + "</pre>" 

705 

706 if get_option("display.notebook_repr_html"): 

707 max_rows = get_option("display.max_rows") 

708 min_rows = get_option("display.min_rows") 

709 max_cols = get_option("display.max_columns") 

710 show_dimensions = get_option("display.show_dimensions") 

711 

712 formatter = fmt.DataFrameFormatter( 

713 self, 

714 columns=None, 

715 col_space=None, 

716 na_rep="NaN", 

717 formatters=None, 

718 float_format=None, 

719 sparsify=None, 

720 justify=None, 

721 index_names=True, 

722 header=True, 

723 index=True, 

724 bold_rows=True, 

725 escape=True, 

726 max_rows=max_rows, 

727 min_rows=min_rows, 

728 max_cols=max_cols, 

729 show_dimensions=show_dimensions, 

730 decimal=".", 

731 table_id=None, 

732 render_links=False, 

733 ) 

734 return formatter.to_html(notebook=True) 

735 else: 

736 return None 

737 

738 @Substitution( 

739 header_type="bool or sequence", 

740 header="Write out the column names. If a list of strings " 

741 "is given, it is assumed to be aliases for the " 

742 "column names", 

743 col_space_type="int", 

744 col_space="The minimum width of each column", 

745 ) 

746 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) 

747 def to_string( 

748 self, 

749 buf: Optional[FilePathOrBuffer[str]] = None, 

750 columns: Optional[Sequence[str]] = None, 

751 col_space: Optional[int] = None, 

752 header: Union[bool, Sequence[str]] = True, 

753 index: bool = True, 

754 na_rep: str = "NaN", 

755 formatters: Optional[fmt.formatters_type] = None, 

756 float_format: Optional[fmt.float_format_type] = None, 

757 sparsify: Optional[bool] = None, 

758 index_names: bool = True, 

759 justify: Optional[str] = None, 

760 max_rows: Optional[int] = None, 

761 min_rows: Optional[int] = None, 

762 max_cols: Optional[int] = None, 

763 show_dimensions: bool = False, 

764 decimal: str = ".", 

765 line_width: Optional[int] = None, 

766 max_colwidth: Optional[int] = None, 

767 encoding: Optional[str] = None, 

768 ) -> Optional[str]: 

769 """ 

770 Render a DataFrame to a console-friendly tabular output. 

771 %(shared_params)s 

772 line_width : int, optional 

773 Width to wrap a line in characters. 

774 max_colwidth : int, optional 

775 Max width to truncate each column in characters. By default, no limit. 

776 

777 .. versionadded:: 1.0.0 

778 encoding : str, default "utf-8" 

779 Set character encoding. 

780 

781 .. versionadded:: 1.0 

782 %(returns)s 

783 See Also 

784 -------- 

785 to_html : Convert DataFrame to HTML. 

786 

787 Examples 

788 -------- 

789 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} 

790 >>> df = pd.DataFrame(d) 

791 >>> print(df.to_string()) 

792 col1 col2 

793 0 1 4 

794 1 2 5 

795 2 3 6 

796 """ 

797 

798 from pandas import option_context 

799 

800 with option_context("display.max_colwidth", max_colwidth): 

801 formatter = fmt.DataFrameFormatter( 

802 self, 

803 columns=columns, 

804 col_space=col_space, 

805 na_rep=na_rep, 

806 formatters=formatters, 

807 float_format=float_format, 

808 sparsify=sparsify, 

809 justify=justify, 

810 index_names=index_names, 

811 header=header, 

812 index=index, 

813 min_rows=min_rows, 

814 max_rows=max_rows, 

815 max_cols=max_cols, 

816 show_dimensions=show_dimensions, 

817 decimal=decimal, 

818 line_width=line_width, 

819 ) 

820 return formatter.to_string(buf=buf, encoding=encoding) 

821 

822 # ---------------------------------------------------------------------- 

823 

824 @property 

825 def style(self) -> "Styler": 

826 """ 

827 Returns a Styler object. 

828 

829 Contains methods for building a styled HTML representation of the DataFrame. 

830 a styled HTML representation fo the DataFrame. 

831 

832 See Also 

833 -------- 

834 io.formats.style.Styler 

835 """ 

836 from pandas.io.formats.style import Styler 

837 

838 return Styler(self) 

839 

840 _shared_docs[ 

841 "items" 

842 ] = r""" 

843 Iterate over (column name, Series) pairs. 

844 

845 Iterates over the DataFrame columns, returning a tuple with 

846 the column name and the content as a Series. 

847 

848 Yields 

849 ------ 

850 label : object 

851 The column names for the DataFrame being iterated over. 

852 content : Series 

853 The column entries belonging to each label, as a Series. 

854 

855 See Also 

856 -------- 

857 DataFrame.iterrows : Iterate over DataFrame rows as 

858 (index, Series) pairs. 

859 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples 

860 of the values. 

861 

862 Examples 

863 -------- 

864 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], 

865 ... 'population': [1864, 22000, 80000]}, 

866 ... index=['panda', 'polar', 'koala']) 

867 >>> df 

868 species population 

869 panda bear 1864 

870 polar bear 22000 

871 koala marsupial 80000 

872 >>> for label, content in df.items(): 

873 ... print('label:', label) 

874 ... print('content:', content, sep='\n') 

875 ... 

876 label: species 

877 content: 

878 panda bear 

879 polar bear 

880 koala marsupial 

881 Name: species, dtype: object 

882 label: population 

883 content: 

884 panda 1864 

885 polar 22000 

886 koala 80000 

887 Name: population, dtype: int64 

888 """ 

889 

890 @Appender(_shared_docs["items"]) 

891 def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: 

892 if self.columns.is_unique and hasattr(self, "_item_cache"): 

893 for k in self.columns: 

894 yield k, self._get_item_cache(k) 

895 else: 

896 for i, k in enumerate(self.columns): 

897 yield k, self._ixs(i, axis=1) 

898 

899 @Appender(_shared_docs["items"]) 

900 def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]: 

901 yield from self.items() 

902 

903 def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]: 

904 """ 

905 Iterate over DataFrame rows as (index, Series) pairs. 

906 

907 Yields 

908 ------ 

909 index : label or tuple of label 

910 The index of the row. A tuple for a `MultiIndex`. 

911 data : Series 

912 The data of the row as a Series. 

913 

914 it : generator 

915 A generator that iterates over the rows of the frame. 

916 

917 See Also 

918 -------- 

919 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. 

920 DataFrame.items : Iterate over (column name, Series) pairs. 

921 

922 Notes 

923 ----- 

924 

925 1. Because ``iterrows`` returns a Series for each row, 

926 it does **not** preserve dtypes across the rows (dtypes are 

927 preserved across columns for DataFrames). For example, 

928 

929 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) 

930 >>> row = next(df.iterrows())[1] 

931 >>> row 

932 int 1.0 

933 float 1.5 

934 Name: 0, dtype: float64 

935 >>> print(row['int'].dtype) 

936 float64 

937 >>> print(df['int'].dtype) 

938 int64 

939 

940 To preserve dtypes while iterating over the rows, it is better 

941 to use :meth:`itertuples` which returns namedtuples of the values 

942 and which is generally faster than ``iterrows``. 

943 

944 2. You should **never modify** something you are iterating over. 

945 This is not guaranteed to work in all cases. Depending on the 

946 data types, the iterator returns a copy and not a view, and writing 

947 to it will have no effect. 

948 """ 

949 columns = self.columns 

950 klass = self._constructor_sliced 

951 for k, v in zip(self.index, self.values): 

952 s = klass(v, index=columns, name=k) 

953 yield k, s 

954 

955 def itertuples(self, index=True, name="Pandas"): 

956 """ 

957 Iterate over DataFrame rows as namedtuples. 

958 

959 Parameters 

960 ---------- 

961 index : bool, default True 

962 If True, return the index as the first element of the tuple. 

963 name : str or None, default "Pandas" 

964 The name of the returned namedtuples or None to return regular 

965 tuples. 

966 

967 Returns 

968 ------- 

969 iterator 

970 An object to iterate over namedtuples for each row in the 

971 DataFrame with the first field possibly being the index and 

972 following fields being the column values. 

973 

974 See Also 

975 -------- 

976 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) 

977 pairs. 

978 DataFrame.items : Iterate over (column name, Series) pairs. 

979 

980 Notes 

981 ----- 

982 The column names will be renamed to positional names if they are 

983 invalid Python identifiers, repeated, or start with an underscore. 

984 On python versions < 3.7 regular tuples are returned for DataFrames 

985 with a large number of columns (>254). 

986 

987 Examples 

988 -------- 

989 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, 

990 ... index=['dog', 'hawk']) 

991 >>> df 

992 num_legs num_wings 

993 dog 4 0 

994 hawk 2 2 

995 >>> for row in df.itertuples(): 

996 ... print(row) 

997 ... 

998 Pandas(Index='dog', num_legs=4, num_wings=0) 

999 Pandas(Index='hawk', num_legs=2, num_wings=2) 

1000 

1001 By setting the `index` parameter to False we can remove the index 

1002 as the first element of the tuple: 

1003 

1004 >>> for row in df.itertuples(index=False): 

1005 ... print(row) 

1006 ... 

1007 Pandas(num_legs=4, num_wings=0) 

1008 Pandas(num_legs=2, num_wings=2) 

1009 

1010 With the `name` parameter set we set a custom name for the yielded 

1011 namedtuples: 

1012 

1013 >>> for row in df.itertuples(name='Animal'): 

1014 ... print(row) 

1015 ... 

1016 Animal(Index='dog', num_legs=4, num_wings=0) 

1017 Animal(Index='hawk', num_legs=2, num_wings=2) 

1018 """ 

1019 arrays = [] 

1020 fields = list(self.columns) 

1021 if index: 

1022 arrays.append(self.index) 

1023 fields.insert(0, "Index") 

1024 

1025 # use integer indexing because of possible duplicate column names 

1026 arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) 

1027 

1028 # Python versions before 3.7 support at most 255 arguments to constructors 

1029 can_return_named_tuples = PY37 or len(self.columns) + index < 255 

1030 if name is not None and can_return_named_tuples: 

1031 itertuple = collections.namedtuple(name, fields, rename=True) 

1032 return map(itertuple._make, zip(*arrays)) 

1033 

1034 # fallback to regular tuples 

1035 return zip(*arrays) 

1036 

1037 def __len__(self) -> int: 

1038 """ 

1039 Returns length of info axis, but here we use the index. 

1040 """ 

1041 return len(self.index) 

1042 

1043 def dot(self, other): 

1044 """ 

1045 Compute the matrix multiplication between the DataFrame and other. 

1046 

1047 This method computes the matrix product between the DataFrame and the 

1048 values of an other Series, DataFrame or a numpy array. 

1049 

1050 It can also be called using ``self @ other`` in Python >= 3.5. 

1051 

1052 Parameters 

1053 ---------- 

1054 other : Series, DataFrame or array-like 

1055 The other object to compute the matrix product with. 

1056 

1057 Returns 

1058 ------- 

1059 Series or DataFrame 

1060 If other is a Series, return the matrix product between self and 

1061 other as a Serie. If other is a DataFrame or a numpy.array, return 

1062 the matrix product of self and other in a DataFrame of a np.array. 

1063 

1064 See Also 

1065 -------- 

1066 Series.dot: Similar method for Series. 

1067 

1068 Notes 

1069 ----- 

1070 The dimensions of DataFrame and other must be compatible in order to 

1071 compute the matrix multiplication. In addition, the column names of 

1072 DataFrame and the index of other must contain the same values, as they 

1073 will be aligned prior to the multiplication. 

1074 

1075 The dot method for Series computes the inner product, instead of the 

1076 matrix product here. 

1077 

1078 Examples 

1079 -------- 

1080 Here we multiply a DataFrame with a Series. 

1081 

1082 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) 

1083 >>> s = pd.Series([1, 1, 2, 1]) 

1084 >>> df.dot(s) 

1085 0 -4 

1086 1 5 

1087 dtype: int64 

1088 

1089 Here we multiply a DataFrame with another DataFrame. 

1090 

1091 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) 

1092 >>> df.dot(other) 

1093 0 1 

1094 0 1 4 

1095 1 2 2 

1096 

1097 Note that the dot method give the same result as @ 

1098 

1099 >>> df @ other 

1100 0 1 

1101 0 1 4 

1102 1 2 2 

1103 

1104 The dot method works also if other is an np.array. 

1105 

1106 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) 

1107 >>> df.dot(arr) 

1108 0 1 

1109 0 1 4 

1110 1 2 2 

1111 

1112 Note how shuffling of the objects does not change the result. 

1113 

1114 >>> s2 = s.reindex([1, 0, 2, 3]) 

1115 >>> df.dot(s2) 

1116 0 -4 

1117 1 5 

1118 dtype: int64 

1119 """ 

1120 if isinstance(other, (Series, DataFrame)): 

1121 common = self.columns.union(other.index) 

1122 if len(common) > len(self.columns) or len(common) > len(other.index): 

1123 raise ValueError("matrices are not aligned") 

1124 

1125 left = self.reindex(columns=common, copy=False) 

1126 right = other.reindex(index=common, copy=False) 

1127 lvals = left.values 

1128 rvals = right.values 

1129 else: 

1130 left = self 

1131 lvals = self.values 

1132 rvals = np.asarray(other) 

1133 if lvals.shape[1] != rvals.shape[0]: 

1134 raise ValueError( 

1135 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" 

1136 ) 

1137 

1138 if isinstance(other, DataFrame): 

1139 return self._constructor( 

1140 np.dot(lvals, rvals), index=left.index, columns=other.columns 

1141 ) 

1142 elif isinstance(other, Series): 

1143 return Series(np.dot(lvals, rvals), index=left.index) 

1144 elif isinstance(rvals, (np.ndarray, Index)): 

1145 result = np.dot(lvals, rvals) 

1146 if result.ndim == 2: 

1147 return self._constructor(result, index=left.index) 

1148 else: 

1149 return Series(result, index=left.index) 

1150 else: # pragma: no cover 

1151 raise TypeError(f"unsupported type: {type(other)}") 

1152 

1153 def __matmul__(self, other): 

1154 """ 

1155 Matrix multiplication using binary `@` operator in Python>=3.5. 

1156 """ 

1157 return self.dot(other) 

1158 

1159 def __rmatmul__(self, other): 

1160 """ 

1161 Matrix multiplication using binary `@` operator in Python>=3.5. 

1162 """ 

1163 return self.T.dot(np.transpose(other)).T 

1164 

1165 # ---------------------------------------------------------------------- 

1166 # IO methods (to / from other formats) 

1167 

1168 @classmethod 

1169 def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": 

1170 """ 

1171 Construct DataFrame from dict of array-like or dicts. 

1172 

1173 Creates DataFrame object from dictionary by columns or by index 

1174 allowing dtype specification. 

1175 

1176 Parameters 

1177 ---------- 

1178 data : dict 

1179 Of the form {field : array-like} or {field : dict}. 

1180 orient : {'columns', 'index'}, default 'columns' 

1181 The "orientation" of the data. If the keys of the passed dict 

1182 should be the columns of the resulting DataFrame, pass 'columns' 

1183 (default). Otherwise if the keys should be rows, pass 'index'. 

1184 dtype : dtype, default None 

1185 Data type to force, otherwise infer. 

1186 columns : list, default None 

1187 Column labels to use when ``orient='index'``. Raises a ValueError 

1188 if used with ``orient='columns'``. 

1189 

1190 .. versionadded:: 0.23.0 

1191 

1192 Returns 

1193 ------- 

1194 DataFrame 

1195 

1196 See Also 

1197 -------- 

1198 DataFrame.from_records : DataFrame from ndarray (structured 

1199 dtype), list of tuples, dict, or DataFrame. 

1200 DataFrame : DataFrame object creation using constructor. 

1201 

1202 Examples 

1203 -------- 

1204 By default the keys of the dict become the DataFrame columns: 

1205 

1206 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} 

1207 >>> pd.DataFrame.from_dict(data) 

1208 col_1 col_2 

1209 0 3 a 

1210 1 2 b 

1211 2 1 c 

1212 3 0 d 

1213 

1214 Specify ``orient='index'`` to create the DataFrame using dictionary 

1215 keys as rows: 

1216 

1217 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} 

1218 >>> pd.DataFrame.from_dict(data, orient='index') 

1219 0 1 2 3 

1220 row_1 3 2 1 0 

1221 row_2 a b c d 

1222 

1223 When using the 'index' orientation, the column names can be 

1224 specified manually: 

1225 

1226 >>> pd.DataFrame.from_dict(data, orient='index', 

1227 ... columns=['A', 'B', 'C', 'D']) 

1228 A B C D 

1229 row_1 3 2 1 0 

1230 row_2 a b c d 

1231 """ 

1232 index = None 

1233 orient = orient.lower() 

1234 if orient == "index": 

1235 if len(data) > 0: 

1236 # TODO speed up Series case 

1237 if isinstance(list(data.values())[0], (Series, dict)): 

1238 data = _from_nested_dict(data) 

1239 else: 

1240 data, index = list(data.values()), list(data.keys()) 

1241 elif orient == "columns": 

1242 if columns is not None: 

1243 raise ValueError("cannot use columns parameter with orient='columns'") 

1244 else: # pragma: no cover 

1245 raise ValueError("only recognize index or columns for orient") 

1246 

1247 return cls(data, index=index, columns=columns, dtype=dtype) 

1248 

1249 def to_numpy(self, dtype=None, copy=False) -> np.ndarray: 

1250 """ 

1251 Convert the DataFrame to a NumPy array. 

1252 

1253 .. versionadded:: 0.24.0 

1254 

1255 By default, the dtype of the returned array will be the common NumPy 

1256 dtype of all types in the DataFrame. For example, if the dtypes are 

1257 ``float16`` and ``float32``, the results dtype will be ``float32``. 

1258 This may require copying data and coercing values, which may be 

1259 expensive. 

1260 

1261 Parameters 

1262 ---------- 

1263 dtype : str or numpy.dtype, optional 

1264 The dtype to pass to :meth:`numpy.asarray`. 

1265 copy : bool, default False 

1266 Whether to ensure that the returned value is a not a view on 

1267 another array. Note that ``copy=False`` does not *ensure* that 

1268 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that 

1269 a copy is made, even if not strictly necessary. 

1270 

1271 Returns 

1272 ------- 

1273 numpy.ndarray 

1274 

1275 See Also 

1276 -------- 

1277 Series.to_numpy : Similar method for Series. 

1278 

1279 Examples 

1280 -------- 

1281 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() 

1282 array([[1, 3], 

1283 [2, 4]]) 

1284 

1285 With heterogeneous data, the lowest common type will have to 

1286 be used. 

1287 

1288 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) 

1289 >>> df.to_numpy() 

1290 array([[1. , 3. ], 

1291 [2. , 4.5]]) 

1292 

1293 For a mix of numeric and non-numeric types, the output array will 

1294 have object dtype. 

1295 

1296 >>> df['C'] = pd.date_range('2000', periods=2) 

1297 >>> df.to_numpy() 

1298 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], 

1299 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) 

1300 """ 

1301 result = np.array(self.values, dtype=dtype, copy=copy) 

1302 return result 

1303 

1304 def to_dict(self, orient="dict", into=dict): 

1305 """ 

1306 Convert the DataFrame to a dictionary. 

1307 

1308 The type of the key-value pairs can be customized with the parameters 

1309 (see below). 

1310 

1311 Parameters 

1312 ---------- 

1313 orient : str {'dict', 'list', 'series', 'split', 'records', 'index'} 

1314 Determines the type of the values of the dictionary. 

1315 

1316 - 'dict' (default) : dict like {column -> {index -> value}} 

1317 - 'list' : dict like {column -> [values]} 

1318 - 'series' : dict like {column -> Series(values)} 

1319 - 'split' : dict like 

1320 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} 

1321 - 'records' : list like 

1322 [{column -> value}, ... , {column -> value}] 

1323 - 'index' : dict like {index -> {column -> value}} 

1324 

1325 Abbreviations are allowed. `s` indicates `series` and `sp` 

1326 indicates `split`. 

1327 

1328 into : class, default dict 

1329 The collections.abc.Mapping subclass used for all Mappings 

1330 in the return value. Can be the actual class or an empty 

1331 instance of the mapping type you want. If you want a 

1332 collections.defaultdict, you must pass it initialized. 

1333 

1334 .. versionadded:: 0.21.0 

1335 

1336 Returns 

1337 ------- 

1338 dict, list or collections.abc.Mapping 

1339 Return a collections.abc.Mapping object representing the DataFrame. 

1340 The resulting transformation depends on the `orient` parameter. 

1341 

1342 See Also 

1343 -------- 

1344 DataFrame.from_dict: Create a DataFrame from a dictionary. 

1345 DataFrame.to_json: Convert a DataFrame to JSON format. 

1346 

1347 Examples 

1348 -------- 

1349 >>> df = pd.DataFrame({'col1': [1, 2], 

1350 ... 'col2': [0.5, 0.75]}, 

1351 ... index=['row1', 'row2']) 

1352 >>> df 

1353 col1 col2 

1354 row1 1 0.50 

1355 row2 2 0.75 

1356 >>> df.to_dict() 

1357 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} 

1358 

1359 You can specify the return orientation. 

1360 

1361 >>> df.to_dict('series') 

1362 {'col1': row1 1 

1363 row2 2 

1364 Name: col1, dtype: int64, 

1365 'col2': row1 0.50 

1366 row2 0.75 

1367 Name: col2, dtype: float64} 

1368 

1369 >>> df.to_dict('split') 

1370 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 

1371 'data': [[1, 0.5], [2, 0.75]]} 

1372 

1373 >>> df.to_dict('records') 

1374 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] 

1375 

1376 >>> df.to_dict('index') 

1377 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} 

1378 

1379 You can also specify the mapping type. 

1380 

1381 >>> from collections import OrderedDict, defaultdict 

1382 >>> df.to_dict(into=OrderedDict) 

1383 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), 

1384 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) 

1385 

1386 If you want a `defaultdict`, you need to initialize it: 

1387 

1388 >>> dd = defaultdict(list) 

1389 >>> df.to_dict('records', into=dd) 

1390 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}), 

1391 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})] 

1392 """ 

1393 if not self.columns.is_unique: 

1394 warnings.warn( 

1395 "DataFrame columns are not unique, some columns will be omitted.", 

1396 UserWarning, 

1397 stacklevel=2, 

1398 ) 

1399 # GH16122 

1400 into_c = com.standardize_mapping(into) 

1401 if orient.lower().startswith("d"): 

1402 return into_c((k, v.to_dict(into)) for k, v in self.items()) 

1403 elif orient.lower().startswith("l"): 

1404 return into_c((k, v.tolist()) for k, v in self.items()) 

1405 elif orient.lower().startswith("sp"): 

1406 return into_c( 

1407 ( 

1408 ("index", self.index.tolist()), 

1409 ("columns", self.columns.tolist()), 

1410 ( 

1411 "data", 

1412 [ 

1413 list(map(com.maybe_box_datetimelike, t)) 

1414 for t in self.itertuples(index=False, name=None) 

1415 ], 

1416 ), 

1417 ) 

1418 ) 

1419 elif orient.lower().startswith("s"): 

1420 return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) 

1421 elif orient.lower().startswith("r"): 

1422 columns = self.columns.tolist() 

1423 rows = ( 

1424 dict(zip(columns, row)) 

1425 for row in self.itertuples(index=False, name=None) 

1426 ) 

1427 return [ 

1428 into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) 

1429 for row in rows 

1430 ] 

1431 elif orient.lower().startswith("i"): 

1432 if not self.index.is_unique: 

1433 raise ValueError("DataFrame index must be unique for orient='index'.") 

1434 return into_c( 

1435 (t[0], dict(zip(self.columns, t[1:]))) 

1436 for t in self.itertuples(name=None) 

1437 ) 

1438 else: 

1439 raise ValueError(f"orient '{orient}' not understood") 

1440 

1441 def to_gbq( 

1442 self, 

1443 destination_table, 

1444 project_id=None, 

1445 chunksize=None, 

1446 reauth=False, 

1447 if_exists="fail", 

1448 auth_local_webserver=False, 

1449 table_schema=None, 

1450 location=None, 

1451 progress_bar=True, 

1452 credentials=None, 

1453 ) -> None: 

1454 """ 

1455 Write a DataFrame to a Google BigQuery table. 

1456 

1457 This function requires the `pandas-gbq package 

1458 <https://pandas-gbq.readthedocs.io>`__. 

1459 

1460 See the `How to authenticate with Google BigQuery 

1461 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__ 

1462 guide for authentication instructions. 

1463 

1464 Parameters 

1465 ---------- 

1466 destination_table : str 

1467 Name of table to be written, in the form ``dataset.tablename``. 

1468 project_id : str, optional 

1469 Google BigQuery Account project ID. Optional when available from 

1470 the environment. 

1471 chunksize : int, optional 

1472 Number of rows to be inserted in each chunk from the dataframe. 

1473 Set to ``None`` to load the whole dataframe at once. 

1474 reauth : bool, default False 

1475 Force Google BigQuery to re-authenticate the user. This is useful 

1476 if multiple accounts are used. 

1477 if_exists : str, default 'fail' 

1478 Behavior when the destination table exists. Value can be one of: 

1479 

1480 ``'fail'`` 

1481 If table exists raise pandas_gbq.gbq.TableCreationError. 

1482 ``'replace'`` 

1483 If table exists, drop it, recreate it, and insert data. 

1484 ``'append'`` 

1485 If table exists, insert data. Create if does not exist. 

1486 auth_local_webserver : bool, default False 

1487 Use the `local webserver flow`_ instead of the `console flow`_ 

1488 when getting user credentials. 

1489 

1490 .. _local webserver flow: 

1491 http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server 

1492 .. _console flow: 

1493 http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console 

1494 

1495 *New in version 0.2.0 of pandas-gbq*. 

1496 table_schema : list of dicts, optional 

1497 List of BigQuery table fields to which according DataFrame 

1498 columns conform to, e.g. ``[{'name': 'col1', 'type': 

1499 'STRING'},...]``. If schema is not provided, it will be 

1500 generated according to dtypes of DataFrame columns. See 

1501 BigQuery API documentation on available names of a field. 

1502 

1503 *New in version 0.3.1 of pandas-gbq*. 

1504 location : str, optional 

1505 Location where the load job should run. See the `BigQuery locations 

1506 documentation 

1507 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a 

1508 list of available locations. The location must match that of the 

1509 target dataset. 

1510 

1511 *New in version 0.5.0 of pandas-gbq*. 

1512 progress_bar : bool, default True 

1513 Use the library `tqdm` to show the progress bar for the upload, 

1514 chunk by chunk. 

1515 

1516 *New in version 0.5.0 of pandas-gbq*. 

1517 credentials : google.auth.credentials.Credentials, optional 

1518 Credentials for accessing Google APIs. Use this parameter to 

1519 override default credentials, such as to use Compute Engine 

1520 :class:`google.auth.compute_engine.Credentials` or Service 

1521 Account :class:`google.oauth2.service_account.Credentials` 

1522 directly. 

1523 

1524 *New in version 0.8.0 of pandas-gbq*. 

1525 

1526 .. versionadded:: 0.24.0 

1527 

1528 See Also 

1529 -------- 

1530 pandas_gbq.to_gbq : This function in the pandas-gbq library. 

1531 read_gbq : Read a DataFrame from Google BigQuery. 

1532 """ 

1533 from pandas.io import gbq 

1534 

1535 gbq.to_gbq( 

1536 self, 

1537 destination_table, 

1538 project_id=project_id, 

1539 chunksize=chunksize, 

1540 reauth=reauth, 

1541 if_exists=if_exists, 

1542 auth_local_webserver=auth_local_webserver, 

1543 table_schema=table_schema, 

1544 location=location, 

1545 progress_bar=progress_bar, 

1546 credentials=credentials, 

1547 ) 

1548 

1549 @classmethod 

1550 def from_records( 

1551 cls, 

1552 data, 

1553 index=None, 

1554 exclude=None, 

1555 columns=None, 

1556 coerce_float=False, 

1557 nrows=None, 

1558 ) -> "DataFrame": 

1559 """ 

1560 Convert structured or record ndarray to DataFrame. 

1561 

1562 Parameters 

1563 ---------- 

1564 data : ndarray (structured dtype), list of tuples, dict, or DataFrame 

1565 index : str, list of fields, array-like 

1566 Field of array to use as the index, alternately a specific set of 

1567 input labels to use. 

1568 exclude : sequence, default None 

1569 Columns or fields to exclude. 

1570 columns : sequence, default None 

1571 Column names to use. If the passed data do not have names 

1572 associated with them, this argument provides names for the 

1573 columns. Otherwise this argument indicates the order of the columns 

1574 in the result (any names not found in the data will become all-NA 

1575 columns). 

1576 coerce_float : bool, default False 

1577 Attempt to convert values of non-string, non-numeric objects (like 

1578 decimal.Decimal) to floating point, useful for SQL result sets. 

1579 nrows : int, default None 

1580 Number of rows to read if data is an iterator. 

1581 

1582 Returns 

1583 ------- 

1584 DataFrame 

1585 """ 

1586 

1587 # Make a copy of the input columns so we can modify it 

1588 if columns is not None: 

1589 columns = ensure_index(columns) 

1590 

1591 if is_iterator(data): 

1592 if nrows == 0: 

1593 return cls() 

1594 

1595 try: 

1596 first_row = next(data) 

1597 except StopIteration: 

1598 return cls(index=index, columns=columns) 

1599 

1600 dtype = None 

1601 if hasattr(first_row, "dtype") and first_row.dtype.names: 

1602 dtype = first_row.dtype 

1603 

1604 values = [first_row] 

1605 

1606 if nrows is None: 

1607 values += data 

1608 else: 

1609 values.extend(itertools.islice(data, nrows - 1)) 

1610 

1611 if dtype is not None: 

1612 data = np.array(values, dtype=dtype) 

1613 else: 

1614 data = values 

1615 

1616 if isinstance(data, dict): 

1617 if columns is None: 

1618 columns = arr_columns = ensure_index(sorted(data)) 

1619 arrays = [data[k] for k in columns] 

1620 else: 

1621 arrays = [] 

1622 arr_columns = [] 

1623 for k, v in data.items(): 

1624 if k in columns: 

1625 arr_columns.append(k) 

1626 arrays.append(v) 

1627 

1628 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns) 

1629 

1630 elif isinstance(data, (np.ndarray, DataFrame)): 

1631 arrays, columns = to_arrays(data, columns) 

1632 if columns is not None: 

1633 columns = ensure_index(columns) 

1634 arr_columns = columns 

1635 else: 

1636 arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float) 

1637 

1638 arr_columns = ensure_index(arr_columns) 

1639 if columns is not None: 

1640 columns = ensure_index(columns) 

1641 else: 

1642 columns = arr_columns 

1643 

1644 if exclude is None: 

1645 exclude = set() 

1646 else: 

1647 exclude = set(exclude) 

1648 

1649 result_index = None 

1650 if index is not None: 

1651 if isinstance(index, str) or not hasattr(index, "__iter__"): 

1652 i = columns.get_loc(index) 

1653 exclude.add(index) 

1654 if len(arrays) > 0: 

1655 result_index = Index(arrays[i], name=index) 

1656 else: 

1657 result_index = Index([], name=index) 

1658 else: 

1659 try: 

1660 index_data = [arrays[arr_columns.get_loc(field)] for field in index] 

1661 except (KeyError, TypeError): 

1662 # raised by get_loc, see GH#29258 

1663 result_index = index 

1664 else: 

1665 result_index = ensure_index_from_sequences(index_data, names=index) 

1666 exclude.update(index) 

1667 

1668 if any(exclude): 

1669 arr_exclude = [x for x in exclude if x in arr_columns] 

1670 to_remove = [arr_columns.get_loc(col) for col in arr_exclude] 

1671 arrays = [v for i, v in enumerate(arrays) if i not in to_remove] 

1672 

1673 arr_columns = arr_columns.drop(arr_exclude) 

1674 columns = columns.drop(exclude) 

1675 

1676 mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) 

1677 

1678 return cls(mgr) 

1679 

1680 def to_records( 

1681 self, index=True, column_dtypes=None, index_dtypes=None 

1682 ) -> np.recarray: 

1683 """ 

1684 Convert DataFrame to a NumPy record array. 

1685 

1686 Index will be included as the first field of the record array if 

1687 requested. 

1688 

1689 Parameters 

1690 ---------- 

1691 index : bool, default True 

1692 Include index in resulting record array, stored in 'index' 

1693 field or using the index label, if set. 

1694 column_dtypes : str, type, dict, default None 

1695 .. versionadded:: 0.24.0 

1696 

1697 If a string or type, the data type to store all columns. If 

1698 a dictionary, a mapping of column names and indices (zero-indexed) 

1699 to specific data types. 

1700 index_dtypes : str, type, dict, default None 

1701 .. versionadded:: 0.24.0 

1702 

1703 If a string or type, the data type to store all index levels. If 

1704 a dictionary, a mapping of index level names and indices 

1705 (zero-indexed) to specific data types. 

1706 

1707 This mapping is applied only if `index=True`. 

1708 

1709 Returns 

1710 ------- 

1711 numpy.recarray 

1712 NumPy ndarray with the DataFrame labels as fields and each row 

1713 of the DataFrame as entries. 

1714 

1715 See Also 

1716 -------- 

1717 DataFrame.from_records: Convert structured or record ndarray 

1718 to DataFrame. 

1719 numpy.recarray: An ndarray that allows field access using 

1720 attributes, analogous to typed columns in a 

1721 spreadsheet. 

1722 

1723 Examples 

1724 -------- 

1725 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, 

1726 ... index=['a', 'b']) 

1727 >>> df 

1728 A B 

1729 a 1 0.50 

1730 b 2 0.75 

1731 >>> df.to_records() 

1732 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

1733 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')]) 

1734 

1735 If the DataFrame index has no label then the recarray field name 

1736 is set to 'index'. If the index has a label then this is used as the 

1737 field name: 

1738 

1739 >>> df.index = df.index.rename("I") 

1740 >>> df.to_records() 

1741 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

1742 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')]) 

1743 

1744 The index can be excluded from the record array: 

1745 

1746 >>> df.to_records(index=False) 

1747 rec.array([(1, 0.5 ), (2, 0.75)], 

1748 dtype=[('A', '<i8'), ('B', '<f8')]) 

1749 

1750 Data types can be specified for the columns: 

1751 

1752 >>> df.to_records(column_dtypes={"A": "int32"}) 

1753 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

1754 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')]) 

1755 

1756 As well as for the index: 

1757 

1758 >>> df.to_records(index_dtypes="<S2") 

1759 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], 

1760 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')]) 

1761 

1762 >>> index_dtypes = f"<S{df.index.str.len().max()}" 

1763 >>> df.to_records(index_dtypes=index_dtypes) 

1764 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], 

1765 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')]) 

1766 """ 

1767 

1768 if index: 

1769 if isinstance(self.index, ABCMultiIndex): 

1770 # array of tuples to numpy cols. copy copy copy 

1771 ix_vals = list(map(np.array, zip(*self.index.values))) 

1772 else: 

1773 ix_vals = [self.index.values] 

1774 

1775 arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns] 

1776 

1777 count = 0 

1778 index_names = list(self.index.names) 

1779 

1780 if isinstance(self.index, ABCMultiIndex): 

1781 for i, n in enumerate(index_names): 

1782 if n is None: 

1783 index_names[i] = f"level_{count}" 

1784 count += 1 

1785 elif index_names[0] is None: 

1786 index_names = ["index"] 

1787 

1788 names = [str(name) for name in itertools.chain(index_names, self.columns)] 

1789 else: 

1790 arrays = [self[c]._internal_get_values() for c in self.columns] 

1791 names = [str(c) for c in self.columns] 

1792 index_names = [] 

1793 

1794 index_len = len(index_names) 

1795 formats = [] 

1796 

1797 for i, v in enumerate(arrays): 

1798 index = i 

1799 

1800 # When the names and arrays are collected, we 

1801 # first collect those in the DataFrame's index, 

1802 # followed by those in its columns. 

1803 # 

1804 # Thus, the total length of the array is: 

1805 # len(index_names) + len(DataFrame.columns). 

1806 # 

1807 # This check allows us to see whether we are 

1808 # handling a name / array in the index or column. 

1809 if index < index_len: 

1810 dtype_mapping = index_dtypes 

1811 name = index_names[index] 

1812 else: 

1813 index -= index_len 

1814 dtype_mapping = column_dtypes 

1815 name = self.columns[index] 

1816 

1817 # We have a dictionary, so we get the data type 

1818 # associated with the index or column (which can 

1819 # be denoted by its name in the DataFrame or its 

1820 # position in DataFrame's array of indices or 

1821 # columns, whichever is applicable. 

1822 if is_dict_like(dtype_mapping): 

1823 if name in dtype_mapping: 

1824 dtype_mapping = dtype_mapping[name] 

1825 elif index in dtype_mapping: 

1826 dtype_mapping = dtype_mapping[index] 

1827 else: 

1828 dtype_mapping = None 

1829 

1830 # If no mapping can be found, use the array's 

1831 # dtype attribute for formatting. 

1832 # 

1833 # A valid dtype must either be a type or 

1834 # string naming a type. 

1835 if dtype_mapping is None: 

1836 formats.append(v.dtype) 

1837 elif isinstance(dtype_mapping, (type, np.dtype, str)): 

1838 formats.append(dtype_mapping) 

1839 else: 

1840 element = "row" if i < index_len else "column" 

1841 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" 

1842 raise ValueError(msg) 

1843 

1844 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) 

1845 

1846 @classmethod 

1847 def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame": 

1848 mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) 

1849 return cls(mgr) 

1850 

1851 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") 

1852 def to_stata( 

1853 self, 

1854 path, 

1855 convert_dates=None, 

1856 write_index=True, 

1857 byteorder=None, 

1858 time_stamp=None, 

1859 data_label=None, 

1860 variable_labels=None, 

1861 version=114, 

1862 convert_strl=None, 

1863 ): 

1864 """ 

1865 Export DataFrame object to Stata dta format. 

1866 

1867 Writes the DataFrame to a Stata dataset file. 

1868 "dta" files contain a Stata dataset. 

1869 

1870 Parameters 

1871 ---------- 

1872 path : str, buffer or path object 

1873 String, path object (pathlib.Path or py._path.local.LocalPath) or 

1874 object implementing a binary write() function. If using a buffer 

1875 then the buffer will not be automatically closed after the file 

1876 data has been written. 

1877 

1878 .. versionchanged:: 1.0.0 

1879 

1880 Previously this was "fname" 

1881 

1882 convert_dates : dict 

1883 Dictionary mapping columns containing datetime types to stata 

1884 internal format to use when writing the dates. Options are 'tc', 

1885 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer 

1886 or a name. Datetime columns that do not have a conversion type 

1887 specified will be converted to 'tc'. Raises NotImplementedError if 

1888 a datetime column has timezone information. 

1889 write_index : bool 

1890 Write the index to Stata dataset. 

1891 byteorder : str 

1892 Can be ">", "<", "little", or "big". default is `sys.byteorder`. 

1893 time_stamp : datetime 

1894 A datetime to use as file creation date. Default is the current 

1895 time. 

1896 data_label : str, optional 

1897 A label for the data set. Must be 80 characters or smaller. 

1898 variable_labels : dict 

1899 Dictionary containing columns as keys and variable labels as 

1900 values. Each label must be 80 characters or smaller. 

1901 version : {114, 117, 118, 119, None}, default 114 

1902 Version to use in the output dta file. Set to None to let pandas 

1903 decide between 118 or 119 formats depending on the number of 

1904 columns in the frame. Version 114 can be read by Stata 10 and 

1905 later. Version 117 can be read by Stata 13 or later. Version 118 

1906 is supported in Stata 14 and later. Version 119 is supported in 

1907 Stata 15 and later. Version 114 limits string variables to 244 

1908 characters or fewer while versions 117 and later allow strings 

1909 with lengths up to 2,000,000 characters. Versions 118 and 119 

1910 support Unicode characters, and version 119 supports more than 

1911 32,767 variables. 

1912 

1913 .. versionadded:: 0.23.0 

1914 .. versionchanged:: 1.0.0 

1915 

1916 Added support for formats 118 and 119. 

1917 

1918 convert_strl : list, optional 

1919 List of column names to convert to string columns to Stata StrL 

1920 format. Only available if version is 117. Storing strings in the 

1921 StrL format can produce smaller dta files if strings have more than 

1922 8 characters and values are repeated. 

1923 

1924 .. versionadded:: 0.23.0 

1925 

1926 Raises 

1927 ------ 

1928 NotImplementedError 

1929 * If datetimes contain timezone information 

1930 * Column dtype is not representable in Stata 

1931 ValueError 

1932 * Columns listed in convert_dates are neither datetime64[ns] 

1933 or datetime.datetime 

1934 * Column listed in convert_dates is not in DataFrame 

1935 * Categorical label contains more than 32,000 characters 

1936 

1937 See Also 

1938 -------- 

1939 read_stata : Import Stata data files. 

1940 io.stata.StataWriter : Low-level writer for Stata data files. 

1941 io.stata.StataWriter117 : Low-level writer for version 117 files. 

1942 

1943 Examples 

1944 -------- 

1945 >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', 

1946 ... 'parrot'], 

1947 ... 'speed': [350, 18, 361, 15]}) 

1948 >>> df.to_stata('animals.dta') # doctest: +SKIP 

1949 """ 

1950 if version not in (114, 117, 118, 119, None): 

1951 raise ValueError("Only formats 114, 117, 118 and 119 are supported.") 

1952 if version == 114: 

1953 if convert_strl is not None: 

1954 raise ValueError("strl is not supported in format 114") 

1955 from pandas.io.stata import StataWriter as statawriter 

1956 elif version == 117: 

1957 from pandas.io.stata import StataWriter117 as statawriter 

1958 else: # versions 118 and 119 

1959 from pandas.io.stata import StataWriterUTF8 as statawriter 

1960 

1961 kwargs = {} 

1962 if version is None or version >= 117: 

1963 # strl conversion is only supported >= 117 

1964 kwargs["convert_strl"] = convert_strl 

1965 if version is None or version >= 118: 

1966 # Specifying the version is only supported for UTF8 (118 or 119) 

1967 kwargs["version"] = version 

1968 

1969 writer = statawriter( 

1970 path, 

1971 self, 

1972 convert_dates=convert_dates, 

1973 byteorder=byteorder, 

1974 time_stamp=time_stamp, 

1975 data_label=data_label, 

1976 write_index=write_index, 

1977 variable_labels=variable_labels, 

1978 **kwargs, 

1979 ) 

1980 writer.write_file() 

1981 

1982 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") 

1983 def to_feather(self, path) -> None: 

1984 """ 

1985 Write out the binary feather-format for DataFrames. 

1986 

1987 Parameters 

1988 ---------- 

1989 path : str 

1990 String file path. 

1991 """ 

1992 from pandas.io.feather_format import to_feather 

1993 

1994 to_feather(self, path) 

1995 

1996 @Appender( 

1997 """ 

1998 Examples 

1999 -------- 

2000 >>> df = pd.DataFrame( 

2001 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} 

2002 ... ) 

2003 >>> print(df.to_markdown()) 

2004 | | animal_1 | animal_2 | 

2005 |---:|:-----------|:-----------| 

2006 | 0 | elk | dog | 

2007 | 1 | pig | quetzal | 

2008 """ 

2009 ) 

2010 @Substitution(klass="DataFrame") 

2011 @Appender(_shared_docs["to_markdown"]) 

2012 def to_markdown( 

2013 self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs 

2014 ) -> Optional[str]: 

2015 kwargs.setdefault("headers", "keys") 

2016 kwargs.setdefault("tablefmt", "pipe") 

2017 tabulate = import_optional_dependency("tabulate") 

2018 result = tabulate.tabulate(self, **kwargs) 

2019 if buf is None: 

2020 return result 

2021 buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) 

2022 assert buf is not None # Help mypy. 

2023 buf.writelines(result) 

2024 return None 

2025 

2026 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") 

2027 def to_parquet( 

2028 self, 

2029 path, 

2030 engine="auto", 

2031 compression="snappy", 

2032 index=None, 

2033 partition_cols=None, 

2034 **kwargs, 

2035 ) -> None: 

2036 """ 

2037 Write a DataFrame to the binary parquet format. 

2038 

2039 .. versionadded:: 0.21.0 

2040 

2041 This function writes the dataframe as a `parquet file 

2042 <https://parquet.apache.org/>`_. You can choose different parquet 

2043 backends, and have the option of compression. See 

2044 :ref:`the user guide <io.parquet>` for more details. 

2045 

2046 Parameters 

2047 ---------- 

2048 path : str 

2049 File path or Root Directory path. Will be used as Root Directory 

2050 path while writing a partitioned dataset. 

2051 

2052 .. versionchanged:: 1.0.0 

2053 

2054 Previously this was "fname" 

2055 

2056 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' 

2057 Parquet library to use. If 'auto', then the option 

2058 ``io.parquet.engine`` is used. The default ``io.parquet.engine`` 

2059 behavior is to try 'pyarrow', falling back to 'fastparquet' if 

2060 'pyarrow' is unavailable. 

2061 compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' 

2062 Name of the compression to use. Use ``None`` for no compression. 

2063 index : bool, default None 

2064 If ``True``, include the dataframe's index(es) in the file output. 

2065 If ``False``, they will not be written to the file. 

2066 If ``None``, similar to ``True`` the dataframe's index(es) 

2067 will be saved. However, instead of being saved as values, 

2068 the RangeIndex will be stored as a range in the metadata so it 

2069 doesn't require much space and is faster. Other indexes will 

2070 be included as columns in the file output. 

2071 

2072 .. versionadded:: 0.24.0 

2073 

2074 partition_cols : list, optional, default None 

2075 Column names by which to partition the dataset. 

2076 Columns are partitioned in the order they are given. 

2077 

2078 .. versionadded:: 0.24.0 

2079 

2080 **kwargs 

2081 Additional arguments passed to the parquet library. See 

2082 :ref:`pandas io <io.parquet>` for more details. 

2083 

2084 See Also 

2085 -------- 

2086 read_parquet : Read a parquet file. 

2087 DataFrame.to_csv : Write a csv file. 

2088 DataFrame.to_sql : Write to a sql table. 

2089 DataFrame.to_hdf : Write to hdf. 

2090 

2091 Notes 

2092 ----- 

2093 This function requires either the `fastparquet 

2094 <https://pypi.org/project/fastparquet>`_ or `pyarrow 

2095 <https://arrow.apache.org/docs/python/>`_ library. 

2096 

2097 Examples 

2098 -------- 

2099 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) 

2100 >>> df.to_parquet('df.parquet.gzip', 

2101 ... compression='gzip') # doctest: +SKIP 

2102 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP 

2103 col1 col2 

2104 0 1 3 

2105 1 2 4 

2106 """ 

2107 from pandas.io.parquet import to_parquet 

2108 

2109 to_parquet( 

2110 self, 

2111 path, 

2112 engine, 

2113 compression=compression, 

2114 index=index, 

2115 partition_cols=partition_cols, 

2116 **kwargs, 

2117 ) 

2118 

2119 @Substitution( 

2120 header_type="bool", 

2121 header="Whether to print column labels, default True", 

2122 col_space_type="str or int", 

2123 col_space="The minimum width of each column in CSS length " 

2124 "units. An int is assumed to be px units.\n\n" 

2125 " .. versionadded:: 0.25.0\n" 

2126 " Ability to use str", 

2127 ) 

2128 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) 

2129 def to_html( 

2130 self, 

2131 buf=None, 

2132 columns=None, 

2133 col_space=None, 

2134 header=True, 

2135 index=True, 

2136 na_rep="NaN", 

2137 formatters=None, 

2138 float_format=None, 

2139 sparsify=None, 

2140 index_names=True, 

2141 justify=None, 

2142 max_rows=None, 

2143 max_cols=None, 

2144 show_dimensions=False, 

2145 decimal=".", 

2146 bold_rows=True, 

2147 classes=None, 

2148 escape=True, 

2149 notebook=False, 

2150 border=None, 

2151 table_id=None, 

2152 render_links=False, 

2153 encoding=None, 

2154 ): 

2155 """ 

2156 Render a DataFrame as an HTML table. 

2157 %(shared_params)s 

2158 bold_rows : bool, default True 

2159 Make the row labels bold in the output. 

2160 classes : str or list or tuple, default None 

2161 CSS class(es) to apply to the resulting html table. 

2162 escape : bool, default True 

2163 Convert the characters <, >, and & to HTML-safe sequences. 

2164 notebook : {True, False}, default False 

2165 Whether the generated HTML is for IPython Notebook. 

2166 border : int 

2167 A ``border=border`` attribute is included in the opening 

2168 `<table>` tag. Default ``pd.options.display.html.border``. 

2169 encoding : str, default "utf-8" 

2170 Set character encoding. 

2171 

2172 .. versionadded:: 1.0 

2173 

2174 table_id : str, optional 

2175 A css id is included in the opening `<table>` tag if specified. 

2176 

2177 .. versionadded:: 0.23.0 

2178 

2179 render_links : bool, default False 

2180 Convert URLs to HTML links. 

2181 

2182 .. versionadded:: 0.24.0 

2183 %(returns)s 

2184 See Also 

2185 -------- 

2186 to_string : Convert DataFrame to a string. 

2187 """ 

2188 

2189 if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: 

2190 raise ValueError("Invalid value for justify parameter") 

2191 

2192 formatter = fmt.DataFrameFormatter( 

2193 self, 

2194 columns=columns, 

2195 col_space=col_space, 

2196 na_rep=na_rep, 

2197 formatters=formatters, 

2198 float_format=float_format, 

2199 sparsify=sparsify, 

2200 justify=justify, 

2201 index_names=index_names, 

2202 header=header, 

2203 index=index, 

2204 bold_rows=bold_rows, 

2205 escape=escape, 

2206 max_rows=max_rows, 

2207 max_cols=max_cols, 

2208 show_dimensions=show_dimensions, 

2209 decimal=decimal, 

2210 table_id=table_id, 

2211 render_links=render_links, 

2212 ) 

2213 # TODO: a generic formatter wld b in DataFrameFormatter 

2214 return formatter.to_html( 

2215 buf=buf, 

2216 classes=classes, 

2217 notebook=notebook, 

2218 border=border, 

2219 encoding=encoding, 

2220 ) 

2221 

2222 # ---------------------------------------------------------------------- 

2223 

2224 def info( 

2225 self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None 

2226 ) -> None: 

2227 """ 

2228 Print a concise summary of a DataFrame. 

2229 

2230 This method prints information about a DataFrame including 

2231 the index dtype and column dtypes, non-null values and memory usage. 

2232 

2233 Parameters 

2234 ---------- 

2235 verbose : bool, optional 

2236 Whether to print the full summary. By default, the setting in 

2237 ``pandas.options.display.max_info_columns`` is followed. 

2238 buf : writable buffer, defaults to sys.stdout 

2239 Where to send the output. By default, the output is printed to 

2240 sys.stdout. Pass a writable buffer if you need to further process 

2241 the output. 

2242 max_cols : int, optional 

2243 When to switch from the verbose to the truncated output. If the 

2244 DataFrame has more than `max_cols` columns, the truncated output 

2245 is used. By default, the setting in 

2246 ``pandas.options.display.max_info_columns`` is used. 

2247 memory_usage : bool, str, optional 

2248 Specifies whether total memory usage of the DataFrame 

2249 elements (including the index) should be displayed. By default, 

2250 this follows the ``pandas.options.display.memory_usage`` setting. 

2251 

2252 True always show memory usage. False never shows memory usage. 

2253 A value of 'deep' is equivalent to "True with deep introspection". 

2254 Memory usage is shown in human-readable units (base-2 

2255 representation). Without deep introspection a memory estimation is 

2256 made based in column dtype and number of rows assuming values 

2257 consume the same memory amount for corresponding dtypes. With deep 

2258 memory introspection, a real memory usage calculation is performed 

2259 at the cost of computational resources. 

2260 null_counts : bool, optional 

2261 Whether to show the non-null counts. By default, this is shown 

2262 only if the frame is smaller than 

2263 ``pandas.options.display.max_info_rows`` and 

2264 ``pandas.options.display.max_info_columns``. A value of True always 

2265 shows the counts, and False never shows the counts. 

2266 

2267 Returns 

2268 ------- 

2269 None 

2270 This method prints a summary of a DataFrame and returns None. 

2271 

2272 See Also 

2273 -------- 

2274 DataFrame.describe: Generate descriptive statistics of DataFrame 

2275 columns. 

2276 DataFrame.memory_usage: Memory usage of DataFrame columns. 

2277 

2278 Examples 

2279 -------- 

2280 >>> int_values = [1, 2, 3, 4, 5] 

2281 >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] 

2282 >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] 

2283 >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, 

2284 ... "float_col": float_values}) 

2285 >>> df 

2286 int_col text_col float_col 

2287 0 1 alpha 0.00 

2288 1 2 beta 0.25 

2289 2 3 gamma 0.50 

2290 3 4 delta 0.75 

2291 4 5 epsilon 1.00 

2292 

2293 Prints information of all columns: 

2294 

2295 >>> df.info(verbose=True) 

2296 <class 'pandas.core.frame.DataFrame'> 

2297 RangeIndex: 5 entries, 0 to 4 

2298 Data columns (total 3 columns): 

2299 # Column Non-Null Count Dtype 

2300 --- ------ -------------- ----- 

2301 0 int_col 5 non-null int64 

2302 1 text_col 5 non-null object 

2303 2 float_col 5 non-null float64 

2304 dtypes: float64(1), int64(1), object(1) 

2305 memory usage: 248.0+ bytes 

2306 

2307 Prints a summary of columns count and its dtypes but not per column 

2308 information: 

2309 

2310 >>> df.info(verbose=False) 

2311 <class 'pandas.core.frame.DataFrame'> 

2312 RangeIndex: 5 entries, 0 to 4 

2313 Columns: 3 entries, int_col to float_col 

2314 dtypes: float64(1), int64(1), object(1) 

2315 memory usage: 248.0+ bytes 

2316 

2317 Pipe output of DataFrame.info to buffer instead of sys.stdout, get 

2318 buffer content and writes to a text file: 

2319 

2320 >>> import io 

2321 >>> buffer = io.StringIO() 

2322 >>> df.info(buf=buffer) 

2323 >>> s = buffer.getvalue() 

2324 >>> with open("df_info.txt", "w", 

2325 ... encoding="utf-8") as f: # doctest: +SKIP 

2326 ... f.write(s) 

2327 260 

2328 

2329 The `memory_usage` parameter allows deep introspection mode, specially 

2330 useful for big DataFrames and fine-tune memory optimization: 

2331 

2332 >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) 

2333 >>> df = pd.DataFrame({ 

2334 ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), 

2335 ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), 

2336 ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) 

2337 ... }) 

2338 >>> df.info() 

2339 <class 'pandas.core.frame.DataFrame'> 

2340 RangeIndex: 1000000 entries, 0 to 999999 

2341 Data columns (total 3 columns): 

2342 # Column Non-Null Count Dtype 

2343 --- ------ -------------- ----- 

2344 0 column_1 1000000 non-null object 

2345 1 column_2 1000000 non-null object 

2346 2 column_3 1000000 non-null object 

2347 dtypes: object(3) 

2348 memory usage: 22.9+ MB 

2349 

2350 >>> df.info(memory_usage='deep') 

2351 <class 'pandas.core.frame.DataFrame'> 

2352 RangeIndex: 1000000 entries, 0 to 999999 

2353 Data columns (total 3 columns): 

2354 # Column Non-Null Count Dtype 

2355 --- ------ -------------- ----- 

2356 0 column_1 1000000 non-null object 

2357 1 column_2 1000000 non-null object 

2358 2 column_3 1000000 non-null object 

2359 dtypes: object(3) 

2360 memory usage: 188.8 MB 

2361 """ 

2362 

2363 if buf is None: # pragma: no cover 

2364 buf = sys.stdout 

2365 

2366 lines = [] 

2367 

2368 lines.append(str(type(self))) 

2369 lines.append(self.index._summary()) 

2370 

2371 if len(self.columns) == 0: 

2372 lines.append(f"Empty {type(self).__name__}") 

2373 fmt.buffer_put_lines(buf, lines) 

2374 return 

2375 

2376 cols = self.columns 

2377 col_count = len(self.columns) 

2378 

2379 # hack 

2380 if max_cols is None: 

2381 max_cols = get_option("display.max_info_columns", len(self.columns) + 1) 

2382 

2383 max_rows = get_option("display.max_info_rows", len(self) + 1) 

2384 

2385 if null_counts is None: 

2386 show_counts = (col_count <= max_cols) and (len(self) < max_rows) 

2387 else: 

2388 show_counts = null_counts 

2389 exceeds_info_cols = col_count > max_cols 

2390 

2391 def _verbose_repr(): 

2392 lines.append(f"Data columns (total {len(self.columns)} columns):") 

2393 

2394 id_head = " # " 

2395 column_head = "Column" 

2396 col_space = 2 

2397 

2398 max_col = max(len(pprint_thing(k)) for k in cols) 

2399 len_column = len(pprint_thing(column_head)) 

2400 space = max(max_col, len_column) + col_space 

2401 

2402 max_id = len(pprint_thing(col_count)) 

2403 len_id = len(pprint_thing(id_head)) 

2404 space_num = max(max_id, len_id) + col_space 

2405 counts = None 

2406 

2407 header = _put_str(id_head, space_num) + _put_str(column_head, space) 

2408 if show_counts: 

2409 counts = self.count() 

2410 if len(cols) != len(counts): # pragma: no cover 

2411 raise AssertionError( 

2412 f"Columns must equal counts ({len(cols)} != {len(counts)})" 

2413 ) 

2414 count_header = "Non-Null Count" 

2415 len_count = len(count_header) 

2416 non_null = " non-null" 

2417 max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) 

2418 space_count = max(len_count, max_count) + col_space 

2419 count_temp = "{count}" + non_null 

2420 else: 

2421 count_header = "" 

2422 space_count = len(count_header) 

2423 len_count = space_count 

2424 count_temp = "{count}" 

2425 

2426 dtype_header = "Dtype" 

2427 len_dtype = len(dtype_header) 

2428 max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) 

2429 space_dtype = max(len_dtype, max_dtypes) 

2430 header += _put_str(count_header, space_count) + _put_str( 

2431 dtype_header, space_dtype 

2432 ) 

2433 

2434 lines.append(header) 

2435 lines.append( 

2436 _put_str("-" * len_id, space_num) 

2437 + _put_str("-" * len_column, space) 

2438 + _put_str("-" * len_count, space_count) 

2439 + _put_str("-" * len_dtype, space_dtype) 

2440 ) 

2441 

2442 for i, col in enumerate(self.columns): 

2443 dtype = self.dtypes.iloc[i] 

2444 col = pprint_thing(col) 

2445 

2446 line_no = _put_str(" {num}".format(num=i), space_num) 

2447 count = "" 

2448 if show_counts: 

2449 count = counts.iloc[i] 

2450 

2451 lines.append( 

2452 line_no 

2453 + _put_str(col, space) 

2454 + _put_str(count_temp.format(count=count), space_count) 

2455 + _put_str(dtype, space_dtype) 

2456 ) 

2457 

2458 def _non_verbose_repr(): 

2459 lines.append(self.columns._summary(name="Columns")) 

2460 

2461 def _sizeof_fmt(num, size_qualifier): 

2462 # returns size in human readable format 

2463 for x in ["bytes", "KB", "MB", "GB", "TB"]: 

2464 if num < 1024.0: 

2465 return f"{num:3.1f}{size_qualifier} {x}" 

2466 num /= 1024.0 

2467 return f"{num:3.1f}{size_qualifier} PB" 

2468 

2469 if verbose: 

2470 _verbose_repr() 

2471 elif verbose is False: # specifically set to False, not nesc None 

2472 _non_verbose_repr() 

2473 else: 

2474 if exceeds_info_cols: 

2475 _non_verbose_repr() 

2476 else: 

2477 _verbose_repr() 

2478 

2479 counts = self._data.get_dtype_counts() 

2480 dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] 

2481 lines.append(f"dtypes: {', '.join(dtypes)}") 

2482 

2483 if memory_usage is None: 

2484 memory_usage = get_option("display.memory_usage") 

2485 if memory_usage: 

2486 # append memory usage of df to display 

2487 size_qualifier = "" 

2488 if memory_usage == "deep": 

2489 deep = True 

2490 else: 

2491 # size_qualifier is just a best effort; not guaranteed to catch 

2492 # all cases (e.g., it misses categorical data even with object 

2493 # categories) 

2494 deep = False 

2495 if "object" in counts or self.index._is_memory_usage_qualified(): 

2496 size_qualifier = "+" 

2497 mem_usage = self.memory_usage(index=True, deep=deep).sum() 

2498 lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") 

2499 fmt.buffer_put_lines(buf, lines) 

2500 

2501 def memory_usage(self, index=True, deep=False) -> Series: 

2502 """ 

2503 Return the memory usage of each column in bytes. 

2504 

2505 The memory usage can optionally include the contribution of 

2506 the index and elements of `object` dtype. 

2507 

2508 This value is displayed in `DataFrame.info` by default. This can be 

2509 suppressed by setting ``pandas.options.display.memory_usage`` to False. 

2510 

2511 Parameters 

2512 ---------- 

2513 index : bool, default True 

2514 Specifies whether to include the memory usage of the DataFrame's 

2515 index in returned Series. If ``index=True``, the memory usage of 

2516 the index is the first item in the output. 

2517 deep : bool, default False 

2518 If True, introspect the data deeply by interrogating 

2519 `object` dtypes for system-level memory consumption, and include 

2520 it in the returned values. 

2521 

2522 Returns 

2523 ------- 

2524 Series 

2525 A Series whose index is the original column names and whose values 

2526 is the memory usage of each column in bytes. 

2527 

2528 See Also 

2529 -------- 

2530 numpy.ndarray.nbytes : Total bytes consumed by the elements of an 

2531 ndarray. 

2532 Series.memory_usage : Bytes consumed by a Series. 

2533 Categorical : Memory-efficient array for string values with 

2534 many repeated values. 

2535 DataFrame.info : Concise summary of a DataFrame. 

2536 

2537 Examples 

2538 -------- 

2539 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] 

2540 >>> data = dict([(t, np.ones(shape=5000).astype(t)) 

2541 ... for t in dtypes]) 

2542 >>> df = pd.DataFrame(data) 

2543 >>> df.head() 

2544 int64 float64 complex128 object bool 

2545 0 1 1.0 1.000000+0.000000j 1 True 

2546 1 1 1.0 1.000000+0.000000j 1 True 

2547 2 1 1.0 1.000000+0.000000j 1 True 

2548 3 1 1.0 1.000000+0.000000j 1 True 

2549 4 1 1.0 1.000000+0.000000j 1 True 

2550 

2551 >>> df.memory_usage() 

2552 Index 128 

2553 int64 40000 

2554 float64 40000 

2555 complex128 80000 

2556 object 40000 

2557 bool 5000 

2558 dtype: int64 

2559 

2560 >>> df.memory_usage(index=False) 

2561 int64 40000 

2562 float64 40000 

2563 complex128 80000 

2564 object 40000 

2565 bool 5000 

2566 dtype: int64 

2567 

2568 The memory footprint of `object` dtype columns is ignored by default: 

2569 

2570 >>> df.memory_usage(deep=True) 

2571 Index 128 

2572 int64 40000 

2573 float64 40000 

2574 complex128 80000 

2575 object 160000 

2576 bool 5000 

2577 dtype: int64 

2578 

2579 Use a Categorical for efficient storage of an object-dtype column with 

2580 many repeated values. 

2581 

2582 >>> df['object'].astype('category').memory_usage(deep=True) 

2583 5216 

2584 """ 

2585 result = Series( 

2586 [c.memory_usage(index=False, deep=deep) for col, c in self.items()], 

2587 index=self.columns, 

2588 ) 

2589 if index: 

2590 result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append( 

2591 result 

2592 ) 

2593 return result 

2594 

2595 def transpose(self, *args, copy: bool = False) -> "DataFrame": 

2596 """ 

2597 Transpose index and columns. 

2598 

2599 Reflect the DataFrame over its main diagonal by writing rows as columns 

2600 and vice-versa. The property :attr:`.T` is an accessor to the method 

2601 :meth:`transpose`. 

2602 

2603 Parameters 

2604 ---------- 

2605 *args : tuple, optional 

2606 Accepted for compatibility with NumPy. 

2607 copy : bool, default False 

2608 Whether to copy the data after transposing, even for DataFrames 

2609 with a single dtype. 

2610 

2611 Note that a copy is always required for mixed dtype DataFrames, 

2612 or for DataFrames with any extension types. 

2613 

2614 Returns 

2615 ------- 

2616 DataFrame 

2617 The transposed DataFrame. 

2618 

2619 See Also 

2620 -------- 

2621 numpy.transpose : Permute the dimensions of a given array. 

2622 

2623 Notes 

2624 ----- 

2625 Transposing a DataFrame with mixed dtypes will result in a homogeneous 

2626 DataFrame with the `object` dtype. In such a case, a copy of the data 

2627 is always made. 

2628 

2629 Examples 

2630 -------- 

2631 **Square DataFrame with homogeneous dtype** 

2632 

2633 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} 

2634 >>> df1 = pd.DataFrame(data=d1) 

2635 >>> df1 

2636 col1 col2 

2637 0 1 3 

2638 1 2 4 

2639 

2640 >>> df1_transposed = df1.T # or df1.transpose() 

2641 >>> df1_transposed 

2642 0 1 

2643 col1 1 2 

2644 col2 3 4 

2645 

2646 When the dtype is homogeneous in the original DataFrame, we get a 

2647 transposed DataFrame with the same dtype: 

2648 

2649 >>> df1.dtypes 

2650 col1 int64 

2651 col2 int64 

2652 dtype: object 

2653 >>> df1_transposed.dtypes 

2654 0 int64 

2655 1 int64 

2656 dtype: object 

2657 

2658 **Non-square DataFrame with mixed dtypes** 

2659 

2660 >>> d2 = {'name': ['Alice', 'Bob'], 

2661 ... 'score': [9.5, 8], 

2662 ... 'employed': [False, True], 

2663 ... 'kids': [0, 0]} 

2664 >>> df2 = pd.DataFrame(data=d2) 

2665 >>> df2 

2666 name score employed kids 

2667 0 Alice 9.5 False 0 

2668 1 Bob 8.0 True 0 

2669 

2670 >>> df2_transposed = df2.T # or df2.transpose() 

2671 >>> df2_transposed 

2672 0 1 

2673 name Alice Bob 

2674 score 9.5 8 

2675 employed False True 

2676 kids 0 0 

2677 

2678 When the DataFrame has mixed dtypes, we get a transposed DataFrame with 

2679 the `object` dtype: 

2680 

2681 >>> df2.dtypes 

2682 name object 

2683 score float64 

2684 employed bool 

2685 kids int64 

2686 dtype: object 

2687 >>> df2_transposed.dtypes 

2688 0 object 

2689 1 object 

2690 dtype: object 

2691 """ 

2692 nv.validate_transpose(args, dict()) 

2693 # construct the args 

2694 

2695 dtypes = list(self.dtypes) 

2696 if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]): 

2697 # We have EAs with the same dtype. We can preserve that dtype in transpose. 

2698 dtype = dtypes[0] 

2699 arr_type = dtype.construct_array_type() 

2700 values = self.values 

2701 

2702 new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] 

2703 result = self._constructor( 

2704 dict(zip(self.index, new_values)), index=self.columns 

2705 ) 

2706 

2707 else: 

2708 new_values = self.values.T 

2709 if copy: 

2710 new_values = new_values.copy() 

2711 result = self._constructor( 

2712 new_values, index=self.columns, columns=self.index 

2713 ) 

2714 

2715 return result.__finalize__(self) 

2716 

2717 T = property(transpose) 

2718 

2719 # ---------------------------------------------------------------------- 

2720 # Indexing Methods 

2721 

2722 def _ixs(self, i: int, axis: int = 0): 

2723 """ 

2724 Parameters 

2725 ---------- 

2726 i : int 

2727 axis : int 

2728 

2729 Notes 

2730 ----- 

2731 If slice passed, the resulting data will be a view. 

2732 """ 

2733 # irow 

2734 if axis == 0: 

2735 new_values = self._data.fast_xs(i) 

2736 

2737 # if we are a copy, mark as such 

2738 copy = isinstance(new_values, np.ndarray) and new_values.base is None 

2739 result = self._constructor_sliced( 

2740 new_values, 

2741 index=self.columns, 

2742 name=self.index[i], 

2743 dtype=new_values.dtype, 

2744 ) 

2745 result._set_is_copy(self, copy=copy) 

2746 return result 

2747 

2748 # icol 

2749 else: 

2750 label = self.columns[i] 

2751 

2752 # if the values returned are not the same length 

2753 # as the index (iow a not found value), iget returns 

2754 # a 0-len ndarray. This is effectively catching 

2755 # a numpy error (as numpy should really raise) 

2756 values = self._data.iget(i) 

2757 

2758 if len(self.index) and not len(values): 

2759 values = np.array([np.nan] * len(self.index), dtype=object) 

2760 result = self._box_col_values(values, label) 

2761 

2762 # this is a cached value, mark it so 

2763 result._set_as_cached(label, self) 

2764 

2765 return result 

2766 

2767 def __getitem__(self, key): 

2768 key = lib.item_from_zerodim(key) 

2769 key = com.apply_if_callable(key, self) 

2770 

2771 if is_hashable(key): 

2772 # shortcut if the key is in columns 

2773 if self.columns.is_unique and key in self.columns: 

2774 if self.columns.nlevels > 1: 

2775 return self._getitem_multilevel(key) 

2776 return self._get_item_cache(key) 

2777 

2778 # Do we have a slicer (on rows)? 

2779 indexer = convert_to_index_sliceable(self, key) 

2780 if indexer is not None: 

2781 # either we have a slice or we have a string that can be converted 

2782 # to a slice for partial-string date indexing 

2783 return self._slice(indexer, axis=0) 

2784 

2785 # Do we have a (boolean) DataFrame? 

2786 if isinstance(key, DataFrame): 

2787 return self.where(key) 

2788 

2789 # Do we have a (boolean) 1d indexer? 

2790 if com.is_bool_indexer(key): 

2791 return self._getitem_bool_array(key) 

2792 

2793 # We are left with two options: a single key, and a collection of keys, 

2794 # We interpret tuples as collections only for non-MultiIndex 

2795 is_single_key = isinstance(key, tuple) or not is_list_like(key) 

2796 

2797 if is_single_key: 

2798 if self.columns.nlevels > 1: 

2799 return self._getitem_multilevel(key) 

2800 indexer = self.columns.get_loc(key) 

2801 if is_integer(indexer): 

2802 indexer = [indexer] 

2803 else: 

2804 if is_iterator(key): 

2805 key = list(key) 

2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1] 

2807 

2808 # take() does not accept boolean indexers 

2809 if getattr(indexer, "dtype", None) == bool: 

2810 indexer = np.where(indexer)[0] 

2811 

2812 data = self._take_with_is_copy(indexer, axis=1) 

2813 

2814 if is_single_key: 

2815 # What does looking for a single key in a non-unique index return? 

2816 # The behavior is inconsistent. It returns a Series, except when 

2817 # - the key itself is repeated (test on data.shape, #9519), or 

2818 # - we have a MultiIndex on columns (test on self.columns, #21309) 

2819 if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex): 

2820 data = data[key] 

2821 

2822 return data 

2823 

2824 def _getitem_bool_array(self, key): 

2825 # also raises Exception if object array with NA values 

2826 # warning here just in case -- previously __setitem__ was 

2827 # reindexing but __getitem__ was not; it seems more reasonable to 

2828 # go with the __setitem__ behavior since that is more consistent 

2829 # with all other indexing behavior 

2830 if isinstance(key, Series) and not key.index.equals(self.index): 

2831 warnings.warn( 

2832 "Boolean Series key will be reindexed to match DataFrame index.", 

2833 UserWarning, 

2834 stacklevel=3, 

2835 ) 

2836 elif len(key) != len(self.index): 

2837 raise ValueError( 

2838 f"Item wrong length {len(key)} instead of {len(self.index)}." 

2839 ) 

2840 

2841 # check_bool_indexer will throw exception if Series key cannot 

2842 # be reindexed to match DataFrame rows 

2843 key = check_bool_indexer(self.index, key) 

2844 indexer = key.nonzero()[0] 

2845 return self._take_with_is_copy(indexer, axis=0) 

2846 

2847 def _getitem_multilevel(self, key): 

2848 # self.columns is a MultiIndex 

2849 loc = self.columns.get_loc(key) 

2850 if isinstance(loc, (slice, Series, np.ndarray, Index)): 

2851 new_columns = self.columns[loc] 

2852 result_columns = maybe_droplevels(new_columns, key) 

2853 if self._is_mixed_type: 

2854 result = self.reindex(columns=new_columns) 

2855 result.columns = result_columns 

2856 else: 

2857 new_values = self.values[:, loc] 

2858 result = self._constructor( 

2859 new_values, index=self.index, columns=result_columns 

2860 ) 

2861 result = result.__finalize__(self) 

2862 

2863 # If there is only one column being returned, and its name is 

2864 # either an empty string, or a tuple with an empty string as its 

2865 # first element, then treat the empty string as a placeholder 

2866 # and return the column as if the user had provided that empty 

2867 # string in the key. If the result is a Series, exclude the 

2868 # implied empty string from its name. 

2869 if len(result.columns) == 1: 

2870 top = result.columns[0] 

2871 if isinstance(top, tuple): 

2872 top = top[0] 

2873 if top == "": 

2874 result = result[""] 

2875 if isinstance(result, Series): 

2876 result = self._constructor_sliced( 

2877 result, index=self.index, name=key 

2878 ) 

2879 

2880 result._set_is_copy(self) 

2881 return result 

2882 else: 

2883 return self._get_item_cache(key) 

2884 

2885 def _get_value(self, index, col, takeable: bool = False): 

2886 """ 

2887 Quickly retrieve single value at passed column and index. 

2888 

2889 Parameters 

2890 ---------- 

2891 index : row label 

2892 col : column label 

2893 takeable : interpret the index/col as indexers, default False 

2894 

2895 Returns 

2896 ------- 

2897 scalar 

2898 """ 

2899 if takeable: 

2900 series = self._iget_item_cache(col) 

2901 return com.maybe_box_datetimelike(series._values[index]) 

2902 

2903 series = self._get_item_cache(col) 

2904 engine = self.index._engine 

2905 

2906 try: 

2907 return engine.get_value(series._values, index) 

2908 except KeyError: 

2909 # GH 20629 

2910 if self.index.nlevels > 1: 

2911 # partial indexing forbidden 

2912 raise 

2913 except (TypeError, ValueError): 

2914 pass 

2915 

2916 # we cannot handle direct indexing 

2917 # use positional 

2918 col = self.columns.get_loc(col) 

2919 index = self.index.get_loc(index) 

2920 return self._get_value(index, col, takeable=True) 

2921 

2922 def __setitem__(self, key, value): 

2923 key = com.apply_if_callable(key, self) 

2924 

2925 # see if we can slice the rows 

2926 indexer = convert_to_index_sliceable(self, key) 

2927 if indexer is not None: 

2928 # either we have a slice or we have a string that can be converted 

2929 # to a slice for partial-string date indexing 

2930 return self._setitem_slice(indexer, value) 

2931 

2932 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: 

2933 self._setitem_frame(key, value) 

2934 elif isinstance(key, (Series, np.ndarray, list, Index)): 

2935 self._setitem_array(key, value) 

2936 else: 

2937 # set column 

2938 self._set_item(key, value) 

2939 

2940 def _setitem_slice(self, key, value): 

2941 # NB: we can't just use self.loc[key] = value because that 

2942 # operates on labels and we need to operate positional for 

2943 # backwards-compat, xref GH#31469 

2944 self._check_setitem_copy() 

2945 self.loc._setitem_with_indexer(key, value) 

2946 

2947 def _setitem_array(self, key, value): 

2948 # also raises Exception if object array with NA values 

2949 if com.is_bool_indexer(key): 

2950 if len(key) != len(self.index): 

2951 raise ValueError( 

2952 f"Item wrong length {len(key)} instead of {len(self.index)}!" 

2953 ) 

2954 key = check_bool_indexer(self.index, key) 

2955 indexer = key.nonzero()[0] 

2956 self._check_setitem_copy() 

2957 self.loc._setitem_with_indexer(indexer, value) 

2958 else: 

2959 if isinstance(value, DataFrame): 

2960 if len(value.columns) != len(key): 

2961 raise ValueError("Columns must be same length as key") 

2962 for k1, k2 in zip(key, value.columns): 

2963 self[k1] = value[k2] 

2964 else: 

2965 indexer = self.loc._get_listlike_indexer( 

2966 key, axis=1, raise_missing=False 

2967 )[1] 

2968 self._check_setitem_copy() 

2969 self.loc._setitem_with_indexer((slice(None), indexer), value) 

2970 

2971 def _setitem_frame(self, key, value): 

2972 # support boolean setting with DataFrame input, e.g. 

2973 # df[df > df2] = 0 

2974 if isinstance(key, np.ndarray): 

2975 if key.shape != self.shape: 

2976 raise ValueError("Array conditional must be same shape as self") 

2977 key = self._constructor(key, **self._construct_axes_dict()) 

2978 

2979 if key.values.size and not is_bool_dtype(key.values): 

2980 raise TypeError( 

2981 "Must pass DataFrame or 2-d ndarray with boolean values only" 

2982 ) 

2983 

2984 self._check_inplace_setting(value) 

2985 self._check_setitem_copy() 

2986 self._where(-key, value, inplace=True) 

2987 

2988 def _set_item(self, key, value): 

2989 """ 

2990 Add series to DataFrame in specified column. 

2991 

2992 If series is a numpy-array (not a Series/TimeSeries), it must be the 

2993 same length as the DataFrames index or an error will be thrown. 

2994 

2995 Series/TimeSeries will be conformed to the DataFrames index to 

2996 ensure homogeneity. 

2997 """ 

2998 

2999 self._ensure_valid_index(value) 

3000 value = self._sanitize_column(key, value) 

3001 NDFrame._set_item(self, key, value) 

3002 

3003 # check if we are modifying a copy 

3004 # try to set first as we want an invalid 

3005 # value exception to occur first 

3006 if len(self): 

3007 self._check_setitem_copy() 

3008 

3009 def _set_value(self, index, col, value, takeable: bool = False): 

3010 """ 

3011 Put single value at passed column and index. 

3012 

3013 Parameters 

3014 ---------- 

3015 index : row label 

3016 col : column label 

3017 value : scalar 

3018 takeable : interpret the index/col as indexers, default False 

3019 

3020 Returns 

3021 ------- 

3022 DataFrame 

3023 If label pair is contained, will be reference to calling DataFrame, 

3024 otherwise a new object. 

3025 """ 

3026 try: 

3027 if takeable is True: 

3028 series = self._iget_item_cache(col) 

3029 return series._set_value(index, value, takeable=True) 

3030 

3031 series = self._get_item_cache(col) 

3032 engine = self.index._engine 

3033 engine.set_value(series._values, index, value) 

3034 return self 

3035 except (KeyError, TypeError): 

3036 

3037 # set using a non-recursive method & reset the cache 

3038 if takeable: 

3039 self.iloc[index, col] = value 

3040 else: 

3041 self.loc[index, col] = value 

3042 self._item_cache.pop(col, None) 

3043 

3044 return self 

3045 

3046 def _ensure_valid_index(self, value): 

3047 """ 

3048 Ensure that if we don't have an index, that we can create one from the 

3049 passed value. 

3050 """ 

3051 # GH5632, make sure that we are a Series convertible 

3052 if not len(self.index) and is_list_like(value) and len(value): 

3053 try: 

3054 value = Series(value) 

3055 except (ValueError, NotImplementedError, TypeError): 

3056 raise ValueError( 

3057 "Cannot set a frame with no defined index " 

3058 "and a value that cannot be converted to a " 

3059 "Series" 

3060 ) 

3061 

3062 self._data = self._data.reindex_axis( 

3063 value.index.copy(), axis=1, fill_value=np.nan 

3064 ) 

3065 

3066 def _box_item_values(self, key, values): 

3067 items = self.columns[self.columns.get_loc(key)] 

3068 if values.ndim == 2: 

3069 return self._constructor(values.T, columns=items, index=self.index) 

3070 else: 

3071 return self._box_col_values(values, items) 

3072 

3073 def _box_col_values(self, values, items): 

3074 """ 

3075 Provide boxed values for a column. 

3076 """ 

3077 klass = self._constructor_sliced 

3078 return klass(values, index=self.index, name=items, fastpath=True) 

3079 

3080 # ---------------------------------------------------------------------- 

3081 # Unsorted 

3082 

3083 def query(self, expr, inplace=False, **kwargs): 

3084 """ 

3085 Query the columns of a DataFrame with a boolean expression. 

3086 

3087 Parameters 

3088 ---------- 

3089 expr : str 

3090 The query string to evaluate. 

3091 

3092 You can refer to variables 

3093 in the environment by prefixing them with an '@' character like 

3094 ``@a + b``. 

3095 

3096 You can refer to column names that contain spaces or operators by 

3097 surrounding them in backticks. This way you can also escape 

3098 names that start with a digit, or those that are a Python keyword. 

3099 Basically when it is not valid Python identifier. See notes down 

3100 for more details. 

3101 

3102 For example, if one of your columns is called ``a a`` and you want 

3103 to sum it with ``b``, your query should be ```a a` + b``. 

3104 

3105 .. versionadded:: 0.25.0 

3106 Backtick quoting introduced. 

3107 

3108 .. versionadded:: 1.0.0 

3109 Expanding functionality of backtick quoting for more than only spaces. 

3110 

3111 inplace : bool 

3112 Whether the query should modify the data in place or return 

3113 a modified copy. 

3114 **kwargs 

3115 See the documentation for :func:`eval` for complete details 

3116 on the keyword arguments accepted by :meth:`DataFrame.query`. 

3117 

3118 Returns 

3119 ------- 

3120 DataFrame 

3121 DataFrame resulting from the provided query expression. 

3122 

3123 See Also 

3124 -------- 

3125 eval : Evaluate a string describing operations on 

3126 DataFrame columns. 

3127 DataFrame.eval : Evaluate a string describing operations on 

3128 DataFrame columns. 

3129 

3130 Notes 

3131 ----- 

3132 The result of the evaluation of this expression is first passed to 

3133 :attr:`DataFrame.loc` and if that fails because of a 

3134 multidimensional key (e.g., a DataFrame) then the result will be passed 

3135 to :meth:`DataFrame.__getitem__`. 

3136 

3137 This method uses the top-level :func:`eval` function to 

3138 evaluate the passed query. 

3139 

3140 The :meth:`~pandas.DataFrame.query` method uses a slightly 

3141 modified Python syntax by default. For example, the ``&`` and ``|`` 

3142 (bitwise) operators have the precedence of their boolean cousins, 

3143 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, 

3144 however the semantics are different. 

3145 

3146 You can change the semantics of the expression by passing the keyword 

3147 argument ``parser='python'``. This enforces the same semantics as 

3148 evaluation in Python space. Likewise, you can pass ``engine='python'`` 

3149 to evaluate an expression using Python itself as a backend. This is not 

3150 recommended as it is inefficient compared to using ``numexpr`` as the 

3151 engine. 

3152 

3153 The :attr:`DataFrame.index` and 

3154 :attr:`DataFrame.columns` attributes of the 

3155 :class:`~pandas.DataFrame` instance are placed in the query namespace 

3156 by default, which allows you to treat both the index and columns of the 

3157 frame as a column in the frame. 

3158 The identifier ``index`` is used for the frame index; you can also 

3159 use the name of the index to identify it in a query. Please note that 

3160 Python keywords may not be used as identifiers. 

3161 

3162 For further details and examples see the ``query`` documentation in 

3163 :ref:`indexing <indexing.query>`. 

3164 

3165 *Backtick quoted variables* 

3166 

3167 Backtick quoted variables are parsed as literal Python code and 

3168 are converted internally to a Python valid identifier. 

3169 This can lead to the following problems. 

3170 

3171 During parsing a number of disallowed characters inside the backtick 

3172 quoted string are replaced by strings that are allowed as a Python identifier. 

3173 These characters include all operators in Python, the space character, the 

3174 question mark, the exclamation mark, the dollar sign, and the euro sign. 

3175 For other characters that fall outside the ASCII range (U+0001..U+007F) 

3176 and those that are not further specified in PEP 3131, 

3177 the query parser will raise an error. 

3178 This excludes whitespace different than the space character, 

3179 but also the hashtag (as it is used for comments) and the backtick 

3180 itself (backtick can also not be escaped). 

3181 

3182 In a special case, quotes that make a pair around a backtick can 

3183 confuse the parser. 

3184 For example, ```it's` > `that's``` will raise an error, 

3185 as it forms a quoted string (``'s > `that'``) with a backtick inside. 

3186 

3187 See also the Python documentation about lexical analysis 

3188 (https://docs.python.org/3/reference/lexical_analysis.html) 

3189 in combination with the source code in :mod:`pandas.core.computation.parsing`. 

3190 

3191 Examples 

3192 -------- 

3193 >>> df = pd.DataFrame({'A': range(1, 6), 

3194 ... 'B': range(10, 0, -2), 

3195 ... 'C C': range(10, 5, -1)}) 

3196 >>> df 

3197 A B C C 

3198 0 1 10 10 

3199 1 2 8 9 

3200 2 3 6 8 

3201 3 4 4 7 

3202 4 5 2 6 

3203 >>> df.query('A > B') 

3204 A B C C 

3205 4 5 2 6 

3206 

3207 The previous expression is equivalent to 

3208 

3209 >>> df[df.A > df.B] 

3210 A B C C 

3211 4 5 2 6 

3212 

3213 For columns with spaces in their name, you can use backtick quoting. 

3214 

3215 >>> df.query('B == `C C`') 

3216 A B C C 

3217 0 1 10 10 

3218 

3219 The previous expression is equivalent to 

3220 

3221 >>> df[df.B == df['C C']] 

3222 A B C C 

3223 0 1 10 10 

3224 """ 

3225 inplace = validate_bool_kwarg(inplace, "inplace") 

3226 if not isinstance(expr, str): 

3227 msg = f"expr must be a string to be evaluated, {type(expr)} given" 

3228 raise ValueError(msg) 

3229 kwargs["level"] = kwargs.pop("level", 0) + 1 

3230 kwargs["target"] = None 

3231 res = self.eval(expr, **kwargs) 

3232 

3233 try: 

3234 new_data = self.loc[res] 

3235 except ValueError: 

3236 # when res is multi-dimensional loc raises, but this is sometimes a 

3237 # valid query 

3238 new_data = self[res] 

3239 

3240 if inplace: 

3241 self._update_inplace(new_data) 

3242 else: 

3243 return new_data 

3244 

3245 def eval(self, expr, inplace=False, **kwargs): 

3246 """ 

3247 Evaluate a string describing operations on DataFrame columns. 

3248 

3249 Operates on columns only, not specific rows or elements. This allows 

3250 `eval` to run arbitrary code, which can make you vulnerable to code 

3251 injection if you pass user input to this function. 

3252 

3253 Parameters 

3254 ---------- 

3255 expr : str 

3256 The expression string to evaluate. 

3257 inplace : bool, default False 

3258 If the expression contains an assignment, whether to perform the 

3259 operation inplace and mutate the existing DataFrame. Otherwise, 

3260 a new DataFrame is returned. 

3261 **kwargs 

3262 See the documentation for :func:`eval` for complete details 

3263 on the keyword arguments accepted by 

3264 :meth:`~pandas.DataFrame.query`. 

3265 

3266 Returns 

3267 ------- 

3268 ndarray, scalar, or pandas object 

3269 The result of the evaluation. 

3270 

3271 See Also 

3272 -------- 

3273 DataFrame.query : Evaluates a boolean expression to query the columns 

3274 of a frame. 

3275 DataFrame.assign : Can evaluate an expression or function to create new 

3276 values for a column. 

3277 eval : Evaluate a Python expression as a string using various 

3278 backends. 

3279 

3280 Notes 

3281 ----- 

3282 For more details see the API documentation for :func:`~eval`. 

3283 For detailed examples see :ref:`enhancing performance with eval 

3284 <enhancingperf.eval>`. 

3285 

3286 Examples 

3287 -------- 

3288 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) 

3289 >>> df 

3290 A B 

3291 0 1 10 

3292 1 2 8 

3293 2 3 6 

3294 3 4 4 

3295 4 5 2 

3296 >>> df.eval('A + B') 

3297 0 11 

3298 1 10 

3299 2 9 

3300 3 8 

3301 4 7 

3302 dtype: int64 

3303 

3304 Assignment is allowed though by default the original DataFrame is not 

3305 modified. 

3306 

3307 >>> df.eval('C = A + B') 

3308 A B C 

3309 0 1 10 11 

3310 1 2 8 10 

3311 2 3 6 9 

3312 3 4 4 8 

3313 4 5 2 7 

3314 >>> df 

3315 A B 

3316 0 1 10 

3317 1 2 8 

3318 2 3 6 

3319 3 4 4 

3320 4 5 2 

3321 

3322 Use ``inplace=True`` to modify the original DataFrame. 

3323 

3324 >>> df.eval('C = A + B', inplace=True) 

3325 >>> df 

3326 A B C 

3327 0 1 10 11 

3328 1 2 8 10 

3329 2 3 6 9 

3330 3 4 4 8 

3331 4 5 2 7 

3332 """ 

3333 from pandas.core.computation.eval import eval as _eval 

3334 

3335 inplace = validate_bool_kwarg(inplace, "inplace") 

3336 resolvers = kwargs.pop("resolvers", None) 

3337 kwargs["level"] = kwargs.pop("level", 0) + 1 

3338 if resolvers is None: 

3339 index_resolvers = self._get_index_resolvers() 

3340 column_resolvers = self._get_cleaned_column_resolvers() 

3341 resolvers = column_resolvers, index_resolvers 

3342 if "target" not in kwargs: 

3343 kwargs["target"] = self 

3344 kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) 

3345 

3346 return _eval(expr, inplace=inplace, **kwargs) 

3347 

3348 def select_dtypes(self, include=None, exclude=None) -> "DataFrame": 

3349 """ 

3350 Return a subset of the DataFrame's columns based on the column dtypes. 

3351 

3352 Parameters 

3353 ---------- 

3354 include, exclude : scalar or list-like 

3355 A selection of dtypes or strings to be included/excluded. At least 

3356 one of these parameters must be supplied. 

3357 

3358 Returns 

3359 ------- 

3360 DataFrame 

3361 The subset of the frame including the dtypes in ``include`` and 

3362 excluding the dtypes in ``exclude``. 

3363 

3364 Raises 

3365 ------ 

3366 ValueError 

3367 * If both of ``include`` and ``exclude`` are empty 

3368 * If ``include`` and ``exclude`` have overlapping elements 

3369 * If any kind of string dtype is passed in. 

3370 

3371 Notes 

3372 ----- 

3373 * To select all *numeric* types, use ``np.number`` or ``'number'`` 

3374 * To select strings you must use the ``object`` dtype, but note that 

3375 this will return *all* object dtype columns 

3376 * See the `numpy dtype hierarchy 

3377 <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__ 

3378 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or 

3379 ``'datetime64'`` 

3380 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or 

3381 ``'timedelta64'`` 

3382 * To select Pandas categorical dtypes, use ``'category'`` 

3383 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in 

3384 0.20.0) or ``'datetime64[ns, tz]'`` 

3385 

3386 Examples 

3387 -------- 

3388 >>> df = pd.DataFrame({'a': [1, 2] * 3, 

3389 ... 'b': [True, False] * 3, 

3390 ... 'c': [1.0, 2.0] * 3}) 

3391 >>> df 

3392 a b c 

3393 0 1 True 1.0 

3394 1 2 False 2.0 

3395 2 1 True 1.0 

3396 3 2 False 2.0 

3397 4 1 True 1.0 

3398 5 2 False 2.0 

3399 

3400 >>> df.select_dtypes(include='bool') 

3401 b 

3402 0 True 

3403 1 False 

3404 2 True 

3405 3 False 

3406 4 True 

3407 5 False 

3408 

3409 >>> df.select_dtypes(include=['float64']) 

3410 c 

3411 0 1.0 

3412 1 2.0 

3413 2 1.0 

3414 3 2.0 

3415 4 1.0 

3416 5 2.0 

3417 

3418 >>> df.select_dtypes(exclude=['int']) 

3419 b c 

3420 0 True 1.0 

3421 1 False 2.0 

3422 2 True 1.0 

3423 3 False 2.0 

3424 4 True 1.0 

3425 5 False 2.0 

3426 """ 

3427 

3428 if not is_list_like(include): 

3429 include = (include,) if include is not None else () 

3430 if not is_list_like(exclude): 

3431 exclude = (exclude,) if exclude is not None else () 

3432 

3433 selection = (frozenset(include), frozenset(exclude)) 

3434 

3435 if not any(selection): 

3436 raise ValueError("at least one of include or exclude must be nonempty") 

3437 

3438 # convert the myriad valid dtypes object to a single representation 

3439 include = frozenset(infer_dtype_from_object(x) for x in include) 

3440 exclude = frozenset(infer_dtype_from_object(x) for x in exclude) 

3441 for dtypes in (include, exclude): 

3442 invalidate_string_dtypes(dtypes) 

3443 

3444 # can't both include AND exclude! 

3445 if not include.isdisjoint(exclude): 

3446 raise ValueError(f"include and exclude overlap on {(include & exclude)}") 

3447 

3448 # We raise when both include and exclude are empty 

3449 # Hence, we can just shrink the columns we want to keep 

3450 keep_these = np.full(self.shape[1], True) 

3451 

3452 def extract_unique_dtypes_from_dtypes_set( 

3453 dtypes_set: FrozenSet[Dtype], unique_dtypes: np.ndarray 

3454 ) -> List[Dtype]: 

3455 extracted_dtypes = [ 

3456 unique_dtype 

3457 for unique_dtype in unique_dtypes 

3458 if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore 

3459 ] 

3460 return extracted_dtypes 

3461 

3462 unique_dtypes = self.dtypes.unique() 

3463 

3464 if include: 

3465 included_dtypes = extract_unique_dtypes_from_dtypes_set( 

3466 include, unique_dtypes 

3467 ) 

3468 keep_these &= self.dtypes.isin(included_dtypes) 

3469 

3470 if exclude: 

3471 excluded_dtypes = extract_unique_dtypes_from_dtypes_set( 

3472 exclude, unique_dtypes 

3473 ) 

3474 keep_these &= ~self.dtypes.isin(excluded_dtypes) 

3475 

3476 return self.iloc[:, keep_these.values] 

3477 

3478 def insert(self, loc, column, value, allow_duplicates=False) -> None: 

3479 """ 

3480 Insert column into DataFrame at specified location. 

3481 

3482 Raises a ValueError if `column` is already contained in the DataFrame, 

3483 unless `allow_duplicates` is set to True. 

3484 

3485 Parameters 

3486 ---------- 

3487 loc : int 

3488 Insertion index. Must verify 0 <= loc <= len(columns). 

3489 column : str, number, or hashable object 

3490 Label of the inserted column. 

3491 value : int, Series, or array-like 

3492 allow_duplicates : bool, optional 

3493 """ 

3494 self._ensure_valid_index(value) 

3495 value = self._sanitize_column(column, value, broadcast=False) 

3496 self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) 

3497 

3498 def assign(self, **kwargs) -> "DataFrame": 

3499 r""" 

3500 Assign new columns to a DataFrame. 

3501 

3502 Returns a new object with all original columns in addition to new ones. 

3503 Existing columns that are re-assigned will be overwritten. 

3504 

3505 Parameters 

3506 ---------- 

3507 **kwargs : dict of {str: callable or Series} 

3508 The column names are keywords. If the values are 

3509 callable, they are computed on the DataFrame and 

3510 assigned to the new columns. The callable must not 

3511 change input DataFrame (though pandas doesn't check it). 

3512 If the values are not callable, (e.g. a Series, scalar, or array), 

3513 they are simply assigned. 

3514 

3515 Returns 

3516 ------- 

3517 DataFrame 

3518 A new DataFrame with the new columns in addition to 

3519 all the existing columns. 

3520 

3521 Notes 

3522 ----- 

3523 Assigning multiple columns within the same ``assign`` is possible. 

3524 Later items in '\*\*kwargs' may refer to newly created or modified 

3525 columns in 'df'; items are computed and assigned into 'df' in order. 

3526 

3527 .. versionchanged:: 0.23.0 

3528 

3529 Keyword argument order is maintained. 

3530 

3531 Examples 

3532 -------- 

3533 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, 

3534 ... index=['Portland', 'Berkeley']) 

3535 >>> df 

3536 temp_c 

3537 Portland 17.0 

3538 Berkeley 25.0 

3539 

3540 Where the value is a callable, evaluated on `df`: 

3541 

3542 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) 

3543 temp_c temp_f 

3544 Portland 17.0 62.6 

3545 Berkeley 25.0 77.0 

3546 

3547 Alternatively, the same behavior can be achieved by directly 

3548 referencing an existing Series or sequence: 

3549 

3550 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) 

3551 temp_c temp_f 

3552 Portland 17.0 62.6 

3553 Berkeley 25.0 77.0 

3554 

3555 You can create multiple columns within the same assign where one 

3556 of the columns depends on another one defined within the same assign: 

3557 

3558 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, 

3559 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) 

3560 temp_c temp_f temp_k 

3561 Portland 17.0 62.6 290.15 

3562 Berkeley 25.0 77.0 298.15 

3563 """ 

3564 data = self.copy() 

3565 

3566 for k, v in kwargs.items(): 

3567 data[k] = com.apply_if_callable(v, data) 

3568 return data 

3569 

3570 def _sanitize_column(self, key, value, broadcast=True): 

3571 """ 

3572 Ensures new columns (which go into the BlockManager as new blocks) are 

3573 always copied and converted into an array. 

3574 

3575 Parameters 

3576 ---------- 

3577 key : object 

3578 value : scalar, Series, or array-like 

3579 broadcast : bool, default True 

3580 If ``key`` matches multiple duplicate column names in the 

3581 DataFrame, this parameter indicates whether ``value`` should be 

3582 tiled so that the returned array contains a (duplicated) column for 

3583 each occurrence of the key. If False, ``value`` will not be tiled. 

3584 

3585 Returns 

3586 ------- 

3587 numpy.ndarray 

3588 """ 

3589 

3590 def reindexer(value): 

3591 # reindex if necessary 

3592 

3593 if value.index.equals(self.index) or not len(self.index): 

3594 value = value._values.copy() 

3595 else: 

3596 

3597 # GH 4107 

3598 try: 

3599 value = value.reindex(self.index)._values 

3600 except ValueError as err: 

3601 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs 

3602 if not value.index.is_unique: 

3603 # duplicate axis 

3604 raise err 

3605 

3606 # other 

3607 raise TypeError( 

3608 "incompatible index of inserted column with frame index" 

3609 ) 

3610 return value 

3611 

3612 if isinstance(value, Series): 

3613 value = reindexer(value) 

3614 

3615 elif isinstance(value, DataFrame): 

3616 # align right-hand-side columns if self.columns 

3617 # is multi-index and self[key] is a sub-frame 

3618 if isinstance(self.columns, ABCMultiIndex) and key in self.columns: 

3619 loc = self.columns.get_loc(key) 

3620 if isinstance(loc, (slice, Series, np.ndarray, Index)): 

3621 cols = maybe_droplevels(self.columns[loc], key) 

3622 if len(cols) and not cols.equals(value.columns): 

3623 value = value.reindex(cols, axis=1) 

3624 # now align rows 

3625 value = reindexer(value).T 

3626 

3627 elif isinstance(value, ExtensionArray): 

3628 # Explicitly copy here, instead of in sanitize_index, 

3629 # as sanitize_index won't copy an EA, even with copy=True 

3630 value = value.copy() 

3631 value = sanitize_index(value, self.index, copy=False) 

3632 

3633 elif isinstance(value, Index) or is_sequence(value): 

3634 

3635 # turn me into an ndarray 

3636 value = sanitize_index(value, self.index, copy=False) 

3637 if not isinstance(value, (np.ndarray, Index)): 

3638 if isinstance(value, list) and len(value) > 0: 

3639 value = maybe_convert_platform(value) 

3640 else: 

3641 value = com.asarray_tuplesafe(value) 

3642 elif value.ndim == 2: 

3643 value = value.copy().T 

3644 elif isinstance(value, Index): 

3645 value = value.copy(deep=True) 

3646 else: 

3647 value = value.copy() 

3648 

3649 # possibly infer to datetimelike 

3650 if is_object_dtype(value.dtype): 

3651 value = maybe_infer_to_datetimelike(value) 

3652 

3653 else: 

3654 # cast ignores pandas dtypes. so save the dtype first 

3655 infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) 

3656 

3657 # upcast 

3658 value = cast_scalar_to_array(len(self.index), value) 

3659 value = maybe_cast_to_datetime(value, infer_dtype) 

3660 

3661 # return internal types directly 

3662 if is_extension_array_dtype(value): 

3663 return value 

3664 

3665 # broadcast across multiple columns if necessary 

3666 if broadcast and key in self.columns and value.ndim == 1: 

3667 if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex): 

3668 existing_piece = self[key] 

3669 if isinstance(existing_piece, DataFrame): 

3670 value = np.tile(value, (len(existing_piece.columns), 1)) 

3671 

3672 return np.atleast_2d(np.asarray(value)) 

3673 

3674 @property 

3675 def _series(self): 

3676 return { 

3677 item: Series(self._data.iget(idx), index=self.index, name=item) 

3678 for idx, item in enumerate(self.columns) 

3679 } 

3680 

3681 def lookup(self, row_labels, col_labels) -> np.ndarray: 

3682 """ 

3683 Label-based "fancy indexing" function for DataFrame. 

3684 

3685 Given equal-length arrays of row and column labels, return an 

3686 array of the values corresponding to each (row, col) pair. 

3687 

3688 Parameters 

3689 ---------- 

3690 row_labels : sequence 

3691 The row labels to use for lookup. 

3692 col_labels : sequence 

3693 The column labels to use for lookup. 

3694 

3695 Returns 

3696 ------- 

3697 numpy.ndarray 

3698 

3699 Examples 

3700 -------- 

3701 values : ndarray 

3702 The found values 

3703 """ 

3704 n = len(row_labels) 

3705 if n != len(col_labels): 

3706 raise ValueError("Row labels must have same size as column labels") 

3707 

3708 thresh = 1000 

3709 if not self._is_mixed_type or n > thresh: 

3710 values = self.values 

3711 ridx = self.index.get_indexer(row_labels) 

3712 cidx = self.columns.get_indexer(col_labels) 

3713 if (ridx == -1).any(): 

3714 raise KeyError("One or more row labels was not found") 

3715 if (cidx == -1).any(): 

3716 raise KeyError("One or more column labels was not found") 

3717 flat_index = ridx * len(self.columns) + cidx 

3718 result = values.flat[flat_index] 

3719 else: 

3720 result = np.empty(n, dtype="O") 

3721 for i, (r, c) in enumerate(zip(row_labels, col_labels)): 

3722 result[i] = self._get_value(r, c) 

3723 

3724 if is_object_dtype(result): 

3725 result = lib.maybe_convert_objects(result) 

3726 

3727 return result 

3728 

3729 # ---------------------------------------------------------------------- 

3730 # Reindexing and alignment 

3731 

3732 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): 

3733 frame = self 

3734 

3735 columns = axes["columns"] 

3736 if columns is not None: 

3737 frame = frame._reindex_columns( 

3738 columns, method, copy, level, fill_value, limit, tolerance 

3739 ) 

3740 

3741 index = axes["index"] 

3742 if index is not None: 

3743 frame = frame._reindex_index( 

3744 index, method, copy, level, fill_value, limit, tolerance 

3745 ) 

3746 

3747 return frame 

3748 

3749 def _reindex_index( 

3750 self, 

3751 new_index, 

3752 method, 

3753 copy, 

3754 level, 

3755 fill_value=np.nan, 

3756 limit=None, 

3757 tolerance=None, 

3758 ): 

3759 new_index, indexer = self.index.reindex( 

3760 new_index, method=method, level=level, limit=limit, tolerance=tolerance 

3761 ) 

3762 return self._reindex_with_indexers( 

3763 {0: [new_index, indexer]}, 

3764 copy=copy, 

3765 fill_value=fill_value, 

3766 allow_dups=False, 

3767 ) 

3768 

3769 def _reindex_columns( 

3770 self, 

3771 new_columns, 

3772 method, 

3773 copy, 

3774 level, 

3775 fill_value=None, 

3776 limit=None, 

3777 tolerance=None, 

3778 ): 

3779 new_columns, indexer = self.columns.reindex( 

3780 new_columns, method=method, level=level, limit=limit, tolerance=tolerance 

3781 ) 

3782 return self._reindex_with_indexers( 

3783 {1: [new_columns, indexer]}, 

3784 copy=copy, 

3785 fill_value=fill_value, 

3786 allow_dups=False, 

3787 ) 

3788 

3789 def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": 

3790 """ 

3791 We are guaranteed non-Nones in the axes. 

3792 """ 

3793 

3794 new_index, row_indexer = self.index.reindex(axes["index"]) 

3795 new_columns, col_indexer = self.columns.reindex(axes["columns"]) 

3796 

3797 if row_indexer is not None and col_indexer is not None: 

3798 indexer = row_indexer, col_indexer 

3799 new_values = algorithms.take_2d_multi( 

3800 self.values, indexer, fill_value=fill_value 

3801 ) 

3802 return self._constructor(new_values, index=new_index, columns=new_columns) 

3803 else: 

3804 return self._reindex_with_indexers( 

3805 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, 

3806 copy=copy, 

3807 fill_value=fill_value, 

3808 ) 

3809 

3810 @Appender(_shared_docs["align"] % _shared_doc_kwargs) 

3811 def align( 

3812 self, 

3813 other, 

3814 join="outer", 

3815 axis=None, 

3816 level=None, 

3817 copy=True, 

3818 fill_value=None, 

3819 method=None, 

3820 limit=None, 

3821 fill_axis=0, 

3822 broadcast_axis=None, 

3823 ) -> "DataFrame": 

3824 return super().align( 

3825 other, 

3826 join=join, 

3827 axis=axis, 

3828 level=level, 

3829 copy=copy, 

3830 fill_value=fill_value, 

3831 method=method, 

3832 limit=limit, 

3833 fill_axis=fill_axis, 

3834 broadcast_axis=broadcast_axis, 

3835 ) 

3836 

3837 @Substitution(**_shared_doc_kwargs) 

3838 @Appender(NDFrame.reindex.__doc__) 

3839 @rewrite_axis_style_signature( 

3840 "labels", 

3841 [ 

3842 ("method", None), 

3843 ("copy", True), 

3844 ("level", None), 

3845 ("fill_value", np.nan), 

3846 ("limit", None), 

3847 ("tolerance", None), 

3848 ], 

3849 ) 

3850 def reindex(self, *args, **kwargs) -> "DataFrame": 

3851 axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") 

3852 kwargs.update(axes) 

3853 # Pop these, since the values are in `kwargs` under different names 

3854 kwargs.pop("axis", None) 

3855 kwargs.pop("labels", None) 

3856 return super().reindex(**kwargs) 

3857 

3858 def drop( 

3859 self, 

3860 labels=None, 

3861 axis=0, 

3862 index=None, 

3863 columns=None, 

3864 level=None, 

3865 inplace=False, 

3866 errors="raise", 

3867 ): 

3868 """ 

3869 Drop specified labels from rows or columns. 

3870 

3871 Remove rows or columns by specifying label names and corresponding 

3872 axis, or by specifying directly index or column names. When using a 

3873 multi-index, labels on different levels can be removed by specifying 

3874 the level. 

3875 

3876 Parameters 

3877 ---------- 

3878 labels : single label or list-like 

3879 Index or column labels to drop. 

3880 axis : {0 or 'index', 1 or 'columns'}, default 0 

3881 Whether to drop labels from the index (0 or 'index') or 

3882 columns (1 or 'columns'). 

3883 index : single label or list-like 

3884 Alternative to specifying axis (``labels, axis=0`` 

3885 is equivalent to ``index=labels``). 

3886 

3887 .. versionadded:: 0.21.0 

3888 columns : single label or list-like 

3889 Alternative to specifying axis (``labels, axis=1`` 

3890 is equivalent to ``columns=labels``). 

3891 

3892 .. versionadded:: 0.21.0 

3893 level : int or level name, optional 

3894 For MultiIndex, level from which the labels will be removed. 

3895 inplace : bool, default False 

3896 If True, do operation inplace and return None. 

3897 errors : {'ignore', 'raise'}, default 'raise' 

3898 If 'ignore', suppress error and only existing labels are 

3899 dropped. 

3900 

3901 Returns 

3902 ------- 

3903 DataFrame 

3904 DataFrame without the removed index or column labels. 

3905 

3906 Raises 

3907 ------ 

3908 KeyError 

3909 If any of the labels is not found in the selected axis. 

3910 

3911 See Also 

3912 -------- 

3913 DataFrame.loc : Label-location based indexer for selection by label. 

3914 DataFrame.dropna : Return DataFrame with labels on given axis omitted 

3915 where (all or any) data are missing. 

3916 DataFrame.drop_duplicates : Return DataFrame with duplicate rows 

3917 removed, optionally only considering certain columns. 

3918 Series.drop : Return Series with specified index labels removed. 

3919 

3920 Examples 

3921 -------- 

3922 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), 

3923 ... columns=['A', 'B', 'C', 'D']) 

3924 >>> df 

3925 A B C D 

3926 0 0 1 2 3 

3927 1 4 5 6 7 

3928 2 8 9 10 11 

3929 

3930 Drop columns 

3931 

3932 >>> df.drop(['B', 'C'], axis=1) 

3933 A D 

3934 0 0 3 

3935 1 4 7 

3936 2 8 11 

3937 

3938 >>> df.drop(columns=['B', 'C']) 

3939 A D 

3940 0 0 3 

3941 1 4 7 

3942 2 8 11 

3943 

3944 Drop a row by index 

3945 

3946 >>> df.drop([0, 1]) 

3947 A B C D 

3948 2 8 9 10 11 

3949 

3950 Drop columns and/or rows of MultiIndex DataFrame 

3951 

3952 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], 

3953 ... ['speed', 'weight', 'length']], 

3954 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], 

3955 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) 

3956 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], 

3957 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], 

3958 ... [250, 150], [1.5, 0.8], [320, 250], 

3959 ... [1, 0.8], [0.3, 0.2]]) 

3960 >>> df 

3961 big small 

3962 lama speed 45.0 30.0 

3963 weight 200.0 100.0 

3964 length 1.5 1.0 

3965 cow speed 30.0 20.0 

3966 weight 250.0 150.0 

3967 length 1.5 0.8 

3968 falcon speed 320.0 250.0 

3969 weight 1.0 0.8 

3970 length 0.3 0.2 

3971 

3972 >>> df.drop(index='cow', columns='small') 

3973 big 

3974 lama speed 45.0 

3975 weight 200.0 

3976 length 1.5 

3977 falcon speed 320.0 

3978 weight 1.0 

3979 length 0.3 

3980 

3981 >>> df.drop(index='length', level=1) 

3982 big small 

3983 lama speed 45.0 30.0 

3984 weight 200.0 100.0 

3985 cow speed 30.0 20.0 

3986 weight 250.0 150.0 

3987 falcon speed 320.0 250.0 

3988 weight 1.0 0.8 

3989 """ 

3990 return super().drop( 

3991 labels=labels, 

3992 axis=axis, 

3993 index=index, 

3994 columns=columns, 

3995 level=level, 

3996 inplace=inplace, 

3997 errors=errors, 

3998 ) 

3999 

4000 @rewrite_axis_style_signature( 

4001 "mapper", 

4002 [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")], 

4003 ) 

4004 def rename( 

4005 self, 

4006 mapper: Optional[Renamer] = None, 

4007 *, 

4008 index: Optional[Renamer] = None, 

4009 columns: Optional[Renamer] = None, 

4010 axis: Optional[Axis] = None, 

4011 copy: bool = True, 

4012 inplace: bool = False, 

4013 level: Optional[Level] = None, 

4014 errors: str = "ignore", 

4015 ) -> Optional["DataFrame"]: 

4016 

4017 """ 

4018 Alter axes labels. 

4019 

4020 Function / dict values must be unique (1-to-1). Labels not contained in 

4021 a dict / Series will be left as-is. Extra labels listed don't throw an 

4022 error. 

4023 

4024 See the :ref:`user guide <basics.rename>` for more. 

4025 

4026 Parameters 

4027 ---------- 

4028 mapper : dict-like or function 

4029 Dict-like or functions transformations to apply to 

4030 that axis' values. Use either ``mapper`` and ``axis`` to 

4031 specify the axis to target with ``mapper``, or ``index`` and 

4032 ``columns``. 

4033 index : dict-like or function 

4034 Alternative to specifying axis (``mapper, axis=0`` 

4035 is equivalent to ``index=mapper``). 

4036 columns : dict-like or function 

4037 Alternative to specifying axis (``mapper, axis=1`` 

4038 is equivalent to ``columns=mapper``). 

4039 axis : int or str 

4040 Axis to target with ``mapper``. Can be either the axis name 

4041 ('index', 'columns') or number (0, 1). The default is 'index'. 

4042 copy : bool, default True 

4043 Also copy underlying data. 

4044 inplace : bool, default False 

4045 Whether to return a new DataFrame. If True then value of copy is 

4046 ignored. 

4047 level : int or level name, default None 

4048 In case of a MultiIndex, only rename labels in the specified 

4049 level. 

4050 errors : {'ignore', 'raise'}, default 'ignore' 

4051 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, 

4052 or `columns` contains labels that are not present in the Index 

4053 being transformed. 

4054 If 'ignore', existing keys will be renamed and extra keys will be 

4055 ignored. 

4056 

4057 Returns 

4058 ------- 

4059 DataFrame 

4060 DataFrame with the renamed axis labels. 

4061 

4062 Raises 

4063 ------ 

4064 KeyError 

4065 If any of the labels is not found in the selected axis and 

4066 "errors='raise'". 

4067 

4068 See Also 

4069 -------- 

4070 DataFrame.rename_axis : Set the name of the axis. 

4071 

4072 Examples 

4073 -------- 

4074 

4075 ``DataFrame.rename`` supports two calling conventions 

4076 

4077 * ``(index=index_mapper, columns=columns_mapper, ...)`` 

4078 * ``(mapper, axis={'index', 'columns'}, ...)`` 

4079 

4080 We *highly* recommend using keyword arguments to clarify your 

4081 intent. 

4082 

4083 Rename columns using a mapping: 

4084 

4085 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 

4086 >>> df.rename(columns={"A": "a", "B": "c"}) 

4087 a c 

4088 0 1 4 

4089 1 2 5 

4090 2 3 6 

4091 

4092 Rename index using a mapping: 

4093 

4094 >>> df.rename(index={0: "x", 1: "y", 2: "z"}) 

4095 A B 

4096 x 1 4 

4097 y 2 5 

4098 z 3 6 

4099 

4100 Cast index labels to a different type: 

4101 

4102 >>> df.index 

4103 RangeIndex(start=0, stop=3, step=1) 

4104 >>> df.rename(index=str).index 

4105 Index(['0', '1', '2'], dtype='object') 

4106 

4107 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") 

4108 Traceback (most recent call last): 

4109 KeyError: ['C'] not found in axis 

4110 

4111 Using axis-style parameters 

4112 

4113 >>> df.rename(str.lower, axis='columns') 

4114 a b 

4115 0 1 4 

4116 1 2 5 

4117 2 3 6 

4118 

4119 >>> df.rename({1: 2, 2: 4}, axis='index') 

4120 A B 

4121 0 1 4 

4122 2 2 5 

4123 4 3 6 

4124 """ 

4125 return super().rename( 

4126 mapper=mapper, 

4127 index=index, 

4128 columns=columns, 

4129 axis=axis, 

4130 copy=copy, 

4131 inplace=inplace, 

4132 level=level, 

4133 errors=errors, 

4134 ) 

4135 

4136 @Substitution(**_shared_doc_kwargs) 

4137 @Appender(NDFrame.fillna.__doc__) 

4138 def fillna( 

4139 self, 

4140 value=None, 

4141 method=None, 

4142 axis=None, 

4143 inplace=False, 

4144 limit=None, 

4145 downcast=None, 

4146 ) -> Optional["DataFrame"]: 

4147 return super().fillna( 

4148 value=value, 

4149 method=method, 

4150 axis=axis, 

4151 inplace=inplace, 

4152 limit=limit, 

4153 downcast=downcast, 

4154 ) 

4155 

4156 @Appender(_shared_docs["replace"] % _shared_doc_kwargs) 

4157 def replace( 

4158 self, 

4159 to_replace=None, 

4160 value=None, 

4161 inplace=False, 

4162 limit=None, 

4163 regex=False, 

4164 method="pad", 

4165 ): 

4166 return super().replace( 

4167 to_replace=to_replace, 

4168 value=value, 

4169 inplace=inplace, 

4170 limit=limit, 

4171 regex=regex, 

4172 method=method, 

4173 ) 

4174 

4175 @Appender(_shared_docs["shift"] % _shared_doc_kwargs) 

4176 def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": 

4177 return super().shift( 

4178 periods=periods, freq=freq, axis=axis, fill_value=fill_value 

4179 ) 

4180 

4181 def set_index( 

4182 self, keys, drop=True, append=False, inplace=False, verify_integrity=False 

4183 ): 

4184 """ 

4185 Set the DataFrame index using existing columns. 

4186 

4187 Set the DataFrame index (row labels) using one or more existing 

4188 columns or arrays (of the correct length). The index can replace the 

4189 existing index or expand on it. 

4190 

4191 Parameters 

4192 ---------- 

4193 keys : label or array-like or list of labels/arrays 

4194 This parameter can be either a single column key, a single array of 

4195 the same length as the calling DataFrame, or a list containing an 

4196 arbitrary combination of column keys and arrays. Here, "array" 

4197 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and 

4198 instances of :class:`~collections.abc.Iterator`. 

4199 drop : bool, default True 

4200 Delete columns to be used as the new index. 

4201 append : bool, default False 

4202 Whether to append columns to existing index. 

4203 inplace : bool, default False 

4204 Modify the DataFrame in place (do not create a new object). 

4205 verify_integrity : bool, default False 

4206 Check the new index for duplicates. Otherwise defer the check until 

4207 necessary. Setting to False will improve the performance of this 

4208 method. 

4209 

4210 Returns 

4211 ------- 

4212 DataFrame 

4213 Changed row labels. 

4214 

4215 See Also 

4216 -------- 

4217 DataFrame.reset_index : Opposite of set_index. 

4218 DataFrame.reindex : Change to new indices or expand indices. 

4219 DataFrame.reindex_like : Change to same indices as other DataFrame. 

4220 

4221 Examples 

4222 -------- 

4223 >>> df = pd.DataFrame({'month': [1, 4, 7, 10], 

4224 ... 'year': [2012, 2014, 2013, 2014], 

4225 ... 'sale': [55, 40, 84, 31]}) 

4226 >>> df 

4227 month year sale 

4228 0 1 2012 55 

4229 1 4 2014 40 

4230 2 7 2013 84 

4231 3 10 2014 31 

4232 

4233 Set the index to become the 'month' column: 

4234 

4235 >>> df.set_index('month') 

4236 year sale 

4237 month 

4238 1 2012 55 

4239 4 2014 40 

4240 7 2013 84 

4241 10 2014 31 

4242 

4243 Create a MultiIndex using columns 'year' and 'month': 

4244 

4245 >>> df.set_index(['year', 'month']) 

4246 sale 

4247 year month 

4248 2012 1 55 

4249 2014 4 40 

4250 2013 7 84 

4251 2014 10 31 

4252 

4253 Create a MultiIndex using an Index and a column: 

4254 

4255 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) 

4256 month sale 

4257 year 

4258 1 2012 1 55 

4259 2 2014 4 40 

4260 3 2013 7 84 

4261 4 2014 10 31 

4262 

4263 Create a MultiIndex using two Series: 

4264 

4265 >>> s = pd.Series([1, 2, 3, 4]) 

4266 >>> df.set_index([s, s**2]) 

4267 month year sale 

4268 1 1 1 2012 55 

4269 2 4 4 2014 40 

4270 3 9 7 2013 84 

4271 4 16 10 2014 31 

4272 """ 

4273 inplace = validate_bool_kwarg(inplace, "inplace") 

4274 if not isinstance(keys, list): 

4275 keys = [keys] 

4276 

4277 err_msg = ( 

4278 'The parameter "keys" may be a column key, one-dimensional ' 

4279 "array, or a list containing only valid column keys and " 

4280 "one-dimensional arrays." 

4281 ) 

4282 

4283 missing: List[Optional[Hashable]] = [] 

4284 for col in keys: 

4285 if isinstance( 

4286 col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) 

4287 ): 

4288 # arrays are fine as long as they are one-dimensional 

4289 # iterators get converted to list below 

4290 if getattr(col, "ndim", 1) != 1: 

4291 raise ValueError(err_msg) 

4292 else: 

4293 # everything else gets tried as a key; see GH 24969 

4294 try: 

4295 found = col in self.columns 

4296 except TypeError: 

4297 raise TypeError(f"{err_msg}. Received column of type {type(col)}") 

4298 else: 

4299 if not found: 

4300 missing.append(col) 

4301 

4302 if missing: 

4303 raise KeyError(f"None of {missing} are in the columns") 

4304 

4305 if inplace: 

4306 frame = self 

4307 else: 

4308 frame = self.copy() 

4309 

4310 arrays = [] 

4311 names = [] 

4312 if append: 

4313 names = list(self.index.names) 

4314 if isinstance(self.index, ABCMultiIndex): 

4315 for i in range(self.index.nlevels): 

4316 arrays.append(self.index._get_level_values(i)) 

4317 else: 

4318 arrays.append(self.index) 

4319 

4320 to_remove: List[Optional[Hashable]] = [] 

4321 for col in keys: 

4322 if isinstance(col, ABCMultiIndex): 

4323 for n in range(col.nlevels): 

4324 arrays.append(col._get_level_values(n)) 

4325 names.extend(col.names) 

4326 elif isinstance(col, (ABCIndexClass, ABCSeries)): 

4327 # if Index then not MultiIndex (treated above) 

4328 arrays.append(col) 

4329 names.append(col.name) 

4330 elif isinstance(col, (list, np.ndarray)): 

4331 arrays.append(col) 

4332 names.append(None) 

4333 elif isinstance(col, abc.Iterator): 

4334 arrays.append(list(col)) 

4335 names.append(None) 

4336 # from here, col can only be a column label 

4337 else: 

4338 arrays.append(frame[col]._values) 

4339 names.append(col) 

4340 if drop: 

4341 to_remove.append(col) 

4342 

4343 if len(arrays[-1]) != len(self): 

4344 # check newest element against length of calling frame, since 

4345 # ensure_index_from_sequences would not raise for append=False. 

4346 raise ValueError( 

4347 f"Length mismatch: Expected {len(self)} rows, " 

4348 f"received array of length {len(arrays[-1])}" 

4349 ) 

4350 

4351 index = ensure_index_from_sequences(arrays, names) 

4352 

4353 if verify_integrity and not index.is_unique: 

4354 duplicates = index[index.duplicated()].unique() 

4355 raise ValueError(f"Index has duplicate keys: {duplicates}") 

4356 

4357 # use set to handle duplicate column names gracefully in case of drop 

4358 for c in set(to_remove): 

4359 del frame[c] 

4360 

4361 # clear up memory usage 

4362 index._cleanup() 

4363 

4364 frame.index = index 

4365 

4366 if not inplace: 

4367 return frame 

4368 

4369 def reset_index( 

4370 self, 

4371 level: Optional[Union[Hashable, Sequence[Hashable]]] = None, 

4372 drop: bool = False, 

4373 inplace: bool = False, 

4374 col_level: Hashable = 0, 

4375 col_fill: Optional[Hashable] = "", 

4376 ) -> Optional["DataFrame"]: 

4377 """ 

4378 Reset the index, or a level of it. 

4379 

4380 Reset the index of the DataFrame, and use the default one instead. 

4381 If the DataFrame has a MultiIndex, this method can remove one or more 

4382 levels. 

4383 

4384 Parameters 

4385 ---------- 

4386 level : int, str, tuple, or list, default None 

4387 Only remove the given levels from the index. Removes all levels by 

4388 default. 

4389 drop : bool, default False 

4390 Do not try to insert index into dataframe columns. This resets 

4391 the index to the default integer index. 

4392 inplace : bool, default False 

4393 Modify the DataFrame in place (do not create a new object). 

4394 col_level : int or str, default 0 

4395 If the columns have multiple levels, determines which level the 

4396 labels are inserted into. By default it is inserted into the first 

4397 level. 

4398 col_fill : object, default '' 

4399 If the columns have multiple levels, determines how the other 

4400 levels are named. If None then the index name is repeated. 

4401 

4402 Returns 

4403 ------- 

4404 DataFrame or None 

4405 DataFrame with the new index or None if ``inplace=True``. 

4406 

4407 See Also 

4408 -------- 

4409 DataFrame.set_index : Opposite of reset_index. 

4410 DataFrame.reindex : Change to new indices or expand indices. 

4411 DataFrame.reindex_like : Change to same indices as other DataFrame. 

4412 

4413 Examples 

4414 -------- 

4415 >>> df = pd.DataFrame([('bird', 389.0), 

4416 ... ('bird', 24.0), 

4417 ... ('mammal', 80.5), 

4418 ... ('mammal', np.nan)], 

4419 ... index=['falcon', 'parrot', 'lion', 'monkey'], 

4420 ... columns=('class', 'max_speed')) 

4421 >>> df 

4422 class max_speed 

4423 falcon bird 389.0 

4424 parrot bird 24.0 

4425 lion mammal 80.5 

4426 monkey mammal NaN 

4427 

4428 When we reset the index, the old index is added as a column, and a 

4429 new sequential index is used: 

4430 

4431 >>> df.reset_index() 

4432 index class max_speed 

4433 0 falcon bird 389.0 

4434 1 parrot bird 24.0 

4435 2 lion mammal 80.5 

4436 3 monkey mammal NaN 

4437 

4438 We can use the `drop` parameter to avoid the old index being added as 

4439 a column: 

4440 

4441 >>> df.reset_index(drop=True) 

4442 class max_speed 

4443 0 bird 389.0 

4444 1 bird 24.0 

4445 2 mammal 80.5 

4446 3 mammal NaN 

4447 

4448 You can also use `reset_index` with `MultiIndex`. 

4449 

4450 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), 

4451 ... ('bird', 'parrot'), 

4452 ... ('mammal', 'lion'), 

4453 ... ('mammal', 'monkey')], 

4454 ... names=['class', 'name']) 

4455 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), 

4456 ... ('species', 'type')]) 

4457 >>> df = pd.DataFrame([(389.0, 'fly'), 

4458 ... ( 24.0, 'fly'), 

4459 ... ( 80.5, 'run'), 

4460 ... (np.nan, 'jump')], 

4461 ... index=index, 

4462 ... columns=columns) 

4463 >>> df 

4464 speed species 

4465 max type 

4466 class name 

4467 bird falcon 389.0 fly 

4468 parrot 24.0 fly 

4469 mammal lion 80.5 run 

4470 monkey NaN jump 

4471 

4472 If the index has multiple levels, we can reset a subset of them: 

4473 

4474 >>> df.reset_index(level='class') 

4475 class speed species 

4476 max type 

4477 name 

4478 falcon bird 389.0 fly 

4479 parrot bird 24.0 fly 

4480 lion mammal 80.5 run 

4481 monkey mammal NaN jump 

4482 

4483 If we are not dropping the index, by default, it is placed in the top 

4484 level. We can place it in another level: 

4485 

4486 >>> df.reset_index(level='class', col_level=1) 

4487 speed species 

4488 class max type 

4489 name 

4490 falcon bird 389.0 fly 

4491 parrot bird 24.0 fly 

4492 lion mammal 80.5 run 

4493 monkey mammal NaN jump 

4494 

4495 When the index is inserted under another level, we can specify under 

4496 which one with the parameter `col_fill`: 

4497 

4498 >>> df.reset_index(level='class', col_level=1, col_fill='species') 

4499 species speed species 

4500 class max type 

4501 name 

4502 falcon bird 389.0 fly 

4503 parrot bird 24.0 fly 

4504 lion mammal 80.5 run 

4505 monkey mammal NaN jump 

4506 

4507 If we specify a nonexistent level for `col_fill`, it is created: 

4508 

4509 >>> df.reset_index(level='class', col_level=1, col_fill='genus') 

4510 genus speed species 

4511 class max type 

4512 name 

4513 falcon bird 389.0 fly 

4514 parrot bird 24.0 fly 

4515 lion mammal 80.5 run 

4516 monkey mammal NaN jump 

4517 """ 

4518 inplace = validate_bool_kwarg(inplace, "inplace") 

4519 if inplace: 

4520 new_obj = self 

4521 else: 

4522 new_obj = self.copy() 

4523 

4524 def _maybe_casted_values(index, labels=None): 

4525 values = index._values 

4526 if not isinstance(index, (PeriodIndex, DatetimeIndex)): 

4527 if values.dtype == np.object_: 

4528 values = lib.maybe_convert_objects(values) 

4529 

4530 # if we have the labels, extract the values with a mask 

4531 if labels is not None: 

4532 mask = labels == -1 

4533 

4534 # we can have situations where the whole mask is -1, 

4535 # meaning there is nothing found in labels, so make all nan's 

4536 if mask.all(): 

4537 values = np.empty(len(mask)) 

4538 values.fill(np.nan) 

4539 else: 

4540 values = values.take(labels) 

4541 

4542 # TODO(https://github.com/pandas-dev/pandas/issues/24206) 

4543 # Push this into maybe_upcast_putmask? 

4544 # We can't pass EAs there right now. Looks a bit 

4545 # complicated. 

4546 # So we unbox the ndarray_values, op, re-box. 

4547 values_type = type(values) 

4548 values_dtype = values.dtype 

4549 

4550 if issubclass(values_type, DatetimeLikeArray): 

4551 values = values._data 

4552 

4553 if mask.any(): 

4554 values, _ = maybe_upcast_putmask(values, mask, np.nan) 

4555 

4556 if issubclass(values_type, DatetimeLikeArray): 

4557 values = values_type(values, dtype=values_dtype) 

4558 

4559 return values 

4560 

4561 new_index = ibase.default_index(len(new_obj)) 

4562 if level is not None: 

4563 if not isinstance(level, (tuple, list)): 

4564 level = [level] 

4565 level = [self.index._get_level_number(lev) for lev in level] 

4566 if len(level) < self.index.nlevels: 

4567 new_index = self.index.droplevel(level) 

4568 

4569 if not drop: 

4570 to_insert: Iterable[Tuple[Any, Optional[Any]]] 

4571 if isinstance(self.index, ABCMultiIndex): 

4572 names = [ 

4573 (n if n is not None else f"level_{i}") 

4574 for i, n in enumerate(self.index.names) 

4575 ] 

4576 to_insert = zip(self.index.levels, self.index.codes) 

4577 else: 

4578 default = "index" if "index" not in self else "level_0" 

4579 names = [default] if self.index.name is None else [self.index.name] 

4580 to_insert = ((self.index, None),) 

4581 

4582 multi_col = isinstance(self.columns, ABCMultiIndex) 

4583 for i, (lev, lab) in reversed(list(enumerate(to_insert))): 

4584 if not (level is None or i in level): 

4585 continue 

4586 name = names[i] 

4587 if multi_col: 

4588 col_name = list(name) if isinstance(name, tuple) else [name] 

4589 if col_fill is None: 

4590 if len(col_name) not in (1, self.columns.nlevels): 

4591 raise ValueError( 

4592 "col_fill=None is incompatible " 

4593 f"with incomplete column name {name}" 

4594 ) 

4595 col_fill = col_name[0] 

4596 

4597 lev_num = self.columns._get_level_number(col_level) 

4598 name_lst = [col_fill] * lev_num + col_name 

4599 missing = self.columns.nlevels - len(name_lst) 

4600 name_lst += [col_fill] * missing 

4601 name = tuple(name_lst) 

4602 # to ndarray and maybe infer different dtype 

4603 level_values = _maybe_casted_values(lev, lab) 

4604 new_obj.insert(0, name, level_values) 

4605 

4606 new_obj.index = new_index 

4607 if not inplace: 

4608 return new_obj 

4609 

4610 return None 

4611 

4612 # ---------------------------------------------------------------------- 

4613 # Reindex-based selection methods 

4614 

4615 @Appender(_shared_docs["isna"] % _shared_doc_kwargs) 

4616 def isna(self) -> "DataFrame": 

4617 return super().isna() 

4618 

4619 @Appender(_shared_docs["isna"] % _shared_doc_kwargs) 

4620 def isnull(self) -> "DataFrame": 

4621 return super().isnull() 

4622 

4623 @Appender(_shared_docs["notna"] % _shared_doc_kwargs) 

4624 def notna(self) -> "DataFrame": 

4625 return super().notna() 

4626 

4627 @Appender(_shared_docs["notna"] % _shared_doc_kwargs) 

4628 def notnull(self) -> "DataFrame": 

4629 return super().notnull() 

4630 

4631 def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): 

4632 """ 

4633 Remove missing values. 

4634 

4635 See the :ref:`User Guide <missing_data>` for more on which values are 

4636 considered missing, and how to work with missing data. 

4637 

4638 Parameters 

4639 ---------- 

4640 axis : {0 or 'index', 1 or 'columns'}, default 0 

4641 Determine if rows or columns which contain missing values are 

4642 removed. 

4643 

4644 * 0, or 'index' : Drop rows which contain missing values. 

4645 * 1, or 'columns' : Drop columns which contain missing value. 

4646 

4647 .. versionchanged:: 1.0.0 

4648 

4649 Pass tuple or list to drop on multiple axes. 

4650 Only a single axis is allowed. 

4651 

4652 how : {'any', 'all'}, default 'any' 

4653 Determine if row or column is removed from DataFrame, when we have 

4654 at least one NA or all NA. 

4655 

4656 * 'any' : If any NA values are present, drop that row or column. 

4657 * 'all' : If all values are NA, drop that row or column. 

4658 

4659 thresh : int, optional 

4660 Require that many non-NA values. 

4661 subset : array-like, optional 

4662 Labels along other axis to consider, e.g. if you are dropping rows 

4663 these would be a list of columns to include. 

4664 inplace : bool, default False 

4665 If True, do operation inplace and return None. 

4666 

4667 Returns 

4668 ------- 

4669 DataFrame 

4670 DataFrame with NA entries dropped from it. 

4671 

4672 See Also 

4673 -------- 

4674 DataFrame.isna: Indicate missing values. 

4675 DataFrame.notna : Indicate existing (non-missing) values. 

4676 DataFrame.fillna : Replace missing values. 

4677 Series.dropna : Drop missing values. 

4678 Index.dropna : Drop missing indices. 

4679 

4680 Examples 

4681 -------- 

4682 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], 

4683 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], 

4684 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), 

4685 ... pd.NaT]}) 

4686 >>> df 

4687 name toy born 

4688 0 Alfred NaN NaT 

4689 1 Batman Batmobile 1940-04-25 

4690 2 Catwoman Bullwhip NaT 

4691 

4692 Drop the rows where at least one element is missing. 

4693 

4694 >>> df.dropna() 

4695 name toy born 

4696 1 Batman Batmobile 1940-04-25 

4697 

4698 Drop the columns where at least one element is missing. 

4699 

4700 >>> df.dropna(axis='columns') 

4701 name 

4702 0 Alfred 

4703 1 Batman 

4704 2 Catwoman 

4705 

4706 Drop the rows where all elements are missing. 

4707 

4708 >>> df.dropna(how='all') 

4709 name toy born 

4710 0 Alfred NaN NaT 

4711 1 Batman Batmobile 1940-04-25 

4712 2 Catwoman Bullwhip NaT 

4713 

4714 Keep only the rows with at least 2 non-NA values. 

4715 

4716 >>> df.dropna(thresh=2) 

4717 name toy born 

4718 1 Batman Batmobile 1940-04-25 

4719 2 Catwoman Bullwhip NaT 

4720 

4721 Define in which columns to look for missing values. 

4722 

4723 >>> df.dropna(subset=['name', 'born']) 

4724 name toy born 

4725 1 Batman Batmobile 1940-04-25 

4726 

4727 Keep the DataFrame with valid entries in the same variable. 

4728 

4729 >>> df.dropna(inplace=True) 

4730 >>> df 

4731 name toy born 

4732 1 Batman Batmobile 1940-04-25 

4733 """ 

4734 inplace = validate_bool_kwarg(inplace, "inplace") 

4735 if isinstance(axis, (tuple, list)): 

4736 # GH20987 

4737 raise TypeError("supplying multiple axes to axis is no longer supported.") 

4738 

4739 axis = self._get_axis_number(axis) 

4740 agg_axis = 1 - axis 

4741 

4742 agg_obj = self 

4743 if subset is not None: 

4744 ax = self._get_axis(agg_axis) 

4745 indices = ax.get_indexer_for(subset) 

4746 check = indices == -1 

4747 if check.any(): 

4748 raise KeyError(list(np.compress(check, subset))) 

4749 agg_obj = self.take(indices, axis=agg_axis) 

4750 

4751 count = agg_obj.count(axis=agg_axis) 

4752 

4753 if thresh is not None: 

4754 mask = count >= thresh 

4755 elif how == "any": 

4756 mask = count == len(agg_obj._get_axis(agg_axis)) 

4757 elif how == "all": 

4758 mask = count > 0 

4759 else: 

4760 if how is not None: 

4761 raise ValueError(f"invalid how option: {how}") 

4762 else: 

4763 raise TypeError("must specify how or thresh") 

4764 

4765 result = self.loc(axis=axis)[mask] 

4766 

4767 if inplace: 

4768 self._update_inplace(result) 

4769 else: 

4770 return result 

4771 

4772 def drop_duplicates( 

4773 self, 

4774 subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, 

4775 keep: Union[str, bool] = "first", 

4776 inplace: bool = False, 

4777 ignore_index: bool = False, 

4778 ) -> Optional["DataFrame"]: 

4779 """ 

4780 Return DataFrame with duplicate rows removed. 

4781 

4782 Considering certain columns is optional. Indexes, including time indexes 

4783 are ignored. 

4784 

4785 Parameters 

4786 ---------- 

4787 subset : column label or sequence of labels, optional 

4788 Only consider certain columns for identifying duplicates, by 

4789 default use all of the columns. 

4790 keep : {'first', 'last', False}, default 'first' 

4791 Determines which duplicates (if any) to keep. 

4792 - ``first`` : Drop duplicates except for the first occurrence. 

4793 - ``last`` : Drop duplicates except for the last occurrence. 

4794 - False : Drop all duplicates. 

4795 inplace : bool, default False 

4796 Whether to drop duplicates in place or to return a copy. 

4797 ignore_index : bool, default False 

4798 If True, the resulting axis will be labeled 0, 1, …, n - 1. 

4799 

4800 .. versionadded:: 1.0.0 

4801 

4802 Returns 

4803 ------- 

4804 DataFrame 

4805 DataFrame with duplicates removed or None if ``inplace=True``. 

4806 """ 

4807 if self.empty: 

4808 return self.copy() 

4809 

4810 inplace = validate_bool_kwarg(inplace, "inplace") 

4811 duplicated = self.duplicated(subset, keep=keep) 

4812 

4813 if inplace: 

4814 (inds,) = (-duplicated)._ndarray_values.nonzero() 

4815 new_data = self._data.take(inds) 

4816 

4817 if ignore_index: 

4818 new_data.axes[1] = ibase.default_index(len(inds)) 

4819 self._update_inplace(new_data) 

4820 else: 

4821 result = self[-duplicated] 

4822 

4823 if ignore_index: 

4824 result.index = ibase.default_index(len(result)) 

4825 return result 

4826 

4827 return None 

4828 

4829 def duplicated( 

4830 self, 

4831 subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, 

4832 keep: Union[str, bool] = "first", 

4833 ) -> "Series": 

4834 """ 

4835 Return boolean Series denoting duplicate rows. 

4836 

4837 Considering certain columns is optional. 

4838 

4839 Parameters 

4840 ---------- 

4841 subset : column label or sequence of labels, optional 

4842 Only consider certain columns for identifying duplicates, by 

4843 default use all of the columns. 

4844 keep : {'first', 'last', False}, default 'first' 

4845 Determines which duplicates (if any) to mark. 

4846 

4847 - ``first`` : Mark duplicates as ``True`` except for the first occurrence. 

4848 - ``last`` : Mark duplicates as ``True`` except for the last occurrence. 

4849 - False : Mark all duplicates as ``True``. 

4850 

4851 Returns 

4852 ------- 

4853 Series 

4854 """ 

4855 from pandas.core.sorting import get_group_index 

4856 from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT 

4857 

4858 if self.empty: 

4859 return Series(dtype=bool) 

4860 

4861 def f(vals): 

4862 labels, shape = algorithms.factorize( 

4863 vals, size_hint=min(len(self), _SIZE_HINT_LIMIT) 

4864 ) 

4865 return labels.astype("i8", copy=False), len(shape) 

4866 

4867 if subset is None: 

4868 subset = self.columns 

4869 elif ( 

4870 not np.iterable(subset) 

4871 or isinstance(subset, str) 

4872 or isinstance(subset, tuple) 

4873 and subset in self.columns 

4874 ): 

4875 subset = (subset,) 

4876 

4877 # needed for mypy since can't narrow types using np.iterable 

4878 subset = cast(Iterable, subset) 

4879 

4880 # Verify all columns in subset exist in the queried dataframe 

4881 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a 

4882 # key that doesn't exist. 

4883 diff = Index(subset).difference(self.columns) 

4884 if not diff.empty: 

4885 raise KeyError(diff) 

4886 

4887 vals = (col.values for name, col in self.items() if name in subset) 

4888 labels, shape = map(list, zip(*map(f, vals))) 

4889 

4890 ids = get_group_index(labels, shape, sort=False, xnull=False) 

4891 return Series(duplicated_int64(ids, keep), index=self.index) 

4892 

4893 # ---------------------------------------------------------------------- 

4894 # Sorting 

4895 

4896 @Substitution(**_shared_doc_kwargs) 

4897 @Appender(NDFrame.sort_values.__doc__) 

4898 def sort_values( 

4899 self, 

4900 by, 

4901 axis=0, 

4902 ascending=True, 

4903 inplace=False, 

4904 kind="quicksort", 

4905 na_position="last", 

4906 ignore_index=False, 

4907 ): 

4908 inplace = validate_bool_kwarg(inplace, "inplace") 

4909 axis = self._get_axis_number(axis) 

4910 

4911 if not isinstance(by, list): 

4912 by = [by] 

4913 if is_sequence(ascending) and len(by) != len(ascending): 

4914 raise ValueError( 

4915 f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" 

4916 ) 

4917 if len(by) > 1: 

4918 from pandas.core.sorting import lexsort_indexer 

4919 

4920 keys = [self._get_label_or_level_values(x, axis=axis) for x in by] 

4921 indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) 

4922 indexer = ensure_platform_int(indexer) 

4923 else: 

4924 from pandas.core.sorting import nargsort 

4925 

4926 by = by[0] 

4927 k = self._get_label_or_level_values(by, axis=axis) 

4928 

4929 if isinstance(ascending, (tuple, list)): 

4930 ascending = ascending[0] 

4931 

4932 indexer = nargsort( 

4933 k, kind=kind, ascending=ascending, na_position=na_position 

4934 ) 

4935 

4936 new_data = self._data.take( 

4937 indexer, axis=self._get_block_manager_axis(axis), verify=False 

4938 ) 

4939 

4940 if ignore_index: 

4941 new_data.axes[1] = ibase.default_index(len(indexer)) 

4942 

4943 if inplace: 

4944 return self._update_inplace(new_data) 

4945 else: 

4946 return self._constructor(new_data).__finalize__(self) 

4947 

4948 @Substitution(**_shared_doc_kwargs) 

4949 @Appender(NDFrame.sort_index.__doc__) 

4950 def sort_index( 

4951 self, 

4952 axis=0, 

4953 level=None, 

4954 ascending=True, 

4955 inplace=False, 

4956 kind="quicksort", 

4957 na_position="last", 

4958 sort_remaining=True, 

4959 ignore_index: bool = False, 

4960 ): 

4961 

4962 # TODO: this can be combined with Series.sort_index impl as 

4963 # almost identical 

4964 

4965 inplace = validate_bool_kwarg(inplace, "inplace") 

4966 

4967 axis = self._get_axis_number(axis) 

4968 labels = self._get_axis(axis) 

4969 

4970 # make sure that the axis is lexsorted to start 

4971 # if not we need to reconstruct to get the correct indexer 

4972 labels = labels._sort_levels_monotonic() 

4973 if level is not None: 

4974 

4975 new_axis, indexer = labels.sortlevel( 

4976 level, ascending=ascending, sort_remaining=sort_remaining 

4977 ) 

4978 

4979 elif isinstance(labels, ABCMultiIndex): 

4980 from pandas.core.sorting import lexsort_indexer 

4981 

4982 indexer = lexsort_indexer( 

4983 labels._get_codes_for_sorting(), 

4984 orders=ascending, 

4985 na_position=na_position, 

4986 ) 

4987 else: 

4988 from pandas.core.sorting import nargsort 

4989 

4990 # Check monotonic-ness before sort an index 

4991 # GH11080 

4992 if (ascending and labels.is_monotonic_increasing) or ( 

4993 not ascending and labels.is_monotonic_decreasing 

4994 ): 

4995 if inplace: 

4996 return 

4997 else: 

4998 return self.copy() 

4999 

5000 indexer = nargsort( 

5001 labels, kind=kind, ascending=ascending, na_position=na_position 

5002 ) 

5003 

5004 baxis = self._get_block_manager_axis(axis) 

5005 new_data = self._data.take(indexer, axis=baxis, verify=False) 

5006 

5007 # reconstruct axis if needed 

5008 new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() 

5009 

5010 if ignore_index: 

5011 new_data.axes[1] = ibase.default_index(len(indexer)) 

5012 

5013 if inplace: 

5014 return self._update_inplace(new_data) 

5015 else: 

5016 return self._constructor(new_data).__finalize__(self) 

5017 

5018 def nlargest(self, n, columns, keep="first") -> "DataFrame": 

5019 """ 

5020 Return the first `n` rows ordered by `columns` in descending order. 

5021 

5022 Return the first `n` rows with the largest values in `columns`, in 

5023 descending order. The columns that are not specified are returned as 

5024 well, but not used for ordering. 

5025 

5026 This method is equivalent to 

5027 ``df.sort_values(columns, ascending=False).head(n)``, but more 

5028 performant. 

5029 

5030 Parameters 

5031 ---------- 

5032 n : int 

5033 Number of rows to return. 

5034 columns : label or list of labels 

5035 Column label(s) to order by. 

5036 keep : {'first', 'last', 'all'}, default 'first' 

5037 Where there are duplicate values: 

5038 

5039 - `first` : prioritize the first occurrence(s) 

5040 - `last` : prioritize the last occurrence(s) 

5041 - ``all`` : do not drop any duplicates, even it means 

5042 selecting more than `n` items. 

5043 

5044 .. versionadded:: 0.24.0 

5045 

5046 Returns 

5047 ------- 

5048 DataFrame 

5049 The first `n` rows ordered by the given columns in descending 

5050 order. 

5051 

5052 See Also 

5053 -------- 

5054 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in 

5055 ascending order. 

5056 DataFrame.sort_values : Sort DataFrame by the values. 

5057 DataFrame.head : Return the first `n` rows without re-ordering. 

5058 

5059 Notes 

5060 ----- 

5061 This function cannot be used with all column types. For example, when 

5062 specifying columns with `object` or `category` dtypes, ``TypeError`` is 

5063 raised. 

5064 

5065 Examples 

5066 -------- 

5067 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, 

5068 ... 434000, 434000, 337000, 11300, 

5069 ... 11300, 11300], 

5070 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, 

5071 ... 17036, 182, 38, 311], 

5072 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", 

5073 ... "IS", "NR", "TV", "AI"]}, 

5074 ... index=["Italy", "France", "Malta", 

5075 ... "Maldives", "Brunei", "Iceland", 

5076 ... "Nauru", "Tuvalu", "Anguilla"]) 

5077 >>> df 

5078 population GDP alpha-2 

5079 Italy 59000000 1937894 IT 

5080 France 65000000 2583560 FR 

5081 Malta 434000 12011 MT 

5082 Maldives 434000 4520 MV 

5083 Brunei 434000 12128 BN 

5084 Iceland 337000 17036 IS 

5085 Nauru 11300 182 NR 

5086 Tuvalu 11300 38 TV 

5087 Anguilla 11300 311 AI 

5088 

5089 In the following example, we will use ``nlargest`` to select the three 

5090 rows having the largest values in column "population". 

5091 

5092 >>> df.nlargest(3, 'population') 

5093 population GDP alpha-2 

5094 France 65000000 2583560 FR 

5095 Italy 59000000 1937894 IT 

5096 Malta 434000 12011 MT 

5097 

5098 When using ``keep='last'``, ties are resolved in reverse order: 

5099 

5100 >>> df.nlargest(3, 'population', keep='last') 

5101 population GDP alpha-2 

5102 France 65000000 2583560 FR 

5103 Italy 59000000 1937894 IT 

5104 Brunei 434000 12128 BN 

5105 

5106 When using ``keep='all'``, all duplicate items are maintained: 

5107 

5108 >>> df.nlargest(3, 'population', keep='all') 

5109 population GDP alpha-2 

5110 France 65000000 2583560 FR 

5111 Italy 59000000 1937894 IT 

5112 Malta 434000 12011 MT 

5113 Maldives 434000 4520 MV 

5114 Brunei 434000 12128 BN 

5115 

5116 To order by the largest values in column "population" and then "GDP", 

5117 we can specify multiple columns like in the next example. 

5118 

5119 >>> df.nlargest(3, ['population', 'GDP']) 

5120 population GDP alpha-2 

5121 France 65000000 2583560 FR 

5122 Italy 59000000 1937894 IT 

5123 Brunei 434000 12128 BN 

5124 """ 

5125 return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() 

5126 

5127 def nsmallest(self, n, columns, keep="first") -> "DataFrame": 

5128 """ 

5129 Return the first `n` rows ordered by `columns` in ascending order. 

5130 

5131 Return the first `n` rows with the smallest values in `columns`, in 

5132 ascending order. The columns that are not specified are returned as 

5133 well, but not used for ordering. 

5134 

5135 This method is equivalent to 

5136 ``df.sort_values(columns, ascending=True).head(n)``, but more 

5137 performant. 

5138 

5139 Parameters 

5140 ---------- 

5141 n : int 

5142 Number of items to retrieve. 

5143 columns : list or str 

5144 Column name or names to order by. 

5145 keep : {'first', 'last', 'all'}, default 'first' 

5146 Where there are duplicate values: 

5147 

5148 - ``first`` : take the first occurrence. 

5149 - ``last`` : take the last occurrence. 

5150 - ``all`` : do not drop any duplicates, even it means 

5151 selecting more than `n` items. 

5152 

5153 .. versionadded:: 0.24.0 

5154 

5155 Returns 

5156 ------- 

5157 DataFrame 

5158 

5159 See Also 

5160 -------- 

5161 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in 

5162 descending order. 

5163 DataFrame.sort_values : Sort DataFrame by the values. 

5164 DataFrame.head : Return the first `n` rows without re-ordering. 

5165 

5166 Examples 

5167 -------- 

5168 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, 

5169 ... 434000, 434000, 337000, 11300, 

5170 ... 11300, 11300], 

5171 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, 

5172 ... 17036, 182, 38, 311], 

5173 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", 

5174 ... "IS", "NR", "TV", "AI"]}, 

5175 ... index=["Italy", "France", "Malta", 

5176 ... "Maldives", "Brunei", "Iceland", 

5177 ... "Nauru", "Tuvalu", "Anguilla"]) 

5178 >>> df 

5179 population GDP alpha-2 

5180 Italy 59000000 1937894 IT 

5181 France 65000000 2583560 FR 

5182 Malta 434000 12011 MT 

5183 Maldives 434000 4520 MV 

5184 Brunei 434000 12128 BN 

5185 Iceland 337000 17036 IS 

5186 Nauru 11300 182 NR 

5187 Tuvalu 11300 38 TV 

5188 Anguilla 11300 311 AI 

5189 

5190 In the following example, we will use ``nsmallest`` to select the 

5191 three rows having the smallest values in column "a". 

5192 

5193 >>> df.nsmallest(3, 'population') 

5194 population GDP alpha-2 

5195 Nauru 11300 182 NR 

5196 Tuvalu 11300 38 TV 

5197 Anguilla 11300 311 AI 

5198 

5199 When using ``keep='last'``, ties are resolved in reverse order: 

5200 

5201 >>> df.nsmallest(3, 'population', keep='last') 

5202 population GDP alpha-2 

5203 Anguilla 11300 311 AI 

5204 Tuvalu 11300 38 TV 

5205 Nauru 11300 182 NR 

5206 

5207 When using ``keep='all'``, all duplicate items are maintained: 

5208 

5209 >>> df.nsmallest(3, 'population', keep='all') 

5210 population GDP alpha-2 

5211 Nauru 11300 182 NR 

5212 Tuvalu 11300 38 TV 

5213 Anguilla 11300 311 AI 

5214 

5215 To order by the largest values in column "a" and then "c", we can 

5216 specify multiple columns like in the next example. 

5217 

5218 >>> df.nsmallest(3, ['population', 'GDP']) 

5219 population GDP alpha-2 

5220 Tuvalu 11300 38 TV 

5221 Nauru 11300 182 NR 

5222 Anguilla 11300 311 AI 

5223 """ 

5224 return algorithms.SelectNFrame( 

5225 self, n=n, keep=keep, columns=columns 

5226 ).nsmallest() 

5227 

5228 def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": 

5229 """ 

5230 Swap levels i and j in a MultiIndex on a particular axis. 

5231 

5232 Parameters 

5233 ---------- 

5234 i, j : int or str 

5235 Levels of the indices to be swapped. Can pass level name as string. 

5236 

5237 Returns 

5238 ------- 

5239 DataFrame 

5240 """ 

5241 result = self.copy() 

5242 

5243 axis = self._get_axis_number(axis) 

5244 if axis == 0: 

5245 result.index = result.index.swaplevel(i, j) 

5246 else: 

5247 result.columns = result.columns.swaplevel(i, j) 

5248 return result 

5249 

5250 def reorder_levels(self, order, axis=0) -> "DataFrame": 

5251 """ 

5252 Rearrange index levels using input order. May not drop or duplicate levels. 

5253 

5254 Parameters 

5255 ---------- 

5256 order : list of int or list of str 

5257 List representing new level order. Reference level by number 

5258 (position) or by key (label). 

5259 axis : int 

5260 Where to reorder levels. 

5261 

5262 Returns 

5263 ------- 

5264 DataFrame 

5265 """ 

5266 axis = self._get_axis_number(axis) 

5267 if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover 

5268 raise TypeError("Can only reorder levels on a hierarchical axis.") 

5269 

5270 result = self.copy() 

5271 

5272 if axis == 0: 

5273 result.index = result.index.reorder_levels(order) 

5274 else: 

5275 result.columns = result.columns.reorder_levels(order) 

5276 return result 

5277 

5278 # ---------------------------------------------------------------------- 

5279 # Arithmetic / combination related 

5280 

5281 def _combine_frame(self, other, func, fill_value=None, level=None): 

5282 # at this point we have `self._indexed_same(other)` 

5283 

5284 if fill_value is None: 

5285 # since _arith_op may be called in a loop, avoid function call 

5286 # overhead if possible by doing this check once 

5287 _arith_op = func 

5288 

5289 else: 

5290 

5291 def _arith_op(left, right): 

5292 # for the mixed_type case where we iterate over columns, 

5293 # _arith_op(left, right) is equivalent to 

5294 # left._binop(right, func, fill_value=fill_value) 

5295 left, right = ops.fill_binop(left, right, fill_value) 

5296 return func(left, right) 

5297 

5298 if ops.should_series_dispatch(self, other, func): 

5299 # iterate over columns 

5300 new_data = ops.dispatch_to_series(self, other, _arith_op) 

5301 else: 

5302 with np.errstate(all="ignore"): 

5303 res_values = _arith_op(self.values, other.values) 

5304 new_data = dispatch_fill_zeros(func, self.values, other.values, res_values) 

5305 

5306 return new_data 

5307 

5308 def _combine_match_index(self, other, func): 

5309 # at this point we have `self.index.equals(other.index)` 

5310 

5311 if ops.should_series_dispatch(self, other, func): 

5312 # operate column-wise; avoid costly object-casting in `.values` 

5313 new_data = ops.dispatch_to_series(self, other, func) 

5314 else: 

5315 # fastpath --> operate directly on values 

5316 with np.errstate(all="ignore"): 

5317 new_data = func(self.values.T, other.values).T 

5318 return new_data 

5319 

5320 def _construct_result(self, result) -> "DataFrame": 

5321 """ 

5322 Wrap the result of an arithmetic, comparison, or logical operation. 

5323 

5324 Parameters 

5325 ---------- 

5326 result : DataFrame 

5327 

5328 Returns 

5329 ------- 

5330 DataFrame 

5331 """ 

5332 out = self._constructor(result, index=self.index, copy=False) 

5333 # Pin columns instead of passing to constructor for compat with 

5334 # non-unique columns case 

5335 out.columns = self.columns 

5336 return out 

5337 

5338 def combine( 

5339 self, other: "DataFrame", func, fill_value=None, overwrite=True 

5340 ) -> "DataFrame": 

5341 """ 

5342 Perform column-wise combine with another DataFrame. 

5343 

5344 Combines a DataFrame with `other` DataFrame using `func` 

5345 to element-wise combine columns. The row and column indexes of the 

5346 resulting DataFrame will be the union of the two. 

5347 

5348 Parameters 

5349 ---------- 

5350 other : DataFrame 

5351 The DataFrame to merge column-wise. 

5352 func : function 

5353 Function that takes two series as inputs and return a Series or a 

5354 scalar. Used to merge the two dataframes column by columns. 

5355 fill_value : scalar value, default None 

5356 The value to fill NaNs with prior to passing any column to the 

5357 merge func. 

5358 overwrite : bool, default True 

5359 If True, columns in `self` that do not exist in `other` will be 

5360 overwritten with NaNs. 

5361 

5362 Returns 

5363 ------- 

5364 DataFrame 

5365 Combination of the provided DataFrames. 

5366 

5367 See Also 

5368 -------- 

5369 DataFrame.combine_first : Combine two DataFrame objects and default to 

5370 non-null values in frame calling the method. 

5371 

5372 Examples 

5373 -------- 

5374 Combine using a simple function that chooses the smaller column. 

5375 

5376 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 

5377 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

5378 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 

5379 >>> df1.combine(df2, take_smaller) 

5380 A B 

5381 0 0 3 

5382 1 0 3 

5383 

5384 Example using a true element-wise combine function. 

5385 

5386 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) 

5387 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

5388 >>> df1.combine(df2, np.minimum) 

5389 A B 

5390 0 1 2 

5391 1 0 3 

5392 

5393 Using `fill_value` fills Nones prior to passing the column to the 

5394 merge function. 

5395 

5396 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 

5397 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

5398 >>> df1.combine(df2, take_smaller, fill_value=-5) 

5399 A B 

5400 0 0 -5.0 

5401 1 0 4.0 

5402 

5403 However, if the same element in both dataframes is None, that None 

5404 is preserved 

5405 

5406 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 

5407 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) 

5408 >>> df1.combine(df2, take_smaller, fill_value=-5) 

5409 A B 

5410 0 0 -5.0 

5411 1 0 3.0 

5412 

5413 Example that demonstrates the use of `overwrite` and behavior when 

5414 the axis differ between the dataframes. 

5415 

5416 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 

5417 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) 

5418 >>> df1.combine(df2, take_smaller) 

5419 A B C 

5420 0 NaN NaN NaN 

5421 1 NaN 3.0 -10.0 

5422 2 NaN 3.0 1.0 

5423 

5424 >>> df1.combine(df2, take_smaller, overwrite=False) 

5425 A B C 

5426 0 0.0 NaN NaN 

5427 1 0.0 3.0 -10.0 

5428 2 NaN 3.0 1.0 

5429 

5430 Demonstrating the preference of the passed in dataframe. 

5431 

5432 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) 

5433 >>> df2.combine(df1, take_smaller) 

5434 A B C 

5435 0 0.0 NaN NaN 

5436 1 0.0 3.0 NaN 

5437 2 NaN 3.0 NaN 

5438 

5439 >>> df2.combine(df1, take_smaller, overwrite=False) 

5440 A B C 

5441 0 0.0 NaN NaN 

5442 1 0.0 3.0 1.0 

5443 2 NaN 3.0 1.0 

5444 """ 

5445 other_idxlen = len(other.index) # save for compare 

5446 

5447 this, other = self.align(other, copy=False) 

5448 new_index = this.index 

5449 

5450 if other.empty and len(new_index) == len(self.index): 

5451 return self.copy() 

5452 

5453 if self.empty and len(other) == other_idxlen: 

5454 return other.copy() 

5455 

5456 # sorts if possible 

5457 new_columns = this.columns.union(other.columns) 

5458 do_fill = fill_value is not None 

5459 result = {} 

5460 for col in new_columns: 

5461 series = this[col] 

5462 otherSeries = other[col] 

5463 

5464 this_dtype = series.dtype 

5465 other_dtype = otherSeries.dtype 

5466 

5467 this_mask = isna(series) 

5468 other_mask = isna(otherSeries) 

5469 

5470 # don't overwrite columns unnecessarily 

5471 # DO propagate if this column is not in the intersection 

5472 if not overwrite and other_mask.all(): 

5473 result[col] = this[col].copy() 

5474 continue 

5475 

5476 if do_fill: 

5477 series = series.copy() 

5478 otherSeries = otherSeries.copy() 

5479 series[this_mask] = fill_value 

5480 otherSeries[other_mask] = fill_value 

5481 

5482 if col not in self.columns: 

5483 # If self DataFrame does not have col in other DataFrame, 

5484 # try to promote series, which is all NaN, as other_dtype. 

5485 new_dtype = other_dtype 

5486 try: 

5487 series = series.astype(new_dtype, copy=False) 

5488 except ValueError: 

5489 # e.g. new_dtype is integer types 

5490 pass 

5491 else: 

5492 # if we have different dtypes, possibly promote 

5493 new_dtype = find_common_type([this_dtype, other_dtype]) 

5494 if not is_dtype_equal(this_dtype, new_dtype): 

5495 series = series.astype(new_dtype) 

5496 if not is_dtype_equal(other_dtype, new_dtype): 

5497 otherSeries = otherSeries.astype(new_dtype) 

5498 

5499 arr = func(series, otherSeries) 

5500 arr = maybe_downcast_to_dtype(arr, this_dtype) 

5501 

5502 result[col] = arr 

5503 

5504 # convert_objects just in case 

5505 return self._constructor(result, index=new_index, columns=new_columns) 

5506 

5507 def combine_first(self, other: "DataFrame") -> "DataFrame": 

5508 """ 

5509 Update null elements with value in the same location in `other`. 

5510 

5511 Combine two DataFrame objects by filling null values in one DataFrame 

5512 with non-null values from other DataFrame. The row and column indexes 

5513 of the resulting DataFrame will be the union of the two. 

5514 

5515 Parameters 

5516 ---------- 

5517 other : DataFrame 

5518 Provided DataFrame to use to fill null values. 

5519 

5520 Returns 

5521 ------- 

5522 DataFrame 

5523 

5524 See Also 

5525 -------- 

5526 DataFrame.combine : Perform series-wise operation on two DataFrames 

5527 using a given function. 

5528 

5529 Examples 

5530 -------- 

5531 

5532 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) 

5533 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

5534 >>> df1.combine_first(df2) 

5535 A B 

5536 0 1.0 3.0 

5537 1 0.0 4.0 

5538 

5539 Null values still persist if the location of that null value 

5540 does not exist in `other` 

5541 

5542 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) 

5543 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) 

5544 >>> df1.combine_first(df2) 

5545 A B C 

5546 0 NaN 4.0 NaN 

5547 1 0.0 3.0 1.0 

5548 2 NaN 3.0 1.0 

5549 """ 

5550 import pandas.core.computation.expressions as expressions 

5551 

5552 def extract_values(arr): 

5553 # Does two things: 

5554 # 1. maybe gets the values from the Series / Index 

5555 # 2. convert datelike to i8 

5556 if isinstance(arr, (ABCIndexClass, ABCSeries)): 

5557 arr = arr._values 

5558 

5559 if needs_i8_conversion(arr): 

5560 if is_extension_array_dtype(arr.dtype): 

5561 arr = arr.asi8 

5562 else: 

5563 arr = arr.view("i8") 

5564 return arr 

5565 

5566 def combiner(x, y): 

5567 mask = isna(x) 

5568 if isinstance(mask, (ABCIndexClass, ABCSeries)): 

5569 mask = mask._values 

5570 

5571 x_values = extract_values(x) 

5572 y_values = extract_values(y) 

5573 

5574 # If the column y in other DataFrame is not in first DataFrame, 

5575 # just return y_values. 

5576 if y.name not in self.columns: 

5577 return y_values 

5578 

5579 return expressions.where(mask, y_values, x_values) 

5580 

5581 return self.combine(other, combiner, overwrite=False) 

5582 

5583 def update( 

5584 self, other, join="left", overwrite=True, filter_func=None, errors="ignore" 

5585 ) -> None: 

5586 """ 

5587 Modify in place using non-NA values from another DataFrame. 

5588 

5589 Aligns on indices. There is no return value. 

5590 

5591 Parameters 

5592 ---------- 

5593 other : DataFrame, or object coercible into a DataFrame 

5594 Should have at least one matching index/column label 

5595 with the original DataFrame. If a Series is passed, 

5596 its name attribute must be set, and that will be 

5597 used as the column name to align with the original DataFrame. 

5598 join : {'left'}, default 'left' 

5599 Only left join is implemented, keeping the index and columns of the 

5600 original object. 

5601 overwrite : bool, default True 

5602 How to handle non-NA values for overlapping keys: 

5603 

5604 * True: overwrite original DataFrame's values 

5605 with values from `other`. 

5606 * False: only update values that are NA in 

5607 the original DataFrame. 

5608 

5609 filter_func : callable(1d-array) -> bool 1d-array, optional 

5610 Can choose to replace values other than NA. Return True for values 

5611 that should be updated. 

5612 errors : {'raise', 'ignore'}, default 'ignore' 

5613 If 'raise', will raise a ValueError if the DataFrame and `other` 

5614 both contain non-NA data in the same place. 

5615 

5616 .. versionchanged:: 0.24.0 

5617 Changed from `raise_conflict=False|True` 

5618 to `errors='ignore'|'raise'`. 

5619 

5620 Returns 

5621 ------- 

5622 None : method directly changes calling object 

5623 

5624 Raises 

5625 ------ 

5626 ValueError 

5627 * When `errors='raise'` and there's overlapping non-NA data. 

5628 * When `errors` is not either `'ignore'` or `'raise'` 

5629 NotImplementedError 

5630 * If `join != 'left'` 

5631 

5632 See Also 

5633 -------- 

5634 dict.update : Similar method for dictionaries. 

5635 DataFrame.merge : For column(s)-on-columns(s) operations. 

5636 

5637 Examples 

5638 -------- 

5639 >>> df = pd.DataFrame({'A': [1, 2, 3], 

5640 ... 'B': [400, 500, 600]}) 

5641 >>> new_df = pd.DataFrame({'B': [4, 5, 6], 

5642 ... 'C': [7, 8, 9]}) 

5643 >>> df.update(new_df) 

5644 >>> df 

5645 A B 

5646 0 1 4 

5647 1 2 5 

5648 2 3 6 

5649 

5650 The DataFrame's length does not increase as a result of the update, 

5651 only values at matching index/column labels are updated. 

5652 

5653 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

5654 ... 'B': ['x', 'y', 'z']}) 

5655 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) 

5656 >>> df.update(new_df) 

5657 >>> df 

5658 A B 

5659 0 a d 

5660 1 b e 

5661 2 c f 

5662 

5663 For Series, it's name attribute must be set. 

5664 

5665 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

5666 ... 'B': ['x', 'y', 'z']}) 

5667 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) 

5668 >>> df.update(new_column) 

5669 >>> df 

5670 A B 

5671 0 a d 

5672 1 b y 

5673 2 c e 

5674 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

5675 ... 'B': ['x', 'y', 'z']}) 

5676 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) 

5677 >>> df.update(new_df) 

5678 >>> df 

5679 A B 

5680 0 a x 

5681 1 b d 

5682 2 c e 

5683 

5684 If `other` contains NaNs the corresponding values are not updated 

5685 in the original dataframe. 

5686 

5687 >>> df = pd.DataFrame({'A': [1, 2, 3], 

5688 ... 'B': [400, 500, 600]}) 

5689 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) 

5690 >>> df.update(new_df) 

5691 >>> df 

5692 A B 

5693 0 1 4.0 

5694 1 2 500.0 

5695 2 3 6.0 

5696 """ 

5697 import pandas.core.computation.expressions as expressions 

5698 

5699 # TODO: Support other joins 

5700 if join != "left": # pragma: no cover 

5701 raise NotImplementedError("Only left join is supported") 

5702 if errors not in ["ignore", "raise"]: 

5703 raise ValueError("The parameter errors must be either 'ignore' or 'raise'") 

5704 

5705 if not isinstance(other, DataFrame): 

5706 other = DataFrame(other) 

5707 

5708 other = other.reindex_like(self) 

5709 

5710 for col in self.columns: 

5711 this = self[col]._values 

5712 that = other[col]._values 

5713 if filter_func is not None: 

5714 with np.errstate(all="ignore"): 

5715 mask = ~filter_func(this) | isna(that) 

5716 else: 

5717 if errors == "raise": 

5718 mask_this = notna(that) 

5719 mask_that = notna(this) 

5720 if any(mask_this & mask_that): 

5721 raise ValueError("Data overlaps.") 

5722 

5723 if overwrite: 

5724 mask = isna(that) 

5725 else: 

5726 mask = notna(this) 

5727 

5728 # don't overwrite columns unnecessarily 

5729 if mask.all(): 

5730 continue 

5731 

5732 self[col] = expressions.where(mask, this, that) 

5733 

5734 # ---------------------------------------------------------------------- 

5735 # Data reshaping 

5736 @Appender( 

5737 """ 

5738Examples 

5739-------- 

5740>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', 

5741... 'Parrot', 'Parrot'], 

5742... 'Max Speed': [380., 370., 24., 26.]}) 

5743>>> df 

5744 Animal Max Speed 

57450 Falcon 380.0 

57461 Falcon 370.0 

57472 Parrot 24.0 

57483 Parrot 26.0 

5749>>> df.groupby(['Animal']).mean() 

5750 Max Speed 

5751Animal 

5752Falcon 375.0 

5753Parrot 25.0 

5754 

5755**Hierarchical Indexes** 

5756 

5757We can groupby different levels of a hierarchical index 

5758using the `level` parameter: 

5759 

5760>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], 

5761... ['Captive', 'Wild', 'Captive', 'Wild']] 

5762>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) 

5763>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, 

5764... index=index) 

5765>>> df 

5766 Max Speed 

5767Animal Type 

5768Falcon Captive 390.0 

5769 Wild 350.0 

5770Parrot Captive 30.0 

5771 Wild 20.0 

5772>>> df.groupby(level=0).mean() 

5773 Max Speed 

5774Animal 

5775Falcon 370.0 

5776Parrot 25.0 

5777>>> df.groupby(level="Type").mean() 

5778 Max Speed 

5779Type 

5780Captive 210.0 

5781Wild 185.0 

5782""" 

5783 ) 

5784 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) 

5785 def groupby( 

5786 self, 

5787 by=None, 

5788 axis=0, 

5789 level=None, 

5790 as_index: bool = True, 

5791 sort: bool = True, 

5792 group_keys: bool = True, 

5793 squeeze: bool = False, 

5794 observed: bool = False, 

5795 ) -> "groupby_generic.DataFrameGroupBy": 

5796 

5797 if level is None and by is None: 

5798 raise TypeError("You have to supply one of 'by' and 'level'") 

5799 axis = self._get_axis_number(axis) 

5800 

5801 return groupby_generic.DataFrameGroupBy( 

5802 obj=self, 

5803 keys=by, 

5804 axis=axis, 

5805 level=level, 

5806 as_index=as_index, 

5807 sort=sort, 

5808 group_keys=group_keys, 

5809 squeeze=squeeze, 

5810 observed=observed, 

5811 ) 

5812 

5813 _shared_docs[ 

5814 "pivot" 

5815 ] = """ 

5816 Return reshaped DataFrame organized by given index / column values. 

5817 

5818 Reshape data (produce a "pivot" table) based on column values. Uses 

5819 unique values from specified `index` / `columns` to form axes of the 

5820 resulting DataFrame. This function does not support data 

5821 aggregation, multiple values will result in a MultiIndex in the 

5822 columns. See the :ref:`User Guide <reshaping>` for more on reshaping. 

5823 

5824 Parameters 

5825 ----------%s 

5826 index : str or object, optional 

5827 Column to use to make new frame's index. If None, uses 

5828 existing index. 

5829 columns : str or object 

5830 Column to use to make new frame's columns. 

5831 values : str, object or a list of the previous, optional 

5832 Column(s) to use for populating new frame's values. If not 

5833 specified, all remaining columns will be used and the result will 

5834 have hierarchically indexed columns. 

5835 

5836 .. versionchanged:: 0.23.0 

5837 Also accept list of column names. 

5838 

5839 Returns 

5840 ------- 

5841 DataFrame 

5842 Returns reshaped DataFrame. 

5843 

5844 Raises 

5845 ------ 

5846 ValueError: 

5847 When there are any `index`, `columns` combinations with multiple 

5848 values. `DataFrame.pivot_table` when you need to aggregate. 

5849 

5850 See Also 

5851 -------- 

5852 DataFrame.pivot_table : Generalization of pivot that can handle 

5853 duplicate values for one index/column pair. 

5854 DataFrame.unstack : Pivot based on the index values instead of a 

5855 column. 

5856 

5857 Notes 

5858 ----- 

5859 For finer-tuned control, see hierarchical indexing documentation along 

5860 with the related stack/unstack methods. 

5861 

5862 Examples 

5863 -------- 

5864 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 

5865 ... 'two'], 

5866 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 

5867 ... 'baz': [1, 2, 3, 4, 5, 6], 

5868 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) 

5869 >>> df 

5870 foo bar baz zoo 

5871 0 one A 1 x 

5872 1 one B 2 y 

5873 2 one C 3 z 

5874 3 two A 4 q 

5875 4 two B 5 w 

5876 5 two C 6 t 

5877 

5878 >>> df.pivot(index='foo', columns='bar', values='baz') 

5879 bar A B C 

5880 foo 

5881 one 1 2 3 

5882 two 4 5 6 

5883 

5884 >>> df.pivot(index='foo', columns='bar')['baz'] 

5885 bar A B C 

5886 foo 

5887 one 1 2 3 

5888 two 4 5 6 

5889 

5890 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) 

5891 baz zoo 

5892 bar A B C A B C 

5893 foo 

5894 one 1 2 3 x y z 

5895 two 4 5 6 q w t 

5896 

5897 A ValueError is raised if there are any duplicates. 

5898 

5899 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], 

5900 ... "bar": ['A', 'A', 'B', 'C'], 

5901 ... "baz": [1, 2, 3, 4]}) 

5902 >>> df 

5903 foo bar baz 

5904 0 one A 1 

5905 1 one A 2 

5906 2 two B 3 

5907 3 two C 4 

5908 

5909 Notice that the first two rows are the same for our `index` 

5910 and `columns` arguments. 

5911 

5912 >>> df.pivot(index='foo', columns='bar', values='baz') 

5913 Traceback (most recent call last): 

5914 ... 

5915 ValueError: Index contains duplicate entries, cannot reshape 

5916 """ 

5917 

5918 @Substitution("") 

5919 @Appender(_shared_docs["pivot"]) 

5920 def pivot(self, index=None, columns=None, values=None) -> "DataFrame": 

5921 from pandas.core.reshape.pivot import pivot 

5922 

5923 return pivot(self, index=index, columns=columns, values=values) 

5924 

5925 _shared_docs[ 

5926 "pivot_table" 

5927 ] = """ 

5928 Create a spreadsheet-style pivot table as a DataFrame. 

5929 

5930 The levels in the pivot table will be stored in MultiIndex objects 

5931 (hierarchical indexes) on the index and columns of the result DataFrame. 

5932 

5933 Parameters 

5934 ----------%s 

5935 values : column to aggregate, optional 

5936 index : column, Grouper, array, or list of the previous 

5937 If an array is passed, it must be the same length as the data. The 

5938 list can contain any of the other types (except list). 

5939 Keys to group by on the pivot table index. If an array is passed, 

5940 it is being used as the same manner as column values. 

5941 columns : column, Grouper, array, or list of the previous 

5942 If an array is passed, it must be the same length as the data. The 

5943 list can contain any of the other types (except list). 

5944 Keys to group by on the pivot table column. If an array is passed, 

5945 it is being used as the same manner as column values. 

5946 aggfunc : function, list of functions, dict, default numpy.mean 

5947 If list of functions passed, the resulting pivot table will have 

5948 hierarchical columns whose top level are the function names 

5949 (inferred from the function objects themselves) 

5950 If dict is passed, the key is column to aggregate and value 

5951 is function or list of functions. 

5952 fill_value : scalar, default None 

5953 Value to replace missing values with. 

5954 margins : bool, default False 

5955 Add all row / columns (e.g. for subtotal / grand totals). 

5956 dropna : bool, default True 

5957 Do not include columns whose entries are all NaN. 

5958 margins_name : str, default 'All' 

5959 Name of the row / column that will contain the totals 

5960 when margins is True. 

5961 observed : bool, default False 

5962 This only applies if any of the groupers are Categoricals. 

5963 If True: only show observed values for categorical groupers. 

5964 If False: show all values for categorical groupers. 

5965 

5966 .. versionchanged:: 0.25.0 

5967 

5968 Returns 

5969 ------- 

5970 DataFrame 

5971 An Excel style pivot table. 

5972 

5973 See Also 

5974 -------- 

5975 DataFrame.pivot : Pivot without aggregation that can handle 

5976 non-numeric data. 

5977 

5978 Examples 

5979 -------- 

5980 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", 

5981 ... "bar", "bar", "bar", "bar"], 

5982 ... "B": ["one", "one", "one", "two", "two", 

5983 ... "one", "one", "two", "two"], 

5984 ... "C": ["small", "large", "large", "small", 

5985 ... "small", "large", "small", "small", 

5986 ... "large"], 

5987 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], 

5988 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) 

5989 >>> df 

5990 A B C D E 

5991 0 foo one small 1 2 

5992 1 foo one large 2 4 

5993 2 foo one large 2 5 

5994 3 foo two small 3 5 

5995 4 foo two small 3 6 

5996 5 bar one large 4 6 

5997 6 bar one small 5 8 

5998 7 bar two small 6 9 

5999 8 bar two large 7 9 

6000 

6001 This first example aggregates values by taking the sum. 

6002 

6003 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], 

6004 ... columns=['C'], aggfunc=np.sum) 

6005 >>> table 

6006 C large small 

6007 A B 

6008 bar one 4.0 5.0 

6009 two 7.0 6.0 

6010 foo one 4.0 1.0 

6011 two NaN 6.0 

6012 

6013 We can also fill missing values using the `fill_value` parameter. 

6014 

6015 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], 

6016 ... columns=['C'], aggfunc=np.sum, fill_value=0) 

6017 >>> table 

6018 C large small 

6019 A B 

6020 bar one 4 5 

6021 two 7 6 

6022 foo one 4 1 

6023 two 0 6 

6024 

6025 The next example aggregates by taking the mean across multiple columns. 

6026 

6027 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], 

6028 ... aggfunc={'D': np.mean, 

6029 ... 'E': np.mean}) 

6030 >>> table 

6031 D E 

6032 A C 

6033 bar large 5.500000 7.500000 

6034 small 5.500000 8.500000 

6035 foo large 2.000000 4.500000 

6036 small 2.333333 4.333333 

6037 

6038 We can also calculate multiple types of aggregations for any given 

6039 value column. 

6040 

6041 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], 

6042 ... aggfunc={'D': np.mean, 

6043 ... 'E': [min, max, np.mean]}) 

6044 >>> table 

6045 D E 

6046 mean max mean min 

6047 A C 

6048 bar large 5.500000 9.0 7.500000 6.0 

6049 small 5.500000 9.0 8.500000 8.0 

6050 foo large 2.000000 5.0 4.500000 4.0 

6051 small 2.333333 6.0 4.333333 2.0 

6052 """ 

6053 

6054 @Substitution("") 

6055 @Appender(_shared_docs["pivot_table"]) 

6056 def pivot_table( 

6057 self, 

6058 values=None, 

6059 index=None, 

6060 columns=None, 

6061 aggfunc="mean", 

6062 fill_value=None, 

6063 margins=False, 

6064 dropna=True, 

6065 margins_name="All", 

6066 observed=False, 

6067 ) -> "DataFrame": 

6068 from pandas.core.reshape.pivot import pivot_table 

6069 

6070 return pivot_table( 

6071 self, 

6072 values=values, 

6073 index=index, 

6074 columns=columns, 

6075 aggfunc=aggfunc, 

6076 fill_value=fill_value, 

6077 margins=margins, 

6078 dropna=dropna, 

6079 margins_name=margins_name, 

6080 observed=observed, 

6081 ) 

6082 

6083 def stack(self, level=-1, dropna=True): 

6084 """ 

6085 Stack the prescribed level(s) from columns to index. 

6086 

6087 Return a reshaped DataFrame or Series having a multi-level 

6088 index with one or more new inner-most levels compared to the current 

6089 DataFrame. The new inner-most levels are created by pivoting the 

6090 columns of the current dataframe: 

6091 

6092 - if the columns have a single level, the output is a Series; 

6093 - if the columns have multiple levels, the new index 

6094 level(s) is (are) taken from the prescribed level(s) and 

6095 the output is a DataFrame. 

6096 

6097 The new index levels are sorted. 

6098 

6099 Parameters 

6100 ---------- 

6101 level : int, str, list, default -1 

6102 Level(s) to stack from the column axis onto the index 

6103 axis, defined as one index or label, or a list of indices 

6104 or labels. 

6105 dropna : bool, default True 

6106 Whether to drop rows in the resulting Frame/Series with 

6107 missing values. Stacking a column level onto the index 

6108 axis can create combinations of index and column values 

6109 that are missing from the original dataframe. See Examples 

6110 section. 

6111 

6112 Returns 

6113 ------- 

6114 DataFrame or Series 

6115 Stacked dataframe or series. 

6116 

6117 See Also 

6118 -------- 

6119 DataFrame.unstack : Unstack prescribed level(s) from index axis 

6120 onto column axis. 

6121 DataFrame.pivot : Reshape dataframe from long format to wide 

6122 format. 

6123 DataFrame.pivot_table : Create a spreadsheet-style pivot table 

6124 as a DataFrame. 

6125 

6126 Notes 

6127 ----- 

6128 The function is named by analogy with a collection of books 

6129 being reorganized from being side by side on a horizontal 

6130 position (the columns of the dataframe) to being stacked 

6131 vertically on top of each other (in the index of the 

6132 dataframe). 

6133 

6134 Examples 

6135 -------- 

6136 **Single level columns** 

6137 

6138 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], 

6139 ... index=['cat', 'dog'], 

6140 ... columns=['weight', 'height']) 

6141 

6142 Stacking a dataframe with a single level column axis returns a Series: 

6143 

6144 >>> df_single_level_cols 

6145 weight height 

6146 cat 0 1 

6147 dog 2 3 

6148 >>> df_single_level_cols.stack() 

6149 cat weight 0 

6150 height 1 

6151 dog weight 2 

6152 height 3 

6153 dtype: int64 

6154 

6155 **Multi level columns: simple case** 

6156 

6157 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), 

6158 ... ('weight', 'pounds')]) 

6159 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], 

6160 ... index=['cat', 'dog'], 

6161 ... columns=multicol1) 

6162 

6163 Stacking a dataframe with a multi-level column axis: 

6164 

6165 >>> df_multi_level_cols1 

6166 weight 

6167 kg pounds 

6168 cat 1 2 

6169 dog 2 4 

6170 >>> df_multi_level_cols1.stack() 

6171 weight 

6172 cat kg 1 

6173 pounds 2 

6174 dog kg 2 

6175 pounds 4 

6176 

6177 **Missing values** 

6178 

6179 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), 

6180 ... ('height', 'm')]) 

6181 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], 

6182 ... index=['cat', 'dog'], 

6183 ... columns=multicol2) 

6184 

6185 It is common to have missing values when stacking a dataframe 

6186 with multi-level columns, as the stacked dataframe typically 

6187 has more values than the original dataframe. Missing values 

6188 are filled with NaNs: 

6189 

6190 >>> df_multi_level_cols2 

6191 weight height 

6192 kg m 

6193 cat 1.0 2.0 

6194 dog 3.0 4.0 

6195 >>> df_multi_level_cols2.stack() 

6196 height weight 

6197 cat kg NaN 1.0 

6198 m 2.0 NaN 

6199 dog kg NaN 3.0 

6200 m 4.0 NaN 

6201 

6202 **Prescribing the level(s) to be stacked** 

6203 

6204 The first parameter controls which level or levels are stacked: 

6205 

6206 >>> df_multi_level_cols2.stack(0) 

6207 kg m 

6208 cat height NaN 2.0 

6209 weight 1.0 NaN 

6210 dog height NaN 4.0 

6211 weight 3.0 NaN 

6212 >>> df_multi_level_cols2.stack([0, 1]) 

6213 cat height m 2.0 

6214 weight kg 1.0 

6215 dog height m 4.0 

6216 weight kg 3.0 

6217 dtype: float64 

6218 

6219 **Dropping missing values** 

6220 

6221 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]], 

6222 ... index=['cat', 'dog'], 

6223 ... columns=multicol2) 

6224 

6225 Note that rows where all values are missing are dropped by 

6226 default but this behaviour can be controlled via the dropna 

6227 keyword parameter: 

6228 

6229 >>> df_multi_level_cols3 

6230 weight height 

6231 kg m 

6232 cat NaN 1.0 

6233 dog 2.0 3.0 

6234 >>> df_multi_level_cols3.stack(dropna=False) 

6235 height weight 

6236 cat kg NaN NaN 

6237 m 1.0 NaN 

6238 dog kg NaN 2.0 

6239 m 3.0 NaN 

6240 >>> df_multi_level_cols3.stack(dropna=True) 

6241 height weight 

6242 cat m 1.0 NaN 

6243 dog kg NaN 2.0 

6244 m 3.0 NaN 

6245 """ 

6246 from pandas.core.reshape.reshape import stack, stack_multiple 

6247 

6248 if isinstance(level, (tuple, list)): 

6249 return stack_multiple(self, level, dropna=dropna) 

6250 else: 

6251 return stack(self, level, dropna=dropna) 

6252 

6253 def explode(self, column: Union[str, Tuple]) -> "DataFrame": 

6254 """ 

6255 Transform each element of a list-like to a row, replicating index values. 

6256 

6257 .. versionadded:: 0.25.0 

6258 

6259 Parameters 

6260 ---------- 

6261 column : str or tuple 

6262 Column to explode. 

6263 

6264 Returns 

6265 ------- 

6266 DataFrame 

6267 Exploded lists to rows of the subset columns; 

6268 index will be duplicated for these rows. 

6269 

6270 Raises 

6271 ------ 

6272 ValueError : 

6273 if columns of the frame are not unique. 

6274 

6275 See Also 

6276 -------- 

6277 DataFrame.unstack : Pivot a level of the (necessarily hierarchical) 

6278 index labels. 

6279 DataFrame.melt : Unpivot a DataFrame from wide format to long format. 

6280 Series.explode : Explode a DataFrame from list-like columns to long format. 

6281 

6282 Notes 

6283 ----- 

6284 This routine will explode list-likes including lists, tuples, 

6285 Series, and np.ndarray. The result dtype of the subset rows will 

6286 be object. Scalars will be returned unchanged. Empty list-likes will 

6287 result in a np.nan for that row. 

6288 

6289 Examples 

6290 -------- 

6291 >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) 

6292 >>> df 

6293 A B 

6294 0 [1, 2, 3] 1 

6295 1 foo 1 

6296 2 [] 1 

6297 3 [3, 4] 1 

6298 

6299 >>> df.explode('A') 

6300 A B 

6301 0 1 1 

6302 0 2 1 

6303 0 3 1 

6304 1 foo 1 

6305 2 NaN 1 

6306 3 3 1 

6307 3 4 1 

6308 """ 

6309 

6310 if not (is_scalar(column) or isinstance(column, tuple)): 

6311 raise ValueError("column must be a scalar") 

6312 if not self.columns.is_unique: 

6313 raise ValueError("columns must be unique") 

6314 

6315 df = self.reset_index(drop=True) 

6316 # TODO: use overload to refine return type of reset_index 

6317 assert df is not None # needed for mypy 

6318 result = df[column].explode() 

6319 result = df.drop([column], axis=1).join(result) 

6320 result.index = self.index.take(result.index) 

6321 result = result.reindex(columns=self.columns, copy=False) 

6322 

6323 return result 

6324 

6325 def unstack(self, level=-1, fill_value=None): 

6326 """ 

6327 Pivot a level of the (necessarily hierarchical) index labels. 

6328 

6329 Returns a DataFrame having a new level of column labels whose inner-most level 

6330 consists of the pivoted index labels. 

6331 

6332 If the index is not a MultiIndex, the output will be a Series 

6333 (the analogue of stack when the columns are not a MultiIndex). 

6334 

6335 The level involved will automatically get sorted. 

6336 

6337 Parameters 

6338 ---------- 

6339 level : int, str, or list of these, default -1 (last level) 

6340 Level(s) of index to unstack, can pass level name. 

6341 fill_value : int, str or dict 

6342 Replace NaN with this value if the unstack produces missing values. 

6343 

6344 Returns 

6345 ------- 

6346 Series or DataFrame 

6347 

6348 See Also 

6349 -------- 

6350 DataFrame.pivot : Pivot a table based on column values. 

6351 DataFrame.stack : Pivot a level of the column labels (inverse operation 

6352 from `unstack`). 

6353 

6354 Examples 

6355 -------- 

6356 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), 

6357 ... ('two', 'a'), ('two', 'b')]) 

6358 >>> s = pd.Series(np.arange(1.0, 5.0), index=index) 

6359 >>> s 

6360 one a 1.0 

6361 b 2.0 

6362 two a 3.0 

6363 b 4.0 

6364 dtype: float64 

6365 

6366 >>> s.unstack(level=-1) 

6367 a b 

6368 one 1.0 2.0 

6369 two 3.0 4.0 

6370 

6371 >>> s.unstack(level=0) 

6372 one two 

6373 a 1.0 3.0 

6374 b 2.0 4.0 

6375 

6376 >>> df = s.unstack(level=0) 

6377 >>> df.unstack() 

6378 one a 1.0 

6379 b 2.0 

6380 two a 3.0 

6381 b 4.0 

6382 dtype: float64 

6383 """ 

6384 from pandas.core.reshape.reshape import unstack 

6385 

6386 return unstack(self, level, fill_value) 

6387 

6388 _shared_docs[ 

6389 "melt" 

6390 ] = """ 

6391 Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. 

6392 

6393 This function is useful to massage a DataFrame into a format where one 

6394 or more columns are identifier variables (`id_vars`), while all other 

6395 columns, considered measured variables (`value_vars`), are "unpivoted" to 

6396 the row axis, leaving just two non-identifier columns, 'variable' and 

6397 'value'. 

6398 %(versionadded)s 

6399 Parameters 

6400 ---------- 

6401 id_vars : tuple, list, or ndarray, optional 

6402 Column(s) to use as identifier variables. 

6403 value_vars : tuple, list, or ndarray, optional 

6404 Column(s) to unpivot. If not specified, uses all columns that 

6405 are not set as `id_vars`. 

6406 var_name : scalar 

6407 Name to use for the 'variable' column. If None it uses 

6408 ``frame.columns.name`` or 'variable'. 

6409 value_name : scalar, default 'value' 

6410 Name to use for the 'value' column. 

6411 col_level : int or str, optional 

6412 If columns are a MultiIndex then use this level to melt. 

6413 

6414 Returns 

6415 ------- 

6416 DataFrame 

6417 Unpivoted DataFrame. 

6418 

6419 See Also 

6420 -------- 

6421 %(other)s 

6422 pivot_table 

6423 DataFrame.pivot 

6424 Series.explode 

6425 

6426 Examples 

6427 -------- 

6428 >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, 

6429 ... 'B': {0: 1, 1: 3, 2: 5}, 

6430 ... 'C': {0: 2, 1: 4, 2: 6}}) 

6431 >>> df 

6432 A B C 

6433 0 a 1 2 

6434 1 b 3 4 

6435 2 c 5 6 

6436 

6437 >>> %(caller)sid_vars=['A'], value_vars=['B']) 

6438 A variable value 

6439 0 a B 1 

6440 1 b B 3 

6441 2 c B 5 

6442 

6443 >>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) 

6444 A variable value 

6445 0 a B 1 

6446 1 b B 3 

6447 2 c B 5 

6448 3 a C 2 

6449 4 b C 4 

6450 5 c C 6 

6451 

6452 The names of 'variable' and 'value' columns can be customized: 

6453 

6454 >>> %(caller)sid_vars=['A'], value_vars=['B'], 

6455 ... var_name='myVarname', value_name='myValname') 

6456 A myVarname myValname 

6457 0 a B 1 

6458 1 b B 3 

6459 2 c B 5 

6460 

6461 If you have multi-index columns: 

6462 

6463 >>> df.columns = [list('ABC'), list('DEF')] 

6464 >>> df 

6465 A B C 

6466 D E F 

6467 0 a 1 2 

6468 1 b 3 4 

6469 2 c 5 6 

6470 

6471 >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) 

6472 A variable value 

6473 0 a B 1 

6474 1 b B 3 

6475 2 c B 5 

6476 

6477 >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) 

6478 (A, D) variable_0 variable_1 value 

6479 0 a B E 1 

6480 1 b B E 3 

6481 2 c B E 5 

6482 """ 

6483 

6484 @Appender( 

6485 _shared_docs["melt"] 

6486 % dict( 

6487 caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt" 

6488 ) 

6489 ) 

6490 def melt( 

6491 self, 

6492 id_vars=None, 

6493 value_vars=None, 

6494 var_name=None, 

6495 value_name="value", 

6496 col_level=None, 

6497 ) -> "DataFrame": 

6498 from pandas.core.reshape.melt import melt 

6499 

6500 return melt( 

6501 self, 

6502 id_vars=id_vars, 

6503 value_vars=value_vars, 

6504 var_name=var_name, 

6505 value_name=value_name, 

6506 col_level=col_level, 

6507 ) 

6508 

6509 # ---------------------------------------------------------------------- 

6510 # Time series-related 

6511 

6512 def diff(self, periods=1, axis=0) -> "DataFrame": 

6513 """ 

6514 First discrete difference of element. 

6515 

6516 Calculates the difference of a DataFrame element compared with another 

6517 element in the DataFrame (default is the element in the same column 

6518 of the previous row). 

6519 

6520 Parameters 

6521 ---------- 

6522 periods : int, default 1 

6523 Periods to shift for calculating difference, accepts negative 

6524 values. 

6525 axis : {0 or 'index', 1 or 'columns'}, default 0 

6526 Take difference over rows (0) or columns (1). 

6527 

6528 Returns 

6529 ------- 

6530 DataFrame 

6531 

6532 See Also 

6533 -------- 

6534 Series.diff: First discrete difference for a Series. 

6535 DataFrame.pct_change: Percent change over given number of periods. 

6536 DataFrame.shift: Shift index by desired number of periods with an 

6537 optional time freq. 

6538 

6539 Notes 

6540 ----- 

6541 For boolean dtypes, this uses :meth:`operator.xor` rather than 

6542 :meth:`operator.sub`. 

6543 

6544 Examples 

6545 -------- 

6546 Difference with previous row 

6547 

6548 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], 

6549 ... 'b': [1, 1, 2, 3, 5, 8], 

6550 ... 'c': [1, 4, 9, 16, 25, 36]}) 

6551 >>> df 

6552 a b c 

6553 0 1 1 1 

6554 1 2 1 4 

6555 2 3 2 9 

6556 3 4 3 16 

6557 4 5 5 25 

6558 5 6 8 36 

6559 

6560 >>> df.diff() 

6561 a b c 

6562 0 NaN NaN NaN 

6563 1 1.0 0.0 3.0 

6564 2 1.0 1.0 5.0 

6565 3 1.0 1.0 7.0 

6566 4 1.0 2.0 9.0 

6567 5 1.0 3.0 11.0 

6568 

6569 Difference with previous column 

6570 

6571 >>> df.diff(axis=1) 

6572 a b c 

6573 0 NaN 0.0 0.0 

6574 1 NaN -1.0 3.0 

6575 2 NaN -1.0 7.0 

6576 3 NaN -1.0 13.0 

6577 4 NaN 0.0 20.0 

6578 5 NaN 2.0 28.0 

6579 

6580 Difference with 3rd previous row 

6581 

6582 >>> df.diff(periods=3) 

6583 a b c 

6584 0 NaN NaN NaN 

6585 1 NaN NaN NaN 

6586 2 NaN NaN NaN 

6587 3 3.0 2.0 15.0 

6588 4 3.0 4.0 21.0 

6589 5 3.0 6.0 27.0 

6590 

6591 Difference with following row 

6592 

6593 >>> df.diff(periods=-1) 

6594 a b c 

6595 0 -1.0 0.0 -3.0 

6596 1 -1.0 -1.0 -5.0 

6597 2 -1.0 -1.0 -7.0 

6598 3 -1.0 -2.0 -9.0 

6599 4 -1.0 -3.0 -11.0 

6600 5 NaN NaN NaN 

6601 """ 

6602 bm_axis = self._get_block_manager_axis(axis) 

6603 new_data = self._data.diff(n=periods, axis=bm_axis) 

6604 return self._constructor(new_data) 

6605 

6606 # ---------------------------------------------------------------------- 

6607 # Function application 

6608 

6609 def _gotitem( 

6610 self, 

6611 key: Union[str, List[str]], 

6612 ndim: int, 

6613 subset: Optional[Union[Series, ABCDataFrame]] = None, 

6614 ) -> Union[Series, ABCDataFrame]: 

6615 """ 

6616 Sub-classes to define. Return a sliced object. 

6617 

6618 Parameters 

6619 ---------- 

6620 key : string / list of selections 

6621 ndim : 1,2 

6622 requested ndim of result 

6623 subset : object, default None 

6624 subset to act on 

6625 """ 

6626 if subset is None: 

6627 subset = self 

6628 elif subset.ndim == 1: # is Series 

6629 return subset 

6630 

6631 # TODO: _shallow_copy(subset)? 

6632 return subset[key] 

6633 

6634 _agg_summary_and_see_also_doc = dedent( 

6635 """ 

6636 The aggregation operations are always performed over an axis, either the 

6637 index (default) or the column axis. This behavior is different from 

6638 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, 

6639 `var`), where the default is to compute the aggregation of the flattened 

6640 array, e.g., ``numpy.mean(arr_2d)`` as opposed to 

6641 ``numpy.mean(arr_2d, axis=0)``. 

6642 

6643 `agg` is an alias for `aggregate`. Use the alias. 

6644 

6645 See Also 

6646 -------- 

6647 DataFrame.apply : Perform any type of operations. 

6648 DataFrame.transform : Perform transformation type operations. 

6649 core.groupby.GroupBy : Perform operations over groups. 

6650 core.resample.Resampler : Perform operations over resampled bins. 

6651 core.window.Rolling : Perform operations over rolling window. 

6652 core.window.Expanding : Perform operations over expanding window. 

6653 core.window.EWM : Perform operation over exponential weighted 

6654 window. 

6655 """ 

6656 ) 

6657 

6658 _agg_examples_doc = dedent( 

6659 """ 

6660 Examples 

6661 -------- 

6662 >>> df = pd.DataFrame([[1, 2, 3], 

6663 ... [4, 5, 6], 

6664 ... [7, 8, 9], 

6665 ... [np.nan, np.nan, np.nan]], 

6666 ... columns=['A', 'B', 'C']) 

6667 

6668 Aggregate these functions over the rows. 

6669 

6670 >>> df.agg(['sum', 'min']) 

6671 A B C 

6672 sum 12.0 15.0 18.0 

6673 min 1.0 2.0 3.0 

6674 

6675 Different aggregations per column. 

6676 

6677 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) 

6678 A B 

6679 max NaN 8.0 

6680 min 1.0 2.0 

6681 sum 12.0 NaN 

6682 

6683 Aggregate over the columns. 

6684 

6685 >>> df.agg("mean", axis="columns") 

6686 0 2.0 

6687 1 5.0 

6688 2 8.0 

6689 3 NaN 

6690 dtype: float64 

6691 """ 

6692 ) 

6693 

6694 @Substitution( 

6695 see_also=_agg_summary_and_see_also_doc, 

6696 examples=_agg_examples_doc, 

6697 versionadded="\n.. versionadded:: 0.20.0\n", 

6698 **_shared_doc_kwargs, 

6699 ) 

6700 @Appender(_shared_docs["aggregate"]) 

6701 def aggregate(self, func, axis=0, *args, **kwargs): 

6702 axis = self._get_axis_number(axis) 

6703 

6704 result = None 

6705 try: 

6706 result, how = self._aggregate(func, axis=axis, *args, **kwargs) 

6707 except TypeError: 

6708 pass 

6709 if result is None: 

6710 return self.apply(func, axis=axis, args=args, **kwargs) 

6711 return result 

6712 

6713 def _aggregate(self, arg, axis=0, *args, **kwargs): 

6714 if axis == 1: 

6715 # NDFrame.aggregate returns a tuple, and we need to transpose 

6716 # only result 

6717 result, how = self.T._aggregate(arg, *args, **kwargs) 

6718 result = result.T if result is not None else result 

6719 return result, how 

6720 return super()._aggregate(arg, *args, **kwargs) 

6721 

6722 agg = aggregate 

6723 

6724 @Appender(_shared_docs["transform"] % _shared_doc_kwargs) 

6725 def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": 

6726 axis = self._get_axis_number(axis) 

6727 if axis == 1: 

6728 return self.T.transform(func, *args, **kwargs).T 

6729 return super().transform(func, *args, **kwargs) 

6730 

6731 def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): 

6732 """ 

6733 Apply a function along an axis of the DataFrame. 

6734 

6735 Objects passed to the function are Series objects whose index is 

6736 either the DataFrame's index (``axis=0``) or the DataFrame's columns 

6737 (``axis=1``). By default (``result_type=None``), the final return type 

6738 is inferred from the return type of the applied function. Otherwise, 

6739 it depends on the `result_type` argument. 

6740 

6741 Parameters 

6742 ---------- 

6743 func : function 

6744 Function to apply to each column or row. 

6745 axis : {0 or 'index', 1 or 'columns'}, default 0 

6746 Axis along which the function is applied: 

6747 

6748 * 0 or 'index': apply function to each column. 

6749 * 1 or 'columns': apply function to each row. 

6750 

6751 raw : bool, default False 

6752 Determines if row or column is passed as a Series or ndarray object: 

6753 

6754 * ``False`` : passes each row or column as a Series to the 

6755 function. 

6756 * ``True`` : the passed function will receive ndarray objects 

6757 instead. 

6758 If you are just applying a NumPy reduction function this will 

6759 achieve much better performance. 

6760 

6761 result_type : {'expand', 'reduce', 'broadcast', None}, default None 

6762 These only act when ``axis=1`` (columns): 

6763 

6764 * 'expand' : list-like results will be turned into columns. 

6765 * 'reduce' : returns a Series if possible rather than expanding 

6766 list-like results. This is the opposite of 'expand'. 

6767 * 'broadcast' : results will be broadcast to the original shape 

6768 of the DataFrame, the original index and columns will be 

6769 retained. 

6770 

6771 The default behaviour (None) depends on the return value of the 

6772 applied function: list-like results will be returned as a Series 

6773 of those. However if the apply function returns a Series these 

6774 are expanded to columns. 

6775 

6776 .. versionadded:: 0.23.0 

6777 

6778 args : tuple 

6779 Positional arguments to pass to `func` in addition to the 

6780 array/series. 

6781 **kwds 

6782 Additional keyword arguments to pass as keywords arguments to 

6783 `func`. 

6784 

6785 Returns 

6786 ------- 

6787 Series or DataFrame 

6788 Result of applying ``func`` along the given axis of the 

6789 DataFrame. 

6790 

6791 See Also 

6792 -------- 

6793 DataFrame.applymap: For elementwise operations. 

6794 DataFrame.aggregate: Only perform aggregating type operations. 

6795 DataFrame.transform: Only perform transforming type operations. 

6796 

6797 Examples 

6798 -------- 

6799 

6800 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) 

6801 >>> df 

6802 A B 

6803 0 4 9 

6804 1 4 9 

6805 2 4 9 

6806 

6807 Using a numpy universal function (in this case the same as 

6808 ``np.sqrt(df)``): 

6809 

6810 >>> df.apply(np.sqrt) 

6811 A B 

6812 0 2.0 3.0 

6813 1 2.0 3.0 

6814 2 2.0 3.0 

6815 

6816 Using a reducing function on either axis 

6817 

6818 >>> df.apply(np.sum, axis=0) 

6819 A 12 

6820 B 27 

6821 dtype: int64 

6822 

6823 >>> df.apply(np.sum, axis=1) 

6824 0 13 

6825 1 13 

6826 2 13 

6827 dtype: int64 

6828 

6829 Returning a list-like will result in a Series 

6830 

6831 >>> df.apply(lambda x: [1, 2], axis=1) 

6832 0 [1, 2] 

6833 1 [1, 2] 

6834 2 [1, 2] 

6835 dtype: object 

6836 

6837 Passing result_type='expand' will expand list-like results 

6838 to columns of a Dataframe 

6839 

6840 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') 

6841 0 1 

6842 0 1 2 

6843 1 1 2 

6844 2 1 2 

6845 

6846 Returning a Series inside the function is similar to passing 

6847 ``result_type='expand'``. The resulting column names 

6848 will be the Series index. 

6849 

6850 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) 

6851 foo bar 

6852 0 1 2 

6853 1 1 2 

6854 2 1 2 

6855 

6856 Passing ``result_type='broadcast'`` will ensure the same shape 

6857 result, whether list-like or scalar is returned by the function, 

6858 and broadcast it along the axis. The resulting column names will 

6859 be the originals. 

6860 

6861 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') 

6862 A B 

6863 0 1 2 

6864 1 1 2 

6865 2 1 2 

6866 """ 

6867 from pandas.core.apply import frame_apply 

6868 

6869 op = frame_apply( 

6870 self, 

6871 func=func, 

6872 axis=axis, 

6873 raw=raw, 

6874 result_type=result_type, 

6875 args=args, 

6876 kwds=kwds, 

6877 ) 

6878 return op.get_result() 

6879 

6880 def applymap(self, func) -> "DataFrame": 

6881 """ 

6882 Apply a function to a Dataframe elementwise. 

6883 

6884 This method applies a function that accepts and returns a scalar 

6885 to every element of a DataFrame. 

6886 

6887 Parameters 

6888 ---------- 

6889 func : callable 

6890 Python function, returns a single value from a single value. 

6891 

6892 Returns 

6893 ------- 

6894 DataFrame 

6895 Transformed DataFrame. 

6896 

6897 See Also 

6898 -------- 

6899 DataFrame.apply : Apply a function along input axis of DataFrame. 

6900 

6901 Notes 

6902 ----- 

6903 In the current implementation applymap calls `func` twice on the 

6904 first column/row to decide whether it can take a fast or slow 

6905 code path. This can lead to unexpected behavior if `func` has 

6906 side-effects, as they will take effect twice for the first 

6907 column/row. 

6908 

6909 Examples 

6910 -------- 

6911 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) 

6912 >>> df 

6913 0 1 

6914 0 1.000 2.120 

6915 1 3.356 4.567 

6916 

6917 >>> df.applymap(lambda x: len(str(x))) 

6918 0 1 

6919 0 3 4 

6920 1 5 5 

6921 

6922 Note that a vectorized version of `func` often exists, which will 

6923 be much faster. You could square each number elementwise. 

6924 

6925 >>> df.applymap(lambda x: x**2) 

6926 0 1 

6927 0 1.000000 4.494400 

6928 1 11.262736 20.857489 

6929 

6930 But it's better to avoid applymap in that case. 

6931 

6932 >>> df ** 2 

6933 0 1 

6934 0 1.000000 4.494400 

6935 1 11.262736 20.857489 

6936 """ 

6937 

6938 # if we have a dtype == 'M8[ns]', provide boxed values 

6939 def infer(x): 

6940 if x.empty: 

6941 return lib.map_infer(x, func) 

6942 return lib.map_infer(x.astype(object).values, func) 

6943 

6944 return self.apply(infer) 

6945 

6946 # ---------------------------------------------------------------------- 

6947 # Merging / joining methods 

6948 

6949 def append( 

6950 self, other, ignore_index=False, verify_integrity=False, sort=False 

6951 ) -> "DataFrame": 

6952 """ 

6953 Append rows of `other` to the end of caller, returning a new object. 

6954 

6955 Columns in `other` that are not in the caller are added as new columns. 

6956 

6957 Parameters 

6958 ---------- 

6959 other : DataFrame or Series/dict-like object, or list of these 

6960 The data to append. 

6961 ignore_index : bool, default False 

6962 If True, do not use the index labels. 

6963 verify_integrity : bool, default False 

6964 If True, raise ValueError on creating index with duplicates. 

6965 sort : bool, default False 

6966 Sort columns if the columns of `self` and `other` are not aligned. 

6967 

6968 .. versionadded:: 0.23.0 

6969 .. versionchanged:: 1.0.0 

6970 

6971 Changed to not sort by default. 

6972 

6973 Returns 

6974 ------- 

6975 DataFrame 

6976 

6977 See Also 

6978 -------- 

6979 concat : General function to concatenate DataFrame or Series objects. 

6980 

6981 Notes 

6982 ----- 

6983 If a list of dict/series is passed and the keys are all contained in 

6984 the DataFrame's index, the order of the columns in the resulting 

6985 DataFrame will be unchanged. 

6986 

6987 Iteratively appending rows to a DataFrame can be more computationally 

6988 intensive than a single concatenate. A better solution is to append 

6989 those rows to a list and then concatenate the list with the original 

6990 DataFrame all at once. 

6991 

6992 Examples 

6993 -------- 

6994 

6995 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) 

6996 >>> df 

6997 A B 

6998 0 1 2 

6999 1 3 4 

7000 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB')) 

7001 >>> df.append(df2) 

7002 A B 

7003 0 1 2 

7004 1 3 4 

7005 0 5 6 

7006 1 7 8 

7007 

7008 With `ignore_index` set to True: 

7009 

7010 >>> df.append(df2, ignore_index=True) 

7011 A B 

7012 0 1 2 

7013 1 3 4 

7014 2 5 6 

7015 3 7 8 

7016 

7017 The following, while not recommended methods for generating DataFrames, 

7018 show two ways to generate a DataFrame from multiple data sources. 

7019 

7020 Less efficient: 

7021 

7022 >>> df = pd.DataFrame(columns=['A']) 

7023 >>> for i in range(5): 

7024 ... df = df.append({'A': i}, ignore_index=True) 

7025 >>> df 

7026 A 

7027 0 0 

7028 1 1 

7029 2 2 

7030 3 3 

7031 4 4 

7032 

7033 More efficient: 

7034 

7035 >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)], 

7036 ... ignore_index=True) 

7037 A 

7038 0 0 

7039 1 1 

7040 2 2 

7041 3 3 

7042 4 4 

7043 """ 

7044 if isinstance(other, (Series, dict)): 

7045 if isinstance(other, dict): 

7046 other = Series(other) 

7047 if other.name is None and not ignore_index: 

7048 raise TypeError( 

7049 "Can only append a Series if ignore_index=True " 

7050 "or if the Series has a name" 

7051 ) 

7052 

7053 index = Index([other.name], name=self.index.name) 

7054 idx_diff = other.index.difference(self.columns) 

7055 try: 

7056 combined_columns = self.columns.append(idx_diff) 

7057 except TypeError: 

7058 combined_columns = self.columns.astype(object).append(idx_diff) 

7059 other = ( 

7060 other.reindex(combined_columns, copy=False) 

7061 .to_frame() 

7062 .T.infer_objects() 

7063 .rename_axis(index.names, copy=False) 

7064 ) 

7065 if not self.columns.equals(combined_columns): 

7066 self = self.reindex(columns=combined_columns) 

7067 elif isinstance(other, list): 

7068 if not other: 

7069 pass 

7070 elif not isinstance(other[0], DataFrame): 

7071 other = DataFrame(other) 

7072 if (self.columns.get_indexer(other.columns) >= 0).all(): 

7073 other = other.reindex(columns=self.columns) 

7074 

7075 from pandas.core.reshape.concat import concat 

7076 

7077 if isinstance(other, (list, tuple)): 

7078 to_concat = [self, *other] 

7079 else: 

7080 to_concat = [self, other] 

7081 return concat( 

7082 to_concat, 

7083 ignore_index=ignore_index, 

7084 verify_integrity=verify_integrity, 

7085 sort=sort, 

7086 ) 

7087 

7088 def join( 

7089 self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False 

7090 ) -> "DataFrame": 

7091 """ 

7092 Join columns of another DataFrame. 

7093 

7094 Join columns with `other` DataFrame either on index or on a key 

7095 column. Efficiently join multiple DataFrame objects by index at once by 

7096 passing a list. 

7097 

7098 Parameters 

7099 ---------- 

7100 other : DataFrame, Series, or list of DataFrame 

7101 Index should be similar to one of the columns in this one. If a 

7102 Series is passed, its name attribute must be set, and that will be 

7103 used as the column name in the resulting joined DataFrame. 

7104 on : str, list of str, or array-like, optional 

7105 Column or index level name(s) in the caller to join on the index 

7106 in `other`, otherwise joins index-on-index. If multiple 

7107 values given, the `other` DataFrame must have a MultiIndex. Can 

7108 pass an array as the join key if it is not already contained in 

7109 the calling DataFrame. Like an Excel VLOOKUP operation. 

7110 how : {'left', 'right', 'outer', 'inner'}, default 'left' 

7111 How to handle the operation of the two objects. 

7112 

7113 * left: use calling frame's index (or column if on is specified) 

7114 * right: use `other`'s index. 

7115 * outer: form union of calling frame's index (or column if on is 

7116 specified) with `other`'s index, and sort it. 

7117 lexicographically. 

7118 * inner: form intersection of calling frame's index (or column if 

7119 on is specified) with `other`'s index, preserving the order 

7120 of the calling's one. 

7121 lsuffix : str, default '' 

7122 Suffix to use from left frame's overlapping columns. 

7123 rsuffix : str, default '' 

7124 Suffix to use from right frame's overlapping columns. 

7125 sort : bool, default False 

7126 Order result DataFrame lexicographically by the join key. If False, 

7127 the order of the join key depends on the join type (how keyword). 

7128 

7129 Returns 

7130 ------- 

7131 DataFrame 

7132 A dataframe containing columns from both the caller and `other`. 

7133 

7134 See Also 

7135 -------- 

7136 DataFrame.merge : For column(s)-on-columns(s) operations. 

7137 

7138 Notes 

7139 ----- 

7140 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when 

7141 passing a list of `DataFrame` objects. 

7142 

7143 Support for specifying index levels as the `on` parameter was added 

7144 in version 0.23.0. 

7145 

7146 Examples 

7147 -------- 

7148 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 

7149 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) 

7150 

7151 >>> df 

7152 key A 

7153 0 K0 A0 

7154 1 K1 A1 

7155 2 K2 A2 

7156 3 K3 A3 

7157 4 K4 A4 

7158 5 K5 A5 

7159 

7160 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 

7161 ... 'B': ['B0', 'B1', 'B2']}) 

7162 

7163 >>> other 

7164 key B 

7165 0 K0 B0 

7166 1 K1 B1 

7167 2 K2 B2 

7168 

7169 Join DataFrames using their indexes. 

7170 

7171 >>> df.join(other, lsuffix='_caller', rsuffix='_other') 

7172 key_caller A key_other B 

7173 0 K0 A0 K0 B0 

7174 1 K1 A1 K1 B1 

7175 2 K2 A2 K2 B2 

7176 3 K3 A3 NaN NaN 

7177 4 K4 A4 NaN NaN 

7178 5 K5 A5 NaN NaN 

7179 

7180 If we want to join using the key columns, we need to set key to be 

7181 the index in both `df` and `other`. The joined DataFrame will have 

7182 key as its index. 

7183 

7184 >>> df.set_index('key').join(other.set_index('key')) 

7185 A B 

7186 key 

7187 K0 A0 B0 

7188 K1 A1 B1 

7189 K2 A2 B2 

7190 K3 A3 NaN 

7191 K4 A4 NaN 

7192 K5 A5 NaN 

7193 

7194 Another option to join using the key columns is to use the `on` 

7195 parameter. DataFrame.join always uses `other`'s index but we can use 

7196 any column in `df`. This method preserves the original DataFrame's 

7197 index in the result. 

7198 

7199 >>> df.join(other.set_index('key'), on='key') 

7200 key A B 

7201 0 K0 A0 B0 

7202 1 K1 A1 B1 

7203 2 K2 A2 B2 

7204 3 K3 A3 NaN 

7205 4 K4 A4 NaN 

7206 5 K5 A5 NaN 

7207 """ 

7208 return self._join_compat( 

7209 other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort 

7210 ) 

7211 

7212 def _join_compat( 

7213 self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False 

7214 ): 

7215 from pandas.core.reshape.merge import merge 

7216 from pandas.core.reshape.concat import concat 

7217 

7218 if isinstance(other, Series): 

7219 if other.name is None: 

7220 raise ValueError("Other Series must have a name") 

7221 other = DataFrame({other.name: other}) 

7222 

7223 if isinstance(other, DataFrame): 

7224 return merge( 

7225 self, 

7226 other, 

7227 left_on=on, 

7228 how=how, 

7229 left_index=on is None, 

7230 right_index=True, 

7231 suffixes=(lsuffix, rsuffix), 

7232 sort=sort, 

7233 ) 

7234 else: 

7235 if on is not None: 

7236 raise ValueError( 

7237 "Joining multiple DataFrames only supported for joining on index" 

7238 ) 

7239 

7240 frames = [self] + list(other) 

7241 

7242 can_concat = all(df.index.is_unique for df in frames) 

7243 

7244 # join indexes only using concat 

7245 if can_concat: 

7246 if how == "left": 

7247 res = concat( 

7248 frames, axis=1, join="outer", verify_integrity=True, sort=sort 

7249 ) 

7250 return res.reindex(self.index, copy=False) 

7251 else: 

7252 return concat( 

7253 frames, axis=1, join=how, verify_integrity=True, sort=sort 

7254 ) 

7255 

7256 joined = frames[0] 

7257 

7258 for frame in frames[1:]: 

7259 joined = merge( 

7260 joined, frame, how=how, left_index=True, right_index=True 

7261 ) 

7262 

7263 return joined 

7264 

7265 @Substitution("") 

7266 @Appender(_merge_doc, indents=2) 

7267 def merge( 

7268 self, 

7269 right, 

7270 how="inner", 

7271 on=None, 

7272 left_on=None, 

7273 right_on=None, 

7274 left_index=False, 

7275 right_index=False, 

7276 sort=False, 

7277 suffixes=("_x", "_y"), 

7278 copy=True, 

7279 indicator=False, 

7280 validate=None, 

7281 ) -> "DataFrame": 

7282 from pandas.core.reshape.merge import merge 

7283 

7284 return merge( 

7285 self, 

7286 right, 

7287 how=how, 

7288 on=on, 

7289 left_on=left_on, 

7290 right_on=right_on, 

7291 left_index=left_index, 

7292 right_index=right_index, 

7293 sort=sort, 

7294 suffixes=suffixes, 

7295 copy=copy, 

7296 indicator=indicator, 

7297 validate=validate, 

7298 ) 

7299 

7300 def round(self, decimals=0, *args, **kwargs) -> "DataFrame": 

7301 """ 

7302 Round a DataFrame to a variable number of decimal places. 

7303 

7304 Parameters 

7305 ---------- 

7306 decimals : int, dict, Series 

7307 Number of decimal places to round each column to. If an int is 

7308 given, round each column to the same number of places. 

7309 Otherwise dict and Series round to variable numbers of places. 

7310 Column names should be in the keys if `decimals` is a 

7311 dict-like, or in the index if `decimals` is a Series. Any 

7312 columns not included in `decimals` will be left as is. Elements 

7313 of `decimals` which are not columns of the input will be 

7314 ignored. 

7315 *args 

7316 Additional keywords have no effect but might be accepted for 

7317 compatibility with numpy. 

7318 **kwargs 

7319 Additional keywords have no effect but might be accepted for 

7320 compatibility with numpy. 

7321 

7322 Returns 

7323 ------- 

7324 DataFrame 

7325 A DataFrame with the affected columns rounded to the specified 

7326 number of decimal places. 

7327 

7328 See Also 

7329 -------- 

7330 numpy.around : Round a numpy array to the given number of decimals. 

7331 Series.round : Round a Series to the given number of decimals. 

7332 

7333 Examples 

7334 -------- 

7335 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], 

7336 ... columns=['dogs', 'cats']) 

7337 >>> df 

7338 dogs cats 

7339 0 0.21 0.32 

7340 1 0.01 0.67 

7341 2 0.66 0.03 

7342 3 0.21 0.18 

7343 

7344 By providing an integer each column is rounded to the same number 

7345 of decimal places 

7346 

7347 >>> df.round(1) 

7348 dogs cats 

7349 0 0.2 0.3 

7350 1 0.0 0.7 

7351 2 0.7 0.0 

7352 3 0.2 0.2 

7353 

7354 With a dict, the number of places for specific columns can be 

7355 specified with the column names as key and the number of decimal 

7356 places as value 

7357 

7358 >>> df.round({'dogs': 1, 'cats': 0}) 

7359 dogs cats 

7360 0 0.2 0.0 

7361 1 0.0 1.0 

7362 2 0.7 0.0 

7363 3 0.2 0.0 

7364 

7365 Using a Series, the number of places for specific columns can be 

7366 specified with the column names as index and the number of 

7367 decimal places as value 

7368 

7369 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) 

7370 >>> df.round(decimals) 

7371 dogs cats 

7372 0 0.2 0.0 

7373 1 0.0 1.0 

7374 2 0.7 0.0 

7375 3 0.2 0.0 

7376 """ 

7377 from pandas.core.reshape.concat import concat 

7378 

7379 def _dict_round(df, decimals): 

7380 for col, vals in df.items(): 

7381 try: 

7382 yield _series_round(vals, decimals[col]) 

7383 except KeyError: 

7384 yield vals 

7385 

7386 def _series_round(s, decimals): 

7387 if is_integer_dtype(s) or is_float_dtype(s): 

7388 return s.round(decimals) 

7389 return s 

7390 

7391 nv.validate_round(args, kwargs) 

7392 

7393 if isinstance(decimals, (dict, Series)): 

7394 if isinstance(decimals, Series): 

7395 if not decimals.index.is_unique: 

7396 raise ValueError("Index of decimals must be unique") 

7397 new_cols = list(_dict_round(self, decimals)) 

7398 elif is_integer(decimals): 

7399 # Dispatch to Series.round 

7400 new_cols = [_series_round(v, decimals) for _, v in self.items()] 

7401 else: 

7402 raise TypeError("decimals must be an integer, a dict-like or a Series") 

7403 

7404 if len(new_cols) > 0: 

7405 return self._constructor( 

7406 concat(new_cols, axis=1), index=self.index, columns=self.columns 

7407 ) 

7408 else: 

7409 return self 

7410 

7411 # ---------------------------------------------------------------------- 

7412 # Statistical methods, etc. 

7413 

7414 def corr(self, method="pearson", min_periods=1) -> "DataFrame": 

7415 """ 

7416 Compute pairwise correlation of columns, excluding NA/null values. 

7417 

7418 Parameters 

7419 ---------- 

7420 method : {'pearson', 'kendall', 'spearman'} or callable 

7421 Method of correlation: 

7422 

7423 * pearson : standard correlation coefficient 

7424 * kendall : Kendall Tau correlation coefficient 

7425 * spearman : Spearman rank correlation 

7426 * callable: callable with input two 1d ndarrays 

7427 and returning a float. Note that the returned matrix from corr 

7428 will have 1 along the diagonals and will be symmetric 

7429 regardless of the callable's behavior. 

7430 

7431 .. versionadded:: 0.24.0 

7432 

7433 min_periods : int, optional 

7434 Minimum number of observations required per pair of columns 

7435 to have a valid result. Currently only available for Pearson 

7436 and Spearman correlation. 

7437 

7438 Returns 

7439 ------- 

7440 DataFrame 

7441 Correlation matrix. 

7442 

7443 See Also 

7444 -------- 

7445 DataFrame.corrwith 

7446 Series.corr 

7447 

7448 Examples 

7449 -------- 

7450 >>> def histogram_intersection(a, b): 

7451 ... v = np.minimum(a, b).sum().round(decimals=1) 

7452 ... return v 

7453 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], 

7454 ... columns=['dogs', 'cats']) 

7455 >>> df.corr(method=histogram_intersection) 

7456 dogs cats 

7457 dogs 1.0 0.3 

7458 cats 0.3 1.0 

7459 """ 

7460 numeric_df = self._get_numeric_data() 

7461 cols = numeric_df.columns 

7462 idx = cols.copy() 

7463 mat = numeric_df.values 

7464 

7465 if method == "pearson": 

7466 correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) 

7467 elif method == "spearman": 

7468 correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods) 

7469 elif method == "kendall" or callable(method): 

7470 if min_periods is None: 

7471 min_periods = 1 

7472 mat = ensure_float64(mat).T 

7473 corrf = nanops.get_corr_func(method) 

7474 K = len(cols) 

7475 correl = np.empty((K, K), dtype=float) 

7476 mask = np.isfinite(mat) 

7477 for i, ac in enumerate(mat): 

7478 for j, bc in enumerate(mat): 

7479 if i > j: 

7480 continue 

7481 

7482 valid = mask[i] & mask[j] 

7483 if valid.sum() < min_periods: 

7484 c = np.nan 

7485 elif i == j: 

7486 c = 1.0 

7487 elif not valid.all(): 

7488 c = corrf(ac[valid], bc[valid]) 

7489 else: 

7490 c = corrf(ac, bc) 

7491 correl[i, j] = c 

7492 correl[j, i] = c 

7493 else: 

7494 raise ValueError( 

7495 "method must be either 'pearson', " 

7496 "'spearman', 'kendall', or a callable, " 

7497 f"'{method}' was supplied" 

7498 ) 

7499 

7500 return self._constructor(correl, index=idx, columns=cols) 

7501 

7502 def cov(self, min_periods=None) -> "DataFrame": 

7503 """ 

7504 Compute pairwise covariance of columns, excluding NA/null values. 

7505 

7506 Compute the pairwise covariance among the series of a DataFrame. 

7507 The returned data frame is the `covariance matrix 

7508 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns 

7509 of the DataFrame. 

7510 

7511 Both NA and null values are automatically excluded from the 

7512 calculation. (See the note below about bias from missing values.) 

7513 A threshold can be set for the minimum number of 

7514 observations for each value created. Comparisons with observations 

7515 below this threshold will be returned as ``NaN``. 

7516 

7517 This method is generally used for the analysis of time series data to 

7518 understand the relationship between different measures 

7519 across time. 

7520 

7521 Parameters 

7522 ---------- 

7523 min_periods : int, optional 

7524 Minimum number of observations required per pair of columns 

7525 to have a valid result. 

7526 

7527 Returns 

7528 ------- 

7529 DataFrame 

7530 The covariance matrix of the series of the DataFrame. 

7531 

7532 See Also 

7533 -------- 

7534 Series.cov : Compute covariance with another Series. 

7535 core.window.EWM.cov: Exponential weighted sample covariance. 

7536 core.window.Expanding.cov : Expanding sample covariance. 

7537 core.window.Rolling.cov : Rolling sample covariance. 

7538 

7539 Notes 

7540 ----- 

7541 Returns the covariance matrix of the DataFrame's time series. 

7542 The covariance is normalized by N-1. 

7543 

7544 For DataFrames that have Series that are missing data (assuming that 

7545 data is `missing at random 

7546 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__) 

7547 the returned covariance matrix will be an unbiased estimate 

7548 of the variance and covariance between the member Series. 

7549 

7550 However, for many applications this estimate may not be acceptable 

7551 because the estimate covariance matrix is not guaranteed to be positive 

7552 semi-definite. This could lead to estimate correlations having 

7553 absolute values which are greater than one, and/or a non-invertible 

7554 covariance matrix. See `Estimation of covariance matrices 

7555 <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_ 

7556 matrices>`__ for more details. 

7557 

7558 Examples 

7559 -------- 

7560 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], 

7561 ... columns=['dogs', 'cats']) 

7562 >>> df.cov() 

7563 dogs cats 

7564 dogs 0.666667 -1.000000 

7565 cats -1.000000 1.666667 

7566 

7567 >>> np.random.seed(42) 

7568 >>> df = pd.DataFrame(np.random.randn(1000, 5), 

7569 ... columns=['a', 'b', 'c', 'd', 'e']) 

7570 >>> df.cov() 

7571 a b c d e 

7572 a 0.998438 -0.020161 0.059277 -0.008943 0.014144 

7573 b -0.020161 1.059352 -0.008543 -0.024738 0.009826 

7574 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 

7575 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 

7576 e 0.014144 0.009826 -0.000271 -0.013692 0.977795 

7577 

7578 **Minimum number of periods** 

7579 

7580 This method also supports an optional ``min_periods`` keyword 

7581 that specifies the required minimum number of non-NA observations for 

7582 each column pair in order to have a valid result: 

7583 

7584 >>> np.random.seed(42) 

7585 >>> df = pd.DataFrame(np.random.randn(20, 3), 

7586 ... columns=['a', 'b', 'c']) 

7587 >>> df.loc[df.index[:5], 'a'] = np.nan 

7588 >>> df.loc[df.index[5:10], 'b'] = np.nan 

7589 >>> df.cov(min_periods=12) 

7590 a b c 

7591 a 0.316741 NaN -0.150812 

7592 b NaN 1.248003 0.191417 

7593 c -0.150812 0.191417 0.895202 

7594 """ 

7595 numeric_df = self._get_numeric_data() 

7596 cols = numeric_df.columns 

7597 idx = cols.copy() 

7598 mat = numeric_df.values 

7599 

7600 if notna(mat).all(): 

7601 if min_periods is not None and min_periods > len(mat): 

7602 baseCov = np.empty((mat.shape[1], mat.shape[1])) 

7603 baseCov.fill(np.nan) 

7604 else: 

7605 baseCov = np.cov(mat.T) 

7606 baseCov = baseCov.reshape((len(cols), len(cols))) 

7607 else: 

7608 baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods) 

7609 

7610 return self._constructor(baseCov, index=idx, columns=cols) 

7611 

7612 def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: 

7613 """ 

7614 Compute pairwise correlation. 

7615 

7616 Pairwise correlation is computed between rows or columns of 

7617 DataFrame with rows or columns of Series or DataFrame. DataFrames 

7618 are first aligned along both axes before computing the 

7619 correlations. 

7620 

7621 Parameters 

7622 ---------- 

7623 other : DataFrame, Series 

7624 Object with which to compute correlations. 

7625 axis : {0 or 'index', 1 or 'columns'}, default 0 

7626 The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for 

7627 row-wise. 

7628 drop : bool, default False 

7629 Drop missing indices from result. 

7630 method : {'pearson', 'kendall', 'spearman'} or callable 

7631 Method of correlation: 

7632 

7633 * pearson : standard correlation coefficient 

7634 * kendall : Kendall Tau correlation coefficient 

7635 * spearman : Spearman rank correlation 

7636 * callable: callable with input two 1d ndarrays 

7637 and returning a float. 

7638 

7639 .. versionadded:: 0.24.0 

7640 

7641 Returns 

7642 ------- 

7643 Series 

7644 Pairwise correlations. 

7645 

7646 See Also 

7647 -------- 

7648 DataFrame.corr 

7649 """ 

7650 axis = self._get_axis_number(axis) 

7651 this = self._get_numeric_data() 

7652 

7653 if isinstance(other, Series): 

7654 return this.apply(lambda x: other.corr(x, method=method), axis=axis) 

7655 

7656 other = other._get_numeric_data() 

7657 left, right = this.align(other, join="inner", copy=False) 

7658 

7659 if axis == 1: 

7660 left = left.T 

7661 right = right.T 

7662 

7663 if method == "pearson": 

7664 # mask missing values 

7665 left = left + right * 0 

7666 right = right + left * 0 

7667 

7668 # demeaned data 

7669 ldem = left - left.mean() 

7670 rdem = right - right.mean() 

7671 

7672 num = (ldem * rdem).sum() 

7673 dom = (left.count() - 1) * left.std() * right.std() 

7674 

7675 correl = num / dom 

7676 

7677 elif method in ["kendall", "spearman"] or callable(method): 

7678 

7679 def c(x): 

7680 return nanops.nancorr(x[0], x[1], method=method) 

7681 

7682 correl = Series( 

7683 map(c, zip(left.values.T, right.values.T)), index=left.columns 

7684 ) 

7685 

7686 else: 

7687 raise ValueError( 

7688 f"Invalid method {method} was passed, " 

7689 "valid methods are: 'pearson', 'kendall', " 

7690 "'spearman', or callable" 

7691 ) 

7692 

7693 if not drop: 

7694 # Find non-matching labels along the given axis 

7695 # and append missing correlations (GH 22375) 

7696 raxis = 1 if axis == 0 else 0 

7697 result_index = this._get_axis(raxis).union(other._get_axis(raxis)) 

7698 idx_diff = result_index.difference(correl.index) 

7699 

7700 if len(idx_diff) > 0: 

7701 correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff)) 

7702 

7703 return correl 

7704 

7705 # ---------------------------------------------------------------------- 

7706 # ndarray-like stats methods 

7707 

7708 def count(self, axis=0, level=None, numeric_only=False): 

7709 """ 

7710 Count non-NA cells for each column or row. 

7711 

7712 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending 

7713 on `pandas.options.mode.use_inf_as_na`) are considered NA. 

7714 

7715 Parameters 

7716 ---------- 

7717 axis : {0 or 'index', 1 or 'columns'}, default 0 

7718 If 0 or 'index' counts are generated for each column. 

7719 If 1 or 'columns' counts are generated for each **row**. 

7720 level : int or str, optional 

7721 If the axis is a `MultiIndex` (hierarchical), count along a 

7722 particular `level`, collapsing into a `DataFrame`. 

7723 A `str` specifies the level name. 

7724 numeric_only : bool, default False 

7725 Include only `float`, `int` or `boolean` data. 

7726 

7727 Returns 

7728 ------- 

7729 Series or DataFrame 

7730 For each column/row the number of non-NA/null entries. 

7731 If `level` is specified returns a `DataFrame`. 

7732 

7733 See Also 

7734 -------- 

7735 Series.count: Number of non-NA elements in a Series. 

7736 DataFrame.shape: Number of DataFrame rows and columns (including NA 

7737 elements). 

7738 DataFrame.isna: Boolean same-sized DataFrame showing places of NA 

7739 elements. 

7740 

7741 Examples 

7742 -------- 

7743 Constructing DataFrame from a dictionary: 

7744 

7745 >>> df = pd.DataFrame({"Person": 

7746 ... ["John", "Myla", "Lewis", "John", "Myla"], 

7747 ... "Age": [24., np.nan, 21., 33, 26], 

7748 ... "Single": [False, True, True, True, False]}) 

7749 >>> df 

7750 Person Age Single 

7751 0 John 24.0 False 

7752 1 Myla NaN True 

7753 2 Lewis 21.0 True 

7754 3 John 33.0 True 

7755 4 Myla 26.0 False 

7756 

7757 Notice the uncounted NA values: 

7758 

7759 >>> df.count() 

7760 Person 5 

7761 Age 4 

7762 Single 5 

7763 dtype: int64 

7764 

7765 Counts for each **row**: 

7766 

7767 >>> df.count(axis='columns') 

7768 0 3 

7769 1 2 

7770 2 3 

7771 3 3 

7772 4 3 

7773 dtype: int64 

7774 

7775 Counts for one level of a `MultiIndex`: 

7776 

7777 >>> df.set_index(["Person", "Single"]).count(level="Person") 

7778 Age 

7779 Person 

7780 John 2 

7781 Lewis 1 

7782 Myla 1 

7783 """ 

7784 axis = self._get_axis_number(axis) 

7785 if level is not None: 

7786 return self._count_level(level, axis=axis, numeric_only=numeric_only) 

7787 

7788 if numeric_only: 

7789 frame = self._get_numeric_data() 

7790 else: 

7791 frame = self 

7792 

7793 # GH #423 

7794 if len(frame._get_axis(axis)) == 0: 

7795 result = Series(0, index=frame._get_agg_axis(axis)) 

7796 else: 

7797 if frame._is_mixed_type or frame._data.any_extension_types: 

7798 # the or any_extension_types is really only hit for single- 

7799 # column frames with an extension array 

7800 result = notna(frame).sum(axis=axis) 

7801 else: 

7802 # GH13407 

7803 series_counts = notna(frame).sum(axis=axis) 

7804 counts = series_counts.values 

7805 result = Series(counts, index=frame._get_agg_axis(axis)) 

7806 

7807 return result.astype("int64") 

7808 

7809 def _count_level(self, level, axis=0, numeric_only=False): 

7810 if numeric_only: 

7811 frame = self._get_numeric_data() 

7812 else: 

7813 frame = self 

7814 

7815 count_axis = frame._get_axis(axis) 

7816 agg_axis = frame._get_agg_axis(axis) 

7817 

7818 if not isinstance(count_axis, ABCMultiIndex): 

7819 raise TypeError( 

7820 f"Can only count levels on hierarchical {self._get_axis_name(axis)}." 

7821 ) 

7822 

7823 if frame._is_mixed_type: 

7824 # Since we have mixed types, calling notna(frame.values) might 

7825 # upcast everything to object 

7826 mask = notna(frame).values 

7827 else: 

7828 # But use the speedup when we have homogeneous dtypes 

7829 mask = notna(frame.values) 

7830 

7831 if axis == 1: 

7832 # We're transposing the mask rather than frame to avoid potential 

7833 # upcasts to object, which induces a ~20x slowdown 

7834 mask = mask.T 

7835 

7836 if isinstance(level, str): 

7837 level = count_axis._get_level_number(level) 

7838 

7839 level_name = count_axis._names[level] 

7840 level_index = count_axis.levels[level]._shallow_copy(name=level_name) 

7841 level_codes = ensure_int64(count_axis.codes[level]) 

7842 counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) 

7843 

7844 result = DataFrame(counts, index=level_index, columns=agg_axis) 

7845 

7846 if axis == 1: 

7847 # Undo our earlier transpose 

7848 return result.T 

7849 else: 

7850 return result 

7851 

7852 def _reduce( 

7853 self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds 

7854 ): 

7855 if axis is None and filter_type == "bool": 

7856 labels = None 

7857 constructor = None 

7858 else: 

7859 # TODO: Make other agg func handle axis=None properly 

7860 axis = self._get_axis_number(axis) 

7861 labels = self._get_agg_axis(axis) 

7862 constructor = self._constructor 

7863 

7864 def f(x): 

7865 return op(x, axis=axis, skipna=skipna, **kwds) 

7866 

7867 def _get_data(axis_matters): 

7868 if filter_type is None or filter_type == "numeric": 

7869 data = self._get_numeric_data() 

7870 elif filter_type == "bool": 

7871 if axis_matters: 

7872 # GH#25101, GH#24434 

7873 data = self._get_bool_data() if axis == 0 else self 

7874 else: 

7875 data = self._get_bool_data() 

7876 else: # pragma: no cover 

7877 msg = ( 

7878 f"Generating numeric_only data with filter_type {filter_type} " 

7879 "not supported." 

7880 ) 

7881 raise NotImplementedError(msg) 

7882 return data 

7883 

7884 if numeric_only is not None and axis in [0, 1]: 

7885 df = self 

7886 if numeric_only is True: 

7887 df = _get_data(axis_matters=True) 

7888 if axis == 1: 

7889 df = df.T 

7890 axis = 0 

7891 

7892 out_dtype = "bool" if filter_type == "bool" else None 

7893 

7894 def blk_func(values): 

7895 if isinstance(values, ExtensionArray): 

7896 return values._reduce(name, skipna=skipna, **kwds) 

7897 else: 

7898 return op(values, axis=1, skipna=skipna, **kwds) 

7899 

7900 # After possibly _get_data and transposing, we are now in the 

7901 # simple case where we can use BlockManager._reduce 

7902 res = df._data.reduce(blk_func) 

7903 assert isinstance(res, dict) 

7904 if len(res): 

7905 assert len(res) == max(list(res.keys())) + 1, res.keys() 

7906 out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) 

7907 out.index = df.columns 

7908 return out 

7909 

7910 if numeric_only is None: 

7911 values = self.values 

7912 try: 

7913 result = f(values) 

7914 

7915 if filter_type == "bool" and is_object_dtype(values) and axis is None: 

7916 # work around https://github.com/numpy/numpy/issues/10489 

7917 # TODO: combine with hasattr(result, 'dtype') further down 

7918 # hard since we don't have `values` down there. 

7919 result = np.bool_(result) 

7920 except TypeError: 

7921 # e.g. in nanops trying to convert strs to float 

7922 

7923 # try by-column first 

7924 if filter_type is None and axis == 0: 

7925 # this can end up with a non-reduction 

7926 # but not always. if the types are mixed 

7927 # with datelike then need to make sure a series 

7928 

7929 # we only end up here if we have not specified 

7930 # numeric_only and yet we have tried a 

7931 # column-by-column reduction, where we have mixed type. 

7932 # So let's just do what we can 

7933 from pandas.core.apply import frame_apply 

7934 

7935 opa = frame_apply( 

7936 self, func=f, result_type="expand", ignore_failures=True 

7937 ) 

7938 result = opa.get_result() 

7939 if result.ndim == self.ndim: 

7940 result = result.iloc[0] 

7941 return result 

7942 

7943 # TODO: why doesnt axis matter here? 

7944 data = _get_data(axis_matters=False) 

7945 with np.errstate(all="ignore"): 

7946 result = f(data.values) 

7947 labels = data._get_agg_axis(axis) 

7948 else: 

7949 if numeric_only: 

7950 data = _get_data(axis_matters=True) 

7951 

7952 values = data.values 

7953 labels = data._get_agg_axis(axis) 

7954 else: 

7955 values = self.values 

7956 result = f(values) 

7957 

7958 if hasattr(result, "dtype") and is_object_dtype(result.dtype): 

7959 try: 

7960 if filter_type is None or filter_type == "numeric": 

7961 result = result.astype(np.float64) 

7962 elif filter_type == "bool" and notna(result).all(): 

7963 result = result.astype(np.bool_) 

7964 except (ValueError, TypeError): 

7965 

7966 # try to coerce to the original dtypes item by item if we can 

7967 if axis == 0: 

7968 result = coerce_to_dtypes(result, self.dtypes) 

7969 

7970 if constructor is not None: 

7971 result = Series(result, index=labels) 

7972 return result 

7973 

7974 def nunique(self, axis=0, dropna=True) -> Series: 

7975 """ 

7976 Count distinct observations over requested axis. 

7977 

7978 Return Series with number of distinct observations. Can ignore NaN 

7979 values. 

7980 

7981 Parameters 

7982 ---------- 

7983 axis : {0 or 'index', 1 or 'columns'}, default 0 

7984 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for 

7985 column-wise. 

7986 dropna : bool, default True 

7987 Don't include NaN in the counts. 

7988 

7989 Returns 

7990 ------- 

7991 Series 

7992 

7993 See Also 

7994 -------- 

7995 Series.nunique: Method nunique for Series. 

7996 DataFrame.count: Count non-NA cells for each column or row. 

7997 

7998 Examples 

7999 -------- 

8000 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) 

8001 >>> df.nunique() 

8002 A 3 

8003 B 1 

8004 dtype: int64 

8005 

8006 >>> df.nunique(axis=1) 

8007 0 1 

8008 1 2 

8009 2 2 

8010 dtype: int64 

8011 """ 

8012 return self.apply(Series.nunique, axis=axis, dropna=dropna) 

8013 

8014 def idxmin(self, axis=0, skipna=True) -> Series: 

8015 """ 

8016 Return index of first occurrence of minimum over requested axis. 

8017 

8018 NA/null values are excluded. 

8019 

8020 Parameters 

8021 ---------- 

8022 axis : {0 or 'index', 1 or 'columns'}, default 0 

8023 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

8024 skipna : bool, default True 

8025 Exclude NA/null values. If an entire row/column is NA, the result 

8026 will be NA. 

8027 

8028 Returns 

8029 ------- 

8030 Series 

8031 Indexes of minima along the specified axis. 

8032 

8033 Raises 

8034 ------ 

8035 ValueError 

8036 * If the row/column is empty 

8037 

8038 See Also 

8039 -------- 

8040 Series.idxmin 

8041 

8042 Notes 

8043 ----- 

8044 This method is the DataFrame version of ``ndarray.argmin``. 

8045 """ 

8046 axis = self._get_axis_number(axis) 

8047 indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) 

8048 index = self._get_axis(axis) 

8049 result = [index[i] if i >= 0 else np.nan for i in indices] 

8050 return Series(result, index=self._get_agg_axis(axis)) 

8051 

8052 def idxmax(self, axis=0, skipna=True) -> Series: 

8053 """ 

8054 Return index of first occurrence of maximum over requested axis. 

8055 

8056 NA/null values are excluded. 

8057 

8058 Parameters 

8059 ---------- 

8060 axis : {0 or 'index', 1 or 'columns'}, default 0 

8061 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

8062 skipna : bool, default True 

8063 Exclude NA/null values. If an entire row/column is NA, the result 

8064 will be NA. 

8065 

8066 Returns 

8067 ------- 

8068 Series 

8069 Indexes of maxima along the specified axis. 

8070 

8071 Raises 

8072 ------ 

8073 ValueError 

8074 * If the row/column is empty 

8075 

8076 See Also 

8077 -------- 

8078 Series.idxmax 

8079 

8080 Notes 

8081 ----- 

8082 This method is the DataFrame version of ``ndarray.argmax``. 

8083 """ 

8084 axis = self._get_axis_number(axis) 

8085 indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) 

8086 index = self._get_axis(axis) 

8087 result = [index[i] if i >= 0 else np.nan for i in indices] 

8088 return Series(result, index=self._get_agg_axis(axis)) 

8089 

8090 def _get_agg_axis(self, axis_num): 

8091 """ 

8092 Let's be explicit about this. 

8093 """ 

8094 if axis_num == 0: 

8095 return self.columns 

8096 elif axis_num == 1: 

8097 return self.index 

8098 else: 

8099 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") 

8100 

8101 def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": 

8102 """ 

8103 Get the mode(s) of each element along the selected axis. 

8104 

8105 The mode of a set of values is the value that appears most often. 

8106 It can be multiple values. 

8107 

8108 Parameters 

8109 ---------- 

8110 axis : {0 or 'index', 1 or 'columns'}, default 0 

8111 The axis to iterate over while searching for the mode: 

8112 

8113 * 0 or 'index' : get mode of each column 

8114 * 1 or 'columns' : get mode of each row. 

8115 

8116 numeric_only : bool, default False 

8117 If True, only apply to numeric columns. 

8118 dropna : bool, default True 

8119 Don't consider counts of NaN/NaT. 

8120 

8121 .. versionadded:: 0.24.0 

8122 

8123 Returns 

8124 ------- 

8125 DataFrame 

8126 The modes of each column or row. 

8127 

8128 See Also 

8129 -------- 

8130 Series.mode : Return the highest frequency value in a Series. 

8131 Series.value_counts : Return the counts of values in a Series. 

8132 

8133 Examples 

8134 -------- 

8135 >>> df = pd.DataFrame([('bird', 2, 2), 

8136 ... ('mammal', 4, np.nan), 

8137 ... ('arthropod', 8, 0), 

8138 ... ('bird', 2, np.nan)], 

8139 ... index=('falcon', 'horse', 'spider', 'ostrich'), 

8140 ... columns=('species', 'legs', 'wings')) 

8141 >>> df 

8142 species legs wings 

8143 falcon bird 2 2.0 

8144 horse mammal 4 NaN 

8145 spider arthropod 8 0.0 

8146 ostrich bird 2 NaN 

8147 

8148 By default, missing values are not considered, and the mode of wings 

8149 are both 0 and 2. The second row of species and legs contains ``NaN``, 

8150 because they have only one mode, but the DataFrame has two rows. 

8151 

8152 >>> df.mode() 

8153 species legs wings 

8154 0 bird 2.0 0.0 

8155 1 NaN NaN 2.0 

8156 

8157 Setting ``dropna=False`` ``NaN`` values are considered and they can be 

8158 the mode (like for wings). 

8159 

8160 >>> df.mode(dropna=False) 

8161 species legs wings 

8162 0 bird 2 NaN 

8163 

8164 Setting ``numeric_only=True``, only the mode of numeric columns is 

8165 computed, and columns of other types are ignored. 

8166 

8167 >>> df.mode(numeric_only=True) 

8168 legs wings 

8169 0 2.0 0.0 

8170 1 NaN 2.0 

8171 

8172 To compute the mode over columns and not rows, use the axis parameter: 

8173 

8174 >>> df.mode(axis='columns', numeric_only=True) 

8175 0 1 

8176 falcon 2.0 NaN 

8177 horse 4.0 NaN 

8178 spider 0.0 8.0 

8179 ostrich 2.0 NaN 

8180 """ 

8181 data = self if not numeric_only else self._get_numeric_data() 

8182 

8183 def f(s): 

8184 return s.mode(dropna=dropna) 

8185 

8186 return data.apply(f, axis=axis) 

8187 

8188 def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): 

8189 """ 

8190 Return values at the given quantile over requested axis. 

8191 

8192 Parameters 

8193 ---------- 

8194 q : float or array-like, default 0.5 (50% quantile) 

8195 Value between 0 <= q <= 1, the quantile(s) to compute. 

8196 axis : {0, 1, 'index', 'columns'} (default 0) 

8197 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

8198 numeric_only : bool, default True 

8199 If False, the quantile of datetime and timedelta data will be 

8200 computed as well. 

8201 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} 

8202 This optional parameter specifies the interpolation method to use, 

8203 when the desired quantile lies between two data points `i` and `j`: 

8204 

8205 * linear: `i + (j - i) * fraction`, where `fraction` is the 

8206 fractional part of the index surrounded by `i` and `j`. 

8207 * lower: `i`. 

8208 * higher: `j`. 

8209 * nearest: `i` or `j` whichever is nearest. 

8210 * midpoint: (`i` + `j`) / 2. 

8211 

8212 Returns 

8213 ------- 

8214 Series or DataFrame 

8215 

8216 If ``q`` is an array, a DataFrame will be returned where the 

8217 index is ``q``, the columns are the columns of self, and the 

8218 values are the quantiles. 

8219 If ``q`` is a float, a Series will be returned where the 

8220 index is the columns of self and the values are the quantiles. 

8221 

8222 See Also 

8223 -------- 

8224 core.window.Rolling.quantile: Rolling quantile. 

8225 numpy.percentile: Numpy function to compute the percentile. 

8226 

8227 Examples 

8228 -------- 

8229 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), 

8230 ... columns=['a', 'b']) 

8231 >>> df.quantile(.1) 

8232 a 1.3 

8233 b 3.7 

8234 Name: 0.1, dtype: float64 

8235 >>> df.quantile([.1, .5]) 

8236 a b 

8237 0.1 1.3 3.7 

8238 0.5 2.5 55.0 

8239 

8240 Specifying `numeric_only=False` will also compute the quantile of 

8241 datetime and timedelta data. 

8242 

8243 >>> df = pd.DataFrame({'A': [1, 2], 

8244 ... 'B': [pd.Timestamp('2010'), 

8245 ... pd.Timestamp('2011')], 

8246 ... 'C': [pd.Timedelta('1 days'), 

8247 ... pd.Timedelta('2 days')]}) 

8248 >>> df.quantile(0.5, numeric_only=False) 

8249 A 1.5 

8250 B 2010-07-02 12:00:00 

8251 C 1 days 12:00:00 

8252 Name: 0.5, dtype: object 

8253 """ 

8254 validate_percentile(q) 

8255 

8256 data = self._get_numeric_data() if numeric_only else self 

8257 axis = self._get_axis_number(axis) 

8258 is_transposed = axis == 1 

8259 

8260 if is_transposed: 

8261 data = data.T 

8262 

8263 if len(data.columns) == 0: 

8264 # GH#23925 _get_numeric_data may have dropped all columns 

8265 cols = Index([], name=self.columns.name) 

8266 if is_list_like(q): 

8267 return self._constructor([], index=q, columns=cols) 

8268 return self._constructor_sliced([], index=cols, name=q, dtype=np.float64) 

8269 

8270 result = data._data.quantile( 

8271 qs=q, axis=1, interpolation=interpolation, transposed=is_transposed 

8272 ) 

8273 

8274 if result.ndim == 2: 

8275 result = self._constructor(result) 

8276 else: 

8277 result = self._constructor_sliced(result, name=q) 

8278 

8279 if is_transposed: 

8280 result = result.T 

8281 

8282 return result 

8283 

8284 def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame": 

8285 """ 

8286 Cast to DatetimeIndex of timestamps, at *beginning* of period. 

8287 

8288 Parameters 

8289 ---------- 

8290 freq : str, default frequency of PeriodIndex 

8291 Desired frequency. 

8292 how : {'s', 'e', 'start', 'end'} 

8293 Convention for converting period to timestamp; start of period 

8294 vs. end. 

8295 axis : {0 or 'index', 1 or 'columns'}, default 0 

8296 The axis to convert (the index by default). 

8297 copy : bool, default True 

8298 If False then underlying input data is not copied. 

8299 

8300 Returns 

8301 ------- 

8302 DataFrame with DatetimeIndex 

8303 """ 

8304 new_data = self._data 

8305 if copy: 

8306 new_data = new_data.copy() 

8307 

8308 axis = self._get_axis_number(axis) 

8309 if axis == 0: 

8310 new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how)) 

8311 elif axis == 1: 

8312 new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) 

8313 else: # pragma: no cover 

8314 raise AssertionError(f"Axis must be 0 or 1. Got {axis}") 

8315 

8316 return self._constructor(new_data) 

8317 

8318 def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame": 

8319 """ 

8320 Convert DataFrame from DatetimeIndex to PeriodIndex. 

8321 

8322 Convert DataFrame from DatetimeIndex to PeriodIndex with desired 

8323 frequency (inferred from index if not passed). 

8324 

8325 Parameters 

8326 ---------- 

8327 freq : str, default 

8328 Frequency of the PeriodIndex. 

8329 axis : {0 or 'index', 1 or 'columns'}, default 0 

8330 The axis to convert (the index by default). 

8331 copy : bool, default True 

8332 If False then underlying input data is not copied. 

8333 

8334 Returns 

8335 ------- 

8336 TimeSeries with PeriodIndex 

8337 """ 

8338 new_data = self._data 

8339 if copy: 

8340 new_data = new_data.copy() 

8341 

8342 axis = self._get_axis_number(axis) 

8343 if axis == 0: 

8344 new_data.set_axis(1, self.index.to_period(freq=freq)) 

8345 elif axis == 1: 

8346 new_data.set_axis(0, self.columns.to_period(freq=freq)) 

8347 else: # pragma: no cover 

8348 raise AssertionError(f"Axis must be 0 or 1. Got {axis}") 

8349 

8350 return self._constructor(new_data) 

8351 

8352 def isin(self, values) -> "DataFrame": 

8353 """ 

8354 Whether each element in the DataFrame is contained in values. 

8355 

8356 Parameters 

8357 ---------- 

8358 values : iterable, Series, DataFrame or dict 

8359 The result will only be true at a location if all the 

8360 labels match. If `values` is a Series, that's the index. If 

8361 `values` is a dict, the keys must be the column names, 

8362 which must match. If `values` is a DataFrame, 

8363 then both the index and column labels must match. 

8364 

8365 Returns 

8366 ------- 

8367 DataFrame 

8368 DataFrame of booleans showing whether each element in the DataFrame 

8369 is contained in values. 

8370 

8371 See Also 

8372 -------- 

8373 DataFrame.eq: Equality test for DataFrame. 

8374 Series.isin: Equivalent method on Series. 

8375 Series.str.contains: Test if pattern or regex is contained within a 

8376 string of a Series or Index. 

8377 

8378 Examples 

8379 -------- 

8380 

8381 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, 

8382 ... index=['falcon', 'dog']) 

8383 >>> df 

8384 num_legs num_wings 

8385 falcon 2 2 

8386 dog 4 0 

8387 

8388 When ``values`` is a list check whether every value in the DataFrame 

8389 is present in the list (which animals have 0 or 2 legs or wings) 

8390 

8391 >>> df.isin([0, 2]) 

8392 num_legs num_wings 

8393 falcon True True 

8394 dog False True 

8395 

8396 When ``values`` is a dict, we can pass values to check for each 

8397 column separately: 

8398 

8399 >>> df.isin({'num_wings': [0, 3]}) 

8400 num_legs num_wings 

8401 falcon False False 

8402 dog False True 

8403 

8404 When ``values`` is a Series or DataFrame the index and column must 

8405 match. Note that 'falcon' does not match based on the number of legs 

8406 in df2. 

8407 

8408 >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]}, 

8409 ... index=['spider', 'falcon']) 

8410 >>> df.isin(other) 

8411 num_legs num_wings 

8412 falcon True True 

8413 dog False False 

8414 """ 

8415 if isinstance(values, dict): 

8416 from pandas.core.reshape.concat import concat 

8417 

8418 values = collections.defaultdict(list, values) 

8419 return concat( 

8420 ( 

8421 self.iloc[:, [i]].isin(values[col]) 

8422 for i, col in enumerate(self.columns) 

8423 ), 

8424 axis=1, 

8425 ) 

8426 elif isinstance(values, Series): 

8427 if not values.index.is_unique: 

8428 raise ValueError("cannot compute isin with a duplicate axis.") 

8429 return self.eq(values.reindex_like(self), axis="index") 

8430 elif isinstance(values, DataFrame): 

8431 if not (values.columns.is_unique and values.index.is_unique): 

8432 raise ValueError("cannot compute isin with a duplicate axis.") 

8433 return self.eq(values.reindex_like(self)) 

8434 else: 

8435 if not is_list_like(values): 

8436 raise TypeError( 

8437 "only list-like or dict-like objects are allowed " 

8438 "to be passed to DataFrame.isin(), " 

8439 f"you passed a {repr(type(values).__name__)}" 

8440 ) 

8441 return DataFrame( 

8442 algorithms.isin(self.values.ravel(), values).reshape(self.shape), 

8443 self.index, 

8444 self.columns, 

8445 ) 

8446 

8447 # ---------------------------------------------------------------------- 

8448 # Add plotting methods to DataFrame 

8449 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) 

8450 hist = pandas.plotting.hist_frame 

8451 boxplot = pandas.plotting.boxplot_frame 

8452 sparse = CachedAccessor("sparse", SparseFrameAccessor) 

8453 

8454 

8455DataFrame._setup_axes( 

8456 ["index", "columns"], 

8457 docs={ 

8458 "index": "The index (row labels) of the DataFrame.", 

8459 "columns": "The column labels of the DataFrame.", 

8460 }, 

8461) 

8462DataFrame._add_numeric_operations() 

8463DataFrame._add_series_or_dataframe_operations() 

8464 

8465ops.add_flex_arithmetic_methods(DataFrame) 

8466ops.add_special_arithmetic_methods(DataFrame) 

8467 

8468 

8469def _from_nested_dict(data): 

8470 # TODO: this should be seriously cythonized 

8471 new_data = {} 

8472 for index, s in data.items(): 

8473 for col, v in s.items(): 

8474 new_data[col] = new_data.get(col, {}) 

8475 new_data[col][index] = v 

8476 return new_data 

8477 

8478 

8479def _put_str(s, space): 

8480 return str(s)[:space].ljust(space)