Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Provide the groupby split-apply-combine paradigm. Define the GroupBy 

3class providing the base-class of operations. 

4 

5The SeriesGroupBy and DataFrameGroupBy sub-class 

6(defined in pandas.core.groupby.generic) 

7expose these user-facing objects to provide specific functionality. 

8""" 

9 

10from contextlib import contextmanager 

11import datetime 

12from functools import partial, wraps 

13import inspect 

14import re 

15import types 

16from typing import ( 

17 Callable, 

18 Dict, 

19 FrozenSet, 

20 Hashable, 

21 Iterable, 

22 List, 

23 Mapping, 

24 Optional, 

25 Tuple, 

26 Type, 

27 Union, 

28) 

29 

30import numpy as np 

31 

32from pandas._config.config import option_context 

33 

34from pandas._libs import Timestamp 

35import pandas._libs.groupby as libgroupby 

36from pandas._typing import FrameOrSeries, Scalar 

37from pandas.compat import set_function_name 

38from pandas.compat.numpy import function as nv 

39from pandas.errors import AbstractMethodError 

40from pandas.util._decorators import Appender, Substitution, cache_readonly 

41 

42from pandas.core.dtypes.cast import maybe_downcast_to_dtype 

43from pandas.core.dtypes.common import ( 

44 ensure_float, 

45 is_categorical_dtype, 

46 is_datetime64_dtype, 

47 is_extension_array_dtype, 

48 is_integer_dtype, 

49 is_numeric_dtype, 

50 is_object_dtype, 

51 is_scalar, 

52) 

53from pandas.core.dtypes.missing import isna, notna 

54 

55from pandas.core import nanops 

56import pandas.core.algorithms as algorithms 

57from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea 

58from pandas.core.base import DataError, PandasObject, SelectionMixin 

59import pandas.core.common as com 

60from pandas.core.frame import DataFrame 

61from pandas.core.generic import NDFrame 

62from pandas.core.groupby import base, ops 

63from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex 

64from pandas.core.series import Series 

65from pandas.core.sorting import get_group_index_sorter 

66 

67_common_see_also = """ 

68 See Also 

69 -------- 

70 Series.%(name)s 

71 DataFrame.%(name)s 

72""" 

73 

74_apply_docs = dict( 

75 template=""" 

76 Apply function `func` group-wise and combine the results together. 

77 

78 The function passed to `apply` must take a {input} as its first 

79 argument and return a DataFrame, Series or scalar. `apply` will 

80 then take care of combining the results back together into a single 

81 dataframe or series. `apply` is therefore a highly flexible 

82 grouping method. 

83 

84 While `apply` is a very flexible method, its downside is that 

85 using it can be quite a bit slower than using more specific methods 

86 like `agg` or `transform`. Pandas offers a wide range of method that will 

87 be much faster than using `apply` for their specific purposes, so try to 

88 use them before reaching for `apply`. 

89 

90 Parameters 

91 ---------- 

92 func : callable 

93 A callable that takes a {input} as its first argument, and 

94 returns a dataframe, a series or a scalar. In addition the 

95 callable may take positional and keyword arguments. 

96 args, kwargs : tuple and dict 

97 Optional positional and keyword arguments to pass to `func`. 

98 

99 Returns 

100 ------- 

101 applied : Series or DataFrame 

102 

103 See Also 

104 -------- 

105 pipe : Apply function to the full GroupBy object instead of to each 

106 group. 

107 aggregate : Apply aggregate function to the GroupBy object. 

108 transform : Apply function column-by-column to the GroupBy object. 

109 Series.apply : Apply a function to a Series. 

110 DataFrame.apply : Apply a function to each row or column of a DataFrame. 

111 """, 

112 dataframe_examples=""" 

113 >>> df = pd.DataFrame({'A': 'a a b'.split(), 

114 'B': [1,2,3], 

115 'C': [4,6, 5]}) 

116 >>> g = df.groupby('A') 

117 

118 Notice that ``g`` has two groups, ``a`` and ``b``. 

119 Calling `apply` in various ways, we can get different grouping results: 

120 

121 Example 1: below the function passed to `apply` takes a DataFrame as 

122 its argument and returns a DataFrame. `apply` combines the result for 

123 each group together into a new DataFrame: 

124 

125 >>> g[['B', 'C']].apply(lambda x: x / x.sum()) 

126 B C 

127 0 0.333333 0.4 

128 1 0.666667 0.6 

129 2 1.000000 1.0 

130 

131 Example 2: The function passed to `apply` takes a DataFrame as 

132 its argument and returns a Series. `apply` combines the result for 

133 each group together into a new DataFrame: 

134 

135 >>> g[['B', 'C']].apply(lambda x: x.max() - x.min()) 

136 B C 

137 A 

138 a 1 2 

139 b 0 0 

140 

141 Example 3: The function passed to `apply` takes a DataFrame as 

142 its argument and returns a scalar. `apply` combines the result for 

143 each group together into a Series, including setting the index as 

144 appropriate: 

145 

146 >>> g.apply(lambda x: x.C.max() - x.B.min()) 

147 A 

148 a 5 

149 b 2 

150 dtype: int64 

151 """, 

152 series_examples=""" 

153 >>> s = pd.Series([0, 1, 2], index='a a b'.split()) 

154 >>> g = s.groupby(s.index) 

155 

156 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. 

157 Calling `apply` in various ways, we can get different grouping results: 

158 

159 Example 1: The function passed to `apply` takes a Series as 

160 its argument and returns a Series. `apply` combines the result for 

161 each group together into a new Series: 

162 

163 >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2) 

164 0 0.0 

165 1 0.5 

166 2 4.0 

167 dtype: float64 

168 

169 Example 2: The function passed to `apply` takes a Series as 

170 its argument and returns a scalar. `apply` combines the result for 

171 each group together into a Series, including setting the index as 

172 appropriate: 

173 

174 >>> g.apply(lambda x: x.max() - x.min()) 

175 a 1 

176 b 0 

177 dtype: int64 

178 

179 Notes 

180 ----- 

181 In the current implementation `apply` calls `func` twice on the 

182 first group to decide whether it can take a fast or slow code 

183 path. This can lead to unexpected behavior if `func` has 

184 side-effects, as they will take effect twice for the first 

185 group. 

186 

187 Examples 

188 -------- 

189 {examples} 

190 """, 

191) 

192 

193_pipe_template = """ 

194Apply a function `func` with arguments to this %(klass)s object and return 

195the function's result. 

196 

197%(versionadded)s 

198 

199Use `.pipe` when you want to improve readability by chaining together 

200functions that expect Series, DataFrames, GroupBy or Resampler objects. 

201Instead of writing 

202 

203>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) 

204 

205You can write 

206 

207>>> (df.groupby('group') 

208... .pipe(f) 

209... .pipe(g, arg1=a) 

210... .pipe(h, arg2=b, arg3=c)) 

211 

212which is much more readable. 

213 

214Parameters 

215---------- 

216func : callable or tuple of (callable, string) 

217 Function to apply to this %(klass)s object or, alternatively, 

218 a `(callable, data_keyword)` tuple where `data_keyword` is a 

219 string indicating the keyword of `callable` that expects the 

220 %(klass)s object. 

221args : iterable, optional 

222 Positional arguments passed into `func`. 

223kwargs : dict, optional 

224 A dictionary of keyword arguments passed into `func`. 

225 

226Returns 

227------- 

228object : the return type of `func`. 

229 

230See Also 

231-------- 

232Series.pipe : Apply a function with arguments to a series. 

233DataFrame.pipe: Apply a function with arguments to a dataframe. 

234apply : Apply function to each group instead of to the 

235 full %(klass)s object. 

236 

237Notes 

238----- 

239See more `here 

240<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_ 

241 

242Examples 

243-------- 

244%(examples)s 

245""" 

246 

247_transform_template = """ 

248Call function producing a like-indexed %(klass)s on each group and 

249return a %(klass)s having the same indexes as the original object 

250filled with the transformed values 

251 

252Parameters 

253---------- 

254f : function 

255 Function to apply to each group 

256 

257Returns 

258------- 

259%(klass)s 

260 

261See Also 

262-------- 

263aggregate, transform 

264 

265Notes 

266----- 

267Each group is endowed the attribute 'name' in case you need to know 

268which group you are working on. 

269 

270The current implementation imposes three requirements on f: 

271 

272* f must return a value that either has the same shape as the input 

273 subframe or can be broadcast to the shape of the input subframe. 

274 For example, if `f` returns a scalar it will be broadcast to have the 

275 same shape as the input subframe. 

276* if this is a DataFrame, f must support application column-by-column 

277 in the subframe. If f also supports application to the entire subframe, 

278 then a fast path is used starting from the second chunk. 

279* f must not mutate groups. Mutation is not supported and may 

280 produce unexpected results. 

281 

282Examples 

283-------- 

284 

285# Same shape 

286>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

287... 'foo', 'bar'], 

288... 'B' : ['one', 'one', 'two', 'three', 

289... 'two', 'two'], 

290... 'C' : [1, 5, 5, 2, 5, 5], 

291... 'D' : [2.0, 5., 8., 1., 2., 9.]}) 

292>>> grouped = df.groupby('A') 

293>>> grouped.transform(lambda x: (x - x.mean()) / x.std()) 

294 C D 

2950 -1.154701 -0.577350 

2961 0.577350 0.000000 

2972 0.577350 1.154701 

2983 -1.154701 -1.000000 

2994 0.577350 -0.577350 

3005 0.577350 1.000000 

301 

302# Broadcastable 

303>>> grouped.transform(lambda x: x.max() - x.min()) 

304 C D 

3050 4 6.0 

3061 3 8.0 

3072 4 6.0 

3083 3 8.0 

3094 4 6.0 

3105 3 8.0 

311""" 

312 

313 

314class GroupByPlot(PandasObject): 

315 """ 

316 Class implementing the .plot attribute for groupby objects. 

317 """ 

318 

319 def __init__(self, groupby): 

320 self._groupby = groupby 

321 

322 def __call__(self, *args, **kwargs): 

323 def f(self): 

324 return self.plot(*args, **kwargs) 

325 

326 f.__name__ = "plot" 

327 return self._groupby.apply(f) 

328 

329 def __getattr__(self, name: str): 

330 def attr(*args, **kwargs): 

331 def f(self): 

332 return getattr(self.plot, name)(*args, **kwargs) 

333 

334 return self._groupby.apply(f) 

335 

336 return attr 

337 

338 

339@contextmanager 

340def _group_selection_context(groupby): 

341 """ 

342 Set / reset the _group_selection_context. 

343 """ 

344 groupby._set_group_selection() 

345 yield groupby 

346 groupby._reset_group_selection() 

347 

348 

349_KeysArgType = Union[ 

350 Hashable, 

351 List[Hashable], 

352 Callable[[Hashable], Hashable], 

353 List[Callable[[Hashable], Hashable]], 

354 Mapping[Hashable, Hashable], 

355] 

356 

357 

358class _GroupBy(PandasObject, SelectionMixin): 

359 _group_selection = None 

360 _apply_whitelist: FrozenSet[str] = frozenset() 

361 

362 def __init__( 

363 self, 

364 obj: NDFrame, 

365 keys: Optional[_KeysArgType] = None, 

366 axis: int = 0, 

367 level=None, 

368 grouper: "Optional[ops.BaseGrouper]" = None, 

369 exclusions=None, 

370 selection=None, 

371 as_index: bool = True, 

372 sort: bool = True, 

373 group_keys: bool = True, 

374 squeeze: bool = False, 

375 observed: bool = False, 

376 mutated: bool = False, 

377 ): 

378 

379 self._selection = selection 

380 

381 assert isinstance(obj, NDFrame), type(obj) 

382 obj._consolidate_inplace() 

383 

384 self.level = level 

385 

386 if not as_index: 

387 if not isinstance(obj, DataFrame): 

388 raise TypeError("as_index=False only valid with DataFrame") 

389 if axis != 0: 

390 raise ValueError("as_index=False only valid for axis=0") 

391 

392 self.as_index = as_index 

393 self.keys = keys 

394 self.sort = sort 

395 self.group_keys = group_keys 

396 self.squeeze = squeeze 

397 self.observed = observed 

398 self.mutated = mutated 

399 

400 if grouper is None: 

401 from pandas.core.groupby.grouper import get_grouper 

402 

403 grouper, exclusions, obj = get_grouper( 

404 obj, 

405 keys, 

406 axis=axis, 

407 level=level, 

408 sort=sort, 

409 observed=observed, 

410 mutated=self.mutated, 

411 ) 

412 

413 self.obj = obj 

414 self.axis = obj._get_axis_number(axis) 

415 self.grouper = grouper 

416 self.exclusions = set(exclusions) if exclusions else set() 

417 

418 def __len__(self) -> int: 

419 return len(self.groups) 

420 

421 def __repr__(self) -> str: 

422 # TODO: Better repr for GroupBy object 

423 return object.__repr__(self) 

424 

425 def _assure_grouper(self): 

426 """ 

427 We create the grouper on instantiation sub-classes may have a 

428 different policy. 

429 """ 

430 pass 

431 

432 @property 

433 def groups(self): 

434 """ 

435 Dict {group name -> group labels}. 

436 """ 

437 self._assure_grouper() 

438 return self.grouper.groups 

439 

440 @property 

441 def ngroups(self): 

442 self._assure_grouper() 

443 return self.grouper.ngroups 

444 

445 @property 

446 def indices(self): 

447 """ 

448 Dict {group name -> group indices}. 

449 """ 

450 self._assure_grouper() 

451 return self.grouper.indices 

452 

453 def _get_indices(self, names): 

454 """ 

455 Safe get multiple indices, translate keys for 

456 datelike to underlying repr. 

457 """ 

458 

459 def get_converter(s): 

460 # possibly convert to the actual key types 

461 # in the indices, could be a Timestamp or a np.datetime64 

462 if isinstance(s, datetime.datetime): 

463 return lambda key: Timestamp(key) 

464 elif isinstance(s, np.datetime64): 

465 return lambda key: Timestamp(key).asm8 

466 else: 

467 return lambda key: key 

468 

469 if len(names) == 0: 

470 return [] 

471 

472 if len(self.indices) > 0: 

473 index_sample = next(iter(self.indices)) 

474 else: 

475 index_sample = None # Dummy sample 

476 

477 name_sample = names[0] 

478 if isinstance(index_sample, tuple): 

479 if not isinstance(name_sample, tuple): 

480 msg = "must supply a tuple to get_group with multiple grouping keys" 

481 raise ValueError(msg) 

482 if not len(name_sample) == len(index_sample): 

483 try: 

484 # If the original grouper was a tuple 

485 return [self.indices[name] for name in names] 

486 except KeyError: 

487 # turns out it wasn't a tuple 

488 msg = ( 

489 "must supply a same-length tuple to get_group " 

490 "with multiple grouping keys" 

491 ) 

492 raise ValueError(msg) 

493 

494 converters = [get_converter(s) for s in index_sample] 

495 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) 

496 

497 else: 

498 converter = get_converter(index_sample) 

499 names = (converter(name) for name in names) 

500 

501 return [self.indices.get(name, []) for name in names] 

502 

503 def _get_index(self, name): 

504 """ 

505 Safe get index, translate keys for datelike to underlying repr. 

506 """ 

507 return self._get_indices([name])[0] 

508 

509 @cache_readonly 

510 def _selected_obj(self): 

511 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy 

512 

513 if self._selection is None or isinstance(self.obj, Series): 

514 if self._group_selection is not None: 

515 return self.obj[self._group_selection] 

516 return self.obj 

517 else: 

518 return self.obj[self._selection] 

519 

520 def _reset_group_selection(self): 

521 """ 

522 Clear group based selection. 

523 

524 Used for methods needing to return info on each group regardless of 

525 whether a group selection was previously set. 

526 """ 

527 if self._group_selection is not None: 

528 # GH12839 clear cached selection too when changing group selection 

529 self._group_selection = None 

530 self._reset_cache("_selected_obj") 

531 

532 def _set_group_selection(self): 

533 """ 

534 Create group based selection. 

535 

536 Used when selection is not passed directly but instead via a grouper. 

537 

538 NOTE: this should be paired with a call to _reset_group_selection 

539 """ 

540 grp = self.grouper 

541 if not ( 

542 self.as_index 

543 and getattr(grp, "groupings", None) is not None 

544 and self.obj.ndim > 1 

545 and self._group_selection is None 

546 ): 

547 return 

548 

549 ax = self.obj._info_axis 

550 groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis] 

551 

552 if len(groupers): 

553 # GH12839 clear selected obj cache when group selection changes 

554 self._group_selection = ax.difference(Index(groupers), sort=False).tolist() 

555 self._reset_cache("_selected_obj") 

556 

557 def _set_result_index_ordered(self, result): 

558 # set the result index on the passed values object and 

559 # return the new object, xref 8046 

560 

561 # the values/counts are repeated according to the group index 

562 # shortcut if we have an already ordered grouper 

563 if not self.grouper.is_monotonic: 

564 index = Index(np.concatenate(self._get_indices(self.grouper.result_index))) 

565 result.set_axis(index, axis=self.axis, inplace=True) 

566 result = result.sort_index(axis=self.axis) 

567 

568 result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) 

569 return result 

570 

571 def _dir_additions(self): 

572 return self.obj._dir_additions() | self._apply_whitelist 

573 

574 def __getattr__(self, attr: str): 

575 if attr in self._internal_names_set: 

576 return object.__getattribute__(self, attr) 

577 if attr in self.obj: 

578 return self[attr] 

579 

580 raise AttributeError( 

581 f"'{type(self).__name__}' object has no attribute '{attr}'" 

582 ) 

583 

584 @Substitution( 

585 klass="GroupBy", 

586 versionadded=".. versionadded:: 0.21.0", 

587 examples="""\ 

588>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) 

589>>> df 

590 A B 

5910 a 1 

5921 b 2 

5932 a 3 

5943 b 4 

595 

596To get the difference between each groups maximum and minimum value in one 

597pass, you can do 

598 

599>>> df.groupby('A').pipe(lambda x: x.max() - x.min()) 

600 B 

601A 

602a 2 

603b 2""", 

604 ) 

605 @Appender(_pipe_template) 

606 def pipe(self, func, *args, **kwargs): 

607 return com.pipe(self, func, *args, **kwargs) 

608 

609 plot = property(GroupByPlot) 

610 

611 def _make_wrapper(self, name): 

612 assert name in self._apply_whitelist 

613 

614 self._set_group_selection() 

615 

616 # need to setup the selection 

617 # as are not passed directly but in the grouper 

618 f = getattr(self._selected_obj, name) 

619 if not isinstance(f, types.MethodType): 

620 return self.apply(lambda self: getattr(self, name)) 

621 

622 f = getattr(type(self._selected_obj), name) 

623 sig = inspect.signature(f) 

624 

625 def wrapper(*args, **kwargs): 

626 # a little trickery for aggregation functions that need an axis 

627 # argument 

628 if "axis" in sig.parameters: 

629 if kwargs.get("axis", None) is None: 

630 kwargs["axis"] = self.axis 

631 

632 def curried(x): 

633 return f(x, *args, **kwargs) 

634 

635 # preserve the name so we can detect it when calling plot methods, 

636 # to avoid duplicates 

637 curried.__name__ = name 

638 

639 # special case otherwise extra plots are created when catching the 

640 # exception below 

641 if name in base.plotting_methods: 

642 return self.apply(curried) 

643 

644 try: 

645 return self.apply(curried) 

646 except TypeError as err: 

647 if not re.search( 

648 "reduction operation '.*' not allowed for this dtype", str(err) 

649 ): 

650 # We don't have a cython implementation 

651 # TODO: is the above comment accurate? 

652 raise 

653 

654 if self.obj.ndim == 1: 

655 # this can be called recursively, so need to raise ValueError 

656 raise ValueError 

657 

658 # GH#3688 try to operate item-by-item 

659 result = self._aggregate_item_by_item(name, *args, **kwargs) 

660 return result 

661 

662 wrapper.__name__ = name 

663 return wrapper 

664 

665 def get_group(self, name, obj=None): 

666 """ 

667 Construct DataFrame from group with provided name. 

668 

669 Parameters 

670 ---------- 

671 name : object 

672 The name of the group to get as a DataFrame. 

673 obj : DataFrame, default None 

674 The DataFrame to take the DataFrame out of. If 

675 it is None, the object groupby was called on will 

676 be used. 

677 

678 Returns 

679 ------- 

680 group : same type as obj 

681 """ 

682 if obj is None: 

683 obj = self._selected_obj 

684 

685 inds = self._get_index(name) 

686 if not len(inds): 

687 raise KeyError(name) 

688 

689 return obj._take_with_is_copy(inds, axis=self.axis) 

690 

691 def __iter__(self): 

692 """ 

693 Groupby iterator. 

694 

695 Returns 

696 ------- 

697 Generator yielding sequence of (name, subsetted object) 

698 for each group 

699 """ 

700 return self.grouper.get_iterator(self.obj, axis=self.axis) 

701 

702 @Appender( 

703 _apply_docs["template"].format( 

704 input="dataframe", examples=_apply_docs["dataframe_examples"] 

705 ) 

706 ) 

707 def apply(self, func, *args, **kwargs): 

708 

709 func = self._is_builtin_func(func) 

710 

711 # this is needed so we don't try and wrap strings. If we could 

712 # resolve functions to their callable functions prior, this 

713 # wouldn't be needed 

714 if args or kwargs: 

715 if callable(func): 

716 

717 @wraps(func) 

718 def f(g): 

719 with np.errstate(all="ignore"): 

720 return func(g, *args, **kwargs) 

721 

722 elif hasattr(nanops, "nan" + func): 

723 # TODO: should we wrap this in to e.g. _is_builtin_func? 

724 f = getattr(nanops, "nan" + func) 

725 

726 else: 

727 raise ValueError( 

728 "func must be a callable if args or kwargs are supplied" 

729 ) 

730 else: 

731 f = func 

732 

733 # ignore SettingWithCopy here in case the user mutates 

734 with option_context("mode.chained_assignment", None): 

735 try: 

736 result = self._python_apply_general(f) 

737 except TypeError: 

738 # gh-20949 

739 # try again, with .apply acting as a filtering 

740 # operation, by excluding the grouping column 

741 # This would normally not be triggered 

742 # except if the udf is trying an operation that 

743 # fails on *some* columns, e.g. a numeric operation 

744 # on a string grouper column 

745 

746 with _group_selection_context(self): 

747 return self._python_apply_general(f) 

748 

749 return result 

750 

751 def _python_apply_general(self, f): 

752 keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) 

753 

754 return self._wrap_applied_output( 

755 keys, values, not_indexed_same=mutated or self.mutated 

756 ) 

757 

758 def _iterate_slices(self) -> Iterable[Series]: 

759 raise AbstractMethodError(self) 

760 

761 def transform(self, func, *args, **kwargs): 

762 raise AbstractMethodError(self) 

763 

764 def _cumcount_array(self, ascending: bool = True): 

765 """ 

766 Parameters 

767 ---------- 

768 ascending : bool, default True 

769 If False, number in reverse, from length of group - 1 to 0. 

770 

771 Notes 

772 ----- 

773 this is currently implementing sort=False 

774 (though the default is sort=True) for groupby in general 

775 """ 

776 ids, _, ngroups = self.grouper.group_info 

777 sorter = get_group_index_sorter(ids, ngroups) 

778 ids, count = ids[sorter], len(ids) 

779 

780 if count == 0: 

781 return np.empty(0, dtype=np.int64) 

782 

783 run = np.r_[True, ids[:-1] != ids[1:]] 

784 rep = np.diff(np.r_[np.nonzero(run)[0], count]) 

785 out = (~run).cumsum() 

786 

787 if ascending: 

788 out -= np.repeat(out[run], rep) 

789 else: 

790 out = np.repeat(out[np.r_[run[1:], True]], rep) - out 

791 

792 rev = np.empty(count, dtype=np.intp) 

793 rev[sorter] = np.arange(count, dtype=np.intp) 

794 return out[rev].astype(np.int64, copy=False) 

795 

796 def _try_cast(self, result, obj, numeric_only: bool = False): 

797 """ 

798 Try to cast the result to our obj original type, 

799 we may have roundtripped through object in the mean-time. 

800 

801 If numeric_only is True, then only try to cast numerics 

802 and not datetimelikes. 

803 

804 """ 

805 if obj.ndim > 1: 

806 dtype = obj._values.dtype 

807 else: 

808 dtype = obj.dtype 

809 

810 if not is_scalar(result): 

811 if ( 

812 is_extension_array_dtype(dtype) 

813 and not is_categorical_dtype(dtype) 

814 and dtype.kind != "M" 

815 ): 

816 # We have to special case categorical so as not to upcast 

817 # things like counts back to categorical 

818 cls = dtype.construct_array_type() 

819 result = try_cast_to_ea(cls, result, dtype=dtype) 

820 

821 elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: 

822 result = maybe_downcast_to_dtype(result, dtype) 

823 

824 return result 

825 

826 def _transform_should_cast(self, func_nm: str) -> bool: 

827 """ 

828 Parameters 

829 ---------- 

830 func_nm: str 

831 The name of the aggregation function being performed 

832 

833 Returns 

834 ------- 

835 bool 

836 Whether transform should attempt to cast the result of aggregation 

837 """ 

838 return (self.size().fillna(0) > 0).any() and ( 

839 func_nm not in base.cython_cast_blacklist 

840 ) 

841 

842 def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): 

843 output: Dict[base.OutputKey, np.ndarray] = {} 

844 for idx, obj in enumerate(self._iterate_slices()): 

845 name = obj.name 

846 is_numeric = is_numeric_dtype(obj.dtype) 

847 if numeric_only and not is_numeric: 

848 continue 

849 

850 try: 

851 result, _ = self.grouper.transform(obj.values, how, **kwargs) 

852 except NotImplementedError: 

853 continue 

854 

855 if self._transform_should_cast(how): 

856 result = self._try_cast(result, obj) 

857 

858 key = base.OutputKey(label=name, position=idx) 

859 output[key] = result 

860 

861 if len(output) == 0: 

862 raise DataError("No numeric types to aggregate") 

863 

864 return self._wrap_transformed_output(output) 

865 

866 def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): 

867 raise AbstractMethodError(self) 

868 

869 def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): 

870 raise AbstractMethodError(self) 

871 

872 def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): 

873 raise AbstractMethodError(self) 

874 

875 def _cython_agg_general( 

876 self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 

877 ): 

878 output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {} 

879 # Ideally we would be able to enumerate self._iterate_slices and use 

880 # the index from enumeration as the key of output, but ohlc in particular 

881 # returns a (n x 4) array. Output requires 1D ndarrays as values, so we 

882 # need to slice that up into 1D arrays 

883 idx = 0 

884 for obj in self._iterate_slices(): 

885 name = obj.name 

886 is_numeric = is_numeric_dtype(obj.dtype) 

887 if numeric_only and not is_numeric: 

888 continue 

889 

890 result, agg_names = self.grouper.aggregate( 

891 obj._values, how, min_count=min_count 

892 ) 

893 

894 if agg_names: 

895 # e.g. ohlc 

896 assert len(agg_names) == result.shape[1] 

897 for result_column, result_name in zip(result.T, agg_names): 

898 key = base.OutputKey(label=result_name, position=idx) 

899 output[key] = self._try_cast(result_column, obj) 

900 idx += 1 

901 else: 

902 assert result.ndim == 1 

903 key = base.OutputKey(label=name, position=idx) 

904 output[key] = self._try_cast(result, obj) 

905 idx += 1 

906 

907 if len(output) == 0: 

908 raise DataError("No numeric types to aggregate") 

909 

910 return self._wrap_aggregated_output(output) 

911 

912 def _python_agg_general(self, func, *args, **kwargs): 

913 func = self._is_builtin_func(func) 

914 f = lambda x: func(x, *args, **kwargs) 

915 

916 # iterate through "columns" ex exclusions to populate output dict 

917 output: Dict[base.OutputKey, np.ndarray] = {} 

918 

919 for idx, obj in enumerate(self._iterate_slices()): 

920 name = obj.name 

921 if self.grouper.ngroups == 0: 

922 # agg_series below assumes ngroups > 0 

923 continue 

924 

925 try: 

926 # if this function is invalid for this dtype, we will ignore it. 

927 result, counts = self.grouper.agg_series(obj, f) 

928 except TypeError: 

929 continue 

930 

931 assert result is not None 

932 key = base.OutputKey(label=name, position=idx) 

933 output[key] = self._try_cast(result, obj, numeric_only=True) 

934 

935 if len(output) == 0: 

936 return self._python_apply_general(f) 

937 

938 if self.grouper._filter_empty_groups: 

939 

940 mask = counts.ravel() > 0 

941 for key, result in output.items(): 

942 

943 # since we are masking, make sure that we have a float object 

944 values = result 

945 if is_numeric_dtype(values.dtype): 

946 values = ensure_float(values) 

947 

948 output[key] = self._try_cast(values[mask], result) 

949 

950 return self._wrap_aggregated_output(output) 

951 

952 def _concat_objects(self, keys, values, not_indexed_same: bool = False): 

953 from pandas.core.reshape.concat import concat 

954 

955 def reset_identity(values): 

956 # reset the identities of the components 

957 # of the values to prevent aliasing 

958 for v in com.not_none(*values): 

959 ax = v._get_axis(self.axis) 

960 ax._reset_identity() 

961 return values 

962 

963 if not not_indexed_same: 

964 result = concat(values, axis=self.axis) 

965 ax = self._selected_obj._get_axis(self.axis) 

966 

967 if isinstance(result, Series): 

968 result = result.reindex(ax) 

969 else: 

970 

971 # this is a very unfortunate situation 

972 # we have a multi-index that is NOT lexsorted 

973 # and we have a result which is duplicated 

974 # we can't reindex, so we resort to this 

975 # GH 14776 

976 if isinstance(ax, MultiIndex) and not ax.is_unique: 

977 indexer = algorithms.unique1d( 

978 result.index.get_indexer_for(ax.values) 

979 ) 

980 result = result.take(indexer, axis=self.axis) 

981 else: 

982 result = result.reindex(ax, axis=self.axis) 

983 

984 elif self.group_keys: 

985 

986 values = reset_identity(values) 

987 if self.as_index: 

988 

989 # possible MI return case 

990 group_keys = keys 

991 group_levels = self.grouper.levels 

992 group_names = self.grouper.names 

993 

994 result = concat( 

995 values, 

996 axis=self.axis, 

997 keys=group_keys, 

998 levels=group_levels, 

999 names=group_names, 

1000 sort=False, 

1001 ) 

1002 else: 

1003 

1004 # GH5610, returns a MI, with the first level being a 

1005 # range index 

1006 keys = list(range(len(values))) 

1007 result = concat(values, axis=self.axis, keys=keys) 

1008 else: 

1009 values = reset_identity(values) 

1010 result = concat(values, axis=self.axis) 

1011 

1012 if isinstance(result, Series) and self._selection_name is not None: 

1013 

1014 result.name = self._selection_name 

1015 

1016 return result 

1017 

1018 def _apply_filter(self, indices, dropna): 

1019 if len(indices) == 0: 

1020 indices = np.array([], dtype="int64") 

1021 else: 

1022 indices = np.sort(np.concatenate(indices)) 

1023 if dropna: 

1024 filtered = self._selected_obj.take(indices, axis=self.axis) 

1025 else: 

1026 mask = np.empty(len(self._selected_obj.index), dtype=bool) 

1027 mask.fill(False) 

1028 mask[indices.astype(int)] = True 

1029 # mask fails to broadcast when passed to where; broadcast manually. 

1030 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T 

1031 filtered = self._selected_obj.where(mask) # Fill with NaNs. 

1032 return filtered 

1033 

1034 

1035class GroupBy(_GroupBy): 

1036 """ 

1037 Class for grouping and aggregating relational data. 

1038 

1039 See aggregate, transform, and apply functions on this object. 

1040 

1041 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: 

1042 

1043 :: 

1044 

1045 grouped = groupby(obj, ...) 

1046 

1047 Parameters 

1048 ---------- 

1049 obj : pandas object 

1050 axis : int, default 0 

1051 level : int, default None 

1052 Level of MultiIndex 

1053 groupings : list of Grouping objects 

1054 Most users should ignore this 

1055 exclusions : array-like, optional 

1056 List of columns to exclude 

1057 name : str 

1058 Most users should ignore this 

1059 

1060 Returns 

1061 ------- 

1062 **Attributes** 

1063 groups : dict 

1064 {group name -> group labels} 

1065 len(grouped) : int 

1066 Number of groups 

1067 

1068 Notes 

1069 ----- 

1070 After grouping, see aggregate, apply, and transform functions. Here are 

1071 some other brief notes about usage. When grouping by multiple groups, the 

1072 result index will be a MultiIndex (hierarchical) by default. 

1073 

1074 Iteration produces (key, group) tuples, i.e. chunking the data by group. So 

1075 you can write code like: 

1076 

1077 :: 

1078 

1079 grouped = obj.groupby(keys, axis=axis) 

1080 for key, group in grouped: 

1081 # do something with the data 

1082 

1083 Function calls on GroupBy, if not specially implemented, "dispatch" to the 

1084 grouped data. So if you group a DataFrame and wish to invoke the std() 

1085 method on each group, you can simply do: 

1086 

1087 :: 

1088 

1089 df.groupby(mapper).std() 

1090 

1091 rather than 

1092 

1093 :: 

1094 

1095 df.groupby(mapper).aggregate(np.std) 

1096 

1097 You can pass arguments to these "wrapped" functions, too. 

1098 

1099 See the online documentation for full exposition on these topics and much 

1100 more 

1101 """ 

1102 

1103 def _bool_agg(self, val_test, skipna): 

1104 """ 

1105 Shared func to call any / all Cython GroupBy implementations. 

1106 """ 

1107 

1108 def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]: 

1109 if is_object_dtype(vals): 

1110 vals = np.array([bool(x) for x in vals]) 

1111 else: 

1112 vals = vals.astype(np.bool) 

1113 

1114 return vals.view(np.uint8), np.bool 

1115 

1116 def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: 

1117 return result.astype(inference, copy=False) 

1118 

1119 return self._get_cythonized_result( 

1120 "group_any_all", 

1121 aggregate=True, 

1122 cython_dtype=np.dtype(np.uint8), 

1123 needs_values=True, 

1124 needs_mask=True, 

1125 pre_processing=objs_to_bool, 

1126 post_processing=result_to_bool, 

1127 val_test=val_test, 

1128 skipna=skipna, 

1129 ) 

1130 

1131 @Substitution(name="groupby") 

1132 @Appender(_common_see_also) 

1133 def any(self, skipna: bool = True): 

1134 """ 

1135 Return True if any value in the group is truthful, else False. 

1136 

1137 Parameters 

1138 ---------- 

1139 skipna : bool, default True 

1140 Flag to ignore nan values during truth testing. 

1141 

1142 Returns 

1143 ------- 

1144 bool 

1145 """ 

1146 return self._bool_agg("any", skipna) 

1147 

1148 @Substitution(name="groupby") 

1149 @Appender(_common_see_also) 

1150 def all(self, skipna: bool = True): 

1151 """ 

1152 Return True if all values in the group are truthful, else False. 

1153 

1154 Parameters 

1155 ---------- 

1156 skipna : bool, default True 

1157 Flag to ignore nan values during truth testing. 

1158 

1159 Returns 

1160 ------- 

1161 bool 

1162 """ 

1163 return self._bool_agg("all", skipna) 

1164 

1165 @Substitution(name="groupby") 

1166 @Appender(_common_see_also) 

1167 def count(self): 

1168 """ 

1169 Compute count of group, excluding missing values. 

1170 

1171 Returns 

1172 ------- 

1173 Series or DataFrame 

1174 Count of values within each group. 

1175 """ 

1176 

1177 # defined here for API doc 

1178 raise NotImplementedError 

1179 

1180 @Substitution(name="groupby") 

1181 @Substitution(see_also=_common_see_also) 

1182 def mean(self, *args, **kwargs): 

1183 """ 

1184 Compute mean of groups, excluding missing values. 

1185 

1186 Returns 

1187 ------- 

1188 pandas.Series or pandas.DataFrame 

1189 %(see_also)s 

1190 Examples 

1191 -------- 

1192 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], 

1193 ... 'B': [np.nan, 2, 3, 4, 5], 

1194 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) 

1195 

1196 Groupby one column and return the mean of the remaining columns in 

1197 each group. 

1198 

1199 >>> df.groupby('A').mean() 

1200 B C 

1201 A 

1202 1 3.0 1.333333 

1203 2 4.0 1.500000 

1204 

1205 Groupby two columns and return the mean of the remaining column. 

1206 

1207 >>> df.groupby(['A', 'B']).mean() 

1208 C 

1209 A B 

1210 1 2.0 2 

1211 4.0 1 

1212 2 3.0 1 

1213 5.0 2 

1214 

1215 Groupby one column and return the mean of only particular column in 

1216 the group. 

1217 

1218 >>> df.groupby('A')['B'].mean() 

1219 A 

1220 1 3.0 

1221 2 4.0 

1222 Name: B, dtype: float64 

1223 """ 

1224 nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"]) 

1225 return self._cython_agg_general( 

1226 "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs 

1227 ) 

1228 

1229 @Substitution(name="groupby") 

1230 @Appender(_common_see_also) 

1231 def median(self, **kwargs): 

1232 """ 

1233 Compute median of groups, excluding missing values. 

1234 

1235 For multiple groupings, the result index will be a MultiIndex 

1236 

1237 Returns 

1238 ------- 

1239 Series or DataFrame 

1240 Median of values within each group. 

1241 """ 

1242 return self._cython_agg_general( 

1243 "median", 

1244 alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), 

1245 **kwargs, 

1246 ) 

1247 

1248 @Substitution(name="groupby") 

1249 @Appender(_common_see_also) 

1250 def std(self, ddof: int = 1, *args, **kwargs): 

1251 """ 

1252 Compute standard deviation of groups, excluding missing values. 

1253 

1254 For multiple groupings, the result index will be a MultiIndex. 

1255 

1256 Parameters 

1257 ---------- 

1258 ddof : int, default 1 

1259 Degrees of freedom. 

1260 

1261 Returns 

1262 ------- 

1263 Series or DataFrame 

1264 Standard deviation of values within each group. 

1265 """ 

1266 

1267 # TODO: implement at Cython level? 

1268 nv.validate_groupby_func("std", args, kwargs) 

1269 return np.sqrt(self.var(ddof=ddof, **kwargs)) 

1270 

1271 @Substitution(name="groupby") 

1272 @Appender(_common_see_also) 

1273 def var(self, ddof: int = 1, *args, **kwargs): 

1274 """ 

1275 Compute variance of groups, excluding missing values. 

1276 

1277 For multiple groupings, the result index will be a MultiIndex. 

1278 

1279 Parameters 

1280 ---------- 

1281 ddof : int, default 1 

1282 Degrees of freedom. 

1283 

1284 Returns 

1285 ------- 

1286 Series or DataFrame 

1287 Variance of values within each group. 

1288 """ 

1289 nv.validate_groupby_func("var", args, kwargs) 

1290 if ddof == 1: 

1291 return self._cython_agg_general( 

1292 "var", alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), **kwargs 

1293 ) 

1294 else: 

1295 f = lambda x: x.var(ddof=ddof, **kwargs) 

1296 with _group_selection_context(self): 

1297 return self._python_agg_general(f) 

1298 

1299 @Substitution(name="groupby") 

1300 @Appender(_common_see_also) 

1301 def sem(self, ddof: int = 1): 

1302 """ 

1303 Compute standard error of the mean of groups, excluding missing values. 

1304 

1305 For multiple groupings, the result index will be a MultiIndex. 

1306 

1307 Parameters 

1308 ---------- 

1309 ddof : int, default 1 

1310 Degrees of freedom. 

1311 

1312 Returns 

1313 ------- 

1314 Series or DataFrame 

1315 Standard error of the mean of values within each group. 

1316 """ 

1317 return self.std(ddof=ddof) / np.sqrt(self.count()) 

1318 

1319 @Substitution(name="groupby") 

1320 @Appender(_common_see_also) 

1321 def size(self): 

1322 """ 

1323 Compute group sizes. 

1324 

1325 Returns 

1326 ------- 

1327 Series 

1328 Number of rows in each group. 

1329 """ 

1330 result = self.grouper.size() 

1331 

1332 if isinstance(self.obj, Series): 

1333 result.name = self.obj.name 

1334 return self._reindex_output(result, fill_value=0) 

1335 

1336 @classmethod 

1337 def _add_numeric_operations(cls): 

1338 """ 

1339 Add numeric operations to the GroupBy generically. 

1340 """ 

1341 

1342 def groupby_function( 

1343 name: str, 

1344 alias: str, 

1345 npfunc, 

1346 numeric_only: bool = True, 

1347 min_count: int = -1, 

1348 ): 

1349 

1350 _local_template = """ 

1351 Compute %(f)s of group values. 

1352 

1353 Returns 

1354 ------- 

1355 Series or DataFrame 

1356 Computed %(f)s of values within each group. 

1357 """ 

1358 

1359 @Substitution(name="groupby", f=name) 

1360 @Appender(_common_see_also) 

1361 @Appender(_local_template) 

1362 def f(self, **kwargs): 

1363 if "numeric_only" not in kwargs: 

1364 kwargs["numeric_only"] = numeric_only 

1365 if "min_count" not in kwargs: 

1366 kwargs["min_count"] = min_count 

1367 

1368 self._set_group_selection() 

1369 

1370 # try a cython aggregation if we can 

1371 try: 

1372 return self._cython_agg_general(alias, alt=npfunc, **kwargs) 

1373 except DataError: 

1374 pass 

1375 except NotImplementedError as err: 

1376 if "function is not implemented for this dtype" in str( 

1377 err 

1378 ) or "category dtype not supported" in str(err): 

1379 # raised in _get_cython_function, in some cases can 

1380 # be trimmed by implementing cython funcs for more dtypes 

1381 pass 

1382 else: 

1383 raise 

1384 

1385 # apply a non-cython aggregation 

1386 result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) 

1387 return result 

1388 

1389 set_function_name(f, name, cls) 

1390 

1391 return f 

1392 

1393 def first_compat(x, axis=0): 

1394 def first(x): 

1395 x = x.to_numpy() 

1396 

1397 x = x[notna(x)] 

1398 if len(x) == 0: 

1399 return np.nan 

1400 return x[0] 

1401 

1402 if isinstance(x, DataFrame): 

1403 return x.apply(first, axis=axis) 

1404 else: 

1405 return first(x) 

1406 

1407 def last_compat(x, axis=0): 

1408 def last(x): 

1409 x = x.to_numpy() 

1410 x = x[notna(x)] 

1411 if len(x) == 0: 

1412 return np.nan 

1413 return x[-1] 

1414 

1415 if isinstance(x, DataFrame): 

1416 return x.apply(last, axis=axis) 

1417 else: 

1418 return last(x) 

1419 

1420 cls.sum = groupby_function("sum", "add", np.sum, min_count=0) 

1421 cls.prod = groupby_function("prod", "prod", np.prod, min_count=0) 

1422 cls.min = groupby_function("min", "min", np.min, numeric_only=False) 

1423 cls.max = groupby_function("max", "max", np.max, numeric_only=False) 

1424 cls.first = groupby_function("first", "first", first_compat, numeric_only=False) 

1425 cls.last = groupby_function("last", "last", last_compat, numeric_only=False) 

1426 

1427 @Substitution(name="groupby") 

1428 @Appender(_common_see_also) 

1429 def ohlc(self) -> DataFrame: 

1430 """ 

1431 Compute sum of values, excluding missing values. 

1432 

1433 For multiple groupings, the result index will be a MultiIndex 

1434 

1435 Returns 

1436 ------- 

1437 DataFrame 

1438 Open, high, low and close values within each group. 

1439 """ 

1440 

1441 return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) 

1442 

1443 @Appender(DataFrame.describe.__doc__) 

1444 def describe(self, **kwargs): 

1445 with _group_selection_context(self): 

1446 result = self.apply(lambda x: x.describe(**kwargs)) 

1447 if self.axis == 1: 

1448 return result.T 

1449 return result.unstack() 

1450 

1451 def resample(self, rule, *args, **kwargs): 

1452 """ 

1453 Provide resampling when using a TimeGrouper. 

1454 

1455 Given a grouper, the function resamples it according to a string 

1456 "string" -> "frequency". 

1457 

1458 See the :ref:`frequency aliases <timeseries.offset_aliases>` 

1459 documentation for more details. 

1460 

1461 Parameters 

1462 ---------- 

1463 rule : str or DateOffset 

1464 The offset string or object representing target grouper conversion. 

1465 *args, **kwargs 

1466 Possible arguments are `how`, `fill_method`, `limit`, `kind` and 

1467 `on`, and other arguments of `TimeGrouper`. 

1468 

1469 Returns 

1470 ------- 

1471 Grouper 

1472 Return a new grouper with our resampler appended. 

1473 

1474 See Also 

1475 -------- 

1476 Grouper : Specify a frequency to resample with when 

1477 grouping by a key. 

1478 DatetimeIndex.resample : Frequency conversion and resampling of 

1479 time series. 

1480 

1481 Examples 

1482 -------- 

1483 >>> idx = pd.date_range('1/1/2000', periods=4, freq='T') 

1484 >>> df = pd.DataFrame(data=4 * [range(2)], 

1485 ... index=idx, 

1486 ... columns=['a', 'b']) 

1487 >>> df.iloc[2, 0] = 5 

1488 >>> df 

1489 a b 

1490 2000-01-01 00:00:00 0 1 

1491 2000-01-01 00:01:00 0 1 

1492 2000-01-01 00:02:00 5 1 

1493 2000-01-01 00:03:00 0 1 

1494 

1495 Downsample the DataFrame into 3 minute bins and sum the values of 

1496 the timestamps falling into a bin. 

1497 

1498 >>> df.groupby('a').resample('3T').sum() 

1499 a b 

1500 a 

1501 0 2000-01-01 00:00:00 0 2 

1502 2000-01-01 00:03:00 0 1 

1503 5 2000-01-01 00:00:00 5 1 

1504 

1505 Upsample the series into 30 second bins. 

1506 

1507 >>> df.groupby('a').resample('30S').sum() 

1508 a b 

1509 a 

1510 0 2000-01-01 00:00:00 0 1 

1511 2000-01-01 00:00:30 0 0 

1512 2000-01-01 00:01:00 0 1 

1513 2000-01-01 00:01:30 0 0 

1514 2000-01-01 00:02:00 0 0 

1515 2000-01-01 00:02:30 0 0 

1516 2000-01-01 00:03:00 0 1 

1517 5 2000-01-01 00:02:00 5 1 

1518 

1519 Resample by month. Values are assigned to the month of the period. 

1520 

1521 >>> df.groupby('a').resample('M').sum() 

1522 a b 

1523 a 

1524 0 2000-01-31 0 3 

1525 5 2000-01-31 5 1 

1526 

1527 Downsample the series into 3 minute bins as above, but close the right 

1528 side of the bin interval. 

1529 

1530 >>> df.groupby('a').resample('3T', closed='right').sum() 

1531 a b 

1532 a 

1533 0 1999-12-31 23:57:00 0 1 

1534 2000-01-01 00:00:00 0 2 

1535 5 2000-01-01 00:00:00 5 1 

1536 

1537 Downsample the series into 3 minute bins and close the right side of 

1538 the bin interval, but label each bin using the right edge instead of 

1539 the left. 

1540 

1541 >>> df.groupby('a').resample('3T', closed='right', label='right').sum() 

1542 a b 

1543 a 

1544 0 2000-01-01 00:00:00 0 1 

1545 2000-01-01 00:03:00 0 2 

1546 5 2000-01-01 00:03:00 5 1 

1547 

1548 Add an offset of twenty seconds. 

1549 

1550 >>> df.groupby('a').resample('3T', loffset='20s').sum() 

1551 a b 

1552 a 

1553 0 2000-01-01 00:00:20 0 2 

1554 2000-01-01 00:03:20 0 1 

1555 5 2000-01-01 00:00:20 5 1 

1556 """ 

1557 from pandas.core.resample import get_resampler_for_grouping 

1558 

1559 return get_resampler_for_grouping(self, rule, *args, **kwargs) 

1560 

1561 @Substitution(name="groupby") 

1562 @Appender(_common_see_also) 

1563 def rolling(self, *args, **kwargs): 

1564 """ 

1565 Return a rolling grouper, providing rolling functionality per group. 

1566 """ 

1567 from pandas.core.window import RollingGroupby 

1568 

1569 return RollingGroupby(self, *args, **kwargs) 

1570 

1571 @Substitution(name="groupby") 

1572 @Appender(_common_see_also) 

1573 def expanding(self, *args, **kwargs): 

1574 """ 

1575 Return an expanding grouper, providing expanding 

1576 functionality per group. 

1577 """ 

1578 from pandas.core.window import ExpandingGroupby 

1579 

1580 return ExpandingGroupby(self, *args, **kwargs) 

1581 

1582 def _fill(self, direction, limit=None): 

1583 """ 

1584 Shared function for `pad` and `backfill` to call Cython method. 

1585 

1586 Parameters 

1587 ---------- 

1588 direction : {'ffill', 'bfill'} 

1589 Direction passed to underlying Cython function. `bfill` will cause 

1590 values to be filled backwards. `ffill` and any other values will 

1591 default to a forward fill 

1592 limit : int, default None 

1593 Maximum number of consecutive values to fill. If `None`, this 

1594 method will convert to -1 prior to passing to Cython 

1595 

1596 Returns 

1597 ------- 

1598 `Series` or `DataFrame` with filled values 

1599 

1600 See Also 

1601 -------- 

1602 pad 

1603 backfill 

1604 """ 

1605 # Need int value for Cython 

1606 if limit is None: 

1607 limit = -1 

1608 

1609 return self._get_cythonized_result( 

1610 "group_fillna_indexer", 

1611 needs_mask=True, 

1612 cython_dtype=np.dtype(np.int64), 

1613 result_is_index=True, 

1614 direction=direction, 

1615 limit=limit, 

1616 ) 

1617 

1618 @Substitution(name="groupby") 

1619 def pad(self, limit=None): 

1620 """ 

1621 Forward fill the values. 

1622 

1623 Parameters 

1624 ---------- 

1625 limit : int, optional 

1626 Limit of how many values to fill. 

1627 

1628 Returns 

1629 ------- 

1630 Series or DataFrame 

1631 Object with missing values filled. 

1632 

1633 See Also 

1634 -------- 

1635 Series.pad 

1636 DataFrame.pad 

1637 Series.fillna 

1638 DataFrame.fillna 

1639 """ 

1640 return self._fill("ffill", limit=limit) 

1641 

1642 ffill = pad 

1643 

1644 @Substitution(name="groupby") 

1645 def backfill(self, limit=None): 

1646 """ 

1647 Backward fill the values. 

1648 

1649 Parameters 

1650 ---------- 

1651 limit : int, optional 

1652 Limit of how many values to fill. 

1653 

1654 Returns 

1655 ------- 

1656 Series or DataFrame 

1657 Object with missing values filled. 

1658 

1659 See Also 

1660 -------- 

1661 Series.backfill 

1662 DataFrame.backfill 

1663 Series.fillna 

1664 DataFrame.fillna 

1665 """ 

1666 return self._fill("bfill", limit=limit) 

1667 

1668 bfill = backfill 

1669 

1670 @Substitution(name="groupby") 

1671 @Substitution(see_also=_common_see_also) 

1672 def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: 

1673 """ 

1674 Take the nth row from each group if n is an int, or a subset of rows 

1675 if n is a list of ints. 

1676 

1677 If dropna, will take the nth non-null row, dropna is either 

1678 'all' or 'any'; this is equivalent to calling dropna(how=dropna) 

1679 before the groupby. 

1680 

1681 Parameters 

1682 ---------- 

1683 n : int or list of ints 

1684 A single nth value for the row or a list of nth values. 

1685 dropna : None or str, optional 

1686 Apply the specified dropna operation before counting which row is 

1687 the nth row. Needs to be None, 'any' or 'all'. 

1688 

1689 Returns 

1690 ------- 

1691 Series or DataFrame 

1692 N-th value within each group. 

1693 %(see_also)s 

1694 Examples 

1695 -------- 

1696 

1697 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], 

1698 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) 

1699 >>> g = df.groupby('A') 

1700 >>> g.nth(0) 

1701 B 

1702 A 

1703 1 NaN 

1704 2 3.0 

1705 >>> g.nth(1) 

1706 B 

1707 A 

1708 1 2.0 

1709 2 5.0 

1710 >>> g.nth(-1) 

1711 B 

1712 A 

1713 1 4.0 

1714 2 5.0 

1715 >>> g.nth([0, 1]) 

1716 B 

1717 A 

1718 1 NaN 

1719 1 2.0 

1720 2 3.0 

1721 2 5.0 

1722 

1723 Specifying `dropna` allows count ignoring ``NaN`` 

1724 

1725 >>> g.nth(0, dropna='any') 

1726 B 

1727 A 

1728 1 2.0 

1729 2 3.0 

1730 

1731 NaNs denote group exhausted when using dropna 

1732 

1733 >>> g.nth(3, dropna='any') 

1734 B 

1735 A 

1736 1 NaN 

1737 2 NaN 

1738 

1739 Specifying `as_index=False` in `groupby` keeps the original index. 

1740 

1741 >>> df.groupby('A', as_index=False).nth(1) 

1742 A B 

1743 1 1 2.0 

1744 4 2 5.0 

1745 """ 

1746 

1747 valid_containers = (set, list, tuple) 

1748 if not isinstance(n, (valid_containers, int)): 

1749 raise TypeError("n needs to be an int or a list/set/tuple of ints") 

1750 

1751 if not dropna: 

1752 

1753 if isinstance(n, int): 

1754 nth_values = [n] 

1755 elif isinstance(n, valid_containers): 

1756 nth_values = list(set(n)) 

1757 

1758 nth_array = np.array(nth_values, dtype=np.intp) 

1759 self._set_group_selection() 

1760 

1761 mask_left = np.in1d(self._cumcount_array(), nth_array) 

1762 mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) 

1763 mask = mask_left | mask_right 

1764 

1765 ids, _, _ = self.grouper.group_info 

1766 

1767 # Drop NA values in grouping 

1768 mask = mask & (ids != -1) 

1769 

1770 out = self._selected_obj[mask] 

1771 if not self.as_index: 

1772 return out 

1773 

1774 result_index = self.grouper.result_index 

1775 out.index = result_index[ids[mask]] 

1776 

1777 if not self.observed and isinstance(result_index, CategoricalIndex): 

1778 out = out.reindex(result_index) 

1779 

1780 out = self._reindex_output(out) 

1781 return out.sort_index() if self.sort else out 

1782 

1783 # dropna is truthy 

1784 if isinstance(n, valid_containers): 

1785 raise ValueError("dropna option with a list of nth values is not supported") 

1786 

1787 if dropna not in ["any", "all"]: 

1788 # Note: when agg-ing picker doesn't raise this, just returns NaN 

1789 raise ValueError( 

1790 "For a DataFrame groupby, dropna must be " 

1791 "either None, 'any' or 'all', " 

1792 f"(was passed {dropna})." 

1793 ) 

1794 

1795 # old behaviour, but with all and any support for DataFrames. 

1796 # modified in GH 7559 to have better perf 

1797 max_len = n if n >= 0 else -1 - n 

1798 dropped = self.obj.dropna(how=dropna, axis=self.axis) 

1799 

1800 # get a new grouper for our dropped obj 

1801 if self.keys is None and self.level is None: 

1802 

1803 # we don't have the grouper info available 

1804 # (e.g. we have selected out 

1805 # a column that is not in the current object) 

1806 axis = self.grouper.axis 

1807 grouper = axis[axis.isin(dropped.index)] 

1808 

1809 else: 

1810 

1811 # create a grouper with the original parameters, but on dropped 

1812 # object 

1813 from pandas.core.groupby.grouper import get_grouper 

1814 

1815 grouper, _, _ = get_grouper( 

1816 dropped, 

1817 key=self.keys, 

1818 axis=self.axis, 

1819 level=self.level, 

1820 sort=self.sort, 

1821 mutated=self.mutated, 

1822 ) 

1823 

1824 grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) 

1825 sizes, result = grb.size(), grb.nth(n) 

1826 mask = (sizes < max_len).values 

1827 

1828 # set the results which don't meet the criteria 

1829 if len(result) and mask.any(): 

1830 result.loc[mask] = np.nan 

1831 

1832 # reset/reindex to the original groups 

1833 if len(self.obj) == len(dropped) or len(result) == len( 

1834 self.grouper.result_index 

1835 ): 

1836 result.index = self.grouper.result_index 

1837 else: 

1838 result = result.reindex(self.grouper.result_index) 

1839 

1840 return result 

1841 

1842 def quantile(self, q=0.5, interpolation: str = "linear"): 

1843 """ 

1844 Return group values at the given quantile, a la numpy.percentile. 

1845 

1846 Parameters 

1847 ---------- 

1848 q : float or array-like, default 0.5 (50% quantile) 

1849 Value(s) between 0 and 1 providing the quantile(s) to compute. 

1850 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} 

1851 Method to use when the desired quantile falls between two points. 

1852 

1853 Returns 

1854 ------- 

1855 Series or DataFrame 

1856 Return type determined by caller of GroupBy object. 

1857 

1858 See Also 

1859 -------- 

1860 Series.quantile : Similar method for Series. 

1861 DataFrame.quantile : Similar method for DataFrame. 

1862 numpy.percentile : NumPy method to compute qth percentile. 

1863 

1864 Examples 

1865 -------- 

1866 >>> df = pd.DataFrame([ 

1867 ... ['a', 1], ['a', 2], ['a', 3], 

1868 ... ['b', 1], ['b', 3], ['b', 5] 

1869 ... ], columns=['key', 'val']) 

1870 >>> df.groupby('key').quantile() 

1871 val 

1872 key 

1873 a 2.0 

1874 b 3.0 

1875 """ 

1876 from pandas import concat 

1877 

1878 def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: 

1879 if is_object_dtype(vals): 

1880 raise TypeError( 

1881 "'quantile' cannot be performed against 'object' dtypes!" 

1882 ) 

1883 

1884 inference = None 

1885 if is_integer_dtype(vals): 

1886 inference = np.int64 

1887 elif is_datetime64_dtype(vals): 

1888 inference = "datetime64[ns]" 

1889 vals = vals.astype(np.float) 

1890 

1891 return vals, inference 

1892 

1893 def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: 

1894 if inference: 

1895 # Check for edge case 

1896 if not ( 

1897 is_integer_dtype(inference) 

1898 and interpolation in {"linear", "midpoint"} 

1899 ): 

1900 vals = vals.astype(inference) 

1901 

1902 return vals 

1903 

1904 if is_scalar(q): 

1905 return self._get_cythonized_result( 

1906 "group_quantile", 

1907 aggregate=True, 

1908 needs_values=True, 

1909 needs_mask=True, 

1910 cython_dtype=np.dtype(np.float64), 

1911 pre_processing=pre_processor, 

1912 post_processing=post_processor, 

1913 q=q, 

1914 interpolation=interpolation, 

1915 ) 

1916 else: 

1917 results = [ 

1918 self._get_cythonized_result( 

1919 "group_quantile", 

1920 aggregate=True, 

1921 needs_values=True, 

1922 needs_mask=True, 

1923 cython_dtype=np.dtype(np.float64), 

1924 pre_processing=pre_processor, 

1925 post_processing=post_processor, 

1926 q=qi, 

1927 interpolation=interpolation, 

1928 ) 

1929 for qi in q 

1930 ] 

1931 result = concat(results, axis=0, keys=q) 

1932 # fix levels to place quantiles on the inside 

1933 # TODO(GH-10710): Ideally, we could write this as 

1934 # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] 

1935 # but this hits https://github.com/pandas-dev/pandas/issues/10710 

1936 # which doesn't reorder the list-like `q` on the inner level. 

1937 order = list(range(1, result.index.nlevels)) + [0] 

1938 

1939 # temporarily saves the index names 

1940 index_names = np.array(result.index.names) 

1941 

1942 # set index names to positions to avoid confusion 

1943 result.index.names = np.arange(len(index_names)) 

1944 

1945 # place quantiles on the inside 

1946 result = result.reorder_levels(order) 

1947 

1948 # restore the index names in order 

1949 result.index.names = index_names[order] 

1950 

1951 # reorder rows to keep things sorted 

1952 indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten() 

1953 return result.take(indices) 

1954 

1955 @Substitution(name="groupby") 

1956 def ngroup(self, ascending: bool = True): 

1957 """ 

1958 Number each group from 0 to the number of groups - 1. 

1959 

1960 This is the enumerative complement of cumcount. Note that the 

1961 numbers given to the groups match the order in which the groups 

1962 would be seen when iterating over the groupby object, not the 

1963 order they are first observed. 

1964 

1965 Parameters 

1966 ---------- 

1967 ascending : bool, default True 

1968 If False, number in reverse, from number of group - 1 to 0. 

1969 

1970 Returns 

1971 ------- 

1972 Series 

1973 Unique numbers for each group. 

1974 

1975 See Also 

1976 -------- 

1977 .cumcount : Number the rows in each group. 

1978 

1979 Examples 

1980 -------- 

1981 

1982 >>> df = pd.DataFrame({"A": list("aaabba")}) 

1983 >>> df 

1984 A 

1985 0 a 

1986 1 a 

1987 2 a 

1988 3 b 

1989 4 b 

1990 5 a 

1991 >>> df.groupby('A').ngroup() 

1992 0 0 

1993 1 0 

1994 2 0 

1995 3 1 

1996 4 1 

1997 5 0 

1998 dtype: int64 

1999 >>> df.groupby('A').ngroup(ascending=False) 

2000 0 1 

2001 1 1 

2002 2 1 

2003 3 0 

2004 4 0 

2005 5 1 

2006 dtype: int64 

2007 >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() 

2008 0 0 

2009 1 0 

2010 2 1 

2011 3 3 

2012 4 2 

2013 5 0 

2014 dtype: int64 

2015 """ 

2016 

2017 with _group_selection_context(self): 

2018 index = self._selected_obj.index 

2019 result = Series(self.grouper.group_info[0], index) 

2020 if not ascending: 

2021 result = self.ngroups - 1 - result 

2022 return result 

2023 

2024 @Substitution(name="groupby") 

2025 def cumcount(self, ascending: bool = True): 

2026 """ 

2027 Number each item in each group from 0 to the length of that group - 1. 

2028 

2029 Essentially this is equivalent to 

2030 

2031 >>> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) 

2032 

2033 Parameters 

2034 ---------- 

2035 ascending : bool, default True 

2036 If False, number in reverse, from length of group - 1 to 0. 

2037 

2038 Returns 

2039 ------- 

2040 Series 

2041 Sequence number of each element within each group. 

2042 

2043 See Also 

2044 -------- 

2045 .ngroup : Number the groups themselves. 

2046 

2047 Examples 

2048 -------- 

2049 

2050 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], 

2051 ... columns=['A']) 

2052 >>> df 

2053 A 

2054 0 a 

2055 1 a 

2056 2 a 

2057 3 b 

2058 4 b 

2059 5 a 

2060 >>> df.groupby('A').cumcount() 

2061 0 0 

2062 1 1 

2063 2 2 

2064 3 0 

2065 4 1 

2066 5 3 

2067 dtype: int64 

2068 >>> df.groupby('A').cumcount(ascending=False) 

2069 0 3 

2070 1 2 

2071 2 1 

2072 3 1 

2073 4 0 

2074 5 0 

2075 dtype: int64 

2076 """ 

2077 

2078 with _group_selection_context(self): 

2079 index = self._selected_obj.index 

2080 cumcounts = self._cumcount_array(ascending=ascending) 

2081 return Series(cumcounts, index) 

2082 

2083 @Substitution(name="groupby") 

2084 @Appender(_common_see_also) 

2085 def rank( 

2086 self, 

2087 method: str = "average", 

2088 ascending: bool = True, 

2089 na_option: str = "keep", 

2090 pct: bool = False, 

2091 axis: int = 0, 

2092 ): 

2093 """ 

2094 Provide the rank of values within each group. 

2095 

2096 Parameters 

2097 ---------- 

2098 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 

2099 * average: average rank of group. 

2100 * min: lowest rank in group. 

2101 * max: highest rank in group. 

2102 * first: ranks assigned in order they appear in the array. 

2103 * dense: like 'min', but rank always increases by 1 between groups. 

2104 ascending : bool, default True 

2105 False for ranks by high (1) to low (N). 

2106 na_option : {'keep', 'top', 'bottom'}, default 'keep' 

2107 * keep: leave NA values where they are. 

2108 * top: smallest rank if ascending. 

2109 * bottom: smallest rank if descending. 

2110 pct : bool, default False 

2111 Compute percentage rank of data within each group. 

2112 axis : int, default 0 

2113 The axis of the object over which to compute the rank. 

2114 

2115 Returns 

2116 ------- 

2117 DataFrame with ranking of values within each group 

2118 """ 

2119 if na_option not in {"keep", "top", "bottom"}: 

2120 msg = "na_option must be one of 'keep', 'top', or 'bottom'" 

2121 raise ValueError(msg) 

2122 return self._cython_transform( 

2123 "rank", 

2124 numeric_only=False, 

2125 ties_method=method, 

2126 ascending=ascending, 

2127 na_option=na_option, 

2128 pct=pct, 

2129 axis=axis, 

2130 ) 

2131 

2132 @Substitution(name="groupby") 

2133 @Appender(_common_see_also) 

2134 def cumprod(self, axis=0, *args, **kwargs): 

2135 """ 

2136 Cumulative product for each group. 

2137 

2138 Returns 

2139 ------- 

2140 Series or DataFrame 

2141 """ 

2142 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) 

2143 if axis != 0: 

2144 return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) 

2145 

2146 return self._cython_transform("cumprod", **kwargs) 

2147 

2148 @Substitution(name="groupby") 

2149 @Appender(_common_see_also) 

2150 def cumsum(self, axis=0, *args, **kwargs): 

2151 """ 

2152 Cumulative sum for each group. 

2153 

2154 Returns 

2155 ------- 

2156 Series or DataFrame 

2157 """ 

2158 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) 

2159 if axis != 0: 

2160 return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) 

2161 

2162 return self._cython_transform("cumsum", **kwargs) 

2163 

2164 @Substitution(name="groupby") 

2165 @Appender(_common_see_also) 

2166 def cummin(self, axis=0, **kwargs): 

2167 """ 

2168 Cumulative min for each group. 

2169 

2170 Returns 

2171 ------- 

2172 Series or DataFrame 

2173 """ 

2174 if axis != 0: 

2175 return self.apply(lambda x: np.minimum.accumulate(x, axis)) 

2176 

2177 return self._cython_transform("cummin", numeric_only=False) 

2178 

2179 @Substitution(name="groupby") 

2180 @Appender(_common_see_also) 

2181 def cummax(self, axis=0, **kwargs): 

2182 """ 

2183 Cumulative max for each group. 

2184 

2185 Returns 

2186 ------- 

2187 Series or DataFrame 

2188 """ 

2189 if axis != 0: 

2190 return self.apply(lambda x: np.maximum.accumulate(x, axis)) 

2191 

2192 return self._cython_transform("cummax", numeric_only=False) 

2193 

2194 def _get_cythonized_result( 

2195 self, 

2196 how: str, 

2197 cython_dtype: np.dtype, 

2198 aggregate: bool = False, 

2199 needs_values: bool = False, 

2200 needs_mask: bool = False, 

2201 needs_ngroups: bool = False, 

2202 result_is_index: bool = False, 

2203 pre_processing=None, 

2204 post_processing=None, 

2205 **kwargs, 

2206 ): 

2207 """ 

2208 Get result for Cythonized functions. 

2209 

2210 Parameters 

2211 ---------- 

2212 how : str, Cythonized function name to be called 

2213 cython_dtype : np.dtype 

2214 Type of the array that will be modified by the Cython call. 

2215 aggregate : bool, default False 

2216 Whether the result should be aggregated to match the number of 

2217 groups 

2218 needs_values : bool, default False 

2219 Whether the values should be a part of the Cython call 

2220 signature 

2221 needs_mask : bool, default False 

2222 Whether boolean mask needs to be part of the Cython call 

2223 signature 

2224 needs_ngroups : bool, default False 

2225 Whether number of groups is part of the Cython call signature 

2226 result_is_index : bool, default False 

2227 Whether the result of the Cython operation is an index of 

2228 values to be retrieved, instead of the actual values themselves 

2229 pre_processing : function, default None 

2230 Function to be applied to `values` prior to passing to Cython. 

2231 Function should return a tuple where the first element is the 

2232 values to be passed to Cython and the second element is an optional 

2233 type which the values should be converted to after being returned 

2234 by the Cython operation. Raises if `needs_values` is False. 

2235 post_processing : function, default None 

2236 Function to be applied to result of Cython function. Should accept 

2237 an array of values as the first argument and type inferences as its 

2238 second argument, i.e. the signature should be 

2239 (ndarray, Type). 

2240 **kwargs : dict 

2241 Extra arguments to be passed back to Cython funcs 

2242 

2243 Returns 

2244 ------- 

2245 `Series` or `DataFrame` with filled values 

2246 """ 

2247 if result_is_index and aggregate: 

2248 raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") 

2249 if post_processing: 

2250 if not callable(pre_processing): 

2251 raise ValueError("'post_processing' must be a callable!") 

2252 if pre_processing: 

2253 if not callable(pre_processing): 

2254 raise ValueError("'pre_processing' must be a callable!") 

2255 if not needs_values: 

2256 raise ValueError( 

2257 "Cannot use 'pre_processing' without specifying 'needs_values'!" 

2258 ) 

2259 

2260 grouper = self.grouper 

2261 

2262 labels, _, ngroups = grouper.group_info 

2263 output: Dict[base.OutputKey, np.ndarray] = {} 

2264 base_func = getattr(libgroupby, how) 

2265 

2266 for idx, obj in enumerate(self._iterate_slices()): 

2267 name = obj.name 

2268 values = obj._data._values 

2269 

2270 if aggregate: 

2271 result_sz = ngroups 

2272 else: 

2273 result_sz = len(values) 

2274 

2275 result = np.zeros(result_sz, dtype=cython_dtype) 

2276 func = partial(base_func, result, labels) 

2277 inferences = None 

2278 

2279 if needs_values: 

2280 vals = values 

2281 if pre_processing: 

2282 vals, inferences = pre_processing(vals) 

2283 func = partial(func, vals) 

2284 

2285 if needs_mask: 

2286 mask = isna(values).view(np.uint8) 

2287 func = partial(func, mask) 

2288 

2289 if needs_ngroups: 

2290 func = partial(func, ngroups) 

2291 

2292 func(**kwargs) # Call func to modify indexer values in place 

2293 

2294 if result_is_index: 

2295 result = algorithms.take_nd(values, result) 

2296 

2297 if post_processing: 

2298 result = post_processing(result, inferences) 

2299 

2300 key = base.OutputKey(label=name, position=idx) 

2301 output[key] = result 

2302 

2303 if aggregate: 

2304 return self._wrap_aggregated_output(output) 

2305 else: 

2306 return self._wrap_transformed_output(output) 

2307 

2308 @Substitution(name="groupby") 

2309 @Appender(_common_see_also) 

2310 def shift(self, periods=1, freq=None, axis=0, fill_value=None): 

2311 """ 

2312 Shift each group by periods observations. 

2313 

2314 Parameters 

2315 ---------- 

2316 periods : int, default 1 

2317 Number of periods to shift. 

2318 freq : frequency string 

2319 axis : axis to shift, default 0 

2320 fill_value : optional 

2321 

2322 .. versionadded:: 0.24.0 

2323 

2324 Returns 

2325 ------- 

2326 Series or DataFrame 

2327 Object shifted within each group. 

2328 """ 

2329 

2330 if freq is not None or axis != 0 or not isna(fill_value): 

2331 return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) 

2332 

2333 return self._get_cythonized_result( 

2334 "group_shift_indexer", 

2335 cython_dtype=np.dtype(np.int64), 

2336 needs_ngroups=True, 

2337 result_is_index=True, 

2338 periods=periods, 

2339 ) 

2340 

2341 @Substitution(name="groupby") 

2342 @Appender(_common_see_also) 

2343 def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0): 

2344 """ 

2345 Calculate pct_change of each value to previous entry in group. 

2346 

2347 Returns 

2348 ------- 

2349 Series or DataFrame 

2350 Percentage changes within each group. 

2351 """ 

2352 if freq is not None or axis != 0: 

2353 return self.apply( 

2354 lambda x: x.pct_change( 

2355 periods=periods, 

2356 fill_method=fill_method, 

2357 limit=limit, 

2358 freq=freq, 

2359 axis=axis, 

2360 ) 

2361 ) 

2362 if fill_method is None: # GH30463 

2363 fill_method = "pad" 

2364 limit = 0 

2365 filled = getattr(self, fill_method)(limit=limit) 

2366 fill_grp = filled.groupby(self.grouper.codes) 

2367 shifted = fill_grp.shift(periods=periods, freq=freq) 

2368 return (filled / shifted) - 1 

2369 

2370 @Substitution(name="groupby") 

2371 @Substitution(see_also=_common_see_also) 

2372 def head(self, n=5): 

2373 """ 

2374 Return first n rows of each group. 

2375 

2376 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows 

2377 from the original DataFrame with original index and order preserved 

2378 (``as_index`` flag is ignored). 

2379 

2380 Does not work for negative values of `n`. 

2381 

2382 Returns 

2383 ------- 

2384 Series or DataFrame 

2385 %(see_also)s 

2386 Examples 

2387 -------- 

2388 

2389 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], 

2390 ... columns=['A', 'B']) 

2391 >>> df.groupby('A').head(1) 

2392 A B 

2393 0 1 2 

2394 2 5 6 

2395 >>> df.groupby('A').head(-1) 

2396 Empty DataFrame 

2397 Columns: [A, B] 

2398 Index: [] 

2399 """ 

2400 self._reset_group_selection() 

2401 mask = self._cumcount_array() < n 

2402 return self._selected_obj[mask] 

2403 

2404 @Substitution(name="groupby") 

2405 @Substitution(see_also=_common_see_also) 

2406 def tail(self, n=5): 

2407 """ 

2408 Return last n rows of each group. 

2409 

2410 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows 

2411 from the original DataFrame with original index and order preserved 

2412 (``as_index`` flag is ignored). 

2413 

2414 Does not work for negative values of `n`. 

2415 

2416 Returns 

2417 ------- 

2418 Series or DataFrame 

2419 %(see_also)s 

2420 Examples 

2421 -------- 

2422 

2423 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], 

2424 ... columns=['A', 'B']) 

2425 >>> df.groupby('A').tail(1) 

2426 A B 

2427 1 a 2 

2428 3 b 2 

2429 >>> df.groupby('A').tail(-1) 

2430 Empty DataFrame 

2431 Columns: [A, B] 

2432 Index: [] 

2433 """ 

2434 self._reset_group_selection() 

2435 mask = self._cumcount_array(ascending=False) < n 

2436 return self._selected_obj[mask] 

2437 

2438 def _reindex_output( 

2439 self, output: FrameOrSeries, fill_value: Scalar = np.NaN 

2440 ) -> FrameOrSeries: 

2441 """ 

2442 If we have categorical groupers, then we might want to make sure that 

2443 we have a fully re-indexed output to the levels. This means expanding 

2444 the output space to accommodate all values in the cartesian product of 

2445 our groups, regardless of whether they were observed in the data or 

2446 not. This will expand the output space if there are missing groups. 

2447 

2448 The method returns early without modifying the input if the number of 

2449 groupings is less than 2, self.observed == True or none of the groupers 

2450 are categorical. 

2451 

2452 Parameters 

2453 ---------- 

2454 output : Series or DataFrame 

2455 Object resulting from grouping and applying an operation. 

2456 fill_value : scalar, default np.NaN 

2457 Value to use for unobserved categories if self.observed is False. 

2458 

2459 Returns 

2460 ------- 

2461 Series or DataFrame 

2462 Object (potentially) re-indexed to include all possible groups. 

2463 """ 

2464 groupings = self.grouper.groupings 

2465 if groupings is None: 

2466 return output 

2467 elif len(groupings) == 1: 

2468 return output 

2469 

2470 # if we only care about the observed values 

2471 # we are done 

2472 elif self.observed: 

2473 return output 

2474 

2475 # reindexing only applies to a Categorical grouper 

2476 elif not any( 

2477 isinstance(ping.grouper, (Categorical, CategoricalIndex)) 

2478 for ping in groupings 

2479 ): 

2480 return output 

2481 

2482 levels_list = [ping.group_index for ping in groupings] 

2483 index, _ = MultiIndex.from_product( 

2484 levels_list, names=self.grouper.names 

2485 ).sortlevel() 

2486 

2487 if self.as_index: 

2488 d = { 

2489 self.obj._get_axis_name(self.axis): index, 

2490 "copy": False, 

2491 "fill_value": fill_value, 

2492 } 

2493 return output.reindex(**d) 

2494 

2495 # GH 13204 

2496 # Here, the categorical in-axis groupers, which need to be fully 

2497 # expanded, are columns in `output`. An idea is to do: 

2498 # output = output.set_index(self.grouper.names) 

2499 # .reindex(index).reset_index() 

2500 # but special care has to be taken because of possible not-in-axis 

2501 # groupers. 

2502 # So, we manually select and drop the in-axis grouper columns, 

2503 # reindex `output`, and then reset the in-axis grouper columns. 

2504 

2505 # Select in-axis groupers 

2506 in_axis_grps = ( 

2507 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis 

2508 ) 

2509 g_nums, g_names = zip(*in_axis_grps) 

2510 

2511 output = output.drop(labels=list(g_names), axis=1) 

2512 

2513 # Set a temp index and reindex (possibly expanding) 

2514 output = output.set_index(self.grouper.result_index).reindex( 

2515 index, copy=False, fill_value=fill_value 

2516 ) 

2517 

2518 # Reset in-axis grouper columns 

2519 # (using level numbers `g_nums` because level names may not be unique) 

2520 output = output.reset_index(level=g_nums) 

2521 

2522 return output.reset_index(drop=True) 

2523 

2524 

2525GroupBy._add_numeric_operations() 

2526 

2527 

2528@Appender(GroupBy.__doc__) 

2529def get_groupby( 

2530 obj: NDFrame, 

2531 by: Optional[_KeysArgType] = None, 

2532 axis: int = 0, 

2533 level=None, 

2534 grouper: "Optional[ops.BaseGrouper]" = None, 

2535 exclusions=None, 

2536 selection=None, 

2537 as_index: bool = True, 

2538 sort: bool = True, 

2539 group_keys: bool = True, 

2540 squeeze: bool = False, 

2541 observed: bool = False, 

2542 mutated: bool = False, 

2543) -> GroupBy: 

2544 

2545 klass: Type[GroupBy] 

2546 if isinstance(obj, Series): 

2547 from pandas.core.groupby.generic import SeriesGroupBy 

2548 

2549 klass = SeriesGroupBy 

2550 elif isinstance(obj, DataFrame): 

2551 from pandas.core.groupby.generic import DataFrameGroupBy 

2552 

2553 klass = DataFrameGroupBy 

2554 else: 

2555 raise TypeError(f"invalid type: {obj}") 

2556 

2557 return klass( 

2558 obj=obj, 

2559 keys=by, 

2560 axis=axis, 

2561 level=level, 

2562 grouper=grouper, 

2563 exclusions=exclusions, 

2564 selection=selection, 

2565 as_index=as_index, 

2566 sort=sort, 

2567 group_keys=group_keys, 

2568 squeeze=squeeze, 

2569 observed=observed, 

2570 mutated=mutated, 

2571 )