Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/groupby/groupby.py : 24%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Provide the groupby split-apply-combine paradigm. Define the GroupBy
3class providing the base-class of operations.
5The SeriesGroupBy and DataFrameGroupBy sub-class
6(defined in pandas.core.groupby.generic)
7expose these user-facing objects to provide specific functionality.
8"""
10from contextlib import contextmanager
11import datetime
12from functools import partial, wraps
13import inspect
14import re
15import types
16from typing import (
17 Callable,
18 Dict,
19 FrozenSet,
20 Hashable,
21 Iterable,
22 List,
23 Mapping,
24 Optional,
25 Tuple,
26 Type,
27 Union,
28)
30import numpy as np
32from pandas._config.config import option_context
34from pandas._libs import Timestamp
35import pandas._libs.groupby as libgroupby
36from pandas._typing import FrameOrSeries, Scalar
37from pandas.compat import set_function_name
38from pandas.compat.numpy import function as nv
39from pandas.errors import AbstractMethodError
40from pandas.util._decorators import Appender, Substitution, cache_readonly
42from pandas.core.dtypes.cast import maybe_downcast_to_dtype
43from pandas.core.dtypes.common import (
44 ensure_float,
45 is_categorical_dtype,
46 is_datetime64_dtype,
47 is_extension_array_dtype,
48 is_integer_dtype,
49 is_numeric_dtype,
50 is_object_dtype,
51 is_scalar,
52)
53from pandas.core.dtypes.missing import isna, notna
55from pandas.core import nanops
56import pandas.core.algorithms as algorithms
57from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea
58from pandas.core.base import DataError, PandasObject, SelectionMixin
59import pandas.core.common as com
60from pandas.core.frame import DataFrame
61from pandas.core.generic import NDFrame
62from pandas.core.groupby import base, ops
63from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex
64from pandas.core.series import Series
65from pandas.core.sorting import get_group_index_sorter
67_common_see_also = """
68 See Also
69 --------
70 Series.%(name)s
71 DataFrame.%(name)s
72"""
74_apply_docs = dict(
75 template="""
76 Apply function `func` group-wise and combine the results together.
78 The function passed to `apply` must take a {input} as its first
79 argument and return a DataFrame, Series or scalar. `apply` will
80 then take care of combining the results back together into a single
81 dataframe or series. `apply` is therefore a highly flexible
82 grouping method.
84 While `apply` is a very flexible method, its downside is that
85 using it can be quite a bit slower than using more specific methods
86 like `agg` or `transform`. Pandas offers a wide range of method that will
87 be much faster than using `apply` for their specific purposes, so try to
88 use them before reaching for `apply`.
90 Parameters
91 ----------
92 func : callable
93 A callable that takes a {input} as its first argument, and
94 returns a dataframe, a series or a scalar. In addition the
95 callable may take positional and keyword arguments.
96 args, kwargs : tuple and dict
97 Optional positional and keyword arguments to pass to `func`.
99 Returns
100 -------
101 applied : Series or DataFrame
103 See Also
104 --------
105 pipe : Apply function to the full GroupBy object instead of to each
106 group.
107 aggregate : Apply aggregate function to the GroupBy object.
108 transform : Apply function column-by-column to the GroupBy object.
109 Series.apply : Apply a function to a Series.
110 DataFrame.apply : Apply a function to each row or column of a DataFrame.
111 """,
112 dataframe_examples="""
113 >>> df = pd.DataFrame({'A': 'a a b'.split(),
114 'B': [1,2,3],
115 'C': [4,6, 5]})
116 >>> g = df.groupby('A')
118 Notice that ``g`` has two groups, ``a`` and ``b``.
119 Calling `apply` in various ways, we can get different grouping results:
121 Example 1: below the function passed to `apply` takes a DataFrame as
122 its argument and returns a DataFrame. `apply` combines the result for
123 each group together into a new DataFrame:
125 >>> g[['B', 'C']].apply(lambda x: x / x.sum())
126 B C
127 0 0.333333 0.4
128 1 0.666667 0.6
129 2 1.000000 1.0
131 Example 2: The function passed to `apply` takes a DataFrame as
132 its argument and returns a Series. `apply` combines the result for
133 each group together into a new DataFrame:
135 >>> g[['B', 'C']].apply(lambda x: x.max() - x.min())
136 B C
137 A
138 a 1 2
139 b 0 0
141 Example 3: The function passed to `apply` takes a DataFrame as
142 its argument and returns a scalar. `apply` combines the result for
143 each group together into a Series, including setting the index as
144 appropriate:
146 >>> g.apply(lambda x: x.C.max() - x.B.min())
147 A
148 a 5
149 b 2
150 dtype: int64
151 """,
152 series_examples="""
153 >>> s = pd.Series([0, 1, 2], index='a a b'.split())
154 >>> g = s.groupby(s.index)
156 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
157 Calling `apply` in various ways, we can get different grouping results:
159 Example 1: The function passed to `apply` takes a Series as
160 its argument and returns a Series. `apply` combines the result for
161 each group together into a new Series:
163 >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2)
164 0 0.0
165 1 0.5
166 2 4.0
167 dtype: float64
169 Example 2: The function passed to `apply` takes a Series as
170 its argument and returns a scalar. `apply` combines the result for
171 each group together into a Series, including setting the index as
172 appropriate:
174 >>> g.apply(lambda x: x.max() - x.min())
175 a 1
176 b 0
177 dtype: int64
179 Notes
180 -----
181 In the current implementation `apply` calls `func` twice on the
182 first group to decide whether it can take a fast or slow code
183 path. This can lead to unexpected behavior if `func` has
184 side-effects, as they will take effect twice for the first
185 group.
187 Examples
188 --------
189 {examples}
190 """,
191)
193_pipe_template = """
194Apply a function `func` with arguments to this %(klass)s object and return
195the function's result.
197%(versionadded)s
199Use `.pipe` when you want to improve readability by chaining together
200functions that expect Series, DataFrames, GroupBy or Resampler objects.
201Instead of writing
203>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c)
205You can write
207>>> (df.groupby('group')
208... .pipe(f)
209... .pipe(g, arg1=a)
210... .pipe(h, arg2=b, arg3=c))
212which is much more readable.
214Parameters
215----------
216func : callable or tuple of (callable, string)
217 Function to apply to this %(klass)s object or, alternatively,
218 a `(callable, data_keyword)` tuple where `data_keyword` is a
219 string indicating the keyword of `callable` that expects the
220 %(klass)s object.
221args : iterable, optional
222 Positional arguments passed into `func`.
223kwargs : dict, optional
224 A dictionary of keyword arguments passed into `func`.
226Returns
227-------
228object : the return type of `func`.
230See Also
231--------
232Series.pipe : Apply a function with arguments to a series.
233DataFrame.pipe: Apply a function with arguments to a dataframe.
234apply : Apply function to each group instead of to the
235 full %(klass)s object.
237Notes
238-----
239See more `here
240<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
242Examples
243--------
244%(examples)s
245"""
247_transform_template = """
248Call function producing a like-indexed %(klass)s on each group and
249return a %(klass)s having the same indexes as the original object
250filled with the transformed values
252Parameters
253----------
254f : function
255 Function to apply to each group
257Returns
258-------
259%(klass)s
261See Also
262--------
263aggregate, transform
265Notes
266-----
267Each group is endowed the attribute 'name' in case you need to know
268which group you are working on.
270The current implementation imposes three requirements on f:
272* f must return a value that either has the same shape as the input
273 subframe or can be broadcast to the shape of the input subframe.
274 For example, if `f` returns a scalar it will be broadcast to have the
275 same shape as the input subframe.
276* if this is a DataFrame, f must support application column-by-column
277 in the subframe. If f also supports application to the entire subframe,
278 then a fast path is used starting from the second chunk.
279* f must not mutate groups. Mutation is not supported and may
280 produce unexpected results.
282Examples
283--------
285# Same shape
286>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
287... 'foo', 'bar'],
288... 'B' : ['one', 'one', 'two', 'three',
289... 'two', 'two'],
290... 'C' : [1, 5, 5, 2, 5, 5],
291... 'D' : [2.0, 5., 8., 1., 2., 9.]})
292>>> grouped = df.groupby('A')
293>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
294 C D
2950 -1.154701 -0.577350
2961 0.577350 0.000000
2972 0.577350 1.154701
2983 -1.154701 -1.000000
2994 0.577350 -0.577350
3005 0.577350 1.000000
302# Broadcastable
303>>> grouped.transform(lambda x: x.max() - x.min())
304 C D
3050 4 6.0
3061 3 8.0
3072 4 6.0
3083 3 8.0
3094 4 6.0
3105 3 8.0
311"""
314class GroupByPlot(PandasObject):
315 """
316 Class implementing the .plot attribute for groupby objects.
317 """
319 def __init__(self, groupby):
320 self._groupby = groupby
322 def __call__(self, *args, **kwargs):
323 def f(self):
324 return self.plot(*args, **kwargs)
326 f.__name__ = "plot"
327 return self._groupby.apply(f)
329 def __getattr__(self, name: str):
330 def attr(*args, **kwargs):
331 def f(self):
332 return getattr(self.plot, name)(*args, **kwargs)
334 return self._groupby.apply(f)
336 return attr
339@contextmanager
340def _group_selection_context(groupby):
341 """
342 Set / reset the _group_selection_context.
343 """
344 groupby._set_group_selection()
345 yield groupby
346 groupby._reset_group_selection()
349_KeysArgType = Union[
350 Hashable,
351 List[Hashable],
352 Callable[[Hashable], Hashable],
353 List[Callable[[Hashable], Hashable]],
354 Mapping[Hashable, Hashable],
355]
358class _GroupBy(PandasObject, SelectionMixin):
359 _group_selection = None
360 _apply_whitelist: FrozenSet[str] = frozenset()
362 def __init__(
363 self,
364 obj: NDFrame,
365 keys: Optional[_KeysArgType] = None,
366 axis: int = 0,
367 level=None,
368 grouper: "Optional[ops.BaseGrouper]" = None,
369 exclusions=None,
370 selection=None,
371 as_index: bool = True,
372 sort: bool = True,
373 group_keys: bool = True,
374 squeeze: bool = False,
375 observed: bool = False,
376 mutated: bool = False,
377 ):
379 self._selection = selection
381 assert isinstance(obj, NDFrame), type(obj)
382 obj._consolidate_inplace()
384 self.level = level
386 if not as_index:
387 if not isinstance(obj, DataFrame):
388 raise TypeError("as_index=False only valid with DataFrame")
389 if axis != 0:
390 raise ValueError("as_index=False only valid for axis=0")
392 self.as_index = as_index
393 self.keys = keys
394 self.sort = sort
395 self.group_keys = group_keys
396 self.squeeze = squeeze
397 self.observed = observed
398 self.mutated = mutated
400 if grouper is None:
401 from pandas.core.groupby.grouper import get_grouper
403 grouper, exclusions, obj = get_grouper(
404 obj,
405 keys,
406 axis=axis,
407 level=level,
408 sort=sort,
409 observed=observed,
410 mutated=self.mutated,
411 )
413 self.obj = obj
414 self.axis = obj._get_axis_number(axis)
415 self.grouper = grouper
416 self.exclusions = set(exclusions) if exclusions else set()
418 def __len__(self) -> int:
419 return len(self.groups)
421 def __repr__(self) -> str:
422 # TODO: Better repr for GroupBy object
423 return object.__repr__(self)
425 def _assure_grouper(self):
426 """
427 We create the grouper on instantiation sub-classes may have a
428 different policy.
429 """
430 pass
432 @property
433 def groups(self):
434 """
435 Dict {group name -> group labels}.
436 """
437 self._assure_grouper()
438 return self.grouper.groups
440 @property
441 def ngroups(self):
442 self._assure_grouper()
443 return self.grouper.ngroups
445 @property
446 def indices(self):
447 """
448 Dict {group name -> group indices}.
449 """
450 self._assure_grouper()
451 return self.grouper.indices
453 def _get_indices(self, names):
454 """
455 Safe get multiple indices, translate keys for
456 datelike to underlying repr.
457 """
459 def get_converter(s):
460 # possibly convert to the actual key types
461 # in the indices, could be a Timestamp or a np.datetime64
462 if isinstance(s, datetime.datetime):
463 return lambda key: Timestamp(key)
464 elif isinstance(s, np.datetime64):
465 return lambda key: Timestamp(key).asm8
466 else:
467 return lambda key: key
469 if len(names) == 0:
470 return []
472 if len(self.indices) > 0:
473 index_sample = next(iter(self.indices))
474 else:
475 index_sample = None # Dummy sample
477 name_sample = names[0]
478 if isinstance(index_sample, tuple):
479 if not isinstance(name_sample, tuple):
480 msg = "must supply a tuple to get_group with multiple grouping keys"
481 raise ValueError(msg)
482 if not len(name_sample) == len(index_sample):
483 try:
484 # If the original grouper was a tuple
485 return [self.indices[name] for name in names]
486 except KeyError:
487 # turns out it wasn't a tuple
488 msg = (
489 "must supply a same-length tuple to get_group "
490 "with multiple grouping keys"
491 )
492 raise ValueError(msg)
494 converters = [get_converter(s) for s in index_sample]
495 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
497 else:
498 converter = get_converter(index_sample)
499 names = (converter(name) for name in names)
501 return [self.indices.get(name, []) for name in names]
503 def _get_index(self, name):
504 """
505 Safe get index, translate keys for datelike to underlying repr.
506 """
507 return self._get_indices([name])[0]
509 @cache_readonly
510 def _selected_obj(self):
511 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy
513 if self._selection is None or isinstance(self.obj, Series):
514 if self._group_selection is not None:
515 return self.obj[self._group_selection]
516 return self.obj
517 else:
518 return self.obj[self._selection]
520 def _reset_group_selection(self):
521 """
522 Clear group based selection.
524 Used for methods needing to return info on each group regardless of
525 whether a group selection was previously set.
526 """
527 if self._group_selection is not None:
528 # GH12839 clear cached selection too when changing group selection
529 self._group_selection = None
530 self._reset_cache("_selected_obj")
532 def _set_group_selection(self):
533 """
534 Create group based selection.
536 Used when selection is not passed directly but instead via a grouper.
538 NOTE: this should be paired with a call to _reset_group_selection
539 """
540 grp = self.grouper
541 if not (
542 self.as_index
543 and getattr(grp, "groupings", None) is not None
544 and self.obj.ndim > 1
545 and self._group_selection is None
546 ):
547 return
549 ax = self.obj._info_axis
550 groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis]
552 if len(groupers):
553 # GH12839 clear selected obj cache when group selection changes
554 self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
555 self._reset_cache("_selected_obj")
557 def _set_result_index_ordered(self, result):
558 # set the result index on the passed values object and
559 # return the new object, xref 8046
561 # the values/counts are repeated according to the group index
562 # shortcut if we have an already ordered grouper
563 if not self.grouper.is_monotonic:
564 index = Index(np.concatenate(self._get_indices(self.grouper.result_index)))
565 result.set_axis(index, axis=self.axis, inplace=True)
566 result = result.sort_index(axis=self.axis)
568 result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
569 return result
571 def _dir_additions(self):
572 return self.obj._dir_additions() | self._apply_whitelist
574 def __getattr__(self, attr: str):
575 if attr in self._internal_names_set:
576 return object.__getattribute__(self, attr)
577 if attr in self.obj:
578 return self[attr]
580 raise AttributeError(
581 f"'{type(self).__name__}' object has no attribute '{attr}'"
582 )
584 @Substitution(
585 klass="GroupBy",
586 versionadded=".. versionadded:: 0.21.0",
587 examples="""\
588>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
589>>> df
590 A B
5910 a 1
5921 b 2
5932 a 3
5943 b 4
596To get the difference between each groups maximum and minimum value in one
597pass, you can do
599>>> df.groupby('A').pipe(lambda x: x.max() - x.min())
600 B
601A
602a 2
603b 2""",
604 )
605 @Appender(_pipe_template)
606 def pipe(self, func, *args, **kwargs):
607 return com.pipe(self, func, *args, **kwargs)
609 plot = property(GroupByPlot)
611 def _make_wrapper(self, name):
612 assert name in self._apply_whitelist
614 self._set_group_selection()
616 # need to setup the selection
617 # as are not passed directly but in the grouper
618 f = getattr(self._selected_obj, name)
619 if not isinstance(f, types.MethodType):
620 return self.apply(lambda self: getattr(self, name))
622 f = getattr(type(self._selected_obj), name)
623 sig = inspect.signature(f)
625 def wrapper(*args, **kwargs):
626 # a little trickery for aggregation functions that need an axis
627 # argument
628 if "axis" in sig.parameters:
629 if kwargs.get("axis", None) is None:
630 kwargs["axis"] = self.axis
632 def curried(x):
633 return f(x, *args, **kwargs)
635 # preserve the name so we can detect it when calling plot methods,
636 # to avoid duplicates
637 curried.__name__ = name
639 # special case otherwise extra plots are created when catching the
640 # exception below
641 if name in base.plotting_methods:
642 return self.apply(curried)
644 try:
645 return self.apply(curried)
646 except TypeError as err:
647 if not re.search(
648 "reduction operation '.*' not allowed for this dtype", str(err)
649 ):
650 # We don't have a cython implementation
651 # TODO: is the above comment accurate?
652 raise
654 if self.obj.ndim == 1:
655 # this can be called recursively, so need to raise ValueError
656 raise ValueError
658 # GH#3688 try to operate item-by-item
659 result = self._aggregate_item_by_item(name, *args, **kwargs)
660 return result
662 wrapper.__name__ = name
663 return wrapper
665 def get_group(self, name, obj=None):
666 """
667 Construct DataFrame from group with provided name.
669 Parameters
670 ----------
671 name : object
672 The name of the group to get as a DataFrame.
673 obj : DataFrame, default None
674 The DataFrame to take the DataFrame out of. If
675 it is None, the object groupby was called on will
676 be used.
678 Returns
679 -------
680 group : same type as obj
681 """
682 if obj is None:
683 obj = self._selected_obj
685 inds = self._get_index(name)
686 if not len(inds):
687 raise KeyError(name)
689 return obj._take_with_is_copy(inds, axis=self.axis)
691 def __iter__(self):
692 """
693 Groupby iterator.
695 Returns
696 -------
697 Generator yielding sequence of (name, subsetted object)
698 for each group
699 """
700 return self.grouper.get_iterator(self.obj, axis=self.axis)
702 @Appender(
703 _apply_docs["template"].format(
704 input="dataframe", examples=_apply_docs["dataframe_examples"]
705 )
706 )
707 def apply(self, func, *args, **kwargs):
709 func = self._is_builtin_func(func)
711 # this is needed so we don't try and wrap strings. If we could
712 # resolve functions to their callable functions prior, this
713 # wouldn't be needed
714 if args or kwargs:
715 if callable(func):
717 @wraps(func)
718 def f(g):
719 with np.errstate(all="ignore"):
720 return func(g, *args, **kwargs)
722 elif hasattr(nanops, "nan" + func):
723 # TODO: should we wrap this in to e.g. _is_builtin_func?
724 f = getattr(nanops, "nan" + func)
726 else:
727 raise ValueError(
728 "func must be a callable if args or kwargs are supplied"
729 )
730 else:
731 f = func
733 # ignore SettingWithCopy here in case the user mutates
734 with option_context("mode.chained_assignment", None):
735 try:
736 result = self._python_apply_general(f)
737 except TypeError:
738 # gh-20949
739 # try again, with .apply acting as a filtering
740 # operation, by excluding the grouping column
741 # This would normally not be triggered
742 # except if the udf is trying an operation that
743 # fails on *some* columns, e.g. a numeric operation
744 # on a string grouper column
746 with _group_selection_context(self):
747 return self._python_apply_general(f)
749 return result
751 def _python_apply_general(self, f):
752 keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
754 return self._wrap_applied_output(
755 keys, values, not_indexed_same=mutated or self.mutated
756 )
758 def _iterate_slices(self) -> Iterable[Series]:
759 raise AbstractMethodError(self)
761 def transform(self, func, *args, **kwargs):
762 raise AbstractMethodError(self)
764 def _cumcount_array(self, ascending: bool = True):
765 """
766 Parameters
767 ----------
768 ascending : bool, default True
769 If False, number in reverse, from length of group - 1 to 0.
771 Notes
772 -----
773 this is currently implementing sort=False
774 (though the default is sort=True) for groupby in general
775 """
776 ids, _, ngroups = self.grouper.group_info
777 sorter = get_group_index_sorter(ids, ngroups)
778 ids, count = ids[sorter], len(ids)
780 if count == 0:
781 return np.empty(0, dtype=np.int64)
783 run = np.r_[True, ids[:-1] != ids[1:]]
784 rep = np.diff(np.r_[np.nonzero(run)[0], count])
785 out = (~run).cumsum()
787 if ascending:
788 out -= np.repeat(out[run], rep)
789 else:
790 out = np.repeat(out[np.r_[run[1:], True]], rep) - out
792 rev = np.empty(count, dtype=np.intp)
793 rev[sorter] = np.arange(count, dtype=np.intp)
794 return out[rev].astype(np.int64, copy=False)
796 def _try_cast(self, result, obj, numeric_only: bool = False):
797 """
798 Try to cast the result to our obj original type,
799 we may have roundtripped through object in the mean-time.
801 If numeric_only is True, then only try to cast numerics
802 and not datetimelikes.
804 """
805 if obj.ndim > 1:
806 dtype = obj._values.dtype
807 else:
808 dtype = obj.dtype
810 if not is_scalar(result):
811 if (
812 is_extension_array_dtype(dtype)
813 and not is_categorical_dtype(dtype)
814 and dtype.kind != "M"
815 ):
816 # We have to special case categorical so as not to upcast
817 # things like counts back to categorical
818 cls = dtype.construct_array_type()
819 result = try_cast_to_ea(cls, result, dtype=dtype)
821 elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
822 result = maybe_downcast_to_dtype(result, dtype)
824 return result
826 def _transform_should_cast(self, func_nm: str) -> bool:
827 """
828 Parameters
829 ----------
830 func_nm: str
831 The name of the aggregation function being performed
833 Returns
834 -------
835 bool
836 Whether transform should attempt to cast the result of aggregation
837 """
838 return (self.size().fillna(0) > 0).any() and (
839 func_nm not in base.cython_cast_blacklist
840 )
842 def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs):
843 output: Dict[base.OutputKey, np.ndarray] = {}
844 for idx, obj in enumerate(self._iterate_slices()):
845 name = obj.name
846 is_numeric = is_numeric_dtype(obj.dtype)
847 if numeric_only and not is_numeric:
848 continue
850 try:
851 result, _ = self.grouper.transform(obj.values, how, **kwargs)
852 except NotImplementedError:
853 continue
855 if self._transform_should_cast(how):
856 result = self._try_cast(result, obj)
858 key = base.OutputKey(label=name, position=idx)
859 output[key] = result
861 if len(output) == 0:
862 raise DataError("No numeric types to aggregate")
864 return self._wrap_transformed_output(output)
866 def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]):
867 raise AbstractMethodError(self)
869 def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
870 raise AbstractMethodError(self)
872 def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
873 raise AbstractMethodError(self)
875 def _cython_agg_general(
876 self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
877 ):
878 output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {}
879 # Ideally we would be able to enumerate self._iterate_slices and use
880 # the index from enumeration as the key of output, but ohlc in particular
881 # returns a (n x 4) array. Output requires 1D ndarrays as values, so we
882 # need to slice that up into 1D arrays
883 idx = 0
884 for obj in self._iterate_slices():
885 name = obj.name
886 is_numeric = is_numeric_dtype(obj.dtype)
887 if numeric_only and not is_numeric:
888 continue
890 result, agg_names = self.grouper.aggregate(
891 obj._values, how, min_count=min_count
892 )
894 if agg_names:
895 # e.g. ohlc
896 assert len(agg_names) == result.shape[1]
897 for result_column, result_name in zip(result.T, agg_names):
898 key = base.OutputKey(label=result_name, position=idx)
899 output[key] = self._try_cast(result_column, obj)
900 idx += 1
901 else:
902 assert result.ndim == 1
903 key = base.OutputKey(label=name, position=idx)
904 output[key] = self._try_cast(result, obj)
905 idx += 1
907 if len(output) == 0:
908 raise DataError("No numeric types to aggregate")
910 return self._wrap_aggregated_output(output)
912 def _python_agg_general(self, func, *args, **kwargs):
913 func = self._is_builtin_func(func)
914 f = lambda x: func(x, *args, **kwargs)
916 # iterate through "columns" ex exclusions to populate output dict
917 output: Dict[base.OutputKey, np.ndarray] = {}
919 for idx, obj in enumerate(self._iterate_slices()):
920 name = obj.name
921 if self.grouper.ngroups == 0:
922 # agg_series below assumes ngroups > 0
923 continue
925 try:
926 # if this function is invalid for this dtype, we will ignore it.
927 result, counts = self.grouper.agg_series(obj, f)
928 except TypeError:
929 continue
931 assert result is not None
932 key = base.OutputKey(label=name, position=idx)
933 output[key] = self._try_cast(result, obj, numeric_only=True)
935 if len(output) == 0:
936 return self._python_apply_general(f)
938 if self.grouper._filter_empty_groups:
940 mask = counts.ravel() > 0
941 for key, result in output.items():
943 # since we are masking, make sure that we have a float object
944 values = result
945 if is_numeric_dtype(values.dtype):
946 values = ensure_float(values)
948 output[key] = self._try_cast(values[mask], result)
950 return self._wrap_aggregated_output(output)
952 def _concat_objects(self, keys, values, not_indexed_same: bool = False):
953 from pandas.core.reshape.concat import concat
955 def reset_identity(values):
956 # reset the identities of the components
957 # of the values to prevent aliasing
958 for v in com.not_none(*values):
959 ax = v._get_axis(self.axis)
960 ax._reset_identity()
961 return values
963 if not not_indexed_same:
964 result = concat(values, axis=self.axis)
965 ax = self._selected_obj._get_axis(self.axis)
967 if isinstance(result, Series):
968 result = result.reindex(ax)
969 else:
971 # this is a very unfortunate situation
972 # we have a multi-index that is NOT lexsorted
973 # and we have a result which is duplicated
974 # we can't reindex, so we resort to this
975 # GH 14776
976 if isinstance(ax, MultiIndex) and not ax.is_unique:
977 indexer = algorithms.unique1d(
978 result.index.get_indexer_for(ax.values)
979 )
980 result = result.take(indexer, axis=self.axis)
981 else:
982 result = result.reindex(ax, axis=self.axis)
984 elif self.group_keys:
986 values = reset_identity(values)
987 if self.as_index:
989 # possible MI return case
990 group_keys = keys
991 group_levels = self.grouper.levels
992 group_names = self.grouper.names
994 result = concat(
995 values,
996 axis=self.axis,
997 keys=group_keys,
998 levels=group_levels,
999 names=group_names,
1000 sort=False,
1001 )
1002 else:
1004 # GH5610, returns a MI, with the first level being a
1005 # range index
1006 keys = list(range(len(values)))
1007 result = concat(values, axis=self.axis, keys=keys)
1008 else:
1009 values = reset_identity(values)
1010 result = concat(values, axis=self.axis)
1012 if isinstance(result, Series) and self._selection_name is not None:
1014 result.name = self._selection_name
1016 return result
1018 def _apply_filter(self, indices, dropna):
1019 if len(indices) == 0:
1020 indices = np.array([], dtype="int64")
1021 else:
1022 indices = np.sort(np.concatenate(indices))
1023 if dropna:
1024 filtered = self._selected_obj.take(indices, axis=self.axis)
1025 else:
1026 mask = np.empty(len(self._selected_obj.index), dtype=bool)
1027 mask.fill(False)
1028 mask[indices.astype(int)] = True
1029 # mask fails to broadcast when passed to where; broadcast manually.
1030 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
1031 filtered = self._selected_obj.where(mask) # Fill with NaNs.
1032 return filtered
1035class GroupBy(_GroupBy):
1036 """
1037 Class for grouping and aggregating relational data.
1039 See aggregate, transform, and apply functions on this object.
1041 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
1043 ::
1045 grouped = groupby(obj, ...)
1047 Parameters
1048 ----------
1049 obj : pandas object
1050 axis : int, default 0
1051 level : int, default None
1052 Level of MultiIndex
1053 groupings : list of Grouping objects
1054 Most users should ignore this
1055 exclusions : array-like, optional
1056 List of columns to exclude
1057 name : str
1058 Most users should ignore this
1060 Returns
1061 -------
1062 **Attributes**
1063 groups : dict
1064 {group name -> group labels}
1065 len(grouped) : int
1066 Number of groups
1068 Notes
1069 -----
1070 After grouping, see aggregate, apply, and transform functions. Here are
1071 some other brief notes about usage. When grouping by multiple groups, the
1072 result index will be a MultiIndex (hierarchical) by default.
1074 Iteration produces (key, group) tuples, i.e. chunking the data by group. So
1075 you can write code like:
1077 ::
1079 grouped = obj.groupby(keys, axis=axis)
1080 for key, group in grouped:
1081 # do something with the data
1083 Function calls on GroupBy, if not specially implemented, "dispatch" to the
1084 grouped data. So if you group a DataFrame and wish to invoke the std()
1085 method on each group, you can simply do:
1087 ::
1089 df.groupby(mapper).std()
1091 rather than
1093 ::
1095 df.groupby(mapper).aggregate(np.std)
1097 You can pass arguments to these "wrapped" functions, too.
1099 See the online documentation for full exposition on these topics and much
1100 more
1101 """
1103 def _bool_agg(self, val_test, skipna):
1104 """
1105 Shared func to call any / all Cython GroupBy implementations.
1106 """
1108 def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]:
1109 if is_object_dtype(vals):
1110 vals = np.array([bool(x) for x in vals])
1111 else:
1112 vals = vals.astype(np.bool)
1114 return vals.view(np.uint8), np.bool
1116 def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:
1117 return result.astype(inference, copy=False)
1119 return self._get_cythonized_result(
1120 "group_any_all",
1121 aggregate=True,
1122 cython_dtype=np.dtype(np.uint8),
1123 needs_values=True,
1124 needs_mask=True,
1125 pre_processing=objs_to_bool,
1126 post_processing=result_to_bool,
1127 val_test=val_test,
1128 skipna=skipna,
1129 )
1131 @Substitution(name="groupby")
1132 @Appender(_common_see_also)
1133 def any(self, skipna: bool = True):
1134 """
1135 Return True if any value in the group is truthful, else False.
1137 Parameters
1138 ----------
1139 skipna : bool, default True
1140 Flag to ignore nan values during truth testing.
1142 Returns
1143 -------
1144 bool
1145 """
1146 return self._bool_agg("any", skipna)
1148 @Substitution(name="groupby")
1149 @Appender(_common_see_also)
1150 def all(self, skipna: bool = True):
1151 """
1152 Return True if all values in the group are truthful, else False.
1154 Parameters
1155 ----------
1156 skipna : bool, default True
1157 Flag to ignore nan values during truth testing.
1159 Returns
1160 -------
1161 bool
1162 """
1163 return self._bool_agg("all", skipna)
1165 @Substitution(name="groupby")
1166 @Appender(_common_see_also)
1167 def count(self):
1168 """
1169 Compute count of group, excluding missing values.
1171 Returns
1172 -------
1173 Series or DataFrame
1174 Count of values within each group.
1175 """
1177 # defined here for API doc
1178 raise NotImplementedError
1180 @Substitution(name="groupby")
1181 @Substitution(see_also=_common_see_also)
1182 def mean(self, *args, **kwargs):
1183 """
1184 Compute mean of groups, excluding missing values.
1186 Returns
1187 -------
1188 pandas.Series or pandas.DataFrame
1189 %(see_also)s
1190 Examples
1191 --------
1192 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
1193 ... 'B': [np.nan, 2, 3, 4, 5],
1194 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
1196 Groupby one column and return the mean of the remaining columns in
1197 each group.
1199 >>> df.groupby('A').mean()
1200 B C
1201 A
1202 1 3.0 1.333333
1203 2 4.0 1.500000
1205 Groupby two columns and return the mean of the remaining column.
1207 >>> df.groupby(['A', 'B']).mean()
1208 C
1209 A B
1210 1 2.0 2
1211 4.0 1
1212 2 3.0 1
1213 5.0 2
1215 Groupby one column and return the mean of only particular column in
1216 the group.
1218 >>> df.groupby('A')['B'].mean()
1219 A
1220 1 3.0
1221 2 4.0
1222 Name: B, dtype: float64
1223 """
1224 nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"])
1225 return self._cython_agg_general(
1226 "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs
1227 )
1229 @Substitution(name="groupby")
1230 @Appender(_common_see_also)
1231 def median(self, **kwargs):
1232 """
1233 Compute median of groups, excluding missing values.
1235 For multiple groupings, the result index will be a MultiIndex
1237 Returns
1238 -------
1239 Series or DataFrame
1240 Median of values within each group.
1241 """
1242 return self._cython_agg_general(
1243 "median",
1244 alt=lambda x, axis: Series(x).median(axis=axis, **kwargs),
1245 **kwargs,
1246 )
1248 @Substitution(name="groupby")
1249 @Appender(_common_see_also)
1250 def std(self, ddof: int = 1, *args, **kwargs):
1251 """
1252 Compute standard deviation of groups, excluding missing values.
1254 For multiple groupings, the result index will be a MultiIndex.
1256 Parameters
1257 ----------
1258 ddof : int, default 1
1259 Degrees of freedom.
1261 Returns
1262 -------
1263 Series or DataFrame
1264 Standard deviation of values within each group.
1265 """
1267 # TODO: implement at Cython level?
1268 nv.validate_groupby_func("std", args, kwargs)
1269 return np.sqrt(self.var(ddof=ddof, **kwargs))
1271 @Substitution(name="groupby")
1272 @Appender(_common_see_also)
1273 def var(self, ddof: int = 1, *args, **kwargs):
1274 """
1275 Compute variance of groups, excluding missing values.
1277 For multiple groupings, the result index will be a MultiIndex.
1279 Parameters
1280 ----------
1281 ddof : int, default 1
1282 Degrees of freedom.
1284 Returns
1285 -------
1286 Series or DataFrame
1287 Variance of values within each group.
1288 """
1289 nv.validate_groupby_func("var", args, kwargs)
1290 if ddof == 1:
1291 return self._cython_agg_general(
1292 "var", alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), **kwargs
1293 )
1294 else:
1295 f = lambda x: x.var(ddof=ddof, **kwargs)
1296 with _group_selection_context(self):
1297 return self._python_agg_general(f)
1299 @Substitution(name="groupby")
1300 @Appender(_common_see_also)
1301 def sem(self, ddof: int = 1):
1302 """
1303 Compute standard error of the mean of groups, excluding missing values.
1305 For multiple groupings, the result index will be a MultiIndex.
1307 Parameters
1308 ----------
1309 ddof : int, default 1
1310 Degrees of freedom.
1312 Returns
1313 -------
1314 Series or DataFrame
1315 Standard error of the mean of values within each group.
1316 """
1317 return self.std(ddof=ddof) / np.sqrt(self.count())
1319 @Substitution(name="groupby")
1320 @Appender(_common_see_also)
1321 def size(self):
1322 """
1323 Compute group sizes.
1325 Returns
1326 -------
1327 Series
1328 Number of rows in each group.
1329 """
1330 result = self.grouper.size()
1332 if isinstance(self.obj, Series):
1333 result.name = self.obj.name
1334 return self._reindex_output(result, fill_value=0)
1336 @classmethod
1337 def _add_numeric_operations(cls):
1338 """
1339 Add numeric operations to the GroupBy generically.
1340 """
1342 def groupby_function(
1343 name: str,
1344 alias: str,
1345 npfunc,
1346 numeric_only: bool = True,
1347 min_count: int = -1,
1348 ):
1350 _local_template = """
1351 Compute %(f)s of group values.
1353 Returns
1354 -------
1355 Series or DataFrame
1356 Computed %(f)s of values within each group.
1357 """
1359 @Substitution(name="groupby", f=name)
1360 @Appender(_common_see_also)
1361 @Appender(_local_template)
1362 def f(self, **kwargs):
1363 if "numeric_only" not in kwargs:
1364 kwargs["numeric_only"] = numeric_only
1365 if "min_count" not in kwargs:
1366 kwargs["min_count"] = min_count
1368 self._set_group_selection()
1370 # try a cython aggregation if we can
1371 try:
1372 return self._cython_agg_general(alias, alt=npfunc, **kwargs)
1373 except DataError:
1374 pass
1375 except NotImplementedError as err:
1376 if "function is not implemented for this dtype" in str(
1377 err
1378 ) or "category dtype not supported" in str(err):
1379 # raised in _get_cython_function, in some cases can
1380 # be trimmed by implementing cython funcs for more dtypes
1381 pass
1382 else:
1383 raise
1385 # apply a non-cython aggregation
1386 result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
1387 return result
1389 set_function_name(f, name, cls)
1391 return f
1393 def first_compat(x, axis=0):
1394 def first(x):
1395 x = x.to_numpy()
1397 x = x[notna(x)]
1398 if len(x) == 0:
1399 return np.nan
1400 return x[0]
1402 if isinstance(x, DataFrame):
1403 return x.apply(first, axis=axis)
1404 else:
1405 return first(x)
1407 def last_compat(x, axis=0):
1408 def last(x):
1409 x = x.to_numpy()
1410 x = x[notna(x)]
1411 if len(x) == 0:
1412 return np.nan
1413 return x[-1]
1415 if isinstance(x, DataFrame):
1416 return x.apply(last, axis=axis)
1417 else:
1418 return last(x)
1420 cls.sum = groupby_function("sum", "add", np.sum, min_count=0)
1421 cls.prod = groupby_function("prod", "prod", np.prod, min_count=0)
1422 cls.min = groupby_function("min", "min", np.min, numeric_only=False)
1423 cls.max = groupby_function("max", "max", np.max, numeric_only=False)
1424 cls.first = groupby_function("first", "first", first_compat, numeric_only=False)
1425 cls.last = groupby_function("last", "last", last_compat, numeric_only=False)
1427 @Substitution(name="groupby")
1428 @Appender(_common_see_also)
1429 def ohlc(self) -> DataFrame:
1430 """
1431 Compute sum of values, excluding missing values.
1433 For multiple groupings, the result index will be a MultiIndex
1435 Returns
1436 -------
1437 DataFrame
1438 Open, high, low and close values within each group.
1439 """
1441 return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc"))
1443 @Appender(DataFrame.describe.__doc__)
1444 def describe(self, **kwargs):
1445 with _group_selection_context(self):
1446 result = self.apply(lambda x: x.describe(**kwargs))
1447 if self.axis == 1:
1448 return result.T
1449 return result.unstack()
1451 def resample(self, rule, *args, **kwargs):
1452 """
1453 Provide resampling when using a TimeGrouper.
1455 Given a grouper, the function resamples it according to a string
1456 "string" -> "frequency".
1458 See the :ref:`frequency aliases <timeseries.offset_aliases>`
1459 documentation for more details.
1461 Parameters
1462 ----------
1463 rule : str or DateOffset
1464 The offset string or object representing target grouper conversion.
1465 *args, **kwargs
1466 Possible arguments are `how`, `fill_method`, `limit`, `kind` and
1467 `on`, and other arguments of `TimeGrouper`.
1469 Returns
1470 -------
1471 Grouper
1472 Return a new grouper with our resampler appended.
1474 See Also
1475 --------
1476 Grouper : Specify a frequency to resample with when
1477 grouping by a key.
1478 DatetimeIndex.resample : Frequency conversion and resampling of
1479 time series.
1481 Examples
1482 --------
1483 >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
1484 >>> df = pd.DataFrame(data=4 * [range(2)],
1485 ... index=idx,
1486 ... columns=['a', 'b'])
1487 >>> df.iloc[2, 0] = 5
1488 >>> df
1489 a b
1490 2000-01-01 00:00:00 0 1
1491 2000-01-01 00:01:00 0 1
1492 2000-01-01 00:02:00 5 1
1493 2000-01-01 00:03:00 0 1
1495 Downsample the DataFrame into 3 minute bins and sum the values of
1496 the timestamps falling into a bin.
1498 >>> df.groupby('a').resample('3T').sum()
1499 a b
1500 a
1501 0 2000-01-01 00:00:00 0 2
1502 2000-01-01 00:03:00 0 1
1503 5 2000-01-01 00:00:00 5 1
1505 Upsample the series into 30 second bins.
1507 >>> df.groupby('a').resample('30S').sum()
1508 a b
1509 a
1510 0 2000-01-01 00:00:00 0 1
1511 2000-01-01 00:00:30 0 0
1512 2000-01-01 00:01:00 0 1
1513 2000-01-01 00:01:30 0 0
1514 2000-01-01 00:02:00 0 0
1515 2000-01-01 00:02:30 0 0
1516 2000-01-01 00:03:00 0 1
1517 5 2000-01-01 00:02:00 5 1
1519 Resample by month. Values are assigned to the month of the period.
1521 >>> df.groupby('a').resample('M').sum()
1522 a b
1523 a
1524 0 2000-01-31 0 3
1525 5 2000-01-31 5 1
1527 Downsample the series into 3 minute bins as above, but close the right
1528 side of the bin interval.
1530 >>> df.groupby('a').resample('3T', closed='right').sum()
1531 a b
1532 a
1533 0 1999-12-31 23:57:00 0 1
1534 2000-01-01 00:00:00 0 2
1535 5 2000-01-01 00:00:00 5 1
1537 Downsample the series into 3 minute bins and close the right side of
1538 the bin interval, but label each bin using the right edge instead of
1539 the left.
1541 >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
1542 a b
1543 a
1544 0 2000-01-01 00:00:00 0 1
1545 2000-01-01 00:03:00 0 2
1546 5 2000-01-01 00:03:00 5 1
1548 Add an offset of twenty seconds.
1550 >>> df.groupby('a').resample('3T', loffset='20s').sum()
1551 a b
1552 a
1553 0 2000-01-01 00:00:20 0 2
1554 2000-01-01 00:03:20 0 1
1555 5 2000-01-01 00:00:20 5 1
1556 """
1557 from pandas.core.resample import get_resampler_for_grouping
1559 return get_resampler_for_grouping(self, rule, *args, **kwargs)
1561 @Substitution(name="groupby")
1562 @Appender(_common_see_also)
1563 def rolling(self, *args, **kwargs):
1564 """
1565 Return a rolling grouper, providing rolling functionality per group.
1566 """
1567 from pandas.core.window import RollingGroupby
1569 return RollingGroupby(self, *args, **kwargs)
1571 @Substitution(name="groupby")
1572 @Appender(_common_see_also)
1573 def expanding(self, *args, **kwargs):
1574 """
1575 Return an expanding grouper, providing expanding
1576 functionality per group.
1577 """
1578 from pandas.core.window import ExpandingGroupby
1580 return ExpandingGroupby(self, *args, **kwargs)
1582 def _fill(self, direction, limit=None):
1583 """
1584 Shared function for `pad` and `backfill` to call Cython method.
1586 Parameters
1587 ----------
1588 direction : {'ffill', 'bfill'}
1589 Direction passed to underlying Cython function. `bfill` will cause
1590 values to be filled backwards. `ffill` and any other values will
1591 default to a forward fill
1592 limit : int, default None
1593 Maximum number of consecutive values to fill. If `None`, this
1594 method will convert to -1 prior to passing to Cython
1596 Returns
1597 -------
1598 `Series` or `DataFrame` with filled values
1600 See Also
1601 --------
1602 pad
1603 backfill
1604 """
1605 # Need int value for Cython
1606 if limit is None:
1607 limit = -1
1609 return self._get_cythonized_result(
1610 "group_fillna_indexer",
1611 needs_mask=True,
1612 cython_dtype=np.dtype(np.int64),
1613 result_is_index=True,
1614 direction=direction,
1615 limit=limit,
1616 )
1618 @Substitution(name="groupby")
1619 def pad(self, limit=None):
1620 """
1621 Forward fill the values.
1623 Parameters
1624 ----------
1625 limit : int, optional
1626 Limit of how many values to fill.
1628 Returns
1629 -------
1630 Series or DataFrame
1631 Object with missing values filled.
1633 See Also
1634 --------
1635 Series.pad
1636 DataFrame.pad
1637 Series.fillna
1638 DataFrame.fillna
1639 """
1640 return self._fill("ffill", limit=limit)
1642 ffill = pad
1644 @Substitution(name="groupby")
1645 def backfill(self, limit=None):
1646 """
1647 Backward fill the values.
1649 Parameters
1650 ----------
1651 limit : int, optional
1652 Limit of how many values to fill.
1654 Returns
1655 -------
1656 Series or DataFrame
1657 Object with missing values filled.
1659 See Also
1660 --------
1661 Series.backfill
1662 DataFrame.backfill
1663 Series.fillna
1664 DataFrame.fillna
1665 """
1666 return self._fill("bfill", limit=limit)
1668 bfill = backfill
1670 @Substitution(name="groupby")
1671 @Substitution(see_also=_common_see_also)
1672 def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame:
1673 """
1674 Take the nth row from each group if n is an int, or a subset of rows
1675 if n is a list of ints.
1677 If dropna, will take the nth non-null row, dropna is either
1678 'all' or 'any'; this is equivalent to calling dropna(how=dropna)
1679 before the groupby.
1681 Parameters
1682 ----------
1683 n : int or list of ints
1684 A single nth value for the row or a list of nth values.
1685 dropna : None or str, optional
1686 Apply the specified dropna operation before counting which row is
1687 the nth row. Needs to be None, 'any' or 'all'.
1689 Returns
1690 -------
1691 Series or DataFrame
1692 N-th value within each group.
1693 %(see_also)s
1694 Examples
1695 --------
1697 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
1698 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
1699 >>> g = df.groupby('A')
1700 >>> g.nth(0)
1701 B
1702 A
1703 1 NaN
1704 2 3.0
1705 >>> g.nth(1)
1706 B
1707 A
1708 1 2.0
1709 2 5.0
1710 >>> g.nth(-1)
1711 B
1712 A
1713 1 4.0
1714 2 5.0
1715 >>> g.nth([0, 1])
1716 B
1717 A
1718 1 NaN
1719 1 2.0
1720 2 3.0
1721 2 5.0
1723 Specifying `dropna` allows count ignoring ``NaN``
1725 >>> g.nth(0, dropna='any')
1726 B
1727 A
1728 1 2.0
1729 2 3.0
1731 NaNs denote group exhausted when using dropna
1733 >>> g.nth(3, dropna='any')
1734 B
1735 A
1736 1 NaN
1737 2 NaN
1739 Specifying `as_index=False` in `groupby` keeps the original index.
1741 >>> df.groupby('A', as_index=False).nth(1)
1742 A B
1743 1 1 2.0
1744 4 2 5.0
1745 """
1747 valid_containers = (set, list, tuple)
1748 if not isinstance(n, (valid_containers, int)):
1749 raise TypeError("n needs to be an int or a list/set/tuple of ints")
1751 if not dropna:
1753 if isinstance(n, int):
1754 nth_values = [n]
1755 elif isinstance(n, valid_containers):
1756 nth_values = list(set(n))
1758 nth_array = np.array(nth_values, dtype=np.intp)
1759 self._set_group_selection()
1761 mask_left = np.in1d(self._cumcount_array(), nth_array)
1762 mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array)
1763 mask = mask_left | mask_right
1765 ids, _, _ = self.grouper.group_info
1767 # Drop NA values in grouping
1768 mask = mask & (ids != -1)
1770 out = self._selected_obj[mask]
1771 if not self.as_index:
1772 return out
1774 result_index = self.grouper.result_index
1775 out.index = result_index[ids[mask]]
1777 if not self.observed and isinstance(result_index, CategoricalIndex):
1778 out = out.reindex(result_index)
1780 out = self._reindex_output(out)
1781 return out.sort_index() if self.sort else out
1783 # dropna is truthy
1784 if isinstance(n, valid_containers):
1785 raise ValueError("dropna option with a list of nth values is not supported")
1787 if dropna not in ["any", "all"]:
1788 # Note: when agg-ing picker doesn't raise this, just returns NaN
1789 raise ValueError(
1790 "For a DataFrame groupby, dropna must be "
1791 "either None, 'any' or 'all', "
1792 f"(was passed {dropna})."
1793 )
1795 # old behaviour, but with all and any support for DataFrames.
1796 # modified in GH 7559 to have better perf
1797 max_len = n if n >= 0 else -1 - n
1798 dropped = self.obj.dropna(how=dropna, axis=self.axis)
1800 # get a new grouper for our dropped obj
1801 if self.keys is None and self.level is None:
1803 # we don't have the grouper info available
1804 # (e.g. we have selected out
1805 # a column that is not in the current object)
1806 axis = self.grouper.axis
1807 grouper = axis[axis.isin(dropped.index)]
1809 else:
1811 # create a grouper with the original parameters, but on dropped
1812 # object
1813 from pandas.core.groupby.grouper import get_grouper
1815 grouper, _, _ = get_grouper(
1816 dropped,
1817 key=self.keys,
1818 axis=self.axis,
1819 level=self.level,
1820 sort=self.sort,
1821 mutated=self.mutated,
1822 )
1824 grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
1825 sizes, result = grb.size(), grb.nth(n)
1826 mask = (sizes < max_len).values
1828 # set the results which don't meet the criteria
1829 if len(result) and mask.any():
1830 result.loc[mask] = np.nan
1832 # reset/reindex to the original groups
1833 if len(self.obj) == len(dropped) or len(result) == len(
1834 self.grouper.result_index
1835 ):
1836 result.index = self.grouper.result_index
1837 else:
1838 result = result.reindex(self.grouper.result_index)
1840 return result
1842 def quantile(self, q=0.5, interpolation: str = "linear"):
1843 """
1844 Return group values at the given quantile, a la numpy.percentile.
1846 Parameters
1847 ----------
1848 q : float or array-like, default 0.5 (50% quantile)
1849 Value(s) between 0 and 1 providing the quantile(s) to compute.
1850 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
1851 Method to use when the desired quantile falls between two points.
1853 Returns
1854 -------
1855 Series or DataFrame
1856 Return type determined by caller of GroupBy object.
1858 See Also
1859 --------
1860 Series.quantile : Similar method for Series.
1861 DataFrame.quantile : Similar method for DataFrame.
1862 numpy.percentile : NumPy method to compute qth percentile.
1864 Examples
1865 --------
1866 >>> df = pd.DataFrame([
1867 ... ['a', 1], ['a', 2], ['a', 3],
1868 ... ['b', 1], ['b', 3], ['b', 5]
1869 ... ], columns=['key', 'val'])
1870 >>> df.groupby('key').quantile()
1871 val
1872 key
1873 a 2.0
1874 b 3.0
1875 """
1876 from pandas import concat
1878 def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]:
1879 if is_object_dtype(vals):
1880 raise TypeError(
1881 "'quantile' cannot be performed against 'object' dtypes!"
1882 )
1884 inference = None
1885 if is_integer_dtype(vals):
1886 inference = np.int64
1887 elif is_datetime64_dtype(vals):
1888 inference = "datetime64[ns]"
1889 vals = vals.astype(np.float)
1891 return vals, inference
1893 def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
1894 if inference:
1895 # Check for edge case
1896 if not (
1897 is_integer_dtype(inference)
1898 and interpolation in {"linear", "midpoint"}
1899 ):
1900 vals = vals.astype(inference)
1902 return vals
1904 if is_scalar(q):
1905 return self._get_cythonized_result(
1906 "group_quantile",
1907 aggregate=True,
1908 needs_values=True,
1909 needs_mask=True,
1910 cython_dtype=np.dtype(np.float64),
1911 pre_processing=pre_processor,
1912 post_processing=post_processor,
1913 q=q,
1914 interpolation=interpolation,
1915 )
1916 else:
1917 results = [
1918 self._get_cythonized_result(
1919 "group_quantile",
1920 aggregate=True,
1921 needs_values=True,
1922 needs_mask=True,
1923 cython_dtype=np.dtype(np.float64),
1924 pre_processing=pre_processor,
1925 post_processing=post_processor,
1926 q=qi,
1927 interpolation=interpolation,
1928 )
1929 for qi in q
1930 ]
1931 result = concat(results, axis=0, keys=q)
1932 # fix levels to place quantiles on the inside
1933 # TODO(GH-10710): Ideally, we could write this as
1934 # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :]
1935 # but this hits https://github.com/pandas-dev/pandas/issues/10710
1936 # which doesn't reorder the list-like `q` on the inner level.
1937 order = list(range(1, result.index.nlevels)) + [0]
1939 # temporarily saves the index names
1940 index_names = np.array(result.index.names)
1942 # set index names to positions to avoid confusion
1943 result.index.names = np.arange(len(index_names))
1945 # place quantiles on the inside
1946 result = result.reorder_levels(order)
1948 # restore the index names in order
1949 result.index.names = index_names[order]
1951 # reorder rows to keep things sorted
1952 indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten()
1953 return result.take(indices)
1955 @Substitution(name="groupby")
1956 def ngroup(self, ascending: bool = True):
1957 """
1958 Number each group from 0 to the number of groups - 1.
1960 This is the enumerative complement of cumcount. Note that the
1961 numbers given to the groups match the order in which the groups
1962 would be seen when iterating over the groupby object, not the
1963 order they are first observed.
1965 Parameters
1966 ----------
1967 ascending : bool, default True
1968 If False, number in reverse, from number of group - 1 to 0.
1970 Returns
1971 -------
1972 Series
1973 Unique numbers for each group.
1975 See Also
1976 --------
1977 .cumcount : Number the rows in each group.
1979 Examples
1980 --------
1982 >>> df = pd.DataFrame({"A": list("aaabba")})
1983 >>> df
1984 A
1985 0 a
1986 1 a
1987 2 a
1988 3 b
1989 4 b
1990 5 a
1991 >>> df.groupby('A').ngroup()
1992 0 0
1993 1 0
1994 2 0
1995 3 1
1996 4 1
1997 5 0
1998 dtype: int64
1999 >>> df.groupby('A').ngroup(ascending=False)
2000 0 1
2001 1 1
2002 2 1
2003 3 0
2004 4 0
2005 5 1
2006 dtype: int64
2007 >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
2008 0 0
2009 1 0
2010 2 1
2011 3 3
2012 4 2
2013 5 0
2014 dtype: int64
2015 """
2017 with _group_selection_context(self):
2018 index = self._selected_obj.index
2019 result = Series(self.grouper.group_info[0], index)
2020 if not ascending:
2021 result = self.ngroups - 1 - result
2022 return result
2024 @Substitution(name="groupby")
2025 def cumcount(self, ascending: bool = True):
2026 """
2027 Number each item in each group from 0 to the length of that group - 1.
2029 Essentially this is equivalent to
2031 >>> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
2033 Parameters
2034 ----------
2035 ascending : bool, default True
2036 If False, number in reverse, from length of group - 1 to 0.
2038 Returns
2039 -------
2040 Series
2041 Sequence number of each element within each group.
2043 See Also
2044 --------
2045 .ngroup : Number the groups themselves.
2047 Examples
2048 --------
2050 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
2051 ... columns=['A'])
2052 >>> df
2053 A
2054 0 a
2055 1 a
2056 2 a
2057 3 b
2058 4 b
2059 5 a
2060 >>> df.groupby('A').cumcount()
2061 0 0
2062 1 1
2063 2 2
2064 3 0
2065 4 1
2066 5 3
2067 dtype: int64
2068 >>> df.groupby('A').cumcount(ascending=False)
2069 0 3
2070 1 2
2071 2 1
2072 3 1
2073 4 0
2074 5 0
2075 dtype: int64
2076 """
2078 with _group_selection_context(self):
2079 index = self._selected_obj.index
2080 cumcounts = self._cumcount_array(ascending=ascending)
2081 return Series(cumcounts, index)
2083 @Substitution(name="groupby")
2084 @Appender(_common_see_also)
2085 def rank(
2086 self,
2087 method: str = "average",
2088 ascending: bool = True,
2089 na_option: str = "keep",
2090 pct: bool = False,
2091 axis: int = 0,
2092 ):
2093 """
2094 Provide the rank of values within each group.
2096 Parameters
2097 ----------
2098 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
2099 * average: average rank of group.
2100 * min: lowest rank in group.
2101 * max: highest rank in group.
2102 * first: ranks assigned in order they appear in the array.
2103 * dense: like 'min', but rank always increases by 1 between groups.
2104 ascending : bool, default True
2105 False for ranks by high (1) to low (N).
2106 na_option : {'keep', 'top', 'bottom'}, default 'keep'
2107 * keep: leave NA values where they are.
2108 * top: smallest rank if ascending.
2109 * bottom: smallest rank if descending.
2110 pct : bool, default False
2111 Compute percentage rank of data within each group.
2112 axis : int, default 0
2113 The axis of the object over which to compute the rank.
2115 Returns
2116 -------
2117 DataFrame with ranking of values within each group
2118 """
2119 if na_option not in {"keep", "top", "bottom"}:
2120 msg = "na_option must be one of 'keep', 'top', or 'bottom'"
2121 raise ValueError(msg)
2122 return self._cython_transform(
2123 "rank",
2124 numeric_only=False,
2125 ties_method=method,
2126 ascending=ascending,
2127 na_option=na_option,
2128 pct=pct,
2129 axis=axis,
2130 )
2132 @Substitution(name="groupby")
2133 @Appender(_common_see_also)
2134 def cumprod(self, axis=0, *args, **kwargs):
2135 """
2136 Cumulative product for each group.
2138 Returns
2139 -------
2140 Series or DataFrame
2141 """
2142 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
2143 if axis != 0:
2144 return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
2146 return self._cython_transform("cumprod", **kwargs)
2148 @Substitution(name="groupby")
2149 @Appender(_common_see_also)
2150 def cumsum(self, axis=0, *args, **kwargs):
2151 """
2152 Cumulative sum for each group.
2154 Returns
2155 -------
2156 Series or DataFrame
2157 """
2158 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
2159 if axis != 0:
2160 return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
2162 return self._cython_transform("cumsum", **kwargs)
2164 @Substitution(name="groupby")
2165 @Appender(_common_see_also)
2166 def cummin(self, axis=0, **kwargs):
2167 """
2168 Cumulative min for each group.
2170 Returns
2171 -------
2172 Series or DataFrame
2173 """
2174 if axis != 0:
2175 return self.apply(lambda x: np.minimum.accumulate(x, axis))
2177 return self._cython_transform("cummin", numeric_only=False)
2179 @Substitution(name="groupby")
2180 @Appender(_common_see_also)
2181 def cummax(self, axis=0, **kwargs):
2182 """
2183 Cumulative max for each group.
2185 Returns
2186 -------
2187 Series or DataFrame
2188 """
2189 if axis != 0:
2190 return self.apply(lambda x: np.maximum.accumulate(x, axis))
2192 return self._cython_transform("cummax", numeric_only=False)
2194 def _get_cythonized_result(
2195 self,
2196 how: str,
2197 cython_dtype: np.dtype,
2198 aggregate: bool = False,
2199 needs_values: bool = False,
2200 needs_mask: bool = False,
2201 needs_ngroups: bool = False,
2202 result_is_index: bool = False,
2203 pre_processing=None,
2204 post_processing=None,
2205 **kwargs,
2206 ):
2207 """
2208 Get result for Cythonized functions.
2210 Parameters
2211 ----------
2212 how : str, Cythonized function name to be called
2213 cython_dtype : np.dtype
2214 Type of the array that will be modified by the Cython call.
2215 aggregate : bool, default False
2216 Whether the result should be aggregated to match the number of
2217 groups
2218 needs_values : bool, default False
2219 Whether the values should be a part of the Cython call
2220 signature
2221 needs_mask : bool, default False
2222 Whether boolean mask needs to be part of the Cython call
2223 signature
2224 needs_ngroups : bool, default False
2225 Whether number of groups is part of the Cython call signature
2226 result_is_index : bool, default False
2227 Whether the result of the Cython operation is an index of
2228 values to be retrieved, instead of the actual values themselves
2229 pre_processing : function, default None
2230 Function to be applied to `values` prior to passing to Cython.
2231 Function should return a tuple where the first element is the
2232 values to be passed to Cython and the second element is an optional
2233 type which the values should be converted to after being returned
2234 by the Cython operation. Raises if `needs_values` is False.
2235 post_processing : function, default None
2236 Function to be applied to result of Cython function. Should accept
2237 an array of values as the first argument and type inferences as its
2238 second argument, i.e. the signature should be
2239 (ndarray, Type).
2240 **kwargs : dict
2241 Extra arguments to be passed back to Cython funcs
2243 Returns
2244 -------
2245 `Series` or `DataFrame` with filled values
2246 """
2247 if result_is_index and aggregate:
2248 raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
2249 if post_processing:
2250 if not callable(pre_processing):
2251 raise ValueError("'post_processing' must be a callable!")
2252 if pre_processing:
2253 if not callable(pre_processing):
2254 raise ValueError("'pre_processing' must be a callable!")
2255 if not needs_values:
2256 raise ValueError(
2257 "Cannot use 'pre_processing' without specifying 'needs_values'!"
2258 )
2260 grouper = self.grouper
2262 labels, _, ngroups = grouper.group_info
2263 output: Dict[base.OutputKey, np.ndarray] = {}
2264 base_func = getattr(libgroupby, how)
2266 for idx, obj in enumerate(self._iterate_slices()):
2267 name = obj.name
2268 values = obj._data._values
2270 if aggregate:
2271 result_sz = ngroups
2272 else:
2273 result_sz = len(values)
2275 result = np.zeros(result_sz, dtype=cython_dtype)
2276 func = partial(base_func, result, labels)
2277 inferences = None
2279 if needs_values:
2280 vals = values
2281 if pre_processing:
2282 vals, inferences = pre_processing(vals)
2283 func = partial(func, vals)
2285 if needs_mask:
2286 mask = isna(values).view(np.uint8)
2287 func = partial(func, mask)
2289 if needs_ngroups:
2290 func = partial(func, ngroups)
2292 func(**kwargs) # Call func to modify indexer values in place
2294 if result_is_index:
2295 result = algorithms.take_nd(values, result)
2297 if post_processing:
2298 result = post_processing(result, inferences)
2300 key = base.OutputKey(label=name, position=idx)
2301 output[key] = result
2303 if aggregate:
2304 return self._wrap_aggregated_output(output)
2305 else:
2306 return self._wrap_transformed_output(output)
2308 @Substitution(name="groupby")
2309 @Appender(_common_see_also)
2310 def shift(self, periods=1, freq=None, axis=0, fill_value=None):
2311 """
2312 Shift each group by periods observations.
2314 Parameters
2315 ----------
2316 periods : int, default 1
2317 Number of periods to shift.
2318 freq : frequency string
2319 axis : axis to shift, default 0
2320 fill_value : optional
2322 .. versionadded:: 0.24.0
2324 Returns
2325 -------
2326 Series or DataFrame
2327 Object shifted within each group.
2328 """
2330 if freq is not None or axis != 0 or not isna(fill_value):
2331 return self.apply(lambda x: x.shift(periods, freq, axis, fill_value))
2333 return self._get_cythonized_result(
2334 "group_shift_indexer",
2335 cython_dtype=np.dtype(np.int64),
2336 needs_ngroups=True,
2337 result_is_index=True,
2338 periods=periods,
2339 )
2341 @Substitution(name="groupby")
2342 @Appender(_common_see_also)
2343 def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0):
2344 """
2345 Calculate pct_change of each value to previous entry in group.
2347 Returns
2348 -------
2349 Series or DataFrame
2350 Percentage changes within each group.
2351 """
2352 if freq is not None or axis != 0:
2353 return self.apply(
2354 lambda x: x.pct_change(
2355 periods=periods,
2356 fill_method=fill_method,
2357 limit=limit,
2358 freq=freq,
2359 axis=axis,
2360 )
2361 )
2362 if fill_method is None: # GH30463
2363 fill_method = "pad"
2364 limit = 0
2365 filled = getattr(self, fill_method)(limit=limit)
2366 fill_grp = filled.groupby(self.grouper.codes)
2367 shifted = fill_grp.shift(periods=periods, freq=freq)
2368 return (filled / shifted) - 1
2370 @Substitution(name="groupby")
2371 @Substitution(see_also=_common_see_also)
2372 def head(self, n=5):
2373 """
2374 Return first n rows of each group.
2376 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
2377 from the original DataFrame with original index and order preserved
2378 (``as_index`` flag is ignored).
2380 Does not work for negative values of `n`.
2382 Returns
2383 -------
2384 Series or DataFrame
2385 %(see_also)s
2386 Examples
2387 --------
2389 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
2390 ... columns=['A', 'B'])
2391 >>> df.groupby('A').head(1)
2392 A B
2393 0 1 2
2394 2 5 6
2395 >>> df.groupby('A').head(-1)
2396 Empty DataFrame
2397 Columns: [A, B]
2398 Index: []
2399 """
2400 self._reset_group_selection()
2401 mask = self._cumcount_array() < n
2402 return self._selected_obj[mask]
2404 @Substitution(name="groupby")
2405 @Substitution(see_also=_common_see_also)
2406 def tail(self, n=5):
2407 """
2408 Return last n rows of each group.
2410 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows
2411 from the original DataFrame with original index and order preserved
2412 (``as_index`` flag is ignored).
2414 Does not work for negative values of `n`.
2416 Returns
2417 -------
2418 Series or DataFrame
2419 %(see_also)s
2420 Examples
2421 --------
2423 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
2424 ... columns=['A', 'B'])
2425 >>> df.groupby('A').tail(1)
2426 A B
2427 1 a 2
2428 3 b 2
2429 >>> df.groupby('A').tail(-1)
2430 Empty DataFrame
2431 Columns: [A, B]
2432 Index: []
2433 """
2434 self._reset_group_selection()
2435 mask = self._cumcount_array(ascending=False) < n
2436 return self._selected_obj[mask]
2438 def _reindex_output(
2439 self, output: FrameOrSeries, fill_value: Scalar = np.NaN
2440 ) -> FrameOrSeries:
2441 """
2442 If we have categorical groupers, then we might want to make sure that
2443 we have a fully re-indexed output to the levels. This means expanding
2444 the output space to accommodate all values in the cartesian product of
2445 our groups, regardless of whether they were observed in the data or
2446 not. This will expand the output space if there are missing groups.
2448 The method returns early without modifying the input if the number of
2449 groupings is less than 2, self.observed == True or none of the groupers
2450 are categorical.
2452 Parameters
2453 ----------
2454 output : Series or DataFrame
2455 Object resulting from grouping and applying an operation.
2456 fill_value : scalar, default np.NaN
2457 Value to use for unobserved categories if self.observed is False.
2459 Returns
2460 -------
2461 Series or DataFrame
2462 Object (potentially) re-indexed to include all possible groups.
2463 """
2464 groupings = self.grouper.groupings
2465 if groupings is None:
2466 return output
2467 elif len(groupings) == 1:
2468 return output
2470 # if we only care about the observed values
2471 # we are done
2472 elif self.observed:
2473 return output
2475 # reindexing only applies to a Categorical grouper
2476 elif not any(
2477 isinstance(ping.grouper, (Categorical, CategoricalIndex))
2478 for ping in groupings
2479 ):
2480 return output
2482 levels_list = [ping.group_index for ping in groupings]
2483 index, _ = MultiIndex.from_product(
2484 levels_list, names=self.grouper.names
2485 ).sortlevel()
2487 if self.as_index:
2488 d = {
2489 self.obj._get_axis_name(self.axis): index,
2490 "copy": False,
2491 "fill_value": fill_value,
2492 }
2493 return output.reindex(**d)
2495 # GH 13204
2496 # Here, the categorical in-axis groupers, which need to be fully
2497 # expanded, are columns in `output`. An idea is to do:
2498 # output = output.set_index(self.grouper.names)
2499 # .reindex(index).reset_index()
2500 # but special care has to be taken because of possible not-in-axis
2501 # groupers.
2502 # So, we manually select and drop the in-axis grouper columns,
2503 # reindex `output`, and then reset the in-axis grouper columns.
2505 # Select in-axis groupers
2506 in_axis_grps = (
2507 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
2508 )
2509 g_nums, g_names = zip(*in_axis_grps)
2511 output = output.drop(labels=list(g_names), axis=1)
2513 # Set a temp index and reindex (possibly expanding)
2514 output = output.set_index(self.grouper.result_index).reindex(
2515 index, copy=False, fill_value=fill_value
2516 )
2518 # Reset in-axis grouper columns
2519 # (using level numbers `g_nums` because level names may not be unique)
2520 output = output.reset_index(level=g_nums)
2522 return output.reset_index(drop=True)
2525GroupBy._add_numeric_operations()
2528@Appender(GroupBy.__doc__)
2529def get_groupby(
2530 obj: NDFrame,
2531 by: Optional[_KeysArgType] = None,
2532 axis: int = 0,
2533 level=None,
2534 grouper: "Optional[ops.BaseGrouper]" = None,
2535 exclusions=None,
2536 selection=None,
2537 as_index: bool = True,
2538 sort: bool = True,
2539 group_keys: bool = True,
2540 squeeze: bool = False,
2541 observed: bool = False,
2542 mutated: bool = False,
2543) -> GroupBy:
2545 klass: Type[GroupBy]
2546 if isinstance(obj, Series):
2547 from pandas.core.groupby.generic import SeriesGroupBy
2549 klass = SeriesGroupBy
2550 elif isinstance(obj, DataFrame):
2551 from pandas.core.groupby.generic import DataFrameGroupBy
2553 klass = DataFrameGroupBy
2554 else:
2555 raise TypeError(f"invalid type: {obj}")
2557 return klass(
2558 obj=obj,
2559 keys=by,
2560 axis=axis,
2561 level=level,
2562 grouper=grouper,
2563 exclusions=exclusions,
2564 selection=selection,
2565 as_index=as_index,
2566 sort=sort,
2567 group_keys=group_keys,
2568 squeeze=squeeze,
2569 observed=observed,
2570 mutated=mutated,
2571 )