Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/reshape/reshape.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from functools import partial
2import itertools
3from typing import List
5import numpy as np
7import pandas._libs.algos as libalgos
8import pandas._libs.reshape as libreshape
9from pandas._libs.sparse import IntIndex
11from pandas.core.dtypes.cast import maybe_promote
12from pandas.core.dtypes.common import (
13 ensure_platform_int,
14 is_bool_dtype,
15 is_extension_array_dtype,
16 is_integer,
17 is_integer_dtype,
18 is_list_like,
19 is_object_dtype,
20 needs_i8_conversion,
21)
22from pandas.core.dtypes.missing import notna
24import pandas.core.algorithms as algos
25from pandas.core.arrays import SparseArray
26from pandas.core.arrays.categorical import factorize_from_iterable
27from pandas.core.construction import extract_array
28from pandas.core.frame import DataFrame
29from pandas.core.indexes.api import Index, MultiIndex
30from pandas.core.series import Series
31from pandas.core.sorting import (
32 compress_group_index,
33 decons_obs_group_ids,
34 get_compressed_ids,
35 get_group_index,
36)
39class _Unstacker:
40 """
41 Helper class to unstack data / pivot with multi-level index
43 Parameters
44 ----------
45 values : ndarray
46 Values of DataFrame to "Unstack"
47 index : object
48 Pandas ``Index``
49 level : int or str, default last level
50 Level to "unstack". Accepts a name for the level.
51 value_columns : Index, optional
52 Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame
53 fill_value : scalar, optional
54 Default value to fill in missing values if subgroups do not have the
55 same set of labels. By default, missing values will be replaced with
56 the default fill value for that data type, NaN for float, NaT for
57 datetimelike, etc. For integer types, by default data will converted to
58 float and missing values will be set to NaN.
59 constructor : object
60 Pandas ``DataFrame`` or subclass used to create unstacked
61 response. If None, DataFrame will be used.
63 Examples
64 --------
65 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
66 ... ('two', 'a'), ('two', 'b')])
67 >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
68 >>> s
69 one a 1
70 b 2
71 two a 3
72 b 4
73 dtype: int64
75 >>> s.unstack(level=-1)
76 a b
77 one 1 2
78 two 3 4
80 >>> s.unstack(level=0)
81 one two
82 a 1 3
83 b 2 4
85 Returns
86 -------
87 unstacked : DataFrame
88 """
90 def __init__(
91 self,
92 values: np.ndarray,
93 index,
94 level=-1,
95 value_columns=None,
96 fill_value=None,
97 constructor=None,
98 ):
100 if values.ndim == 1:
101 values = values[:, np.newaxis]
102 self.values = values
103 self.value_columns = value_columns
104 self.fill_value = fill_value
106 if constructor is None:
107 constructor = DataFrame
108 self.constructor = constructor
110 if value_columns is None and values.shape[1] != 1: # pragma: no cover
111 raise ValueError("must pass column labels for multi-column data")
113 self.index = index.remove_unused_levels()
115 self.level = self.index._get_level_number(level)
117 # when index includes `nan`, need to lift levels/strides by 1
118 self.lift = 1 if -1 in self.index.codes[self.level] else 0
120 self.new_index_levels = list(self.index.levels)
121 self.new_index_names = list(self.index.names)
123 self.removed_name = self.new_index_names.pop(self.level)
124 self.removed_level = self.new_index_levels.pop(self.level)
125 self.removed_level_full = index.levels[self.level]
127 # Bug fix GH 20601
128 # If the data frame is too big, the number of unique index combination
129 # will cause int32 overflow on windows environments.
130 # We want to check and raise an error before this happens
131 num_rows = np.max([index_level.size for index_level in self.new_index_levels])
132 num_columns = self.removed_level.size
134 # GH20601: This forces an overflow if the number of cells is too high.
135 num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
137 if num_rows > 0 and num_columns > 0 and num_cells <= 0:
138 raise ValueError("Unstacked DataFrame is too big, causing int32 overflow")
140 self._make_sorted_values_labels()
141 self._make_selectors()
143 def _make_sorted_values_labels(self):
144 v = self.level
146 codes = list(self.index.codes)
147 levs = list(self.index.levels)
148 to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
149 sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]]
151 comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
152 ngroups = len(obs_ids)
154 indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0]
155 indexer = ensure_platform_int(indexer)
157 self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
158 self.sorted_labels = [l.take(indexer) for l in to_sort]
160 def _make_selectors(self):
161 new_levels = self.new_index_levels
163 # make the mask
164 remaining_labels = self.sorted_labels[:-1]
165 level_sizes = [len(x) for x in new_levels]
167 comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
168 ngroups = len(obs_ids)
170 comp_index = ensure_platform_int(comp_index)
171 stride = self.index.levshape[self.level] + self.lift
172 self.full_shape = ngroups, stride
174 selector = self.sorted_labels[-1] + stride * comp_index + self.lift
175 mask = np.zeros(np.prod(self.full_shape), dtype=bool)
176 mask.put(selector, True)
178 if mask.sum() < len(self.index):
179 raise ValueError("Index contains duplicate entries, cannot reshape")
181 self.group_index = comp_index
182 self.mask = mask
183 self.unique_groups = obs_ids
184 self.compressor = comp_index.searchsorted(np.arange(ngroups))
186 def get_result(self):
187 values, _ = self.get_new_values()
188 columns = self.get_new_columns()
189 index = self.get_new_index()
191 return self.constructor(values, index=index, columns=columns)
193 def get_new_values(self):
194 values = self.values
196 # place the values
197 length, width = self.full_shape
198 stride = values.shape[1]
199 result_width = width * stride
200 result_shape = (length, result_width)
201 mask = self.mask
202 mask_all = mask.all()
204 # we can simply reshape if we don't have a mask
205 if mask_all and len(values):
206 new_values = (
207 self.sorted_values.reshape(length, width, stride)
208 .swapaxes(1, 2)
209 .reshape(result_shape)
210 )
211 new_mask = np.ones(result_shape, dtype=bool)
212 return new_values, new_mask
214 # if our mask is all True, then we can use our existing dtype
215 if mask_all:
216 dtype = values.dtype
217 new_values = np.empty(result_shape, dtype=dtype)
218 else:
219 dtype, fill_value = maybe_promote(values.dtype, self.fill_value)
220 new_values = np.empty(result_shape, dtype=dtype)
221 new_values.fill(fill_value)
223 new_mask = np.zeros(result_shape, dtype=bool)
225 name = np.dtype(dtype).name
226 sorted_values = self.sorted_values
228 # we need to convert to a basic dtype
229 # and possibly coerce an input to our output dtype
230 # e.g. ints -> floats
231 if needs_i8_conversion(values):
232 sorted_values = sorted_values.view("i8")
233 new_values = new_values.view("i8")
234 elif is_bool_dtype(values):
235 sorted_values = sorted_values.astype("object")
236 new_values = new_values.astype("object")
237 else:
238 sorted_values = sorted_values.astype(name, copy=False)
240 # fill in our values & mask
241 libreshape.unstack(
242 sorted_values,
243 mask.view("u1"),
244 stride,
245 length,
246 width,
247 new_values,
248 new_mask.view("u1"),
249 )
251 # reconstruct dtype if needed
252 if needs_i8_conversion(values):
253 new_values = new_values.view(values.dtype)
255 return new_values, new_mask
257 def get_new_columns(self):
258 if self.value_columns is None:
259 if self.lift == 0:
260 return self.removed_level._shallow_copy(name=self.removed_name)
262 lev = self.removed_level.insert(0, item=self.removed_level._na_value)
263 return lev.rename(self.removed_name)
265 stride = len(self.removed_level) + self.lift
266 width = len(self.value_columns)
267 propagator = np.repeat(np.arange(width), stride)
268 if isinstance(self.value_columns, MultiIndex):
269 new_levels = self.value_columns.levels + (self.removed_level_full,)
270 new_names = self.value_columns.names + (self.removed_name,)
272 new_codes = [lab.take(propagator) for lab in self.value_columns.codes]
273 else:
274 new_levels = [self.value_columns, self.removed_level_full]
275 new_names = [self.value_columns.name, self.removed_name]
276 new_codes = [propagator]
278 # The two indices differ only if the unstacked level had unused items:
279 if len(self.removed_level_full) != len(self.removed_level):
280 # In this case, we remap the new codes to the original level:
281 repeater = self.removed_level_full.get_indexer(self.removed_level)
282 if self.lift:
283 repeater = np.insert(repeater, 0, -1)
284 else:
285 # Otherwise, we just use each level item exactly once:
286 repeater = np.arange(stride) - self.lift
288 # The entire level is then just a repetition of the single chunk:
289 new_codes.append(np.tile(repeater, width))
290 return MultiIndex(
291 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
292 )
294 def get_new_index(self):
295 result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
297 # construct the new index
298 if len(self.new_index_levels) == 1:
299 level, level_codes = self.new_index_levels[0], result_codes[0]
300 if (level_codes == -1).any():
301 level = level.insert(len(level), level._na_value)
302 return level.take(level_codes).rename(self.new_index_names[0])
304 return MultiIndex(
305 levels=self.new_index_levels,
306 codes=result_codes,
307 names=self.new_index_names,
308 verify_integrity=False,
309 )
312def _unstack_multiple(data, clocs, fill_value=None):
313 if len(clocs) == 0:
314 return data
316 # NOTE: This doesn't deal with hierarchical columns yet
318 index = data.index
320 clocs = [index._get_level_number(i) for i in clocs]
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
324 clevels = [index.levels[i] for i in clocs]
325 ccodes = [index.codes[i] for i in clocs]
326 cnames = [index.names[i] for i in clocs]
327 rlevels = [index.levels[i] for i in rlocs]
328 rcodes = [index.codes[i] for i in rlocs]
329 rnames = [index.names[i] for i in rlocs]
331 shape = [len(x) for x in clevels]
332 group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
334 comp_ids, obs_ids = compress_group_index(group_index, sort=False)
335 recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
337 if rlocs == []:
338 # Everything is in clocs, so the dummy df has a regular index
339 dummy_index = Index(obs_ids, name="__placeholder__")
340 else:
341 dummy_index = MultiIndex(
342 levels=rlevels + [obs_ids],
343 codes=rcodes + [comp_ids],
344 names=rnames + ["__placeholder__"],
345 verify_integrity=False,
346 )
348 if isinstance(data, Series):
349 dummy = data.copy()
350 dummy.index = dummy_index
352 unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
353 new_levels = clevels
354 new_names = cnames
355 new_codes = recons_codes
356 else:
357 if isinstance(data.columns, MultiIndex):
358 result = data
359 for i in range(len(clocs)):
360 val = clocs[i]
361 result = result.unstack(val, fill_value=fill_value)
362 clocs = [v if i > v else v - 1 for v in clocs]
364 return result
366 dummy = data.copy()
367 dummy.index = dummy_index
369 unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
370 if isinstance(unstacked, Series):
371 unstcols = unstacked.index
372 else:
373 unstcols = unstacked.columns
374 new_levels = [unstcols.levels[0]] + clevels
375 new_names = [data.columns.name] + cnames
377 new_codes = [unstcols.codes[0]]
378 for rec in recons_codes:
379 new_codes.append(rec.take(unstcols.codes[-1]))
381 new_columns = MultiIndex(
382 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
383 )
385 if isinstance(unstacked, Series):
386 unstacked.index = new_columns
387 else:
388 unstacked.columns = new_columns
390 return unstacked
393def unstack(obj, level, fill_value=None):
394 if isinstance(level, (tuple, list)):
395 if len(level) != 1:
396 # _unstack_multiple only handles MultiIndexes,
397 # and isn't needed for a single level
398 return _unstack_multiple(obj, level, fill_value=fill_value)
399 else:
400 level = level[0]
402 # Prioritize integer interpretation (GH #21677):
403 if not is_integer(level) and not level == "__placeholder__":
404 level = obj.index._get_level_number(level)
406 if isinstance(obj, DataFrame):
407 if isinstance(obj.index, MultiIndex):
408 return _unstack_frame(obj, level, fill_value=fill_value)
409 else:
410 return obj.T.stack(dropna=False)
411 else:
412 if is_extension_array_dtype(obj.dtype):
413 return _unstack_extension_series(obj, level, fill_value)
414 unstacker = _Unstacker(
415 obj.values,
416 obj.index,
417 level=level,
418 fill_value=fill_value,
419 constructor=obj._constructor_expanddim,
420 )
421 return unstacker.get_result()
424def _unstack_frame(obj, level, fill_value=None):
425 if obj._is_mixed_type:
426 unstacker = partial(
427 _Unstacker, index=obj.index, level=level, fill_value=fill_value
428 )
429 blocks = obj._data.unstack(unstacker, fill_value=fill_value)
430 return obj._constructor(blocks)
431 else:
432 unstacker = _Unstacker(
433 obj.values,
434 obj.index,
435 level=level,
436 value_columns=obj.columns,
437 fill_value=fill_value,
438 constructor=obj._constructor,
439 )
440 return unstacker.get_result()
443def _unstack_extension_series(series, level, fill_value):
444 """
445 Unstack an ExtensionArray-backed Series.
447 The ExtensionDtype is preserved.
449 Parameters
450 ----------
451 series : Series
452 A Series with an ExtensionArray for values
453 level : Any
454 The level name or number.
455 fill_value : Any
456 The user-level (not physical storage) fill value to use for
457 missing values introduced by the reshape. Passed to
458 ``series.values.take``.
460 Returns
461 -------
462 DataFrame
463 Each column of the DataFrame will have the same dtype as
464 the input Series.
465 """
466 # Implementation note: the basic idea is to
467 # 1. Do a regular unstack on a dummy array of integers
468 # 2. Followup with a columnwise take.
469 # We use the dummy take to discover newly-created missing values
470 # introduced by the reshape.
471 from pandas.core.reshape.concat import concat
473 dummy_arr = np.arange(len(series))
474 # fill_value=-1, since we will do a series.values.take later
475 result = _Unstacker(
476 dummy_arr, series.index, level=level, fill_value=-1
477 ).get_result()
479 out = []
480 values = extract_array(series, extract_numpy=False)
482 for col, indices in result.items():
483 out.append(
484 Series(
485 values.take(indices.values, allow_fill=True, fill_value=fill_value),
486 name=col,
487 index=result.index,
488 )
489 )
490 return concat(out, axis="columns", copy=False, keys=result.columns)
493def stack(frame, level=-1, dropna=True):
494 """
495 Convert DataFrame to Series with multi-level Index. Columns become the
496 second level of the resulting hierarchical index
498 Returns
499 -------
500 stacked : Series
501 """
503 def factorize(index):
504 if index.is_unique:
505 return index, np.arange(len(index))
506 codes, categories = factorize_from_iterable(index)
507 return categories, codes
509 N, K = frame.shape
511 # Will also convert negative level numbers and check if out of bounds.
512 level_num = frame.columns._get_level_number(level)
514 if isinstance(frame.columns, MultiIndex):
515 return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
516 elif isinstance(frame.index, MultiIndex):
517 new_levels = list(frame.index.levels)
518 new_codes = [lab.repeat(K) for lab in frame.index.codes]
520 clev, clab = factorize(frame.columns)
521 new_levels.append(clev)
522 new_codes.append(np.tile(clab, N).ravel())
524 new_names = list(frame.index.names)
525 new_names.append(frame.columns.name)
526 new_index = MultiIndex(
527 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
528 )
529 else:
530 levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns)))
531 codes = ilab.repeat(K), np.tile(clab, N).ravel()
532 new_index = MultiIndex(
533 levels=levels,
534 codes=codes,
535 names=[frame.index.name, frame.columns.name],
536 verify_integrity=False,
537 )
539 if frame._is_homogeneous_type:
540 # For homogeneous EAs, frame.values will coerce to object. So
541 # we concatenate instead.
542 dtypes = list(frame.dtypes.values)
543 dtype = dtypes[0]
545 if is_extension_array_dtype(dtype):
546 arr = dtype.construct_array_type()
547 new_values = arr._concat_same_type(
548 [col._values for _, col in frame.items()]
549 )
550 new_values = _reorder_for_extension_array_stack(new_values, N, K)
551 else:
552 # homogeneous, non-EA
553 new_values = frame.values.ravel()
555 else:
556 # non-homogeneous
557 new_values = frame.values.ravel()
559 if dropna:
560 mask = notna(new_values)
561 new_values = new_values[mask]
562 new_index = new_index[mask]
564 return frame._constructor_sliced(new_values, index=new_index)
567def stack_multiple(frame, level, dropna=True):
568 # If all passed levels match up to column names, no
569 # ambiguity about what to do
570 if all(lev in frame.columns.names for lev in level):
571 result = frame
572 for lev in level:
573 result = stack(result, lev, dropna=dropna)
575 # Otherwise, level numbers may change as each successive level is stacked
576 elif all(isinstance(lev, int) for lev in level):
577 # As each stack is done, the level numbers decrease, so we need
578 # to account for that when level is a sequence of ints
579 result = frame
580 # _get_level_number() checks level numbers are in range and converts
581 # negative numbers to positive
582 level = [frame.columns._get_level_number(lev) for lev in level]
584 # Can't iterate directly through level as we might need to change
585 # values as we go
586 for index in range(len(level)):
587 lev = level[index]
588 result = stack(result, lev, dropna=dropna)
589 # Decrement all level numbers greater than current, as these
590 # have now shifted down by one
591 updated_level = []
592 for other in level:
593 if other > lev:
594 updated_level.append(other - 1)
595 else:
596 updated_level.append(other)
597 level = updated_level
599 else:
600 raise ValueError(
601 "level should contain all level names or all level "
602 "numbers, not a mixture of the two."
603 )
605 return result
608def _stack_multi_columns(frame, level_num=-1, dropna=True):
609 def _convert_level_number(level_num, columns):
610 """
611 Logic for converting the level number to something we can safely pass
612 to swaplevel:
614 We generally want to convert the level number into a level name, except
615 when columns do not have names, in which case we must leave as a level
616 number
617 """
618 if level_num in columns.names:
619 return columns.names[level_num]
620 else:
621 if columns.names[level_num] is None:
622 return level_num
623 else:
624 return columns.names[level_num]
626 this = frame.copy()
628 # this makes life much simpler
629 if level_num != frame.columns.nlevels - 1:
630 # roll levels to put selected level at end
631 roll_columns = this.columns
632 for i in range(level_num, frame.columns.nlevels - 1):
633 # Need to check if the ints conflict with level names
634 lev1 = _convert_level_number(i, roll_columns)
635 lev2 = _convert_level_number(i + 1, roll_columns)
636 roll_columns = roll_columns.swaplevel(lev1, lev2)
637 this.columns = roll_columns
639 if not this.columns.is_lexsorted():
640 # Workaround the edge case where 0 is one of the column names,
641 # which interferes with trying to sort based on the first
642 # level
643 level_to_sort = _convert_level_number(0, this.columns)
644 this = this.sort_index(level=level_to_sort, axis=1)
646 # tuple list excluding level for grouping columns
647 if len(frame.columns.levels) > 2:
648 tuples = list(
649 zip(
650 *[
651 lev.take(level_codes)
652 for lev, level_codes in zip(
653 this.columns.levels[:-1], this.columns.codes[:-1]
654 )
655 ]
656 )
657 )
658 unique_groups = [key for key, _ in itertools.groupby(tuples)]
659 new_names = this.columns.names[:-1]
660 new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
661 else:
662 new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0])
663 unique_groups = new_columns
665 # time to ravel the values
666 new_data = {}
667 level_vals = this.columns.levels[-1]
668 level_codes = sorted(set(this.columns.codes[-1]))
669 level_vals_used = level_vals[level_codes]
670 levsize = len(level_codes)
671 drop_cols = []
672 for key in unique_groups:
673 try:
674 loc = this.columns.get_loc(key)
675 except KeyError:
676 drop_cols.append(key)
677 continue
679 # can make more efficient?
680 # we almost always return a slice
681 # but if unsorted can get a boolean
682 # indexer
683 if not isinstance(loc, slice):
684 slice_len = len(loc)
685 else:
686 slice_len = loc.stop - loc.start
688 if slice_len != levsize:
689 chunk = this.loc[:, this.columns[loc]]
690 chunk.columns = level_vals.take(chunk.columns.codes[-1])
691 value_slice = chunk.reindex(columns=level_vals_used).values
692 else:
693 if frame._is_homogeneous_type and is_extension_array_dtype(
694 frame.dtypes.iloc[0]
695 ):
696 dtype = this[this.columns[loc]].dtypes.iloc[0]
697 subset = this[this.columns[loc]]
699 value_slice = dtype.construct_array_type()._concat_same_type(
700 [x._values for _, x in subset.items()]
701 )
702 N, K = this.shape
703 idx = np.arange(N * K).reshape(K, N).T.ravel()
704 value_slice = value_slice.take(idx)
706 elif frame._is_mixed_type:
707 value_slice = this[this.columns[loc]].values
708 else:
709 value_slice = this.values[:, loc]
711 if value_slice.ndim > 1:
712 # i.e. not extension
713 value_slice = value_slice.ravel()
715 new_data[key] = value_slice
717 if len(drop_cols) > 0:
718 new_columns = new_columns.difference(drop_cols)
720 N = len(this)
722 if isinstance(this.index, MultiIndex):
723 new_levels = list(this.index.levels)
724 new_names = list(this.index.names)
725 new_codes = [lab.repeat(levsize) for lab in this.index.codes]
726 else:
727 old_codes, old_levels = factorize_from_iterable(this.index)
728 new_levels = [old_levels]
729 new_codes = [old_codes.repeat(levsize)]
730 new_names = [this.index.name] # something better?
732 new_levels.append(level_vals)
733 new_codes.append(np.tile(level_codes, N))
734 new_names.append(frame.columns.names[level_num])
736 new_index = MultiIndex(
737 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
738 )
740 result = frame._constructor(new_data, index=new_index, columns=new_columns)
742 # more efficient way to go about this? can do the whole masking biz but
743 # will only save a small amount of time...
744 if dropna:
745 result = result.dropna(axis=0, how="all")
747 return result
750def get_dummies(
751 data,
752 prefix=None,
753 prefix_sep="_",
754 dummy_na=False,
755 columns=None,
756 sparse=False,
757 drop_first=False,
758 dtype=None,
759) -> "DataFrame":
760 """
761 Convert categorical variable into dummy/indicator variables.
763 Parameters
764 ----------
765 data : array-like, Series, or DataFrame
766 Data of which to get dummy indicators.
767 prefix : str, list of str, or dict of str, default None
768 String to append DataFrame column names.
769 Pass a list with length equal to the number of columns
770 when calling get_dummies on a DataFrame. Alternatively, `prefix`
771 can be a dictionary mapping column names to prefixes.
772 prefix_sep : str, default '_'
773 If appending prefix, separator/delimiter to use. Or pass a
774 list or dictionary as with `prefix`.
775 dummy_na : bool, default False
776 Add a column to indicate NaNs, if False NaNs are ignored.
777 columns : list-like, default None
778 Column names in the DataFrame to be encoded.
779 If `columns` is None then all the columns with
780 `object` or `category` dtype will be converted.
781 sparse : bool, default False
782 Whether the dummy-encoded columns should be backed by
783 a :class:`SparseArray` (True) or a regular NumPy array (False).
784 drop_first : bool, default False
785 Whether to get k-1 dummies out of k categorical levels by removing the
786 first level.
787 dtype : dtype, default np.uint8
788 Data type for new columns. Only a single dtype is allowed.
790 .. versionadded:: 0.23.0
792 Returns
793 -------
794 DataFrame
795 Dummy-coded data.
797 See Also
798 --------
799 Series.str.get_dummies : Convert Series to dummy codes.
801 Examples
802 --------
803 >>> s = pd.Series(list('abca'))
805 >>> pd.get_dummies(s)
806 a b c
807 0 1 0 0
808 1 0 1 0
809 2 0 0 1
810 3 1 0 0
812 >>> s1 = ['a', 'b', np.nan]
814 >>> pd.get_dummies(s1)
815 a b
816 0 1 0
817 1 0 1
818 2 0 0
820 >>> pd.get_dummies(s1, dummy_na=True)
821 a b NaN
822 0 1 0 0
823 1 0 1 0
824 2 0 0 1
826 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
827 ... 'C': [1, 2, 3]})
829 >>> pd.get_dummies(df, prefix=['col1', 'col2'])
830 C col1_a col1_b col2_a col2_b col2_c
831 0 1 1 0 0 1 0
832 1 2 0 1 1 0 0
833 2 3 1 0 0 0 1
835 >>> pd.get_dummies(pd.Series(list('abcaa')))
836 a b c
837 0 1 0 0
838 1 0 1 0
839 2 0 0 1
840 3 1 0 0
841 4 1 0 0
843 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
844 b c
845 0 0 0
846 1 1 0
847 2 0 1
848 3 0 0
849 4 0 0
851 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
852 a b c
853 0 1.0 0.0 0.0
854 1 0.0 1.0 0.0
855 2 0.0 0.0 1.0
856 """
857 from pandas.core.reshape.concat import concat
859 dtypes_to_encode = ["object", "category"]
861 if isinstance(data, DataFrame):
862 # determine columns being encoded
863 if columns is None:
864 data_to_encode = data.select_dtypes(include=dtypes_to_encode)
865 elif not is_list_like(columns):
866 raise TypeError("Input must be a list-like for parameter `columns`")
867 else:
868 data_to_encode = data[columns]
870 # validate prefixes and separator to avoid silently dropping cols
871 def check_len(item, name):
872 len_msg = (
873 "Length of '{name}' ({len_item}) did not match the "
874 "length of the columns being encoded ({len_enc})."
875 )
877 if is_list_like(item):
878 if not len(item) == data_to_encode.shape[1]:
879 len_msg = len_msg.format(
880 name=name, len_item=len(item), len_enc=data_to_encode.shape[1]
881 )
882 raise ValueError(len_msg)
884 check_len(prefix, "prefix")
885 check_len(prefix_sep, "prefix_sep")
887 if isinstance(prefix, str):
888 prefix = itertools.cycle([prefix])
889 if isinstance(prefix, dict):
890 prefix = [prefix[col] for col in data_to_encode.columns]
892 if prefix is None:
893 prefix = data_to_encode.columns
895 # validate separators
896 if isinstance(prefix_sep, str):
897 prefix_sep = itertools.cycle([prefix_sep])
898 elif isinstance(prefix_sep, dict):
899 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
901 if data_to_encode.shape == data.shape:
902 # Encoding the entire df, do not prepend any dropped columns
903 with_dummies: List[DataFrame] = []
904 elif columns is not None:
905 # Encoding only cols specified in columns. Get all cols not in
906 # columns to prepend to result.
907 with_dummies = [data.drop(columns, axis=1)]
908 else:
909 # Encoding only object and category dtype columns. Get remaining
910 # columns to prepend to result.
911 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
913 for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep):
914 # col is (column_name, column), use just column data here
915 dummy = _get_dummies_1d(
916 col[1],
917 prefix=pre,
918 prefix_sep=sep,
919 dummy_na=dummy_na,
920 sparse=sparse,
921 drop_first=drop_first,
922 dtype=dtype,
923 )
924 with_dummies.append(dummy)
925 result = concat(with_dummies, axis=1)
926 else:
927 result = _get_dummies_1d(
928 data,
929 prefix,
930 prefix_sep,
931 dummy_na,
932 sparse=sparse,
933 drop_first=drop_first,
934 dtype=dtype,
935 )
936 return result
939def _get_dummies_1d(
940 data,
941 prefix,
942 prefix_sep="_",
943 dummy_na=False,
944 sparse=False,
945 drop_first=False,
946 dtype=None,
947):
948 from pandas.core.reshape.concat import concat
950 # Series avoids inconsistent NaN handling
951 codes, levels = factorize_from_iterable(Series(data))
953 if dtype is None:
954 dtype = np.uint8
955 dtype = np.dtype(dtype)
957 if is_object_dtype(dtype):
958 raise ValueError("dtype=object is not a valid dtype for get_dummies")
960 def get_empty_frame(data) -> DataFrame:
961 if isinstance(data, Series):
962 index = data.index
963 else:
964 index = np.arange(len(data))
965 return DataFrame(index=index)
967 # if all NaN
968 if not dummy_na and len(levels) == 0:
969 return get_empty_frame(data)
971 codes = codes.copy()
972 if dummy_na:
973 codes[codes == -1] = len(levels)
974 levels = np.append(levels, np.nan)
976 # if dummy_na, we just fake a nan level. drop_first will drop it again
977 if drop_first and len(levels) == 1:
978 return get_empty_frame(data)
980 number_of_cols = len(levels)
982 if prefix is None:
983 dummy_cols = levels
984 else:
986 # PY2 embedded unicode, gh-22084
987 def _make_col_name(prefix, prefix_sep, level) -> str:
988 fstr = "{prefix}{prefix_sep}{level}"
989 return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level)
991 dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels]
993 if isinstance(data, Series):
994 index = data.index
995 else:
996 index = None
998 if sparse:
1000 if is_integer_dtype(dtype):
1001 fill_value = 0
1002 elif dtype == bool:
1003 fill_value = False
1004 else:
1005 fill_value = 0.0
1007 sparse_series = []
1008 N = len(data)
1009 sp_indices = [[] for _ in range(len(dummy_cols))]
1010 mask = codes != -1
1011 codes = codes[mask]
1012 n_idx = np.arange(N)[mask]
1014 for ndx, code in zip(n_idx, codes):
1015 sp_indices[code].append(ndx)
1017 if drop_first:
1018 # remove first categorical level to avoid perfect collinearity
1019 # GH12042
1020 sp_indices = sp_indices[1:]
1021 dummy_cols = dummy_cols[1:]
1022 for col, ixs in zip(dummy_cols, sp_indices):
1023 sarr = SparseArray(
1024 np.ones(len(ixs), dtype=dtype),
1025 sparse_index=IntIndex(N, ixs),
1026 fill_value=fill_value,
1027 dtype=dtype,
1028 )
1029 sparse_series.append(Series(data=sarr, index=index, name=col))
1031 out = concat(sparse_series, axis=1, copy=False)
1032 return out
1034 else:
1035 dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)
1037 if not dummy_na:
1038 # reset NaN GH4446
1039 dummy_mat[codes == -1] = 0
1041 if drop_first:
1042 # remove first GH12042
1043 dummy_mat = dummy_mat[:, 1:]
1044 dummy_cols = dummy_cols[1:]
1045 return DataFrame(dummy_mat, index=index, columns=dummy_cols)
1048def _reorder_for_extension_array_stack(arr, n_rows: int, n_columns: int):
1049 """
1050 Re-orders the values when stacking multiple extension-arrays.
1052 The indirect stacking method used for EAs requires a followup
1053 take to get the order correct.
1055 Parameters
1056 ----------
1057 arr : ExtensionArray
1058 n_rows, n_columns : int
1059 The number of rows and columns in the original DataFrame.
1061 Returns
1062 -------
1063 taken : ExtensionArray
1064 The original `arr` with elements re-ordered appropriately
1066 Examples
1067 --------
1068 >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
1069 >>> _reorder_for_extension_array_stack(arr, 2, 3)
1070 array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
1072 >>> _reorder_for_extension_array_stack(arr, 3, 2)
1073 array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
1074 """
1075 # final take to get the order correct.
1076 # idx is an indexer like
1077 # [c0r0, c1r0, c2r0, ...,
1078 # c0r1, c1r1, c2r1, ...]
1079 idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
1080 return arr.take(idx)