Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from functools import partial 

2import itertools 

3from typing import List 

4 

5import numpy as np 

6 

7import pandas._libs.algos as libalgos 

8import pandas._libs.reshape as libreshape 

9from pandas._libs.sparse import IntIndex 

10 

11from pandas.core.dtypes.cast import maybe_promote 

12from pandas.core.dtypes.common import ( 

13 ensure_platform_int, 

14 is_bool_dtype, 

15 is_extension_array_dtype, 

16 is_integer, 

17 is_integer_dtype, 

18 is_list_like, 

19 is_object_dtype, 

20 needs_i8_conversion, 

21) 

22from pandas.core.dtypes.missing import notna 

23 

24import pandas.core.algorithms as algos 

25from pandas.core.arrays import SparseArray 

26from pandas.core.arrays.categorical import factorize_from_iterable 

27from pandas.core.construction import extract_array 

28from pandas.core.frame import DataFrame 

29from pandas.core.indexes.api import Index, MultiIndex 

30from pandas.core.series import Series 

31from pandas.core.sorting import ( 

32 compress_group_index, 

33 decons_obs_group_ids, 

34 get_compressed_ids, 

35 get_group_index, 

36) 

37 

38 

39class _Unstacker: 

40 """ 

41 Helper class to unstack data / pivot with multi-level index 

42 

43 Parameters 

44 ---------- 

45 values : ndarray 

46 Values of DataFrame to "Unstack" 

47 index : object 

48 Pandas ``Index`` 

49 level : int or str, default last level 

50 Level to "unstack". Accepts a name for the level. 

51 value_columns : Index, optional 

52 Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame 

53 fill_value : scalar, optional 

54 Default value to fill in missing values if subgroups do not have the 

55 same set of labels. By default, missing values will be replaced with 

56 the default fill value for that data type, NaN for float, NaT for 

57 datetimelike, etc. For integer types, by default data will converted to 

58 float and missing values will be set to NaN. 

59 constructor : object 

60 Pandas ``DataFrame`` or subclass used to create unstacked 

61 response. If None, DataFrame will be used. 

62 

63 Examples 

64 -------- 

65 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), 

66 ... ('two', 'a'), ('two', 'b')]) 

67 >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) 

68 >>> s 

69 one a 1 

70 b 2 

71 two a 3 

72 b 4 

73 dtype: int64 

74 

75 >>> s.unstack(level=-1) 

76 a b 

77 one 1 2 

78 two 3 4 

79 

80 >>> s.unstack(level=0) 

81 one two 

82 a 1 3 

83 b 2 4 

84 

85 Returns 

86 ------- 

87 unstacked : DataFrame 

88 """ 

89 

90 def __init__( 

91 self, 

92 values: np.ndarray, 

93 index, 

94 level=-1, 

95 value_columns=None, 

96 fill_value=None, 

97 constructor=None, 

98 ): 

99 

100 if values.ndim == 1: 

101 values = values[:, np.newaxis] 

102 self.values = values 

103 self.value_columns = value_columns 

104 self.fill_value = fill_value 

105 

106 if constructor is None: 

107 constructor = DataFrame 

108 self.constructor = constructor 

109 

110 if value_columns is None and values.shape[1] != 1: # pragma: no cover 

111 raise ValueError("must pass column labels for multi-column data") 

112 

113 self.index = index.remove_unused_levels() 

114 

115 self.level = self.index._get_level_number(level) 

116 

117 # when index includes `nan`, need to lift levels/strides by 1 

118 self.lift = 1 if -1 in self.index.codes[self.level] else 0 

119 

120 self.new_index_levels = list(self.index.levels) 

121 self.new_index_names = list(self.index.names) 

122 

123 self.removed_name = self.new_index_names.pop(self.level) 

124 self.removed_level = self.new_index_levels.pop(self.level) 

125 self.removed_level_full = index.levels[self.level] 

126 

127 # Bug fix GH 20601 

128 # If the data frame is too big, the number of unique index combination 

129 # will cause int32 overflow on windows environments. 

130 # We want to check and raise an error before this happens 

131 num_rows = np.max([index_level.size for index_level in self.new_index_levels]) 

132 num_columns = self.removed_level.size 

133 

134 # GH20601: This forces an overflow if the number of cells is too high. 

135 num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) 

136 

137 if num_rows > 0 and num_columns > 0 and num_cells <= 0: 

138 raise ValueError("Unstacked DataFrame is too big, causing int32 overflow") 

139 

140 self._make_sorted_values_labels() 

141 self._make_selectors() 

142 

143 def _make_sorted_values_labels(self): 

144 v = self.level 

145 

146 codes = list(self.index.codes) 

147 levs = list(self.index.levels) 

148 to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] 

149 sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] 

150 

151 comp_index, obs_ids = get_compressed_ids(to_sort, sizes) 

152 ngroups = len(obs_ids) 

153 

154 indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0] 

155 indexer = ensure_platform_int(indexer) 

156 

157 self.sorted_values = algos.take_nd(self.values, indexer, axis=0) 

158 self.sorted_labels = [l.take(indexer) for l in to_sort] 

159 

160 def _make_selectors(self): 

161 new_levels = self.new_index_levels 

162 

163 # make the mask 

164 remaining_labels = self.sorted_labels[:-1] 

165 level_sizes = [len(x) for x in new_levels] 

166 

167 comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) 

168 ngroups = len(obs_ids) 

169 

170 comp_index = ensure_platform_int(comp_index) 

171 stride = self.index.levshape[self.level] + self.lift 

172 self.full_shape = ngroups, stride 

173 

174 selector = self.sorted_labels[-1] + stride * comp_index + self.lift 

175 mask = np.zeros(np.prod(self.full_shape), dtype=bool) 

176 mask.put(selector, True) 

177 

178 if mask.sum() < len(self.index): 

179 raise ValueError("Index contains duplicate entries, cannot reshape") 

180 

181 self.group_index = comp_index 

182 self.mask = mask 

183 self.unique_groups = obs_ids 

184 self.compressor = comp_index.searchsorted(np.arange(ngroups)) 

185 

186 def get_result(self): 

187 values, _ = self.get_new_values() 

188 columns = self.get_new_columns() 

189 index = self.get_new_index() 

190 

191 return self.constructor(values, index=index, columns=columns) 

192 

193 def get_new_values(self): 

194 values = self.values 

195 

196 # place the values 

197 length, width = self.full_shape 

198 stride = values.shape[1] 

199 result_width = width * stride 

200 result_shape = (length, result_width) 

201 mask = self.mask 

202 mask_all = mask.all() 

203 

204 # we can simply reshape if we don't have a mask 

205 if mask_all and len(values): 

206 new_values = ( 

207 self.sorted_values.reshape(length, width, stride) 

208 .swapaxes(1, 2) 

209 .reshape(result_shape) 

210 ) 

211 new_mask = np.ones(result_shape, dtype=bool) 

212 return new_values, new_mask 

213 

214 # if our mask is all True, then we can use our existing dtype 

215 if mask_all: 

216 dtype = values.dtype 

217 new_values = np.empty(result_shape, dtype=dtype) 

218 else: 

219 dtype, fill_value = maybe_promote(values.dtype, self.fill_value) 

220 new_values = np.empty(result_shape, dtype=dtype) 

221 new_values.fill(fill_value) 

222 

223 new_mask = np.zeros(result_shape, dtype=bool) 

224 

225 name = np.dtype(dtype).name 

226 sorted_values = self.sorted_values 

227 

228 # we need to convert to a basic dtype 

229 # and possibly coerce an input to our output dtype 

230 # e.g. ints -> floats 

231 if needs_i8_conversion(values): 

232 sorted_values = sorted_values.view("i8") 

233 new_values = new_values.view("i8") 

234 elif is_bool_dtype(values): 

235 sorted_values = sorted_values.astype("object") 

236 new_values = new_values.astype("object") 

237 else: 

238 sorted_values = sorted_values.astype(name, copy=False) 

239 

240 # fill in our values & mask 

241 libreshape.unstack( 

242 sorted_values, 

243 mask.view("u1"), 

244 stride, 

245 length, 

246 width, 

247 new_values, 

248 new_mask.view("u1"), 

249 ) 

250 

251 # reconstruct dtype if needed 

252 if needs_i8_conversion(values): 

253 new_values = new_values.view(values.dtype) 

254 

255 return new_values, new_mask 

256 

257 def get_new_columns(self): 

258 if self.value_columns is None: 

259 if self.lift == 0: 

260 return self.removed_level._shallow_copy(name=self.removed_name) 

261 

262 lev = self.removed_level.insert(0, item=self.removed_level._na_value) 

263 return lev.rename(self.removed_name) 

264 

265 stride = len(self.removed_level) + self.lift 

266 width = len(self.value_columns) 

267 propagator = np.repeat(np.arange(width), stride) 

268 if isinstance(self.value_columns, MultiIndex): 

269 new_levels = self.value_columns.levels + (self.removed_level_full,) 

270 new_names = self.value_columns.names + (self.removed_name,) 

271 

272 new_codes = [lab.take(propagator) for lab in self.value_columns.codes] 

273 else: 

274 new_levels = [self.value_columns, self.removed_level_full] 

275 new_names = [self.value_columns.name, self.removed_name] 

276 new_codes = [propagator] 

277 

278 # The two indices differ only if the unstacked level had unused items: 

279 if len(self.removed_level_full) != len(self.removed_level): 

280 # In this case, we remap the new codes to the original level: 

281 repeater = self.removed_level_full.get_indexer(self.removed_level) 

282 if self.lift: 

283 repeater = np.insert(repeater, 0, -1) 

284 else: 

285 # Otherwise, we just use each level item exactly once: 

286 repeater = np.arange(stride) - self.lift 

287 

288 # The entire level is then just a repetition of the single chunk: 

289 new_codes.append(np.tile(repeater, width)) 

290 return MultiIndex( 

291 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

292 ) 

293 

294 def get_new_index(self): 

295 result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] 

296 

297 # construct the new index 

298 if len(self.new_index_levels) == 1: 

299 level, level_codes = self.new_index_levels[0], result_codes[0] 

300 if (level_codes == -1).any(): 

301 level = level.insert(len(level), level._na_value) 

302 return level.take(level_codes).rename(self.new_index_names[0]) 

303 

304 return MultiIndex( 

305 levels=self.new_index_levels, 

306 codes=result_codes, 

307 names=self.new_index_names, 

308 verify_integrity=False, 

309 ) 

310 

311 

312def _unstack_multiple(data, clocs, fill_value=None): 

313 if len(clocs) == 0: 

314 return data 

315 

316 # NOTE: This doesn't deal with hierarchical columns yet 

317 

318 index = data.index 

319 

320 clocs = [index._get_level_number(i) for i in clocs] 

321 

322 rlocs = [i for i in range(index.nlevels) if i not in clocs] 

323 

324 clevels = [index.levels[i] for i in clocs] 

325 ccodes = [index.codes[i] for i in clocs] 

326 cnames = [index.names[i] for i in clocs] 

327 rlevels = [index.levels[i] for i in rlocs] 

328 rcodes = [index.codes[i] for i in rlocs] 

329 rnames = [index.names[i] for i in rlocs] 

330 

331 shape = [len(x) for x in clevels] 

332 group_index = get_group_index(ccodes, shape, sort=False, xnull=False) 

333 

334 comp_ids, obs_ids = compress_group_index(group_index, sort=False) 

335 recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) 

336 

337 if rlocs == []: 

338 # Everything is in clocs, so the dummy df has a regular index 

339 dummy_index = Index(obs_ids, name="__placeholder__") 

340 else: 

341 dummy_index = MultiIndex( 

342 levels=rlevels + [obs_ids], 

343 codes=rcodes + [comp_ids], 

344 names=rnames + ["__placeholder__"], 

345 verify_integrity=False, 

346 ) 

347 

348 if isinstance(data, Series): 

349 dummy = data.copy() 

350 dummy.index = dummy_index 

351 

352 unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) 

353 new_levels = clevels 

354 new_names = cnames 

355 new_codes = recons_codes 

356 else: 

357 if isinstance(data.columns, MultiIndex): 

358 result = data 

359 for i in range(len(clocs)): 

360 val = clocs[i] 

361 result = result.unstack(val, fill_value=fill_value) 

362 clocs = [v if i > v else v - 1 for v in clocs] 

363 

364 return result 

365 

366 dummy = data.copy() 

367 dummy.index = dummy_index 

368 

369 unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) 

370 if isinstance(unstacked, Series): 

371 unstcols = unstacked.index 

372 else: 

373 unstcols = unstacked.columns 

374 new_levels = [unstcols.levels[0]] + clevels 

375 new_names = [data.columns.name] + cnames 

376 

377 new_codes = [unstcols.codes[0]] 

378 for rec in recons_codes: 

379 new_codes.append(rec.take(unstcols.codes[-1])) 

380 

381 new_columns = MultiIndex( 

382 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

383 ) 

384 

385 if isinstance(unstacked, Series): 

386 unstacked.index = new_columns 

387 else: 

388 unstacked.columns = new_columns 

389 

390 return unstacked 

391 

392 

393def unstack(obj, level, fill_value=None): 

394 if isinstance(level, (tuple, list)): 

395 if len(level) != 1: 

396 # _unstack_multiple only handles MultiIndexes, 

397 # and isn't needed for a single level 

398 return _unstack_multiple(obj, level, fill_value=fill_value) 

399 else: 

400 level = level[0] 

401 

402 # Prioritize integer interpretation (GH #21677): 

403 if not is_integer(level) and not level == "__placeholder__": 

404 level = obj.index._get_level_number(level) 

405 

406 if isinstance(obj, DataFrame): 

407 if isinstance(obj.index, MultiIndex): 

408 return _unstack_frame(obj, level, fill_value=fill_value) 

409 else: 

410 return obj.T.stack(dropna=False) 

411 else: 

412 if is_extension_array_dtype(obj.dtype): 

413 return _unstack_extension_series(obj, level, fill_value) 

414 unstacker = _Unstacker( 

415 obj.values, 

416 obj.index, 

417 level=level, 

418 fill_value=fill_value, 

419 constructor=obj._constructor_expanddim, 

420 ) 

421 return unstacker.get_result() 

422 

423 

424def _unstack_frame(obj, level, fill_value=None): 

425 if obj._is_mixed_type: 

426 unstacker = partial( 

427 _Unstacker, index=obj.index, level=level, fill_value=fill_value 

428 ) 

429 blocks = obj._data.unstack(unstacker, fill_value=fill_value) 

430 return obj._constructor(blocks) 

431 else: 

432 unstacker = _Unstacker( 

433 obj.values, 

434 obj.index, 

435 level=level, 

436 value_columns=obj.columns, 

437 fill_value=fill_value, 

438 constructor=obj._constructor, 

439 ) 

440 return unstacker.get_result() 

441 

442 

443def _unstack_extension_series(series, level, fill_value): 

444 """ 

445 Unstack an ExtensionArray-backed Series. 

446 

447 The ExtensionDtype is preserved. 

448 

449 Parameters 

450 ---------- 

451 series : Series 

452 A Series with an ExtensionArray for values 

453 level : Any 

454 The level name or number. 

455 fill_value : Any 

456 The user-level (not physical storage) fill value to use for 

457 missing values introduced by the reshape. Passed to 

458 ``series.values.take``. 

459 

460 Returns 

461 ------- 

462 DataFrame 

463 Each column of the DataFrame will have the same dtype as 

464 the input Series. 

465 """ 

466 # Implementation note: the basic idea is to 

467 # 1. Do a regular unstack on a dummy array of integers 

468 # 2. Followup with a columnwise take. 

469 # We use the dummy take to discover newly-created missing values 

470 # introduced by the reshape. 

471 from pandas.core.reshape.concat import concat 

472 

473 dummy_arr = np.arange(len(series)) 

474 # fill_value=-1, since we will do a series.values.take later 

475 result = _Unstacker( 

476 dummy_arr, series.index, level=level, fill_value=-1 

477 ).get_result() 

478 

479 out = [] 

480 values = extract_array(series, extract_numpy=False) 

481 

482 for col, indices in result.items(): 

483 out.append( 

484 Series( 

485 values.take(indices.values, allow_fill=True, fill_value=fill_value), 

486 name=col, 

487 index=result.index, 

488 ) 

489 ) 

490 return concat(out, axis="columns", copy=False, keys=result.columns) 

491 

492 

493def stack(frame, level=-1, dropna=True): 

494 """ 

495 Convert DataFrame to Series with multi-level Index. Columns become the 

496 second level of the resulting hierarchical index 

497 

498 Returns 

499 ------- 

500 stacked : Series 

501 """ 

502 

503 def factorize(index): 

504 if index.is_unique: 

505 return index, np.arange(len(index)) 

506 codes, categories = factorize_from_iterable(index) 

507 return categories, codes 

508 

509 N, K = frame.shape 

510 

511 # Will also convert negative level numbers and check if out of bounds. 

512 level_num = frame.columns._get_level_number(level) 

513 

514 if isinstance(frame.columns, MultiIndex): 

515 return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) 

516 elif isinstance(frame.index, MultiIndex): 

517 new_levels = list(frame.index.levels) 

518 new_codes = [lab.repeat(K) for lab in frame.index.codes] 

519 

520 clev, clab = factorize(frame.columns) 

521 new_levels.append(clev) 

522 new_codes.append(np.tile(clab, N).ravel()) 

523 

524 new_names = list(frame.index.names) 

525 new_names.append(frame.columns.name) 

526 new_index = MultiIndex( 

527 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

528 ) 

529 else: 

530 levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) 

531 codes = ilab.repeat(K), np.tile(clab, N).ravel() 

532 new_index = MultiIndex( 

533 levels=levels, 

534 codes=codes, 

535 names=[frame.index.name, frame.columns.name], 

536 verify_integrity=False, 

537 ) 

538 

539 if frame._is_homogeneous_type: 

540 # For homogeneous EAs, frame.values will coerce to object. So 

541 # we concatenate instead. 

542 dtypes = list(frame.dtypes.values) 

543 dtype = dtypes[0] 

544 

545 if is_extension_array_dtype(dtype): 

546 arr = dtype.construct_array_type() 

547 new_values = arr._concat_same_type( 

548 [col._values for _, col in frame.items()] 

549 ) 

550 new_values = _reorder_for_extension_array_stack(new_values, N, K) 

551 else: 

552 # homogeneous, non-EA 

553 new_values = frame.values.ravel() 

554 

555 else: 

556 # non-homogeneous 

557 new_values = frame.values.ravel() 

558 

559 if dropna: 

560 mask = notna(new_values) 

561 new_values = new_values[mask] 

562 new_index = new_index[mask] 

563 

564 return frame._constructor_sliced(new_values, index=new_index) 

565 

566 

567def stack_multiple(frame, level, dropna=True): 

568 # If all passed levels match up to column names, no 

569 # ambiguity about what to do 

570 if all(lev in frame.columns.names for lev in level): 

571 result = frame 

572 for lev in level: 

573 result = stack(result, lev, dropna=dropna) 

574 

575 # Otherwise, level numbers may change as each successive level is stacked 

576 elif all(isinstance(lev, int) for lev in level): 

577 # As each stack is done, the level numbers decrease, so we need 

578 # to account for that when level is a sequence of ints 

579 result = frame 

580 # _get_level_number() checks level numbers are in range and converts 

581 # negative numbers to positive 

582 level = [frame.columns._get_level_number(lev) for lev in level] 

583 

584 # Can't iterate directly through level as we might need to change 

585 # values as we go 

586 for index in range(len(level)): 

587 lev = level[index] 

588 result = stack(result, lev, dropna=dropna) 

589 # Decrement all level numbers greater than current, as these 

590 # have now shifted down by one 

591 updated_level = [] 

592 for other in level: 

593 if other > lev: 

594 updated_level.append(other - 1) 

595 else: 

596 updated_level.append(other) 

597 level = updated_level 

598 

599 else: 

600 raise ValueError( 

601 "level should contain all level names or all level " 

602 "numbers, not a mixture of the two." 

603 ) 

604 

605 return result 

606 

607 

608def _stack_multi_columns(frame, level_num=-1, dropna=True): 

609 def _convert_level_number(level_num, columns): 

610 """ 

611 Logic for converting the level number to something we can safely pass 

612 to swaplevel: 

613 

614 We generally want to convert the level number into a level name, except 

615 when columns do not have names, in which case we must leave as a level 

616 number 

617 """ 

618 if level_num in columns.names: 

619 return columns.names[level_num] 

620 else: 

621 if columns.names[level_num] is None: 

622 return level_num 

623 else: 

624 return columns.names[level_num] 

625 

626 this = frame.copy() 

627 

628 # this makes life much simpler 

629 if level_num != frame.columns.nlevels - 1: 

630 # roll levels to put selected level at end 

631 roll_columns = this.columns 

632 for i in range(level_num, frame.columns.nlevels - 1): 

633 # Need to check if the ints conflict with level names 

634 lev1 = _convert_level_number(i, roll_columns) 

635 lev2 = _convert_level_number(i + 1, roll_columns) 

636 roll_columns = roll_columns.swaplevel(lev1, lev2) 

637 this.columns = roll_columns 

638 

639 if not this.columns.is_lexsorted(): 

640 # Workaround the edge case where 0 is one of the column names, 

641 # which interferes with trying to sort based on the first 

642 # level 

643 level_to_sort = _convert_level_number(0, this.columns) 

644 this = this.sort_index(level=level_to_sort, axis=1) 

645 

646 # tuple list excluding level for grouping columns 

647 if len(frame.columns.levels) > 2: 

648 tuples = list( 

649 zip( 

650 *[ 

651 lev.take(level_codes) 

652 for lev, level_codes in zip( 

653 this.columns.levels[:-1], this.columns.codes[:-1] 

654 ) 

655 ] 

656 ) 

657 ) 

658 unique_groups = [key for key, _ in itertools.groupby(tuples)] 

659 new_names = this.columns.names[:-1] 

660 new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) 

661 else: 

662 new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0]) 

663 unique_groups = new_columns 

664 

665 # time to ravel the values 

666 new_data = {} 

667 level_vals = this.columns.levels[-1] 

668 level_codes = sorted(set(this.columns.codes[-1])) 

669 level_vals_used = level_vals[level_codes] 

670 levsize = len(level_codes) 

671 drop_cols = [] 

672 for key in unique_groups: 

673 try: 

674 loc = this.columns.get_loc(key) 

675 except KeyError: 

676 drop_cols.append(key) 

677 continue 

678 

679 # can make more efficient? 

680 # we almost always return a slice 

681 # but if unsorted can get a boolean 

682 # indexer 

683 if not isinstance(loc, slice): 

684 slice_len = len(loc) 

685 else: 

686 slice_len = loc.stop - loc.start 

687 

688 if slice_len != levsize: 

689 chunk = this.loc[:, this.columns[loc]] 

690 chunk.columns = level_vals.take(chunk.columns.codes[-1]) 

691 value_slice = chunk.reindex(columns=level_vals_used).values 

692 else: 

693 if frame._is_homogeneous_type and is_extension_array_dtype( 

694 frame.dtypes.iloc[0] 

695 ): 

696 dtype = this[this.columns[loc]].dtypes.iloc[0] 

697 subset = this[this.columns[loc]] 

698 

699 value_slice = dtype.construct_array_type()._concat_same_type( 

700 [x._values for _, x in subset.items()] 

701 ) 

702 N, K = this.shape 

703 idx = np.arange(N * K).reshape(K, N).T.ravel() 

704 value_slice = value_slice.take(idx) 

705 

706 elif frame._is_mixed_type: 

707 value_slice = this[this.columns[loc]].values 

708 else: 

709 value_slice = this.values[:, loc] 

710 

711 if value_slice.ndim > 1: 

712 # i.e. not extension 

713 value_slice = value_slice.ravel() 

714 

715 new_data[key] = value_slice 

716 

717 if len(drop_cols) > 0: 

718 new_columns = new_columns.difference(drop_cols) 

719 

720 N = len(this) 

721 

722 if isinstance(this.index, MultiIndex): 

723 new_levels = list(this.index.levels) 

724 new_names = list(this.index.names) 

725 new_codes = [lab.repeat(levsize) for lab in this.index.codes] 

726 else: 

727 old_codes, old_levels = factorize_from_iterable(this.index) 

728 new_levels = [old_levels] 

729 new_codes = [old_codes.repeat(levsize)] 

730 new_names = [this.index.name] # something better? 

731 

732 new_levels.append(level_vals) 

733 new_codes.append(np.tile(level_codes, N)) 

734 new_names.append(frame.columns.names[level_num]) 

735 

736 new_index = MultiIndex( 

737 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

738 ) 

739 

740 result = frame._constructor(new_data, index=new_index, columns=new_columns) 

741 

742 # more efficient way to go about this? can do the whole masking biz but 

743 # will only save a small amount of time... 

744 if dropna: 

745 result = result.dropna(axis=0, how="all") 

746 

747 return result 

748 

749 

750def get_dummies( 

751 data, 

752 prefix=None, 

753 prefix_sep="_", 

754 dummy_na=False, 

755 columns=None, 

756 sparse=False, 

757 drop_first=False, 

758 dtype=None, 

759) -> "DataFrame": 

760 """ 

761 Convert categorical variable into dummy/indicator variables. 

762 

763 Parameters 

764 ---------- 

765 data : array-like, Series, or DataFrame 

766 Data of which to get dummy indicators. 

767 prefix : str, list of str, or dict of str, default None 

768 String to append DataFrame column names. 

769 Pass a list with length equal to the number of columns 

770 when calling get_dummies on a DataFrame. Alternatively, `prefix` 

771 can be a dictionary mapping column names to prefixes. 

772 prefix_sep : str, default '_' 

773 If appending prefix, separator/delimiter to use. Or pass a 

774 list or dictionary as with `prefix`. 

775 dummy_na : bool, default False 

776 Add a column to indicate NaNs, if False NaNs are ignored. 

777 columns : list-like, default None 

778 Column names in the DataFrame to be encoded. 

779 If `columns` is None then all the columns with 

780 `object` or `category` dtype will be converted. 

781 sparse : bool, default False 

782 Whether the dummy-encoded columns should be backed by 

783 a :class:`SparseArray` (True) or a regular NumPy array (False). 

784 drop_first : bool, default False 

785 Whether to get k-1 dummies out of k categorical levels by removing the 

786 first level. 

787 dtype : dtype, default np.uint8 

788 Data type for new columns. Only a single dtype is allowed. 

789 

790 .. versionadded:: 0.23.0 

791 

792 Returns 

793 ------- 

794 DataFrame 

795 Dummy-coded data. 

796 

797 See Also 

798 -------- 

799 Series.str.get_dummies : Convert Series to dummy codes. 

800 

801 Examples 

802 -------- 

803 >>> s = pd.Series(list('abca')) 

804 

805 >>> pd.get_dummies(s) 

806 a b c 

807 0 1 0 0 

808 1 0 1 0 

809 2 0 0 1 

810 3 1 0 0 

811 

812 >>> s1 = ['a', 'b', np.nan] 

813 

814 >>> pd.get_dummies(s1) 

815 a b 

816 0 1 0 

817 1 0 1 

818 2 0 0 

819 

820 >>> pd.get_dummies(s1, dummy_na=True) 

821 a b NaN 

822 0 1 0 0 

823 1 0 1 0 

824 2 0 0 1 

825 

826 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 

827 ... 'C': [1, 2, 3]}) 

828 

829 >>> pd.get_dummies(df, prefix=['col1', 'col2']) 

830 C col1_a col1_b col2_a col2_b col2_c 

831 0 1 1 0 0 1 0 

832 1 2 0 1 1 0 0 

833 2 3 1 0 0 0 1 

834 

835 >>> pd.get_dummies(pd.Series(list('abcaa'))) 

836 a b c 

837 0 1 0 0 

838 1 0 1 0 

839 2 0 0 1 

840 3 1 0 0 

841 4 1 0 0 

842 

843 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) 

844 b c 

845 0 0 0 

846 1 1 0 

847 2 0 1 

848 3 0 0 

849 4 0 0 

850 

851 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) 

852 a b c 

853 0 1.0 0.0 0.0 

854 1 0.0 1.0 0.0 

855 2 0.0 0.0 1.0 

856 """ 

857 from pandas.core.reshape.concat import concat 

858 

859 dtypes_to_encode = ["object", "category"] 

860 

861 if isinstance(data, DataFrame): 

862 # determine columns being encoded 

863 if columns is None: 

864 data_to_encode = data.select_dtypes(include=dtypes_to_encode) 

865 elif not is_list_like(columns): 

866 raise TypeError("Input must be a list-like for parameter `columns`") 

867 else: 

868 data_to_encode = data[columns] 

869 

870 # validate prefixes and separator to avoid silently dropping cols 

871 def check_len(item, name): 

872 len_msg = ( 

873 "Length of '{name}' ({len_item}) did not match the " 

874 "length of the columns being encoded ({len_enc})." 

875 ) 

876 

877 if is_list_like(item): 

878 if not len(item) == data_to_encode.shape[1]: 

879 len_msg = len_msg.format( 

880 name=name, len_item=len(item), len_enc=data_to_encode.shape[1] 

881 ) 

882 raise ValueError(len_msg) 

883 

884 check_len(prefix, "prefix") 

885 check_len(prefix_sep, "prefix_sep") 

886 

887 if isinstance(prefix, str): 

888 prefix = itertools.cycle([prefix]) 

889 if isinstance(prefix, dict): 

890 prefix = [prefix[col] for col in data_to_encode.columns] 

891 

892 if prefix is None: 

893 prefix = data_to_encode.columns 

894 

895 # validate separators 

896 if isinstance(prefix_sep, str): 

897 prefix_sep = itertools.cycle([prefix_sep]) 

898 elif isinstance(prefix_sep, dict): 

899 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] 

900 

901 if data_to_encode.shape == data.shape: 

902 # Encoding the entire df, do not prepend any dropped columns 

903 with_dummies: List[DataFrame] = [] 

904 elif columns is not None: 

905 # Encoding only cols specified in columns. Get all cols not in 

906 # columns to prepend to result. 

907 with_dummies = [data.drop(columns, axis=1)] 

908 else: 

909 # Encoding only object and category dtype columns. Get remaining 

910 # columns to prepend to result. 

911 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] 

912 

913 for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep): 

914 # col is (column_name, column), use just column data here 

915 dummy = _get_dummies_1d( 

916 col[1], 

917 prefix=pre, 

918 prefix_sep=sep, 

919 dummy_na=dummy_na, 

920 sparse=sparse, 

921 drop_first=drop_first, 

922 dtype=dtype, 

923 ) 

924 with_dummies.append(dummy) 

925 result = concat(with_dummies, axis=1) 

926 else: 

927 result = _get_dummies_1d( 

928 data, 

929 prefix, 

930 prefix_sep, 

931 dummy_na, 

932 sparse=sparse, 

933 drop_first=drop_first, 

934 dtype=dtype, 

935 ) 

936 return result 

937 

938 

939def _get_dummies_1d( 

940 data, 

941 prefix, 

942 prefix_sep="_", 

943 dummy_na=False, 

944 sparse=False, 

945 drop_first=False, 

946 dtype=None, 

947): 

948 from pandas.core.reshape.concat import concat 

949 

950 # Series avoids inconsistent NaN handling 

951 codes, levels = factorize_from_iterable(Series(data)) 

952 

953 if dtype is None: 

954 dtype = np.uint8 

955 dtype = np.dtype(dtype) 

956 

957 if is_object_dtype(dtype): 

958 raise ValueError("dtype=object is not a valid dtype for get_dummies") 

959 

960 def get_empty_frame(data) -> DataFrame: 

961 if isinstance(data, Series): 

962 index = data.index 

963 else: 

964 index = np.arange(len(data)) 

965 return DataFrame(index=index) 

966 

967 # if all NaN 

968 if not dummy_na and len(levels) == 0: 

969 return get_empty_frame(data) 

970 

971 codes = codes.copy() 

972 if dummy_na: 

973 codes[codes == -1] = len(levels) 

974 levels = np.append(levels, np.nan) 

975 

976 # if dummy_na, we just fake a nan level. drop_first will drop it again 

977 if drop_first and len(levels) == 1: 

978 return get_empty_frame(data) 

979 

980 number_of_cols = len(levels) 

981 

982 if prefix is None: 

983 dummy_cols = levels 

984 else: 

985 

986 # PY2 embedded unicode, gh-22084 

987 def _make_col_name(prefix, prefix_sep, level) -> str: 

988 fstr = "{prefix}{prefix_sep}{level}" 

989 return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) 

990 

991 dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] 

992 

993 if isinstance(data, Series): 

994 index = data.index 

995 else: 

996 index = None 

997 

998 if sparse: 

999 

1000 if is_integer_dtype(dtype): 

1001 fill_value = 0 

1002 elif dtype == bool: 

1003 fill_value = False 

1004 else: 

1005 fill_value = 0.0 

1006 

1007 sparse_series = [] 

1008 N = len(data) 

1009 sp_indices = [[] for _ in range(len(dummy_cols))] 

1010 mask = codes != -1 

1011 codes = codes[mask] 

1012 n_idx = np.arange(N)[mask] 

1013 

1014 for ndx, code in zip(n_idx, codes): 

1015 sp_indices[code].append(ndx) 

1016 

1017 if drop_first: 

1018 # remove first categorical level to avoid perfect collinearity 

1019 # GH12042 

1020 sp_indices = sp_indices[1:] 

1021 dummy_cols = dummy_cols[1:] 

1022 for col, ixs in zip(dummy_cols, sp_indices): 

1023 sarr = SparseArray( 

1024 np.ones(len(ixs), dtype=dtype), 

1025 sparse_index=IntIndex(N, ixs), 

1026 fill_value=fill_value, 

1027 dtype=dtype, 

1028 ) 

1029 sparse_series.append(Series(data=sarr, index=index, name=col)) 

1030 

1031 out = concat(sparse_series, axis=1, copy=False) 

1032 return out 

1033 

1034 else: 

1035 dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) 

1036 

1037 if not dummy_na: 

1038 # reset NaN GH4446 

1039 dummy_mat[codes == -1] = 0 

1040 

1041 if drop_first: 

1042 # remove first GH12042 

1043 dummy_mat = dummy_mat[:, 1:] 

1044 dummy_cols = dummy_cols[1:] 

1045 return DataFrame(dummy_mat, index=index, columns=dummy_cols) 

1046 

1047 

1048def _reorder_for_extension_array_stack(arr, n_rows: int, n_columns: int): 

1049 """ 

1050 Re-orders the values when stacking multiple extension-arrays. 

1051 

1052 The indirect stacking method used for EAs requires a followup 

1053 take to get the order correct. 

1054 

1055 Parameters 

1056 ---------- 

1057 arr : ExtensionArray 

1058 n_rows, n_columns : int 

1059 The number of rows and columns in the original DataFrame. 

1060 

1061 Returns 

1062 ------- 

1063 taken : ExtensionArray 

1064 The original `arr` with elements re-ordered appropriately 

1065 

1066 Examples 

1067 -------- 

1068 >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) 

1069 >>> _reorder_for_extension_array_stack(arr, 2, 3) 

1070 array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1') 

1071 

1072 >>> _reorder_for_extension_array_stack(arr, 3, 2) 

1073 array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1') 

1074 """ 

1075 # final take to get the order correct. 

1076 # idx is an indexer like 

1077 # [c0r0, c1r0, c2r0, ..., 

1078 # c0r1, c1r1, c2r1, ...] 

1079 idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() 

1080 return arr.take(idx)