Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/reshape/concat.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2concat routines
3"""
5from typing import Hashable, Iterable, List, Mapping, Optional, Union, overload
7import numpy as np
9from pandas._typing import FrameOrSeriesUnion
11from pandas import DataFrame, Index, MultiIndex, Series
12from pandas.core.arrays.categorical import (
13 factorize_from_iterable,
14 factorize_from_iterables,
15)
16import pandas.core.common as com
17from pandas.core.generic import NDFrame
18from pandas.core.indexes.api import (
19 all_indexes_same,
20 ensure_index,
21 get_consensus_names,
22 get_objs_combined_axis,
23)
24import pandas.core.indexes.base as ibase
25from pandas.core.internals import concatenate_block_managers
27# ---------------------------------------------------------------------
28# Concatenate DataFrame objects
31@overload
32def concat(
33 objs: Union[Iterable["DataFrame"], Mapping[Optional[Hashable], "DataFrame"]],
34 axis=0,
35 join: str = "outer",
36 ignore_index: bool = False,
37 keys=None,
38 levels=None,
39 names=None,
40 verify_integrity: bool = False,
41 sort: bool = False,
42 copy: bool = True,
43) -> "DataFrame":
44 ...
47@overload
48def concat(
49 objs: Union[
50 Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion]
51 ],
52 axis=0,
53 join: str = "outer",
54 ignore_index: bool = False,
55 keys=None,
56 levels=None,
57 names=None,
58 verify_integrity: bool = False,
59 sort: bool = False,
60 copy: bool = True,
61) -> FrameOrSeriesUnion:
62 ...
65def concat(
66 objs: Union[
67 Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion]
68 ],
69 axis=0,
70 join="outer",
71 ignore_index: bool = False,
72 keys=None,
73 levels=None,
74 names=None,
75 verify_integrity: bool = False,
76 sort: bool = False,
77 copy: bool = True,
78) -> FrameOrSeriesUnion:
79 """
80 Concatenate pandas objects along a particular axis with optional set logic
81 along the other axes.
83 Can also add a layer of hierarchical indexing on the concatenation axis,
84 which may be useful if the labels are the same (or overlapping) on
85 the passed axis number.
87 Parameters
88 ----------
89 objs : a sequence or mapping of Series or DataFrame objects
90 If a dict is passed, the sorted keys will be used as the `keys`
91 argument, unless it is passed, in which case the values will be
92 selected (see below). Any None objects will be dropped silently unless
93 they are all None in which case a ValueError will be raised.
94 axis : {0/'index', 1/'columns'}, default 0
95 The axis to concatenate along.
96 join : {'inner', 'outer'}, default 'outer'
97 How to handle indexes on other axis (or axes).
98 ignore_index : bool, default False
99 If True, do not use the index values along the concatenation axis. The
100 resulting axis will be labeled 0, ..., n - 1. This is useful if you are
101 concatenating objects where the concatenation axis does not have
102 meaningful indexing information. Note the index values on the other
103 axes are still respected in the join.
104 keys : sequence, default None
105 If multiple levels passed, should contain tuples. Construct
106 hierarchical index using the passed keys as the outermost level.
107 levels : list of sequences, default None
108 Specific levels (unique values) to use for constructing a
109 MultiIndex. Otherwise they will be inferred from the keys.
110 names : list, default None
111 Names for the levels in the resulting hierarchical index.
112 verify_integrity : bool, default False
113 Check whether the new concatenated axis contains duplicates. This can
114 be very expensive relative to the actual data concatenation.
115 sort : bool, default False
116 Sort non-concatenation axis if it is not already aligned when `join`
117 is 'outer'.
118 This has no effect when ``join='inner'``, which already preserves
119 the order of the non-concatenation axis.
121 .. versionadded:: 0.23.0
122 .. versionchanged:: 1.0.0
124 Changed to not sort by default.
126 copy : bool, default True
127 If False, do not copy data unnecessarily.
129 Returns
130 -------
131 object, type of objs
132 When concatenating all ``Series`` along the index (axis=0), a
133 ``Series`` is returned. When ``objs`` contains at least one
134 ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
135 the columns (axis=1), a ``DataFrame`` is returned.
137 See Also
138 --------
139 Series.append : Concatenate Series.
140 DataFrame.append : Concatenate DataFrames.
141 DataFrame.join : Join DataFrames using indexes.
142 DataFrame.merge : Merge DataFrames by indexes or columns.
144 Notes
145 -----
146 The keys, levels, and names arguments are all optional.
148 A walkthrough of how this method fits in with other tools for combining
149 pandas objects can be found `here
150 <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
152 Examples
153 --------
154 Combine two ``Series``.
156 >>> s1 = pd.Series(['a', 'b'])
157 >>> s2 = pd.Series(['c', 'd'])
158 >>> pd.concat([s1, s2])
159 0 a
160 1 b
161 0 c
162 1 d
163 dtype: object
165 Clear the existing index and reset it in the result
166 by setting the ``ignore_index`` option to ``True``.
168 >>> pd.concat([s1, s2], ignore_index=True)
169 0 a
170 1 b
171 2 c
172 3 d
173 dtype: object
175 Add a hierarchical index at the outermost level of
176 the data with the ``keys`` option.
178 >>> pd.concat([s1, s2], keys=['s1', 's2'])
179 s1 0 a
180 1 b
181 s2 0 c
182 1 d
183 dtype: object
185 Label the index keys you create with the ``names`` option.
187 >>> pd.concat([s1, s2], keys=['s1', 's2'],
188 ... names=['Series name', 'Row ID'])
189 Series name Row ID
190 s1 0 a
191 1 b
192 s2 0 c
193 1 d
194 dtype: object
196 Combine two ``DataFrame`` objects with identical columns.
198 >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
199 ... columns=['letter', 'number'])
200 >>> df1
201 letter number
202 0 a 1
203 1 b 2
204 >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
205 ... columns=['letter', 'number'])
206 >>> df2
207 letter number
208 0 c 3
209 1 d 4
210 >>> pd.concat([df1, df2])
211 letter number
212 0 a 1
213 1 b 2
214 0 c 3
215 1 d 4
217 Combine ``DataFrame`` objects with overlapping columns
218 and return everything. Columns outside the intersection will
219 be filled with ``NaN`` values.
221 >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
222 ... columns=['letter', 'number', 'animal'])
223 >>> df3
224 letter number animal
225 0 c 3 cat
226 1 d 4 dog
227 >>> pd.concat([df1, df3], sort=False)
228 letter number animal
229 0 a 1 NaN
230 1 b 2 NaN
231 0 c 3 cat
232 1 d 4 dog
234 Combine ``DataFrame`` objects with overlapping columns
235 and return only those that are shared by passing ``inner`` to
236 the ``join`` keyword argument.
238 >>> pd.concat([df1, df3], join="inner")
239 letter number
240 0 a 1
241 1 b 2
242 0 c 3
243 1 d 4
245 Combine ``DataFrame`` objects horizontally along the x axis by
246 passing in ``axis=1``.
248 >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
249 ... columns=['animal', 'name'])
250 >>> pd.concat([df1, df4], axis=1)
251 letter number animal name
252 0 a 1 bird polly
253 1 b 2 monkey george
255 Prevent the result from including duplicate index values with the
256 ``verify_integrity`` option.
258 >>> df5 = pd.DataFrame([1], index=['a'])
259 >>> df5
260 0
261 a 1
262 >>> df6 = pd.DataFrame([2], index=['a'])
263 >>> df6
264 0
265 a 2
266 >>> pd.concat([df5, df6], verify_integrity=True)
267 Traceback (most recent call last):
268 ...
269 ValueError: Indexes have overlapping values: ['a']
270 """
271 op = _Concatenator(
272 objs,
273 axis=axis,
274 ignore_index=ignore_index,
275 join=join,
276 keys=keys,
277 levels=levels,
278 names=names,
279 verify_integrity=verify_integrity,
280 copy=copy,
281 sort=sort,
282 )
284 return op.get_result()
287class _Concatenator:
288 """
289 Orchestrates a concatenation operation for BlockManagers
290 """
292 def __init__(
293 self,
294 objs,
295 axis=0,
296 join: str = "outer",
297 keys=None,
298 levels=None,
299 names=None,
300 ignore_index: bool = False,
301 verify_integrity: bool = False,
302 copy: bool = True,
303 sort=False,
304 ):
305 if isinstance(objs, (NDFrame, str)):
306 raise TypeError(
307 "first argument must be an iterable of pandas "
308 "objects, you passed an object of type "
309 '"{name}"'.format(name=type(objs).__name__)
310 )
312 if join == "outer":
313 self.intersect = False
314 elif join == "inner":
315 self.intersect = True
316 else: # pragma: no cover
317 raise ValueError(
318 "Only can inner (intersect) or outer (union) join the other axis"
319 )
321 if isinstance(objs, dict):
322 if keys is None:
323 keys = list(objs.keys())
324 objs = [objs[k] for k in keys]
325 else:
326 objs = list(objs)
328 if len(objs) == 0:
329 raise ValueError("No objects to concatenate")
331 if keys is None:
332 objs = list(com.not_none(*objs))
333 else:
334 # #1649
335 clean_keys = []
336 clean_objs = []
337 for k, v in zip(keys, objs):
338 if v is None:
339 continue
340 clean_keys.append(k)
341 clean_objs.append(v)
342 objs = clean_objs
343 name = getattr(keys, "name", None)
344 keys = Index(clean_keys, name=name)
346 if len(objs) == 0:
347 raise ValueError("All objects passed were None")
349 # consolidate data & figure out what our result ndim is going to be
350 ndims = set()
351 for obj in objs:
352 if not isinstance(obj, (Series, DataFrame)):
353 msg = (
354 "cannot concatenate object of type '{typ}'; "
355 "only Series and DataFrame objs are valid".format(typ=type(obj))
356 )
357 raise TypeError(msg)
359 # consolidate
360 obj._consolidate(inplace=True)
361 ndims.add(obj.ndim)
363 # get the sample
364 # want the highest ndim that we have, and must be non-empty
365 # unless all objs are empty
366 sample = None
367 if len(ndims) > 1:
368 max_ndim = max(ndims)
369 for obj in objs:
370 if obj.ndim == max_ndim and np.sum(obj.shape):
371 sample = obj
372 break
374 else:
375 # filter out the empties if we have not multi-index possibilities
376 # note to keep empty Series as it affect to result columns / name
377 non_empties = [
378 obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series)
379 ]
381 if len(non_empties) and (
382 keys is None and names is None and levels is None and not self.intersect
383 ):
384 objs = non_empties
385 sample = objs[0]
387 if sample is None:
388 sample = objs[0]
389 self.objs = objs
391 # Standardize axis parameter to int
392 if isinstance(sample, Series):
393 axis = DataFrame._get_axis_number(axis)
394 else:
395 axis = sample._get_axis_number(axis)
397 # Need to flip BlockManager axis in the DataFrame special case
398 self._is_frame = isinstance(sample, DataFrame)
399 if self._is_frame:
400 axis = 1 if axis == 0 else 0
402 self._is_series = isinstance(sample, Series)
403 if not 0 <= axis <= sample.ndim:
404 raise AssertionError(
405 "axis must be between 0 and {ndim}, input was "
406 "{axis}".format(ndim=sample.ndim, axis=axis)
407 )
409 # if we have mixed ndims, then convert to highest ndim
410 # creating column numbers as needed
411 if len(ndims) > 1:
412 current_column = 0
413 max_ndim = sample.ndim
414 self.objs, objs = [], self.objs
415 for obj in objs:
417 ndim = obj.ndim
418 if ndim == max_ndim:
419 pass
421 elif ndim != max_ndim - 1:
422 raise ValueError(
423 "cannot concatenate unaligned mixed "
424 "dimensional NDFrame objects"
425 )
427 else:
428 name = getattr(obj, "name", None)
429 if ignore_index or name is None:
430 name = current_column
431 current_column += 1
433 # doing a row-wise concatenation so need everything
434 # to line up
435 if self._is_frame and axis == 1:
436 name = 0
437 obj = sample._constructor({name: obj})
439 self.objs.append(obj)
441 # note: this is the BlockManager axis (since DataFrame is transposed)
442 self.axis = axis
443 self.keys = keys
444 self.names = names or getattr(keys, "names", None)
445 self.levels = levels
446 self.sort = sort
448 self.ignore_index = ignore_index
449 self.verify_integrity = verify_integrity
450 self.copy = copy
452 self.new_axes = self._get_new_axes()
454 def get_result(self):
456 # series only
457 if self._is_series:
459 # stack blocks
460 if self.axis == 0:
461 name = com.consensus_name_attr(self.objs)
463 mgr = self.objs[0]._data.concat(
464 [x._data for x in self.objs], self.new_axes
465 )
466 cons = self.objs[0]._constructor
467 return cons(mgr, name=name).__finalize__(self, method="concat")
469 # combine as columns in a frame
470 else:
471 data = dict(zip(range(len(self.objs)), self.objs))
472 cons = DataFrame
474 index, columns = self.new_axes
475 df = cons(data, index=index)
476 df.columns = columns
477 return df.__finalize__(self, method="concat")
479 # combine block managers
480 else:
481 mgrs_indexers = []
482 for obj in self.objs:
483 mgr = obj._data
484 indexers = {}
485 for ax, new_labels in enumerate(self.new_axes):
486 if ax == self.axis:
487 # Suppress reindexing on concat axis
488 continue
490 obj_labels = mgr.axes[ax]
491 if not new_labels.equals(obj_labels):
492 indexers[ax] = obj_labels.reindex(new_labels)[1]
494 mgrs_indexers.append((obj._data, indexers))
496 new_data = concatenate_block_managers(
497 mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy
498 )
499 if not self.copy:
500 new_data._consolidate_inplace()
502 cons = self.objs[0]._constructor
503 return cons._from_axes(new_data, self.new_axes).__finalize__(
504 self, method="concat"
505 )
507 def _get_result_dim(self) -> int:
508 if self._is_series and self.axis == 1:
509 return 2
510 else:
511 return self.objs[0].ndim
513 def _get_new_axes(self) -> List[Index]:
514 ndim = self._get_result_dim()
515 return [
516 self._get_concat_axis() if i == self.axis else self._get_comb_axis(i)
517 for i in range(ndim)
518 ]
520 def _get_comb_axis(self, i: int) -> Index:
521 data_axis = self.objs[0]._get_block_manager_axis(i)
522 return get_objs_combined_axis(
523 self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort
524 )
526 def _get_concat_axis(self) -> Index:
527 """
528 Return index to be used along concatenation axis.
529 """
530 if self._is_series:
531 if self.axis == 0:
532 indexes = [x.index for x in self.objs]
533 elif self.ignore_index:
534 idx = ibase.default_index(len(self.objs))
535 return idx
536 elif self.keys is None:
537 names: List[Optional[Hashable]] = [None] * len(self.objs)
538 num = 0
539 has_names = False
540 for i, x in enumerate(self.objs):
541 if not isinstance(x, Series):
542 raise TypeError(
543 f"Cannot concatenate type 'Series' with "
544 f"object of type '{type(x).__name__}'"
545 )
546 if x.name is not None:
547 names[i] = x.name
548 has_names = True
549 else:
550 names[i] = num
551 num += 1
552 if has_names:
553 return Index(names)
554 else:
555 return ibase.default_index(len(self.objs))
556 else:
557 return ensure_index(self.keys).set_names(self.names)
558 else:
559 indexes = [x._data.axes[self.axis] for x in self.objs]
561 if self.ignore_index:
562 idx = ibase.default_index(sum(len(i) for i in indexes))
563 return idx
565 if self.keys is None:
566 concat_axis = _concat_indexes(indexes)
567 else:
568 concat_axis = _make_concat_multiindex(
569 indexes, self.keys, self.levels, self.names
570 )
572 self._maybe_check_integrity(concat_axis)
574 return concat_axis
576 def _maybe_check_integrity(self, concat_index: Index):
577 if self.verify_integrity:
578 if not concat_index.is_unique:
579 overlap = concat_index[concat_index.duplicated()].unique()
580 raise ValueError(
581 "Indexes have overlapping values: "
582 "{overlap!s}".format(overlap=overlap)
583 )
586def _concat_indexes(indexes) -> Index:
587 return indexes[0].append(indexes[1:])
590def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex:
592 if (levels is None and isinstance(keys[0], tuple)) or (
593 levels is not None and len(levels) > 1
594 ):
595 zipped = list(zip(*keys))
596 if names is None:
597 names = [None] * len(zipped)
599 if levels is None:
600 _, levels = factorize_from_iterables(zipped)
601 else:
602 levels = [ensure_index(x) for x in levels]
603 else:
604 zipped = [keys]
605 if names is None:
606 names = [None]
608 if levels is None:
609 levels = [ensure_index(keys)]
610 else:
611 levels = [ensure_index(x) for x in levels]
613 if not all_indexes_same(indexes):
614 codes_list = []
616 # things are potentially different sizes, so compute the exact codes
617 # for each level and pass those to MultiIndex.from_arrays
619 for hlevel, level in zip(zipped, levels):
620 to_concat = []
621 for key, index in zip(hlevel, indexes):
622 try:
623 i = level.get_loc(key)
624 except KeyError:
625 raise ValueError(
626 "Key {key!s} not in level {level!s}".format(
627 key=key, level=level
628 )
629 )
631 to_concat.append(np.repeat(i, len(index)))
632 codes_list.append(np.concatenate(to_concat))
634 concat_index = _concat_indexes(indexes)
636 # these go at the end
637 if isinstance(concat_index, MultiIndex):
638 levels.extend(concat_index.levels)
639 codes_list.extend(concat_index.codes)
640 else:
641 codes, categories = factorize_from_iterable(concat_index)
642 levels.append(categories)
643 codes_list.append(codes)
645 if len(names) == len(levels):
646 names = list(names)
647 else:
648 # make sure that all of the passed indices have the same nlevels
649 if not len({idx.nlevels for idx in indexes}) == 1:
650 raise AssertionError(
651 "Cannot concat indices that do "
652 "not have the same number of levels"
653 )
655 # also copies
656 names = names + get_consensus_names(indexes)
658 return MultiIndex(
659 levels=levels, codes=codes_list, names=names, verify_integrity=False
660 )
662 new_index = indexes[0]
663 n = len(new_index)
664 kpieces = len(indexes)
666 # also copies
667 new_names = list(names)
668 new_levels = list(levels)
670 # construct codes
671 new_codes = []
673 # do something a bit more speedy
675 for hlevel, level in zip(zipped, levels):
676 hlevel = ensure_index(hlevel)
677 mapped = level.get_indexer(hlevel)
679 mask = mapped == -1
680 if mask.any():
681 raise ValueError(
682 "Values not found in passed level: {hlevel!s}".format(
683 hlevel=hlevel[mask]
684 )
685 )
687 new_codes.append(np.repeat(mapped, n))
689 if isinstance(new_index, MultiIndex):
690 new_levels.extend(new_index.levels)
691 new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
692 else:
693 new_levels.append(new_index)
694 new_codes.append(np.tile(np.arange(n), kpieces))
696 if len(new_names) < len(new_levels):
697 new_names.extend(new_index.names)
699 return MultiIndex(
700 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
701 )