Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/reshape/melt.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import re
2from typing import List
4import numpy as np
6from pandas.util._decorators import Appender, deprecate_kwarg
8from pandas.core.dtypes.common import is_extension_array_dtype, is_list_like
9from pandas.core.dtypes.concat import concat_compat
10from pandas.core.dtypes.generic import ABCMultiIndex
11from pandas.core.dtypes.missing import notna
13from pandas.core.arrays import Categorical
14import pandas.core.common as com
15from pandas.core.frame import DataFrame, _shared_docs
16from pandas.core.indexes.base import Index
17from pandas.core.reshape.concat import concat
18from pandas.core.tools.numeric import to_numeric
21@Appender(
22 _shared_docs["melt"]
23 % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt")
24)
25def melt(
26 frame: DataFrame,
27 id_vars=None,
28 value_vars=None,
29 var_name=None,
30 value_name="value",
31 col_level=None,
32) -> DataFrame:
33 # TODO: what about the existing index?
34 # If multiindex, gather names of columns on all level for checking presence
35 # of `id_vars` and `value_vars`
36 if isinstance(frame.columns, ABCMultiIndex):
37 cols = [x for c in frame.columns for x in c]
38 else:
39 cols = list(frame.columns)
41 if id_vars is not None:
42 if not is_list_like(id_vars):
43 id_vars = [id_vars]
44 elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list):
45 raise ValueError(
46 "id_vars must be a list of tuples when columns are a MultiIndex"
47 )
48 else:
49 # Check that `id_vars` are in frame
50 id_vars = list(id_vars)
51 missing = Index(com.flatten(id_vars)).difference(cols)
52 if not missing.empty:
53 raise KeyError(
54 "The following 'id_vars' are not present "
55 "in the DataFrame: {missing}"
56 "".format(missing=list(missing))
57 )
58 else:
59 id_vars = []
61 if value_vars is not None:
62 if not is_list_like(value_vars):
63 value_vars = [value_vars]
64 elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(
65 value_vars, list
66 ):
67 raise ValueError(
68 "value_vars must be a list of tuples when columns are a MultiIndex"
69 )
70 else:
71 value_vars = list(value_vars)
72 # Check that `value_vars` are in frame
73 missing = Index(com.flatten(value_vars)).difference(cols)
74 if not missing.empty:
75 raise KeyError(
76 "The following 'value_vars' are not present in "
77 "the DataFrame: {missing}"
78 "".format(missing=list(missing))
79 )
80 frame = frame.loc[:, id_vars + value_vars]
81 else:
82 frame = frame.copy()
84 if col_level is not None: # allow list or other?
85 # frame is a copy
86 frame.columns = frame.columns.get_level_values(col_level)
88 if var_name is None:
89 if isinstance(frame.columns, ABCMultiIndex):
90 if len(frame.columns.names) == len(set(frame.columns.names)):
91 var_name = frame.columns.names
92 else:
93 var_name = [
94 "variable_{i}".format(i=i) for i in range(len(frame.columns.names))
95 ]
96 else:
97 var_name = [
98 frame.columns.name if frame.columns.name is not None else "variable"
99 ]
100 if isinstance(var_name, str):
101 var_name = [var_name]
103 N, K = frame.shape
104 K -= len(id_vars)
106 mdata = {}
107 for col in id_vars:
108 id_data = frame.pop(col)
109 if is_extension_array_dtype(id_data):
110 id_data = concat([id_data] * K, ignore_index=True)
111 else:
112 id_data = np.tile(id_data.values, K)
113 mdata[col] = id_data
115 mcolumns = id_vars + var_name + [value_name]
117 mdata[value_name] = frame.values.ravel("F")
118 for i, col in enumerate(var_name):
119 # asanyarray will keep the columns as an Index
120 mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N)
122 return frame._constructor(mdata, columns=mcolumns)
125@deprecate_kwarg(old_arg_name="label", new_arg_name=None)
126def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame:
127 """
128 Reshape long-format data to wide. Generalized inverse of DataFrame.pivot
130 Parameters
131 ----------
132 data : DataFrame
133 groups : dict
134 {new_name : list_of_columns}
135 dropna : boolean, default True
137 Examples
138 --------
139 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
140 ... 'team': ['Red Sox', 'Yankees'],
141 ... 'year1': [2007, 2007], 'year2': [2008, 2008]})
142 >>> data
143 hr1 hr2 team year1 year2
144 0 514 545 Red Sox 2007 2008
145 1 573 526 Yankees 2007 2008
147 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
148 team year hr
149 0 Red Sox 2007 514
150 1 Yankees 2007 573
151 2 Red Sox 2008 545
152 3 Yankees 2008 526
154 Returns
155 -------
156 reshaped : DataFrame
157 """
158 if isinstance(groups, dict):
159 keys = list(groups.keys())
160 values = list(groups.values())
161 else:
162 keys, values = zip(*groups)
164 all_cols = list(set.union(*[set(x) for x in values]))
165 id_cols = list(data.columns.difference(all_cols))
167 K = len(values[0])
169 for seq in values:
170 if len(seq) != K:
171 raise ValueError("All column lists must be same length")
173 mdata = {}
174 pivot_cols = []
176 for target, names in zip(keys, values):
177 to_concat = [data[col].values for col in names]
179 mdata[target] = concat_compat(to_concat)
180 pivot_cols.append(target)
182 for col in id_cols:
183 mdata[col] = np.tile(data[col].values, K)
185 if dropna:
186 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
187 for c in pivot_cols:
188 mask &= notna(mdata[c])
189 if not mask.all():
190 mdata = {k: v[mask] for k, v in mdata.items()}
192 return data._constructor(mdata, columns=id_cols + pivot_cols)
195def wide_to_long(
196 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
197) -> DataFrame:
198 r"""
199 Wide panel to long format. Less flexible but more user-friendly than melt.
201 With stubnames ['A', 'B'], this function expects to find one or more
202 group of columns with format
203 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
204 You specify what you want to call this suffix in the resulting long format
205 with `j` (for example `j='year'`)
207 Each row of these wide variables are assumed to be uniquely identified by
208 `i` (can be a single column name or a list of column names)
210 All remaining variables in the data frame are left intact.
212 Parameters
213 ----------
214 df : DataFrame
215 The wide-format DataFrame.
216 stubnames : str or list-like
217 The stub name(s). The wide format variables are assumed to
218 start with the stub names.
219 i : str or list-like
220 Column(s) to use as id variable(s).
221 j : str
222 The name of the sub-observation variable. What you wish to name your
223 suffix in the long format.
224 sep : str, default ""
225 A character indicating the separation of the variable names
226 in the wide format, to be stripped from the names in the long format.
227 For example, if your column names are A-suffix1, A-suffix2, you
228 can strip the hyphen by specifying `sep='-'`.
229 suffix : str, default '\\d+'
230 A regular expression capturing the wanted suffixes. '\\d+' captures
231 numeric suffixes. Suffixes with no numbers could be specified with the
232 negated character class '\\D+'. You can also further disambiguate
233 suffixes, for example, if your wide variables are of the form
234 A-one, B-two,.., and you have an unrelated column A-rating, you can
235 ignore the last one by specifying `suffix='(!?one|two)'`.
237 .. versionchanged:: 0.23.0
238 When all suffixes are numeric, they are cast to int64/float64.
240 Returns
241 -------
242 DataFrame
243 A DataFrame that contains each stub name as a variable, with new index
244 (i, j).
246 Notes
247 -----
248 All extra variables are left untouched. This simply uses
249 `pandas.melt` under the hood, but is hard-coded to "do the right thing"
250 in a typical case.
252 Examples
253 --------
254 >>> np.random.seed(123)
255 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
256 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
257 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
258 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
259 ... "X" : dict(zip(range(3), np.random.randn(3)))
260 ... })
261 >>> df["id"] = df.index
262 >>> df
263 A1970 A1980 B1970 B1980 X id
264 0 a d 2.5 3.2 -1.085631 0
265 1 b e 1.2 1.3 0.997345 1
266 2 c f 0.7 0.1 0.282978 2
267 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
268 ... # doctest: +NORMALIZE_WHITESPACE
269 X A B
270 id year
271 0 1970 -1.085631 a 2.5
272 1 1970 0.997345 b 1.2
273 2 1970 0.282978 c 0.7
274 0 1980 -1.085631 d 3.2
275 1 1980 0.997345 e 1.3
276 2 1980 0.282978 f 0.1
278 With multiple id columns
280 >>> df = pd.DataFrame({
281 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
282 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
283 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
284 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
285 ... })
286 >>> df
287 famid birth ht1 ht2
288 0 1 1 2.8 3.4
289 1 1 2 2.9 3.8
290 2 1 3 2.2 2.9
291 3 2 1 2.0 3.2
292 4 2 2 1.8 2.8
293 5 2 3 1.9 2.4
294 6 3 1 2.2 3.3
295 7 3 2 2.3 3.4
296 8 3 3 2.1 2.9
297 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
298 >>> l
299 ... # doctest: +NORMALIZE_WHITESPACE
300 ht
301 famid birth age
302 1 1 1 2.8
303 2 3.4
304 2 1 2.9
305 2 3.8
306 3 1 2.2
307 2 2.9
308 2 1 1 2.0
309 2 3.2
310 2 1 1.8
311 2 2.8
312 3 1 1.9
313 2 2.4
314 3 1 1 2.2
315 2 3.3
316 2 1 2.3
317 2 3.4
318 3 1 2.1
319 2 2.9
321 Going from long back to wide just takes some creative use of `unstack`
323 >>> w = l.unstack()
324 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
325 >>> w.reset_index()
326 famid birth ht1 ht2
327 0 1 1 2.8 3.4
328 1 1 2 2.9 3.8
329 2 1 3 2.2 2.9
330 3 2 1 2.0 3.2
331 4 2 2 1.8 2.8
332 5 2 3 1.9 2.4
333 6 3 1 2.2 3.3
334 7 3 2 2.3 3.4
335 8 3 3 2.1 2.9
337 Less wieldy column names are also handled
339 >>> np.random.seed(0)
340 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
341 ... 'A(weekly)-2011': np.random.rand(3),
342 ... 'B(weekly)-2010': np.random.rand(3),
343 ... 'B(weekly)-2011': np.random.rand(3),
344 ... 'X' : np.random.randint(3, size=3)})
345 >>> df['id'] = df.index
346 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
347 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
348 0 0.548814 0.544883 0.437587 0.383442 0 0
349 1 0.715189 0.423655 0.891773 0.791725 1 1
350 2 0.602763 0.645894 0.963663 0.528895 1 2
352 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
353 ... j='year', sep='-')
354 ... # doctest: +NORMALIZE_WHITESPACE
355 X A(weekly) B(weekly)
356 id year
357 0 2010 0 0.548814 0.437587
358 1 2010 1 0.715189 0.891773
359 2 2010 1 0.602763 0.963663
360 0 2011 0 0.544883 0.383442
361 1 2011 1 0.423655 0.791725
362 2 2011 1 0.645894 0.528895
364 If we have many columns, we could also use a regex to find our
365 stubnames and pass that list on to wide_to_long
367 >>> stubnames = sorted(
368 ... set([match[0] for match in df.columns.str.findall(
369 ... r'[A-B]\(.*\)').values if match != []])
370 ... )
371 >>> list(stubnames)
372 ['A(weekly)', 'B(weekly)']
374 All of the above examples have integers as suffixes. It is possible to
375 have non-integers as suffixes.
377 >>> df = pd.DataFrame({
378 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
379 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
380 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
381 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
382 ... })
383 >>> df
384 famid birth ht_one ht_two
385 0 1 1 2.8 3.4
386 1 1 2 2.9 3.8
387 2 1 3 2.2 2.9
388 3 2 1 2.0 3.2
389 4 2 2 1.8 2.8
390 5 2 3 1.9 2.4
391 6 3 1 2.2 3.3
392 7 3 2 2.3 3.4
393 8 3 3 2.1 2.9
395 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
396 ... sep='_', suffix='\w+')
397 >>> l
398 ... # doctest: +NORMALIZE_WHITESPACE
399 ht
400 famid birth age
401 1 1 one 2.8
402 two 3.4
403 2 one 2.9
404 two 3.8
405 3 one 2.2
406 two 2.9
407 2 1 one 2.0
408 two 3.2
409 2 one 1.8
410 two 2.8
411 3 one 1.9
412 two 2.4
413 3 1 one 2.2
414 two 3.3
415 2 one 2.3
416 two 3.4
417 3 one 2.1
418 two 2.9
419 """
421 def get_var_names(df, stub: str, sep: str, suffix: str) -> List[str]:
422 regex = r"^{stub}{sep}{suffix}$".format(
423 stub=re.escape(stub), sep=re.escape(sep), suffix=suffix
424 )
425 pattern = re.compile(regex)
426 return [col for col in df.columns if pattern.match(col)]
428 def melt_stub(df, stub: str, i, j, value_vars, sep: str):
429 newdf = melt(
430 df,
431 id_vars=i,
432 value_vars=value_vars,
433 value_name=stub.rstrip(sep),
434 var_name=j,
435 )
436 newdf[j] = Categorical(newdf[j])
437 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
439 # GH17627 Cast numerics suffixes to int/float
440 newdf[j] = to_numeric(newdf[j], errors="ignore")
442 return newdf.set_index(i + [j])
444 if not is_list_like(stubnames):
445 stubnames = [stubnames]
446 else:
447 stubnames = list(stubnames)
449 if any(col in stubnames for col in df.columns):
450 raise ValueError("stubname can't be identical to a column name")
452 if not is_list_like(i):
453 i = [i]
454 else:
455 i = list(i)
457 if df[i].duplicated().any():
458 raise ValueError("the id variables need to uniquely identify each row")
460 value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
462 value_vars_flattened = [e for sublist in value_vars for e in sublist]
463 id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
465 _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]
466 melted = _melted[0].join(_melted[1:], how="outer")
468 if len(i) == 1:
469 new = df[id_vars].set_index(i).join(melted)
470 return new
472 new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
474 return new