Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/json/_normalize.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# ---------------------------------------------------------------------
2# JSON normalization routines
4from collections import defaultdict
5import copy
6from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union
8import numpy as np
10from pandas._libs.writers import convert_json_to_lines
11from pandas._typing import Scalar
12from pandas.util._decorators import deprecate
14import pandas as pd
15from pandas import DataFrame
18def convert_to_line_delimits(s):
19 """
20 Helper function that converts JSON lists to line delimited JSON.
21 """
23 # Determine we have a JSON list to turn to lines otherwise just return the
24 # json object, only lists can
25 if not s[0] == "[" and s[-1] == "]":
26 return s
27 s = s[1:-1]
29 return convert_json_to_lines(s)
32def nested_to_record(
33 ds,
34 prefix: str = "",
35 sep: str = ".",
36 level: int = 0,
37 max_level: Optional[int] = None,
38):
39 """
40 A simplified json_normalize
42 Converts a nested dict into a flat dict ("record"), unlike json_normalize,
43 it does not attempt to extract a subset of the data.
45 Parameters
46 ----------
47 ds : dict or list of dicts
48 prefix: the prefix, optional, default: ""
49 sep : str, default '.'
50 Nested records will generate names separated by sep,
51 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
52 level: int, optional, default: 0
53 The number of levels in the json string.
55 max_level: int, optional, default: None
56 The max depth to normalize.
58 .. versionadded:: 0.25.0
60 Returns
61 -------
62 d - dict or list of dicts, matching `ds`
64 Examples
65 --------
67 IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
68 nested=dict(e=dict(c=1,d=2),d=2)))
69 Out[52]:
70 {'dict1.c': 1,
71 'dict1.d': 2,
72 'flat1': 1,
73 'nested.d': 2,
74 'nested.e.c': 1,
75 'nested.e.d': 2}
76 """
77 singleton = False
78 if isinstance(ds, dict):
79 ds = [ds]
80 singleton = True
81 new_ds = []
82 for d in ds:
83 new_d = copy.deepcopy(d)
84 for k, v in d.items():
85 # each key gets renamed with prefix
86 if not isinstance(k, str):
87 k = str(k)
88 if level == 0:
89 newkey = k
90 else:
91 newkey = prefix + sep + k
93 # flatten if type is dict and
94 # current dict level < maximum level provided and
95 # only dicts gets recurse-flattened
96 # only at level>1 do we rename the rest of the keys
97 if not isinstance(v, dict) or (
98 max_level is not None and level >= max_level
99 ):
100 if level != 0: # so we skip copying for top level, common case
101 v = new_d.pop(k)
102 new_d[newkey] = v
103 continue
104 else:
105 v = new_d.pop(k)
106 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
107 new_ds.append(new_d)
109 if singleton:
110 return new_ds[0]
111 return new_ds
114def _json_normalize(
115 data: Union[Dict, List[Dict]],
116 record_path: Optional[Union[str, List]] = None,
117 meta: Optional[Union[str, List[Union[str, List[str]]]]] = None,
118 meta_prefix: Optional[str] = None,
119 record_prefix: Optional[str] = None,
120 errors: Optional[str] = "raise",
121 sep: str = ".",
122 max_level: Optional[int] = None,
123) -> "DataFrame":
124 """
125 Normalize semi-structured JSON data into a flat table.
127 Parameters
128 ----------
129 data : dict or list of dicts
130 Unserialized JSON objects.
131 record_path : str or list of str, default None
132 Path in each object to list of records. If not passed, data will be
133 assumed to be an array of records.
134 meta : list of paths (str or list of str), default None
135 Fields to use as metadata for each record in resulting table.
136 meta_prefix : str, default None
137 If True, prefix records with dotted (?) path, e.g. foo.bar.field if
138 meta is ['foo', 'bar'].
139 record_prefix : str, default None
140 If True, prefix records with dotted (?) path, e.g. foo.bar.field if
141 path to records is ['foo', 'bar'].
142 errors : {'raise', 'ignore'}, default 'raise'
143 Configures error handling.
145 * 'ignore' : will ignore KeyError if keys listed in meta are not
146 always present.
147 * 'raise' : will raise KeyError if keys listed in meta are not
148 always present.
149 sep : str, default '.'
150 Nested records will generate names separated by sep.
151 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
152 max_level : int, default None
153 Max number of levels(depth of dict) to normalize.
154 if None, normalizes all levels.
156 .. versionadded:: 0.25.0
158 Returns
159 -------
160 frame : DataFrame
161 Normalize semi-structured JSON data into a flat table.
163 Examples
164 --------
166 >>> from pandas.io.json import json_normalize
167 >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
168 ... {'name': {'given': 'Mose', 'family': 'Regner'}},
169 ... {'id': 2, 'name': 'Faye Raker'}]
170 >>> json_normalize(data)
171 id name name.family name.first name.given name.last
172 0 1.0 NaN NaN Coleen NaN Volk
173 1 NaN NaN Regner NaN Mose NaN
174 2 2.0 Faye Raker NaN NaN NaN NaN
176 >>> data = [{'id': 1,
177 ... 'name': "Cole Volk",
178 ... 'fitness': {'height': 130, 'weight': 60}},
179 ... {'name': "Mose Reg",
180 ... 'fitness': {'height': 130, 'weight': 60}},
181 ... {'id': 2, 'name': 'Faye Raker',
182 ... 'fitness': {'height': 130, 'weight': 60}}]
183 >>> json_normalize(data, max_level=0)
184 fitness id name
185 0 {'height': 130, 'weight': 60} 1.0 Cole Volk
186 1 {'height': 130, 'weight': 60} NaN Mose Reg
187 2 {'height': 130, 'weight': 60} 2.0 Faye Raker
189 Normalizes nested data up to level 1.
191 >>> data = [{'id': 1,
192 ... 'name': "Cole Volk",
193 ... 'fitness': {'height': 130, 'weight': 60}},
194 ... {'name': "Mose Reg",
195 ... 'fitness': {'height': 130, 'weight': 60}},
196 ... {'id': 2, 'name': 'Faye Raker',
197 ... 'fitness': {'height': 130, 'weight': 60}}]
198 >>> json_normalize(data, max_level=1)
199 fitness.height fitness.weight id name
200 0 130 60 1.0 Cole Volk
201 1 130 60 NaN Mose Reg
202 2 130 60 2.0 Faye Raker
204 >>> data = [{'state': 'Florida',
205 ... 'shortname': 'FL',
206 ... 'info': {'governor': 'Rick Scott'},
207 ... 'counties': [{'name': 'Dade', 'population': 12345},
208 ... {'name': 'Broward', 'population': 40000},
209 ... {'name': 'Palm Beach', 'population': 60000}]},
210 ... {'state': 'Ohio',
211 ... 'shortname': 'OH',
212 ... 'info': {'governor': 'John Kasich'},
213 ... 'counties': [{'name': 'Summit', 'population': 1234},
214 ... {'name': 'Cuyahoga', 'population': 1337}]}]
215 >>> result = json_normalize(data, 'counties', ['state', 'shortname',
216 ... ['info', 'governor']])
217 >>> result
218 name population state shortname info.governor
219 0 Dade 12345 Florida FL Rick Scott
220 1 Broward 40000 Florida FL Rick Scott
221 2 Palm Beach 60000 Florida FL Rick Scott
222 3 Summit 1234 Ohio OH John Kasich
223 4 Cuyahoga 1337 Ohio OH John Kasich
225 >>> data = {'A': [1, 2]}
226 >>> json_normalize(data, 'A', record_prefix='Prefix.')
227 Prefix.0
228 0 1
229 1 2
231 Returns normalized data with columns prefixed with the given string.
232 """
234 def _pull_field(
235 js: Dict[str, Any], spec: Union[List, str]
236 ) -> Union[Scalar, Iterable]:
237 """Internal function to pull field"""
238 result = js # type: ignore
239 if isinstance(spec, list):
240 for field in spec:
241 result = result[field]
242 else:
243 result = result[spec]
244 return result
246 def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> Iterable:
247 """
248 Interal function to pull field for records, and similar to
249 _pull_field, but require to return Iterable. And will raise error
250 if has non iterable value.
251 """
252 result = _pull_field(js, spec)
254 # GH 31507 GH 30145, if result is not Iterable, raise TypeError if not
255 # null, otherwise return an empty list
256 if not isinstance(result, Iterable):
257 if pd.isnull(result):
258 result = [] # type: ignore
259 else:
260 raise TypeError(
261 f"{js} has non iterable value {result} for path {spec}. "
262 "Must be iterable or null."
263 )
264 return result
266 if isinstance(data, list) and not data:
267 return DataFrame()
269 # A bit of a hackjob
270 if isinstance(data, dict):
271 data = [data]
273 if record_path is None:
274 if any([isinstance(x, dict) for x in y.values()] for y in data):
275 # naive normalization, this is idempotent for flat records
276 # and potentially will inflate the data considerably for
277 # deeply nested structures:
278 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
279 #
280 # TODO: handle record value which are lists, at least error
281 # reasonably
282 data = nested_to_record(data, sep=sep, max_level=max_level)
283 return DataFrame(data)
284 elif not isinstance(record_path, list):
285 record_path = [record_path]
287 if meta is None:
288 meta = []
289 elif not isinstance(meta, list):
290 meta = [meta]
292 _meta = [m if isinstance(m, list) else [m] for m in meta]
294 # Disastrously inefficient for now
295 records: List = []
296 lengths = []
298 meta_vals: DefaultDict = defaultdict(list)
299 meta_keys = [sep.join(val) for val in _meta]
301 def _recursive_extract(data, path, seen_meta, level=0):
302 if isinstance(data, dict):
303 data = [data]
304 if len(path) > 1:
305 for obj in data:
306 for val, key in zip(_meta, meta_keys):
307 if level + 1 == len(val):
308 seen_meta[key] = _pull_field(obj, val[-1])
310 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
311 else:
312 for obj in data:
313 recs = _pull_records(obj, path[0])
314 recs = [
315 nested_to_record(r, sep=sep, max_level=max_level)
316 if isinstance(r, dict)
317 else r
318 for r in recs
319 ]
321 # For repeating the metadata later
322 lengths.append(len(recs))
323 for val, key in zip(_meta, meta_keys):
324 if level + 1 > len(val):
325 meta_val = seen_meta[key]
326 else:
327 try:
328 meta_val = _pull_field(obj, val[level:])
329 except KeyError as e:
330 if errors == "ignore":
331 meta_val = np.nan
332 else:
333 raise KeyError(
334 "Try running with "
335 "errors='ignore' as key "
336 f"{e} is not always present"
337 )
338 meta_vals[key].append(meta_val)
339 records.extend(recs)
341 _recursive_extract(data, record_path, {}, level=0)
343 result = DataFrame(records)
345 if record_prefix is not None:
346 result = result.rename(columns=lambda x: f"{record_prefix}{x}")
348 # Data types, a problem
349 for k, v in meta_vals.items():
350 if meta_prefix is not None:
351 k = meta_prefix + k
353 if k in result:
354 raise ValueError(
355 f"Conflicting metadata name {k}, need distinguishing prefix "
356 )
357 result[k] = np.array(v, dtype=object).repeat(lengths)
358 return result
361json_normalize = deprecate(
362 "pandas.io.json.json_normalize", _json_normalize, "1.0.0", "pandas.json_normalize"
363)