Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# --------------------------------------------------------------------- 

2# JSON normalization routines 

3 

4from collections import defaultdict 

5import copy 

6from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union 

7 

8import numpy as np 

9 

10from pandas._libs.writers import convert_json_to_lines 

11from pandas._typing import Scalar 

12from pandas.util._decorators import deprecate 

13 

14import pandas as pd 

15from pandas import DataFrame 

16 

17 

18def convert_to_line_delimits(s): 

19 """ 

20 Helper function that converts JSON lists to line delimited JSON. 

21 """ 

22 

23 # Determine we have a JSON list to turn to lines otherwise just return the 

24 # json object, only lists can 

25 if not s[0] == "[" and s[-1] == "]": 

26 return s 

27 s = s[1:-1] 

28 

29 return convert_json_to_lines(s) 

30 

31 

32def nested_to_record( 

33 ds, 

34 prefix: str = "", 

35 sep: str = ".", 

36 level: int = 0, 

37 max_level: Optional[int] = None, 

38): 

39 """ 

40 A simplified json_normalize 

41 

42 Converts a nested dict into a flat dict ("record"), unlike json_normalize, 

43 it does not attempt to extract a subset of the data. 

44 

45 Parameters 

46 ---------- 

47 ds : dict or list of dicts 

48 prefix: the prefix, optional, default: "" 

49 sep : str, default '.' 

50 Nested records will generate names separated by sep, 

51 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

52 level: int, optional, default: 0 

53 The number of levels in the json string. 

54 

55 max_level: int, optional, default: None 

56 The max depth to normalize. 

57 

58 .. versionadded:: 0.25.0 

59 

60 Returns 

61 ------- 

62 d - dict or list of dicts, matching `ds` 

63 

64 Examples 

65 -------- 

66 

67 IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), 

68 nested=dict(e=dict(c=1,d=2),d=2))) 

69 Out[52]: 

70 {'dict1.c': 1, 

71 'dict1.d': 2, 

72 'flat1': 1, 

73 'nested.d': 2, 

74 'nested.e.c': 1, 

75 'nested.e.d': 2} 

76 """ 

77 singleton = False 

78 if isinstance(ds, dict): 

79 ds = [ds] 

80 singleton = True 

81 new_ds = [] 

82 for d in ds: 

83 new_d = copy.deepcopy(d) 

84 for k, v in d.items(): 

85 # each key gets renamed with prefix 

86 if not isinstance(k, str): 

87 k = str(k) 

88 if level == 0: 

89 newkey = k 

90 else: 

91 newkey = prefix + sep + k 

92 

93 # flatten if type is dict and 

94 # current dict level < maximum level provided and 

95 # only dicts gets recurse-flattened 

96 # only at level>1 do we rename the rest of the keys 

97 if not isinstance(v, dict) or ( 

98 max_level is not None and level >= max_level 

99 ): 

100 if level != 0: # so we skip copying for top level, common case 

101 v = new_d.pop(k) 

102 new_d[newkey] = v 

103 continue 

104 else: 

105 v = new_d.pop(k) 

106 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) 

107 new_ds.append(new_d) 

108 

109 if singleton: 

110 return new_ds[0] 

111 return new_ds 

112 

113 

114def _json_normalize( 

115 data: Union[Dict, List[Dict]], 

116 record_path: Optional[Union[str, List]] = None, 

117 meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, 

118 meta_prefix: Optional[str] = None, 

119 record_prefix: Optional[str] = None, 

120 errors: Optional[str] = "raise", 

121 sep: str = ".", 

122 max_level: Optional[int] = None, 

123) -> "DataFrame": 

124 """ 

125 Normalize semi-structured JSON data into a flat table. 

126 

127 Parameters 

128 ---------- 

129 data : dict or list of dicts 

130 Unserialized JSON objects. 

131 record_path : str or list of str, default None 

132 Path in each object to list of records. If not passed, data will be 

133 assumed to be an array of records. 

134 meta : list of paths (str or list of str), default None 

135 Fields to use as metadata for each record in resulting table. 

136 meta_prefix : str, default None 

137 If True, prefix records with dotted (?) path, e.g. foo.bar.field if 

138 meta is ['foo', 'bar']. 

139 record_prefix : str, default None 

140 If True, prefix records with dotted (?) path, e.g. foo.bar.field if 

141 path to records is ['foo', 'bar']. 

142 errors : {'raise', 'ignore'}, default 'raise' 

143 Configures error handling. 

144 

145 * 'ignore' : will ignore KeyError if keys listed in meta are not 

146 always present. 

147 * 'raise' : will raise KeyError if keys listed in meta are not 

148 always present. 

149 sep : str, default '.' 

150 Nested records will generate names separated by sep. 

151 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. 

152 max_level : int, default None 

153 Max number of levels(depth of dict) to normalize. 

154 if None, normalizes all levels. 

155 

156 .. versionadded:: 0.25.0 

157 

158 Returns 

159 ------- 

160 frame : DataFrame 

161 Normalize semi-structured JSON data into a flat table. 

162 

163 Examples 

164 -------- 

165 

166 >>> from pandas.io.json import json_normalize 

167 >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, 

168 ... {'name': {'given': 'Mose', 'family': 'Regner'}}, 

169 ... {'id': 2, 'name': 'Faye Raker'}] 

170 >>> json_normalize(data) 

171 id name name.family name.first name.given name.last 

172 0 1.0 NaN NaN Coleen NaN Volk 

173 1 NaN NaN Regner NaN Mose NaN 

174 2 2.0 Faye Raker NaN NaN NaN NaN 

175 

176 >>> data = [{'id': 1, 

177 ... 'name': "Cole Volk", 

178 ... 'fitness': {'height': 130, 'weight': 60}}, 

179 ... {'name': "Mose Reg", 

180 ... 'fitness': {'height': 130, 'weight': 60}}, 

181 ... {'id': 2, 'name': 'Faye Raker', 

182 ... 'fitness': {'height': 130, 'weight': 60}}] 

183 >>> json_normalize(data, max_level=0) 

184 fitness id name 

185 0 {'height': 130, 'weight': 60} 1.0 Cole Volk 

186 1 {'height': 130, 'weight': 60} NaN Mose Reg 

187 2 {'height': 130, 'weight': 60} 2.0 Faye Raker 

188 

189 Normalizes nested data up to level 1. 

190 

191 >>> data = [{'id': 1, 

192 ... 'name': "Cole Volk", 

193 ... 'fitness': {'height': 130, 'weight': 60}}, 

194 ... {'name': "Mose Reg", 

195 ... 'fitness': {'height': 130, 'weight': 60}}, 

196 ... {'id': 2, 'name': 'Faye Raker', 

197 ... 'fitness': {'height': 130, 'weight': 60}}] 

198 >>> json_normalize(data, max_level=1) 

199 fitness.height fitness.weight id name 

200 0 130 60 1.0 Cole Volk 

201 1 130 60 NaN Mose Reg 

202 2 130 60 2.0 Faye Raker 

203 

204 >>> data = [{'state': 'Florida', 

205 ... 'shortname': 'FL', 

206 ... 'info': {'governor': 'Rick Scott'}, 

207 ... 'counties': [{'name': 'Dade', 'population': 12345}, 

208 ... {'name': 'Broward', 'population': 40000}, 

209 ... {'name': 'Palm Beach', 'population': 60000}]}, 

210 ... {'state': 'Ohio', 

211 ... 'shortname': 'OH', 

212 ... 'info': {'governor': 'John Kasich'}, 

213 ... 'counties': [{'name': 'Summit', 'population': 1234}, 

214 ... {'name': 'Cuyahoga', 'population': 1337}]}] 

215 >>> result = json_normalize(data, 'counties', ['state', 'shortname', 

216 ... ['info', 'governor']]) 

217 >>> result 

218 name population state shortname info.governor 

219 0 Dade 12345 Florida FL Rick Scott 

220 1 Broward 40000 Florida FL Rick Scott 

221 2 Palm Beach 60000 Florida FL Rick Scott 

222 3 Summit 1234 Ohio OH John Kasich 

223 4 Cuyahoga 1337 Ohio OH John Kasich 

224 

225 >>> data = {'A': [1, 2]} 

226 >>> json_normalize(data, 'A', record_prefix='Prefix.') 

227 Prefix.0 

228 0 1 

229 1 2 

230 

231 Returns normalized data with columns prefixed with the given string. 

232 """ 

233 

234 def _pull_field( 

235 js: Dict[str, Any], spec: Union[List, str] 

236 ) -> Union[Scalar, Iterable]: 

237 """Internal function to pull field""" 

238 result = js # type: ignore 

239 if isinstance(spec, list): 

240 for field in spec: 

241 result = result[field] 

242 else: 

243 result = result[spec] 

244 return result 

245 

246 def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: 

247 """ 

248 Interal function to pull field for records, and similar to 

249 _pull_field, but require to return Iterable. And will raise error 

250 if has non iterable value. 

251 """ 

252 result = _pull_field(js, spec) 

253 

254 # GH 31507 GH 30145, if result is not Iterable, raise TypeError if not 

255 # null, otherwise return an empty list 

256 if not isinstance(result, Iterable): 

257 if pd.isnull(result): 

258 result = [] # type: ignore 

259 else: 

260 raise TypeError( 

261 f"{js} has non iterable value {result} for path {spec}. " 

262 "Must be iterable or null." 

263 ) 

264 return result 

265 

266 if isinstance(data, list) and not data: 

267 return DataFrame() 

268 

269 # A bit of a hackjob 

270 if isinstance(data, dict): 

271 data = [data] 

272 

273 if record_path is None: 

274 if any([isinstance(x, dict) for x in y.values()] for y in data): 

275 # naive normalization, this is idempotent for flat records 

276 # and potentially will inflate the data considerably for 

277 # deeply nested structures: 

278 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} 

279 # 

280 # TODO: handle record value which are lists, at least error 

281 # reasonably 

282 data = nested_to_record(data, sep=sep, max_level=max_level) 

283 return DataFrame(data) 

284 elif not isinstance(record_path, list): 

285 record_path = [record_path] 

286 

287 if meta is None: 

288 meta = [] 

289 elif not isinstance(meta, list): 

290 meta = [meta] 

291 

292 _meta = [m if isinstance(m, list) else [m] for m in meta] 

293 

294 # Disastrously inefficient for now 

295 records: List = [] 

296 lengths = [] 

297 

298 meta_vals: DefaultDict = defaultdict(list) 

299 meta_keys = [sep.join(val) for val in _meta] 

300 

301 def _recursive_extract(data, path, seen_meta, level=0): 

302 if isinstance(data, dict): 

303 data = [data] 

304 if len(path) > 1: 

305 for obj in data: 

306 for val, key in zip(_meta, meta_keys): 

307 if level + 1 == len(val): 

308 seen_meta[key] = _pull_field(obj, val[-1]) 

309 

310 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) 

311 else: 

312 for obj in data: 

313 recs = _pull_records(obj, path[0]) 

314 recs = [ 

315 nested_to_record(r, sep=sep, max_level=max_level) 

316 if isinstance(r, dict) 

317 else r 

318 for r in recs 

319 ] 

320 

321 # For repeating the metadata later 

322 lengths.append(len(recs)) 

323 for val, key in zip(_meta, meta_keys): 

324 if level + 1 > len(val): 

325 meta_val = seen_meta[key] 

326 else: 

327 try: 

328 meta_val = _pull_field(obj, val[level:]) 

329 except KeyError as e: 

330 if errors == "ignore": 

331 meta_val = np.nan 

332 else: 

333 raise KeyError( 

334 "Try running with " 

335 "errors='ignore' as key " 

336 f"{e} is not always present" 

337 ) 

338 meta_vals[key].append(meta_val) 

339 records.extend(recs) 

340 

341 _recursive_extract(data, record_path, {}, level=0) 

342 

343 result = DataFrame(records) 

344 

345 if record_prefix is not None: 

346 result = result.rename(columns=lambda x: f"{record_prefix}{x}") 

347 

348 # Data types, a problem 

349 for k, v in meta_vals.items(): 

350 if meta_prefix is not None: 

351 k = meta_prefix + k 

352 

353 if k in result: 

354 raise ValueError( 

355 f"Conflicting metadata name {k}, need distinguishing prefix " 

356 ) 

357 result[k] = np.array(v, dtype=object).repeat(lengths) 

358 return result 

359 

360 

361json_normalize = deprecate( 

362 "pandas.io.json.json_normalize", _json_normalize, "1.0.0", "pandas.json_normalize" 

363)