Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from collections import abc 

2import functools 

3from io import StringIO 

4from itertools import islice 

5import os 

6from typing import Any, Callable, Optional, Type 

7 

8import numpy as np 

9 

10import pandas._libs.json as json 

11from pandas._libs.tslibs import iNaT 

12from pandas._typing import JSONSerializable 

13from pandas.errors import AbstractMethodError 

14from pandas.util._decorators import deprecate_kwarg 

15 

16from pandas.core.dtypes.common import ensure_str, is_period_dtype 

17 

18from pandas import DataFrame, MultiIndex, Series, isna, to_datetime 

19from pandas.core.construction import create_series_with_explicit_dtype 

20from pandas.core.reshape.concat import concat 

21 

22from pandas.io.common import ( 

23 get_filepath_or_buffer, 

24 get_handle, 

25 infer_compression, 

26 stringify_path, 

27) 

28from pandas.io.json._normalize import convert_to_line_delimits 

29from pandas.io.json._table_schema import build_table_schema, parse_table_schema 

30from pandas.io.parsers import _validate_integer 

31 

32loads = json.loads 

33dumps = json.dumps 

34 

35TABLE_SCHEMA_VERSION = "0.20.0" 

36 

37 

38# interface to/from 

39def to_json( 

40 path_or_buf, 

41 obj, 

42 orient: Optional[str] = None, 

43 date_format: str = "epoch", 

44 double_precision: int = 10, 

45 force_ascii: bool = True, 

46 date_unit: str = "ms", 

47 default_handler: Optional[Callable[[Any], JSONSerializable]] = None, 

48 lines: bool = False, 

49 compression: Optional[str] = "infer", 

50 index: bool = True, 

51 indent: int = 0, 

52): 

53 

54 if not index and orient not in ["split", "table"]: 

55 raise ValueError( 

56 "'index=False' is only valid when 'orient' is 'split' or 'table'" 

57 ) 

58 

59 path_or_buf = stringify_path(path_or_buf) 

60 if lines and orient != "records": 

61 raise ValueError("'lines' keyword only valid when 'orient' is records") 

62 

63 if orient == "table" and isinstance(obj, Series): 

64 obj = obj.to_frame(name=obj.name or "values") 

65 

66 writer: Type["Writer"] 

67 if orient == "table" and isinstance(obj, DataFrame): 

68 writer = JSONTableWriter 

69 elif isinstance(obj, Series): 

70 writer = SeriesWriter 

71 elif isinstance(obj, DataFrame): 

72 writer = FrameWriter 

73 else: 

74 raise NotImplementedError("'obj' should be a Series or a DataFrame") 

75 

76 s = writer( 

77 obj, 

78 orient=orient, 

79 date_format=date_format, 

80 double_precision=double_precision, 

81 ensure_ascii=force_ascii, 

82 date_unit=date_unit, 

83 default_handler=default_handler, 

84 index=index, 

85 indent=indent, 

86 ).write() 

87 

88 if lines: 

89 s = convert_to_line_delimits(s) 

90 

91 if isinstance(path_or_buf, str): 

92 fh, handles = get_handle(path_or_buf, "w", compression=compression) 

93 try: 

94 fh.write(s) 

95 finally: 

96 fh.close() 

97 elif path_or_buf is None: 

98 return s 

99 else: 

100 path_or_buf.write(s) 

101 

102 

103class Writer: 

104 def __init__( 

105 self, 

106 obj, 

107 orient: Optional[str], 

108 date_format: str, 

109 double_precision: int, 

110 ensure_ascii: bool, 

111 date_unit: str, 

112 index: bool, 

113 default_handler: Optional[Callable[[Any], JSONSerializable]] = None, 

114 indent: int = 0, 

115 ): 

116 self.obj = obj 

117 

118 if orient is None: 

119 orient = self._default_orient # type: ignore 

120 

121 self.orient = orient 

122 self.date_format = date_format 

123 self.double_precision = double_precision 

124 self.ensure_ascii = ensure_ascii 

125 self.date_unit = date_unit 

126 self.default_handler = default_handler 

127 self.index = index 

128 self.indent = indent 

129 

130 self.is_copy = None 

131 self._format_axes() 

132 

133 def _format_axes(self): 

134 raise AbstractMethodError(self) 

135 

136 def write(self): 

137 return self._write( 

138 self.obj, 

139 self.orient, 

140 self.double_precision, 

141 self.ensure_ascii, 

142 self.date_unit, 

143 self.date_format == "iso", 

144 self.default_handler, 

145 self.indent, 

146 ) 

147 

148 def _write( 

149 self, 

150 obj, 

151 orient: Optional[str], 

152 double_precision: int, 

153 ensure_ascii: bool, 

154 date_unit: str, 

155 iso_dates: bool, 

156 default_handler: Optional[Callable[[Any], JSONSerializable]], 

157 indent: int, 

158 ): 

159 return dumps( 

160 obj, 

161 orient=orient, 

162 double_precision=double_precision, 

163 ensure_ascii=ensure_ascii, 

164 date_unit=date_unit, 

165 iso_dates=iso_dates, 

166 default_handler=default_handler, 

167 indent=indent, 

168 ) 

169 

170 

171class SeriesWriter(Writer): 

172 _default_orient = "index" 

173 

174 def _format_axes(self): 

175 if not self.obj.index.is_unique and self.orient == "index": 

176 raise ValueError(f"Series index must be unique for orient='{self.orient}'") 

177 

178 def _write( 

179 self, 

180 obj, 

181 orient: Optional[str], 

182 double_precision: int, 

183 ensure_ascii: bool, 

184 date_unit: str, 

185 iso_dates: bool, 

186 default_handler: Optional[Callable[[Any], JSONSerializable]], 

187 indent: int, 

188 ): 

189 if not self.index and orient == "split": 

190 obj = {"name": obj.name, "data": obj.values} 

191 return super()._write( 

192 obj, 

193 orient, 

194 double_precision, 

195 ensure_ascii, 

196 date_unit, 

197 iso_dates, 

198 default_handler, 

199 indent, 

200 ) 

201 

202 

203class FrameWriter(Writer): 

204 _default_orient = "columns" 

205 

206 def _format_axes(self): 

207 """ 

208 Try to format axes if they are datelike. 

209 """ 

210 if not self.obj.index.is_unique and self.orient in ("index", "columns"): 

211 raise ValueError( 

212 f"DataFrame index must be unique for orient='{self.orient}'." 

213 ) 

214 if not self.obj.columns.is_unique and self.orient in ( 

215 "index", 

216 "columns", 

217 "records", 

218 ): 

219 raise ValueError( 

220 f"DataFrame columns must be unique for orient='{self.orient}'." 

221 ) 

222 

223 def _write( 

224 self, 

225 obj, 

226 orient: Optional[str], 

227 double_precision: int, 

228 ensure_ascii: bool, 

229 date_unit: str, 

230 iso_dates: bool, 

231 default_handler: Optional[Callable[[Any], JSONSerializable]], 

232 indent: int, 

233 ): 

234 if not self.index and orient == "split": 

235 obj = obj.to_dict(orient="split") 

236 del obj["index"] 

237 return super()._write( 

238 obj, 

239 orient, 

240 double_precision, 

241 ensure_ascii, 

242 date_unit, 

243 iso_dates, 

244 default_handler, 

245 indent, 

246 ) 

247 

248 

249class JSONTableWriter(FrameWriter): 

250 _default_orient = "records" 

251 

252 def __init__( 

253 self, 

254 obj, 

255 orient: Optional[str], 

256 date_format: str, 

257 double_precision: int, 

258 ensure_ascii: bool, 

259 date_unit: str, 

260 index: bool, 

261 default_handler: Optional[Callable[[Any], JSONSerializable]] = None, 

262 indent: int = 0, 

263 ): 

264 """ 

265 Adds a `schema` attribute with the Table Schema, resets 

266 the index (can't do in caller, because the schema inference needs 

267 to know what the index is, forces orient to records, and forces 

268 date_format to 'iso'. 

269 """ 

270 

271 super().__init__( 

272 obj, 

273 orient, 

274 date_format, 

275 double_precision, 

276 ensure_ascii, 

277 date_unit, 

278 index, 

279 default_handler=default_handler, 

280 indent=indent, 

281 ) 

282 

283 if date_format != "iso": 

284 msg = ( 

285 "Trying to write with `orient='table'` and " 

286 f"`date_format='{date_format}'`. Table Schema requires dates " 

287 "to be formatted with `date_format='iso'`" 

288 ) 

289 raise ValueError(msg) 

290 

291 self.schema = build_table_schema(obj, index=self.index) 

292 

293 # NotImplemented on a column MultiIndex 

294 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): 

295 raise NotImplementedError("orient='table' is not supported for MultiIndex") 

296 

297 # TODO: Do this timedelta properly in objToJSON.c See GH #15137 

298 if ( 

299 (obj.ndim == 1) 

300 and (obj.name in set(obj.index.names)) 

301 or len(obj.columns & obj.index.names) 

302 ): 

303 msg = "Overlapping names between the index and columns" 

304 raise ValueError(msg) 

305 

306 obj = obj.copy() 

307 timedeltas = obj.select_dtypes(include=["timedelta"]).columns 

308 if len(timedeltas): 

309 obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat()) 

310 # Convert PeriodIndex to datetimes before serializing 

311 if is_period_dtype(obj.index): 

312 obj.index = obj.index.to_timestamp() 

313 

314 # exclude index from obj if index=False 

315 if not self.index: 

316 self.obj = obj.reset_index(drop=True) 

317 else: 

318 self.obj = obj.reset_index(drop=False) 

319 self.date_format = "iso" 

320 self.orient = "records" 

321 self.index = index 

322 

323 def _write( 

324 self, 

325 obj, 

326 orient, 

327 double_precision, 

328 ensure_ascii, 

329 date_unit, 

330 iso_dates, 

331 default_handler, 

332 indent, 

333 ): 

334 table_obj = {"schema": self.schema, "data": obj} 

335 serialized = super()._write( 

336 table_obj, 

337 orient, 

338 double_precision, 

339 ensure_ascii, 

340 date_unit, 

341 iso_dates, 

342 default_handler, 

343 indent, 

344 ) 

345 

346 return serialized 

347 

348 

349@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) 

350def read_json( 

351 path_or_buf=None, 

352 orient=None, 

353 typ="frame", 

354 dtype=None, 

355 convert_axes=None, 

356 convert_dates=True, 

357 keep_default_dates=True, 

358 numpy=False, 

359 precise_float=False, 

360 date_unit=None, 

361 encoding=None, 

362 lines=False, 

363 chunksize=None, 

364 compression="infer", 

365): 

366 """ 

367 Convert a JSON string to pandas object. 

368 

369 Parameters 

370 ---------- 

371 path_or_buf : a valid JSON str, path object or file-like object 

372 Any valid string path is acceptable. The string could be a URL. Valid 

373 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

374 expected. A local file could be: 

375 ``file://localhost/path/to/table.json``. 

376 

377 If you want to pass in a path object, pandas accepts any 

378 ``os.PathLike``. 

379 

380 By file-like object, we refer to objects with a ``read()`` method, 

381 such as a file handler (e.g. via builtin ``open`` function) 

382 or ``StringIO``. 

383 orient : str 

384 Indication of expected JSON string format. 

385 Compatible JSON strings can be produced by ``to_json()`` with a 

386 corresponding orient value. 

387 The set of possible orients is: 

388 

389 - ``'split'`` : dict like 

390 ``{index -> [index], columns -> [columns], data -> [values]}`` 

391 - ``'records'`` : list like 

392 ``[{column -> value}, ... , {column -> value}]`` 

393 - ``'index'`` : dict like ``{index -> {column -> value}}`` 

394 - ``'columns'`` : dict like ``{column -> {index -> value}}`` 

395 - ``'values'`` : just the values array 

396 

397 The allowed and default values depend on the value 

398 of the `typ` parameter. 

399 

400 * when ``typ == 'series'``, 

401 

402 - allowed orients are ``{'split','records','index'}`` 

403 - default is ``'index'`` 

404 - The Series index must be unique for orient ``'index'``. 

405 

406 * when ``typ == 'frame'``, 

407 

408 - allowed orients are ``{'split','records','index', 

409 'columns','values', 'table'}`` 

410 - default is ``'columns'`` 

411 - The DataFrame index must be unique for orients ``'index'`` and 

412 ``'columns'``. 

413 - The DataFrame columns must be unique for orients ``'index'``, 

414 ``'columns'``, and ``'records'``. 

415 

416 .. versionadded:: 0.23.0 

417 'table' as an allowed value for the ``orient`` argument 

418 

419 typ : {'frame', 'series'}, default 'frame' 

420 The type of object to recover. 

421 

422 dtype : bool or dict, default None 

423 If True, infer dtypes; if a dict of column to dtype, then use those; 

424 if False, then don't infer dtypes at all, applies only to the data. 

425 

426 For all ``orient`` values except ``'table'``, default is True. 

427 

428 .. versionchanged:: 0.25.0 

429 

430 Not applicable for ``orient='table'``. 

431 

432 convert_axes : bool, default None 

433 Try to convert the axes to the proper dtypes. 

434 

435 For all ``orient`` values except ``'table'``, default is True. 

436 

437 .. versionchanged:: 0.25.0 

438 

439 Not applicable for ``orient='table'``. 

440 

441 convert_dates : bool or list of str, default True 

442 List of columns to parse for dates. If True, then try to parse 

443 datelike columns. A column label is datelike if 

444 

445 * it ends with ``'_at'``, 

446 

447 * it ends with ``'_time'``, 

448 

449 * it begins with ``'timestamp'``, 

450 

451 * it is ``'modified'``, or 

452 

453 * it is ``'date'``. 

454 

455 keep_default_dates : bool, default True 

456 If parsing dates, then parse the default datelike columns. 

457 

458 numpy : bool, default False 

459 Direct decoding to numpy arrays. Supports numeric data only, but 

460 non-numeric column and index labels are supported. Note also that the 

461 JSON ordering MUST be the same for each term if numpy=True. 

462 

463 .. deprecated:: 1.0.0 

464 

465 precise_float : bool, default False 

466 Set to enable usage of higher precision (strtod) function when 

467 decoding string to double values. Default (False) is to use fast but 

468 less precise builtin functionality. 

469 

470 date_unit : str, default None 

471 The timestamp unit to detect if converting dates. The default behaviour 

472 is to try and detect the correct precision, but if this is not desired 

473 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, 

474 milliseconds, microseconds or nanoseconds respectively. 

475 

476 encoding : str, default is 'utf-8' 

477 The encoding to use to decode py3 bytes. 

478 

479 lines : bool, default False 

480 Read the file as a json object per line. 

481 

482 chunksize : int, optional 

483 Return JsonReader object for iteration. 

484 See the `line-delimited json docs 

485 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_ 

486 for more information on ``chunksize``. 

487 This can only be passed if `lines=True`. 

488 If this is None, the file will be read into memory all at once. 

489 

490 .. versionadded:: 0.21.0 

491 

492 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' 

493 For on-the-fly decompression of on-disk data. If 'infer', then use 

494 gzip, bz2, zip or xz if path_or_buf is a string ending in 

495 '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression 

496 otherwise. If using 'zip', the ZIP file must contain only one data 

497 file to be read in. Set to None for no decompression. 

498 

499 .. versionadded:: 0.21.0 

500 

501 Returns 

502 ------- 

503 Series or DataFrame 

504 The type returned depends on the value of `typ`. 

505 

506 See Also 

507 -------- 

508 DataFrame.to_json : Convert a DataFrame to a JSON string. 

509 Series.to_json : Convert a Series to a JSON string. 

510 

511 Notes 

512 ----- 

513 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal 

514 :class:`Index` name of `index` gets written with :func:`to_json`, the 

515 subsequent read operation will incorrectly set the :class:`Index` name to 

516 ``None``. This is because `index` is also used by :func:`DataFrame.to_json` 

517 to denote a missing :class:`Index` name, and the subsequent 

518 :func:`read_json` operation cannot distinguish between the two. The same 

519 limitation is encountered with a :class:`MultiIndex` and any names 

520 beginning with ``'level_'``. 

521 

522 Examples 

523 -------- 

524 

525 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], 

526 ... index=['row 1', 'row 2'], 

527 ... columns=['col 1', 'col 2']) 

528 

529 Encoding/decoding a Dataframe using ``'split'`` formatted JSON: 

530 

531 >>> df.to_json(orient='split') 

532 '{"columns":["col 1","col 2"], 

533 "index":["row 1","row 2"], 

534 "data":[["a","b"],["c","d"]]}' 

535 >>> pd.read_json(_, orient='split') 

536 col 1 col 2 

537 row 1 a b 

538 row 2 c d 

539 

540 Encoding/decoding a Dataframe using ``'index'`` formatted JSON: 

541 

542 >>> df.to_json(orient='index') 

543 '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' 

544 >>> pd.read_json(_, orient='index') 

545 col 1 col 2 

546 row 1 a b 

547 row 2 c d 

548 

549 Encoding/decoding a Dataframe using ``'records'`` formatted JSON. 

550 Note that index labels are not preserved with this encoding. 

551 

552 >>> df.to_json(orient='records') 

553 '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' 

554 >>> pd.read_json(_, orient='records') 

555 col 1 col 2 

556 0 a b 

557 1 c d 

558 

559 Encoding with Table Schema 

560 

561 >>> df.to_json(orient='table') 

562 '{"schema": {"fields": [{"name": "index", "type": "string"}, 

563 {"name": "col 1", "type": "string"}, 

564 {"name": "col 2", "type": "string"}], 

565 "primaryKey": "index", 

566 "pandas_version": "0.20.0"}, 

567 "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, 

568 {"index": "row 2", "col 1": "c", "col 2": "d"}]}' 

569 """ 

570 

571 if orient == "table" and dtype: 

572 raise ValueError("cannot pass both dtype and orient='table'") 

573 if orient == "table" and convert_axes: 

574 raise ValueError("cannot pass both convert_axes and orient='table'") 

575 

576 if dtype is None and orient != "table": 

577 dtype = True 

578 if convert_axes is None and orient != "table": 

579 convert_axes = True 

580 if encoding is None: 

581 encoding = "utf-8" 

582 

583 compression = infer_compression(path_or_buf, compression) 

584 filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( 

585 path_or_buf, encoding=encoding, compression=compression 

586 ) 

587 

588 json_reader = JsonReader( 

589 filepath_or_buffer, 

590 orient=orient, 

591 typ=typ, 

592 dtype=dtype, 

593 convert_axes=convert_axes, 

594 convert_dates=convert_dates, 

595 keep_default_dates=keep_default_dates, 

596 numpy=numpy, 

597 precise_float=precise_float, 

598 date_unit=date_unit, 

599 encoding=encoding, 

600 lines=lines, 

601 chunksize=chunksize, 

602 compression=compression, 

603 ) 

604 

605 if chunksize: 

606 return json_reader 

607 

608 result = json_reader.read() 

609 if should_close: 

610 filepath_or_buffer.close() 

611 

612 return result 

613 

614 

615class JsonReader(abc.Iterator): 

616 """ 

617 JsonReader provides an interface for reading in a JSON file. 

618 

619 If initialized with ``lines=True`` and ``chunksize``, can be iterated over 

620 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the 

621 whole document. 

622 """ 

623 

624 def __init__( 

625 self, 

626 filepath_or_buffer, 

627 orient, 

628 typ, 

629 dtype, 

630 convert_axes, 

631 convert_dates, 

632 keep_default_dates, 

633 numpy, 

634 precise_float, 

635 date_unit, 

636 encoding, 

637 lines, 

638 chunksize, 

639 compression, 

640 ): 

641 

642 self.path_or_buf = filepath_or_buffer 

643 self.orient = orient 

644 self.typ = typ 

645 self.dtype = dtype 

646 self.convert_axes = convert_axes 

647 self.convert_dates = convert_dates 

648 self.keep_default_dates = keep_default_dates 

649 self.numpy = numpy 

650 self.precise_float = precise_float 

651 self.date_unit = date_unit 

652 self.encoding = encoding 

653 self.compression = compression 

654 self.lines = lines 

655 self.chunksize = chunksize 

656 self.nrows_seen = 0 

657 self.should_close = False 

658 

659 if self.chunksize is not None: 

660 self.chunksize = _validate_integer("chunksize", self.chunksize, 1) 

661 if not self.lines: 

662 raise ValueError("chunksize can only be passed if lines=True") 

663 

664 data = self._get_data_from_filepath(filepath_or_buffer) 

665 self.data = self._preprocess_data(data) 

666 

667 def _preprocess_data(self, data): 

668 """ 

669 At this point, the data either has a `read` attribute (e.g. a file 

670 object or a StringIO) or is a string that is a JSON document. 

671 

672 If self.chunksize, we prepare the data for the `__next__` method. 

673 Otherwise, we read it into memory for the `read` method. 

674 """ 

675 if hasattr(data, "read") and not self.chunksize: 

676 data = data.read() 

677 if not hasattr(data, "read") and self.chunksize: 

678 data = StringIO(data) 

679 

680 return data 

681 

682 def _get_data_from_filepath(self, filepath_or_buffer): 

683 """ 

684 The function read_json accepts three input types: 

685 1. filepath (string-like) 

686 2. file-like object (e.g. open file object, StringIO) 

687 3. JSON string 

688 

689 This method turns (1) into (2) to simplify the rest of the processing. 

690 It returns input types (2) and (3) unchanged. 

691 """ 

692 data = filepath_or_buffer 

693 

694 exists = False 

695 if isinstance(data, str): 

696 try: 

697 exists = os.path.exists(filepath_or_buffer) 

698 # gh-5874: if the filepath is too long will raise here 

699 except (TypeError, ValueError): 

700 pass 

701 

702 if exists or self.compression is not None: 

703 data, _ = get_handle( 

704 filepath_or_buffer, 

705 "r", 

706 encoding=self.encoding, 

707 compression=self.compression, 

708 ) 

709 self.should_close = True 

710 self.open_stream = data 

711 

712 return data 

713 

714 def _combine_lines(self, lines) -> str: 

715 """ 

716 Combines a list of JSON objects into one JSON object. 

717 """ 

718 lines = filter(None, map(lambda x: x.strip(), lines)) 

719 return "[" + ",".join(lines) + "]" 

720 

721 def read(self): 

722 """ 

723 Read the whole JSON input into a pandas object. 

724 """ 

725 if self.lines and self.chunksize: 

726 obj = concat(self) 

727 elif self.lines: 

728 data = ensure_str(self.data) 

729 obj = self._get_object_parser(self._combine_lines(data.split("\n"))) 

730 else: 

731 obj = self._get_object_parser(self.data) 

732 self.close() 

733 return obj 

734 

735 def _get_object_parser(self, json): 

736 """ 

737 Parses a json document into a pandas object. 

738 """ 

739 typ = self.typ 

740 dtype = self.dtype 

741 kwargs = { 

742 "orient": self.orient, 

743 "dtype": self.dtype, 

744 "convert_axes": self.convert_axes, 

745 "convert_dates": self.convert_dates, 

746 "keep_default_dates": self.keep_default_dates, 

747 "numpy": self.numpy, 

748 "precise_float": self.precise_float, 

749 "date_unit": self.date_unit, 

750 } 

751 obj = None 

752 if typ == "frame": 

753 obj = FrameParser(json, **kwargs).parse() 

754 

755 if typ == "series" or obj is None: 

756 if not isinstance(dtype, bool): 

757 kwargs["dtype"] = dtype 

758 obj = SeriesParser(json, **kwargs).parse() 

759 

760 return obj 

761 

762 def close(self): 

763 """ 

764 If we opened a stream earlier, in _get_data_from_filepath, we should 

765 close it. 

766 

767 If an open stream or file was passed, we leave it open. 

768 """ 

769 if self.should_close: 

770 try: 

771 self.open_stream.close() 

772 except (IOError, AttributeError): 

773 pass 

774 

775 def __next__(self): 

776 lines = list(islice(self.data, self.chunksize)) 

777 if lines: 

778 lines_json = self._combine_lines(lines) 

779 obj = self._get_object_parser(lines_json) 

780 

781 # Make sure that the returned objects have the right index. 

782 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) 

783 self.nrows_seen += len(obj) 

784 

785 return obj 

786 

787 self.close() 

788 raise StopIteration 

789 

790 

791class Parser: 

792 

793 _STAMP_UNITS = ("s", "ms", "us", "ns") 

794 _MIN_STAMPS = { 

795 "s": 31536000, 

796 "ms": 31536000000, 

797 "us": 31536000000000, 

798 "ns": 31536000000000000, 

799 } 

800 

801 def __init__( 

802 self, 

803 json, 

804 orient, 

805 dtype=None, 

806 convert_axes=True, 

807 convert_dates=True, 

808 keep_default_dates=False, 

809 numpy=False, 

810 precise_float=False, 

811 date_unit=None, 

812 ): 

813 self.json = json 

814 

815 if orient is None: 

816 orient = self._default_orient 

817 self.orient = orient 

818 

819 self.dtype = dtype 

820 

821 if orient == "split": 

822 numpy = False 

823 

824 if date_unit is not None: 

825 date_unit = date_unit.lower() 

826 if date_unit not in self._STAMP_UNITS: 

827 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}") 

828 self.min_stamp = self._MIN_STAMPS[date_unit] 

829 else: 

830 self.min_stamp = self._MIN_STAMPS["s"] 

831 

832 self.numpy = numpy 

833 self.precise_float = precise_float 

834 self.convert_axes = convert_axes 

835 self.convert_dates = convert_dates 

836 self.date_unit = date_unit 

837 self.keep_default_dates = keep_default_dates 

838 self.obj = None 

839 

840 def check_keys_split(self, decoded): 

841 """ 

842 Checks that dict has only the appropriate keys for orient='split'. 

843 """ 

844 bad_keys = set(decoded.keys()).difference(set(self._split_keys)) 

845 if bad_keys: 

846 bad_keys = ", ".join(bad_keys) 

847 raise ValueError(f"JSON data had unexpected key(s): {bad_keys}") 

848 

849 def parse(self): 

850 

851 # try numpy 

852 numpy = self.numpy 

853 if numpy: 

854 self._parse_numpy() 

855 

856 else: 

857 self._parse_no_numpy() 

858 

859 if self.obj is None: 

860 return None 

861 if self.convert_axes: 

862 self._convert_axes() 

863 self._try_convert_types() 

864 return self.obj 

865 

866 def _convert_axes(self): 

867 """ 

868 Try to convert axes. 

869 """ 

870 for axis in self.obj._AXIS_NUMBERS.keys(): 

871 new_axis, result = self._try_convert_data( 

872 axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True 

873 ) 

874 if result: 

875 setattr(self.obj, axis, new_axis) 

876 

877 def _try_convert_types(self): 

878 raise AbstractMethodError(self) 

879 

880 def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): 

881 """ 

882 Try to parse a ndarray like into a column by inferring dtype. 

883 """ 

884 

885 # don't try to coerce, unless a force conversion 

886 if use_dtypes: 

887 if not self.dtype: 

888 return data, False 

889 elif self.dtype is True: 

890 pass 

891 else: 

892 # dtype to force 

893 dtype = ( 

894 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype 

895 ) 

896 if dtype is not None: 

897 try: 

898 dtype = np.dtype(dtype) 

899 return data.astype(dtype), True 

900 except (TypeError, ValueError): 

901 return data, False 

902 

903 if convert_dates: 

904 new_data, result = self._try_convert_to_date(data) 

905 if result: 

906 return new_data, True 

907 

908 result = False 

909 

910 if data.dtype == "object": 

911 

912 # try float 

913 try: 

914 data = data.astype("float64") 

915 result = True 

916 except (TypeError, ValueError): 

917 pass 

918 

919 if data.dtype.kind == "f": 

920 

921 if data.dtype != "float64": 

922 

923 # coerce floats to 64 

924 try: 

925 data = data.astype("float64") 

926 result = True 

927 except (TypeError, ValueError): 

928 pass 

929 

930 # don't coerce 0-len data 

931 if len(data) and (data.dtype == "float" or data.dtype == "object"): 

932 

933 # coerce ints if we can 

934 try: 

935 new_data = data.astype("int64") 

936 if (new_data == data).all(): 

937 data = new_data 

938 result = True 

939 except (TypeError, ValueError): 

940 pass 

941 

942 # coerce ints to 64 

943 if data.dtype == "int": 

944 

945 # coerce floats to 64 

946 try: 

947 data = data.astype("int64") 

948 result = True 

949 except (TypeError, ValueError): 

950 pass 

951 

952 return data, result 

953 

954 def _try_convert_to_date(self, data): 

955 """ 

956 Try to parse a ndarray like into a date column. 

957 

958 Try to coerce object in epoch/iso formats and integer/float in epoch 

959 formats. Return a boolean if parsing was successful. 

960 """ 

961 

962 # no conversion on empty 

963 if not len(data): 

964 return data, False 

965 

966 new_data = data 

967 if new_data.dtype == "object": 

968 try: 

969 new_data = data.astype("int64") 

970 except (TypeError, ValueError, OverflowError): 

971 pass 

972 

973 # ignore numbers that are out of range 

974 if issubclass(new_data.dtype.type, np.number): 

975 in_range = ( 

976 isna(new_data.values) 

977 | (new_data > self.min_stamp) 

978 | (new_data.values == iNaT) 

979 ) 

980 if not in_range.all(): 

981 return data, False 

982 

983 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS 

984 for date_unit in date_units: 

985 try: 

986 new_data = to_datetime(new_data, errors="raise", unit=date_unit) 

987 except (ValueError, OverflowError): 

988 continue 

989 return new_data, True 

990 return data, False 

991 

992 def _try_convert_dates(self): 

993 raise AbstractMethodError(self) 

994 

995 

996class SeriesParser(Parser): 

997 _default_orient = "index" 

998 _split_keys = ("name", "index", "data") 

999 

1000 def _parse_no_numpy(self): 

1001 data = loads(self.json, precise_float=self.precise_float) 

1002 

1003 if self.orient == "split": 

1004 decoded = {str(k): v for k, v in data.items()} 

1005 self.check_keys_split(decoded) 

1006 self.obj = create_series_with_explicit_dtype(**decoded) 

1007 else: 

1008 self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) 

1009 

1010 def _parse_numpy(self): 

1011 load_kwargs = { 

1012 "dtype": None, 

1013 "numpy": True, 

1014 "precise_float": self.precise_float, 

1015 } 

1016 if self.orient in ["columns", "index"]: 

1017 load_kwargs["labelled"] = True 

1018 loads_ = functools.partial(loads, **load_kwargs) 

1019 data = loads_(self.json) 

1020 

1021 if self.orient == "split": 

1022 decoded = {str(k): v for k, v in data.items()} 

1023 self.check_keys_split(decoded) 

1024 self.obj = create_series_with_explicit_dtype(**decoded) 

1025 elif self.orient in ["columns", "index"]: 

1026 self.obj = create_series_with_explicit_dtype(*data, dtype_if_empty=object) 

1027 else: 

1028 self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) 

1029 

1030 def _try_convert_types(self): 

1031 if self.obj is None: 

1032 return 

1033 obj, result = self._try_convert_data( 

1034 "data", self.obj, convert_dates=self.convert_dates 

1035 ) 

1036 if result: 

1037 self.obj = obj 

1038 

1039 

1040class FrameParser(Parser): 

1041 _default_orient = "columns" 

1042 _split_keys = ("columns", "index", "data") 

1043 

1044 def _parse_numpy(self): 

1045 

1046 json = self.json 

1047 orient = self.orient 

1048 

1049 if orient == "columns": 

1050 args = loads( 

1051 json, 

1052 dtype=None, 

1053 numpy=True, 

1054 labelled=True, 

1055 precise_float=self.precise_float, 

1056 ) 

1057 if len(args): 

1058 args = (args[0].T, args[2], args[1]) 

1059 self.obj = DataFrame(*args) 

1060 elif orient == "split": 

1061 decoded = loads( 

1062 json, dtype=None, numpy=True, precise_float=self.precise_float 

1063 ) 

1064 decoded = {str(k): v for k, v in decoded.items()} 

1065 self.check_keys_split(decoded) 

1066 self.obj = DataFrame(**decoded) 

1067 elif orient == "values": 

1068 self.obj = DataFrame( 

1069 loads(json, dtype=None, numpy=True, precise_float=self.precise_float) 

1070 ) 

1071 else: 

1072 self.obj = DataFrame( 

1073 *loads( 

1074 json, 

1075 dtype=None, 

1076 numpy=True, 

1077 labelled=True, 

1078 precise_float=self.precise_float, 

1079 ) 

1080 ) 

1081 

1082 def _parse_no_numpy(self): 

1083 

1084 json = self.json 

1085 orient = self.orient 

1086 

1087 if orient == "columns": 

1088 self.obj = DataFrame( 

1089 loads(json, precise_float=self.precise_float), dtype=None 

1090 ) 

1091 elif orient == "split": 

1092 decoded = { 

1093 str(k): v 

1094 for k, v in loads(json, precise_float=self.precise_float).items() 

1095 } 

1096 self.check_keys_split(decoded) 

1097 self.obj = DataFrame(dtype=None, **decoded) 

1098 elif orient == "index": 

1099 self.obj = DataFrame.from_dict( 

1100 loads(json, precise_float=self.precise_float), 

1101 dtype=None, 

1102 orient="index", 

1103 ) 

1104 elif orient == "table": 

1105 self.obj = parse_table_schema(json, precise_float=self.precise_float) 

1106 else: 

1107 self.obj = DataFrame( 

1108 loads(json, precise_float=self.precise_float), dtype=None 

1109 ) 

1110 

1111 def _process_converter(self, f, filt=None): 

1112 """ 

1113 Take a conversion function and possibly recreate the frame. 

1114 """ 

1115 

1116 if filt is None: 

1117 filt = lambda col, c: True 

1118 

1119 needs_new_obj = False 

1120 new_obj = dict() 

1121 for i, (col, c) in enumerate(self.obj.items()): 

1122 if filt(col, c): 

1123 new_data, result = f(col, c) 

1124 if result: 

1125 c = new_data 

1126 needs_new_obj = True 

1127 new_obj[i] = c 

1128 

1129 if needs_new_obj: 

1130 

1131 # possibly handle dup columns 

1132 new_obj = DataFrame(new_obj, index=self.obj.index) 

1133 new_obj.columns = self.obj.columns 

1134 self.obj = new_obj 

1135 

1136 def _try_convert_types(self): 

1137 if self.obj is None: 

1138 return 

1139 if self.convert_dates: 

1140 self._try_convert_dates() 

1141 

1142 self._process_converter( 

1143 lambda col, c: self._try_convert_data(col, c, convert_dates=False) 

1144 ) 

1145 

1146 def _try_convert_dates(self): 

1147 if self.obj is None: 

1148 return 

1149 

1150 # our columns to parse 

1151 convert_dates = self.convert_dates 

1152 if convert_dates is True: 

1153 convert_dates = [] 

1154 convert_dates = set(convert_dates) 

1155 

1156 def is_ok(col) -> bool: 

1157 """ 

1158 Return if this col is ok to try for a date parse. 

1159 """ 

1160 if not isinstance(col, str): 

1161 return False 

1162 

1163 col_lower = col.lower() 

1164 if ( 

1165 col_lower.endswith("_at") 

1166 or col_lower.endswith("_time") 

1167 or col_lower == "modified" 

1168 or col_lower == "date" 

1169 or col_lower == "datetime" 

1170 or col_lower.startswith("timestamp") 

1171 ): 

1172 return True 

1173 return False 

1174 

1175 self._process_converter( 

1176 lambda col, c: self._try_convert_to_date(c), 

1177 lambda col, c: ( 

1178 (self.keep_default_dates and is_ok(col)) or col in convert_dates 

1179 ), 

1180 )