Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/json/_json.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from collections import abc
2import functools
3from io import StringIO
4from itertools import islice
5import os
6from typing import Any, Callable, Optional, Type
8import numpy as np
10import pandas._libs.json as json
11from pandas._libs.tslibs import iNaT
12from pandas._typing import JSONSerializable
13from pandas.errors import AbstractMethodError
14from pandas.util._decorators import deprecate_kwarg
16from pandas.core.dtypes.common import ensure_str, is_period_dtype
18from pandas import DataFrame, MultiIndex, Series, isna, to_datetime
19from pandas.core.construction import create_series_with_explicit_dtype
20from pandas.core.reshape.concat import concat
22from pandas.io.common import (
23 get_filepath_or_buffer,
24 get_handle,
25 infer_compression,
26 stringify_path,
27)
28from pandas.io.json._normalize import convert_to_line_delimits
29from pandas.io.json._table_schema import build_table_schema, parse_table_schema
30from pandas.io.parsers import _validate_integer
32loads = json.loads
33dumps = json.dumps
35TABLE_SCHEMA_VERSION = "0.20.0"
38# interface to/from
39def to_json(
40 path_or_buf,
41 obj,
42 orient: Optional[str] = None,
43 date_format: str = "epoch",
44 double_precision: int = 10,
45 force_ascii: bool = True,
46 date_unit: str = "ms",
47 default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
48 lines: bool = False,
49 compression: Optional[str] = "infer",
50 index: bool = True,
51 indent: int = 0,
52):
54 if not index and orient not in ["split", "table"]:
55 raise ValueError(
56 "'index=False' is only valid when 'orient' is 'split' or 'table'"
57 )
59 path_or_buf = stringify_path(path_or_buf)
60 if lines and orient != "records":
61 raise ValueError("'lines' keyword only valid when 'orient' is records")
63 if orient == "table" and isinstance(obj, Series):
64 obj = obj.to_frame(name=obj.name or "values")
66 writer: Type["Writer"]
67 if orient == "table" and isinstance(obj, DataFrame):
68 writer = JSONTableWriter
69 elif isinstance(obj, Series):
70 writer = SeriesWriter
71 elif isinstance(obj, DataFrame):
72 writer = FrameWriter
73 else:
74 raise NotImplementedError("'obj' should be a Series or a DataFrame")
76 s = writer(
77 obj,
78 orient=orient,
79 date_format=date_format,
80 double_precision=double_precision,
81 ensure_ascii=force_ascii,
82 date_unit=date_unit,
83 default_handler=default_handler,
84 index=index,
85 indent=indent,
86 ).write()
88 if lines:
89 s = convert_to_line_delimits(s)
91 if isinstance(path_or_buf, str):
92 fh, handles = get_handle(path_or_buf, "w", compression=compression)
93 try:
94 fh.write(s)
95 finally:
96 fh.close()
97 elif path_or_buf is None:
98 return s
99 else:
100 path_or_buf.write(s)
103class Writer:
104 def __init__(
105 self,
106 obj,
107 orient: Optional[str],
108 date_format: str,
109 double_precision: int,
110 ensure_ascii: bool,
111 date_unit: str,
112 index: bool,
113 default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
114 indent: int = 0,
115 ):
116 self.obj = obj
118 if orient is None:
119 orient = self._default_orient # type: ignore
121 self.orient = orient
122 self.date_format = date_format
123 self.double_precision = double_precision
124 self.ensure_ascii = ensure_ascii
125 self.date_unit = date_unit
126 self.default_handler = default_handler
127 self.index = index
128 self.indent = indent
130 self.is_copy = None
131 self._format_axes()
133 def _format_axes(self):
134 raise AbstractMethodError(self)
136 def write(self):
137 return self._write(
138 self.obj,
139 self.orient,
140 self.double_precision,
141 self.ensure_ascii,
142 self.date_unit,
143 self.date_format == "iso",
144 self.default_handler,
145 self.indent,
146 )
148 def _write(
149 self,
150 obj,
151 orient: Optional[str],
152 double_precision: int,
153 ensure_ascii: bool,
154 date_unit: str,
155 iso_dates: bool,
156 default_handler: Optional[Callable[[Any], JSONSerializable]],
157 indent: int,
158 ):
159 return dumps(
160 obj,
161 orient=orient,
162 double_precision=double_precision,
163 ensure_ascii=ensure_ascii,
164 date_unit=date_unit,
165 iso_dates=iso_dates,
166 default_handler=default_handler,
167 indent=indent,
168 )
171class SeriesWriter(Writer):
172 _default_orient = "index"
174 def _format_axes(self):
175 if not self.obj.index.is_unique and self.orient == "index":
176 raise ValueError(f"Series index must be unique for orient='{self.orient}'")
178 def _write(
179 self,
180 obj,
181 orient: Optional[str],
182 double_precision: int,
183 ensure_ascii: bool,
184 date_unit: str,
185 iso_dates: bool,
186 default_handler: Optional[Callable[[Any], JSONSerializable]],
187 indent: int,
188 ):
189 if not self.index and orient == "split":
190 obj = {"name": obj.name, "data": obj.values}
191 return super()._write(
192 obj,
193 orient,
194 double_precision,
195 ensure_ascii,
196 date_unit,
197 iso_dates,
198 default_handler,
199 indent,
200 )
203class FrameWriter(Writer):
204 _default_orient = "columns"
206 def _format_axes(self):
207 """
208 Try to format axes if they are datelike.
209 """
210 if not self.obj.index.is_unique and self.orient in ("index", "columns"):
211 raise ValueError(
212 f"DataFrame index must be unique for orient='{self.orient}'."
213 )
214 if not self.obj.columns.is_unique and self.orient in (
215 "index",
216 "columns",
217 "records",
218 ):
219 raise ValueError(
220 f"DataFrame columns must be unique for orient='{self.orient}'."
221 )
223 def _write(
224 self,
225 obj,
226 orient: Optional[str],
227 double_precision: int,
228 ensure_ascii: bool,
229 date_unit: str,
230 iso_dates: bool,
231 default_handler: Optional[Callable[[Any], JSONSerializable]],
232 indent: int,
233 ):
234 if not self.index and orient == "split":
235 obj = obj.to_dict(orient="split")
236 del obj["index"]
237 return super()._write(
238 obj,
239 orient,
240 double_precision,
241 ensure_ascii,
242 date_unit,
243 iso_dates,
244 default_handler,
245 indent,
246 )
249class JSONTableWriter(FrameWriter):
250 _default_orient = "records"
252 def __init__(
253 self,
254 obj,
255 orient: Optional[str],
256 date_format: str,
257 double_precision: int,
258 ensure_ascii: bool,
259 date_unit: str,
260 index: bool,
261 default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
262 indent: int = 0,
263 ):
264 """
265 Adds a `schema` attribute with the Table Schema, resets
266 the index (can't do in caller, because the schema inference needs
267 to know what the index is, forces orient to records, and forces
268 date_format to 'iso'.
269 """
271 super().__init__(
272 obj,
273 orient,
274 date_format,
275 double_precision,
276 ensure_ascii,
277 date_unit,
278 index,
279 default_handler=default_handler,
280 indent=indent,
281 )
283 if date_format != "iso":
284 msg = (
285 "Trying to write with `orient='table'` and "
286 f"`date_format='{date_format}'`. Table Schema requires dates "
287 "to be formatted with `date_format='iso'`"
288 )
289 raise ValueError(msg)
291 self.schema = build_table_schema(obj, index=self.index)
293 # NotImplemented on a column MultiIndex
294 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
295 raise NotImplementedError("orient='table' is not supported for MultiIndex")
297 # TODO: Do this timedelta properly in objToJSON.c See GH #15137
298 if (
299 (obj.ndim == 1)
300 and (obj.name in set(obj.index.names))
301 or len(obj.columns & obj.index.names)
302 ):
303 msg = "Overlapping names between the index and columns"
304 raise ValueError(msg)
306 obj = obj.copy()
307 timedeltas = obj.select_dtypes(include=["timedelta"]).columns
308 if len(timedeltas):
309 obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat())
310 # Convert PeriodIndex to datetimes before serializing
311 if is_period_dtype(obj.index):
312 obj.index = obj.index.to_timestamp()
314 # exclude index from obj if index=False
315 if not self.index:
316 self.obj = obj.reset_index(drop=True)
317 else:
318 self.obj = obj.reset_index(drop=False)
319 self.date_format = "iso"
320 self.orient = "records"
321 self.index = index
323 def _write(
324 self,
325 obj,
326 orient,
327 double_precision,
328 ensure_ascii,
329 date_unit,
330 iso_dates,
331 default_handler,
332 indent,
333 ):
334 table_obj = {"schema": self.schema, "data": obj}
335 serialized = super()._write(
336 table_obj,
337 orient,
338 double_precision,
339 ensure_ascii,
340 date_unit,
341 iso_dates,
342 default_handler,
343 indent,
344 )
346 return serialized
349@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None)
350def read_json(
351 path_or_buf=None,
352 orient=None,
353 typ="frame",
354 dtype=None,
355 convert_axes=None,
356 convert_dates=True,
357 keep_default_dates=True,
358 numpy=False,
359 precise_float=False,
360 date_unit=None,
361 encoding=None,
362 lines=False,
363 chunksize=None,
364 compression="infer",
365):
366 """
367 Convert a JSON string to pandas object.
369 Parameters
370 ----------
371 path_or_buf : a valid JSON str, path object or file-like object
372 Any valid string path is acceptable. The string could be a URL. Valid
373 URL schemes include http, ftp, s3, and file. For file URLs, a host is
374 expected. A local file could be:
375 ``file://localhost/path/to/table.json``.
377 If you want to pass in a path object, pandas accepts any
378 ``os.PathLike``.
380 By file-like object, we refer to objects with a ``read()`` method,
381 such as a file handler (e.g. via builtin ``open`` function)
382 or ``StringIO``.
383 orient : str
384 Indication of expected JSON string format.
385 Compatible JSON strings can be produced by ``to_json()`` with a
386 corresponding orient value.
387 The set of possible orients is:
389 - ``'split'`` : dict like
390 ``{index -> [index], columns -> [columns], data -> [values]}``
391 - ``'records'`` : list like
392 ``[{column -> value}, ... , {column -> value}]``
393 - ``'index'`` : dict like ``{index -> {column -> value}}``
394 - ``'columns'`` : dict like ``{column -> {index -> value}}``
395 - ``'values'`` : just the values array
397 The allowed and default values depend on the value
398 of the `typ` parameter.
400 * when ``typ == 'series'``,
402 - allowed orients are ``{'split','records','index'}``
403 - default is ``'index'``
404 - The Series index must be unique for orient ``'index'``.
406 * when ``typ == 'frame'``,
408 - allowed orients are ``{'split','records','index',
409 'columns','values', 'table'}``
410 - default is ``'columns'``
411 - The DataFrame index must be unique for orients ``'index'`` and
412 ``'columns'``.
413 - The DataFrame columns must be unique for orients ``'index'``,
414 ``'columns'``, and ``'records'``.
416 .. versionadded:: 0.23.0
417 'table' as an allowed value for the ``orient`` argument
419 typ : {'frame', 'series'}, default 'frame'
420 The type of object to recover.
422 dtype : bool or dict, default None
423 If True, infer dtypes; if a dict of column to dtype, then use those;
424 if False, then don't infer dtypes at all, applies only to the data.
426 For all ``orient`` values except ``'table'``, default is True.
428 .. versionchanged:: 0.25.0
430 Not applicable for ``orient='table'``.
432 convert_axes : bool, default None
433 Try to convert the axes to the proper dtypes.
435 For all ``orient`` values except ``'table'``, default is True.
437 .. versionchanged:: 0.25.0
439 Not applicable for ``orient='table'``.
441 convert_dates : bool or list of str, default True
442 List of columns to parse for dates. If True, then try to parse
443 datelike columns. A column label is datelike if
445 * it ends with ``'_at'``,
447 * it ends with ``'_time'``,
449 * it begins with ``'timestamp'``,
451 * it is ``'modified'``, or
453 * it is ``'date'``.
455 keep_default_dates : bool, default True
456 If parsing dates, then parse the default datelike columns.
458 numpy : bool, default False
459 Direct decoding to numpy arrays. Supports numeric data only, but
460 non-numeric column and index labels are supported. Note also that the
461 JSON ordering MUST be the same for each term if numpy=True.
463 .. deprecated:: 1.0.0
465 precise_float : bool, default False
466 Set to enable usage of higher precision (strtod) function when
467 decoding string to double values. Default (False) is to use fast but
468 less precise builtin functionality.
470 date_unit : str, default None
471 The timestamp unit to detect if converting dates. The default behaviour
472 is to try and detect the correct precision, but if this is not desired
473 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
474 milliseconds, microseconds or nanoseconds respectively.
476 encoding : str, default is 'utf-8'
477 The encoding to use to decode py3 bytes.
479 lines : bool, default False
480 Read the file as a json object per line.
482 chunksize : int, optional
483 Return JsonReader object for iteration.
484 See the `line-delimited json docs
485 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_
486 for more information on ``chunksize``.
487 This can only be passed if `lines=True`.
488 If this is None, the file will be read into memory all at once.
490 .. versionadded:: 0.21.0
492 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
493 For on-the-fly decompression of on-disk data. If 'infer', then use
494 gzip, bz2, zip or xz if path_or_buf is a string ending in
495 '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
496 otherwise. If using 'zip', the ZIP file must contain only one data
497 file to be read in. Set to None for no decompression.
499 .. versionadded:: 0.21.0
501 Returns
502 -------
503 Series or DataFrame
504 The type returned depends on the value of `typ`.
506 See Also
507 --------
508 DataFrame.to_json : Convert a DataFrame to a JSON string.
509 Series.to_json : Convert a Series to a JSON string.
511 Notes
512 -----
513 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
514 :class:`Index` name of `index` gets written with :func:`to_json`, the
515 subsequent read operation will incorrectly set the :class:`Index` name to
516 ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
517 to denote a missing :class:`Index` name, and the subsequent
518 :func:`read_json` operation cannot distinguish between the two. The same
519 limitation is encountered with a :class:`MultiIndex` and any names
520 beginning with ``'level_'``.
522 Examples
523 --------
525 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
526 ... index=['row 1', 'row 2'],
527 ... columns=['col 1', 'col 2'])
529 Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
531 >>> df.to_json(orient='split')
532 '{"columns":["col 1","col 2"],
533 "index":["row 1","row 2"],
534 "data":[["a","b"],["c","d"]]}'
535 >>> pd.read_json(_, orient='split')
536 col 1 col 2
537 row 1 a b
538 row 2 c d
540 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
542 >>> df.to_json(orient='index')
543 '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
544 >>> pd.read_json(_, orient='index')
545 col 1 col 2
546 row 1 a b
547 row 2 c d
549 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
550 Note that index labels are not preserved with this encoding.
552 >>> df.to_json(orient='records')
553 '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
554 >>> pd.read_json(_, orient='records')
555 col 1 col 2
556 0 a b
557 1 c d
559 Encoding with Table Schema
561 >>> df.to_json(orient='table')
562 '{"schema": {"fields": [{"name": "index", "type": "string"},
563 {"name": "col 1", "type": "string"},
564 {"name": "col 2", "type": "string"}],
565 "primaryKey": "index",
566 "pandas_version": "0.20.0"},
567 "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
568 {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
569 """
571 if orient == "table" and dtype:
572 raise ValueError("cannot pass both dtype and orient='table'")
573 if orient == "table" and convert_axes:
574 raise ValueError("cannot pass both convert_axes and orient='table'")
576 if dtype is None and orient != "table":
577 dtype = True
578 if convert_axes is None and orient != "table":
579 convert_axes = True
580 if encoding is None:
581 encoding = "utf-8"
583 compression = infer_compression(path_or_buf, compression)
584 filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
585 path_or_buf, encoding=encoding, compression=compression
586 )
588 json_reader = JsonReader(
589 filepath_or_buffer,
590 orient=orient,
591 typ=typ,
592 dtype=dtype,
593 convert_axes=convert_axes,
594 convert_dates=convert_dates,
595 keep_default_dates=keep_default_dates,
596 numpy=numpy,
597 precise_float=precise_float,
598 date_unit=date_unit,
599 encoding=encoding,
600 lines=lines,
601 chunksize=chunksize,
602 compression=compression,
603 )
605 if chunksize:
606 return json_reader
608 result = json_reader.read()
609 if should_close:
610 filepath_or_buffer.close()
612 return result
615class JsonReader(abc.Iterator):
616 """
617 JsonReader provides an interface for reading in a JSON file.
619 If initialized with ``lines=True`` and ``chunksize``, can be iterated over
620 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
621 whole document.
622 """
624 def __init__(
625 self,
626 filepath_or_buffer,
627 orient,
628 typ,
629 dtype,
630 convert_axes,
631 convert_dates,
632 keep_default_dates,
633 numpy,
634 precise_float,
635 date_unit,
636 encoding,
637 lines,
638 chunksize,
639 compression,
640 ):
642 self.path_or_buf = filepath_or_buffer
643 self.orient = orient
644 self.typ = typ
645 self.dtype = dtype
646 self.convert_axes = convert_axes
647 self.convert_dates = convert_dates
648 self.keep_default_dates = keep_default_dates
649 self.numpy = numpy
650 self.precise_float = precise_float
651 self.date_unit = date_unit
652 self.encoding = encoding
653 self.compression = compression
654 self.lines = lines
655 self.chunksize = chunksize
656 self.nrows_seen = 0
657 self.should_close = False
659 if self.chunksize is not None:
660 self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
661 if not self.lines:
662 raise ValueError("chunksize can only be passed if lines=True")
664 data = self._get_data_from_filepath(filepath_or_buffer)
665 self.data = self._preprocess_data(data)
667 def _preprocess_data(self, data):
668 """
669 At this point, the data either has a `read` attribute (e.g. a file
670 object or a StringIO) or is a string that is a JSON document.
672 If self.chunksize, we prepare the data for the `__next__` method.
673 Otherwise, we read it into memory for the `read` method.
674 """
675 if hasattr(data, "read") and not self.chunksize:
676 data = data.read()
677 if not hasattr(data, "read") and self.chunksize:
678 data = StringIO(data)
680 return data
682 def _get_data_from_filepath(self, filepath_or_buffer):
683 """
684 The function read_json accepts three input types:
685 1. filepath (string-like)
686 2. file-like object (e.g. open file object, StringIO)
687 3. JSON string
689 This method turns (1) into (2) to simplify the rest of the processing.
690 It returns input types (2) and (3) unchanged.
691 """
692 data = filepath_or_buffer
694 exists = False
695 if isinstance(data, str):
696 try:
697 exists = os.path.exists(filepath_or_buffer)
698 # gh-5874: if the filepath is too long will raise here
699 except (TypeError, ValueError):
700 pass
702 if exists or self.compression is not None:
703 data, _ = get_handle(
704 filepath_or_buffer,
705 "r",
706 encoding=self.encoding,
707 compression=self.compression,
708 )
709 self.should_close = True
710 self.open_stream = data
712 return data
714 def _combine_lines(self, lines) -> str:
715 """
716 Combines a list of JSON objects into one JSON object.
717 """
718 lines = filter(None, map(lambda x: x.strip(), lines))
719 return "[" + ",".join(lines) + "]"
721 def read(self):
722 """
723 Read the whole JSON input into a pandas object.
724 """
725 if self.lines and self.chunksize:
726 obj = concat(self)
727 elif self.lines:
728 data = ensure_str(self.data)
729 obj = self._get_object_parser(self._combine_lines(data.split("\n")))
730 else:
731 obj = self._get_object_parser(self.data)
732 self.close()
733 return obj
735 def _get_object_parser(self, json):
736 """
737 Parses a json document into a pandas object.
738 """
739 typ = self.typ
740 dtype = self.dtype
741 kwargs = {
742 "orient": self.orient,
743 "dtype": self.dtype,
744 "convert_axes": self.convert_axes,
745 "convert_dates": self.convert_dates,
746 "keep_default_dates": self.keep_default_dates,
747 "numpy": self.numpy,
748 "precise_float": self.precise_float,
749 "date_unit": self.date_unit,
750 }
751 obj = None
752 if typ == "frame":
753 obj = FrameParser(json, **kwargs).parse()
755 if typ == "series" or obj is None:
756 if not isinstance(dtype, bool):
757 kwargs["dtype"] = dtype
758 obj = SeriesParser(json, **kwargs).parse()
760 return obj
762 def close(self):
763 """
764 If we opened a stream earlier, in _get_data_from_filepath, we should
765 close it.
767 If an open stream or file was passed, we leave it open.
768 """
769 if self.should_close:
770 try:
771 self.open_stream.close()
772 except (IOError, AttributeError):
773 pass
775 def __next__(self):
776 lines = list(islice(self.data, self.chunksize))
777 if lines:
778 lines_json = self._combine_lines(lines)
779 obj = self._get_object_parser(lines_json)
781 # Make sure that the returned objects have the right index.
782 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
783 self.nrows_seen += len(obj)
785 return obj
787 self.close()
788 raise StopIteration
791class Parser:
793 _STAMP_UNITS = ("s", "ms", "us", "ns")
794 _MIN_STAMPS = {
795 "s": 31536000,
796 "ms": 31536000000,
797 "us": 31536000000000,
798 "ns": 31536000000000000,
799 }
801 def __init__(
802 self,
803 json,
804 orient,
805 dtype=None,
806 convert_axes=True,
807 convert_dates=True,
808 keep_default_dates=False,
809 numpy=False,
810 precise_float=False,
811 date_unit=None,
812 ):
813 self.json = json
815 if orient is None:
816 orient = self._default_orient
817 self.orient = orient
819 self.dtype = dtype
821 if orient == "split":
822 numpy = False
824 if date_unit is not None:
825 date_unit = date_unit.lower()
826 if date_unit not in self._STAMP_UNITS:
827 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")
828 self.min_stamp = self._MIN_STAMPS[date_unit]
829 else:
830 self.min_stamp = self._MIN_STAMPS["s"]
832 self.numpy = numpy
833 self.precise_float = precise_float
834 self.convert_axes = convert_axes
835 self.convert_dates = convert_dates
836 self.date_unit = date_unit
837 self.keep_default_dates = keep_default_dates
838 self.obj = None
840 def check_keys_split(self, decoded):
841 """
842 Checks that dict has only the appropriate keys for orient='split'.
843 """
844 bad_keys = set(decoded.keys()).difference(set(self._split_keys))
845 if bad_keys:
846 bad_keys = ", ".join(bad_keys)
847 raise ValueError(f"JSON data had unexpected key(s): {bad_keys}")
849 def parse(self):
851 # try numpy
852 numpy = self.numpy
853 if numpy:
854 self._parse_numpy()
856 else:
857 self._parse_no_numpy()
859 if self.obj is None:
860 return None
861 if self.convert_axes:
862 self._convert_axes()
863 self._try_convert_types()
864 return self.obj
866 def _convert_axes(self):
867 """
868 Try to convert axes.
869 """
870 for axis in self.obj._AXIS_NUMBERS.keys():
871 new_axis, result = self._try_convert_data(
872 axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True
873 )
874 if result:
875 setattr(self.obj, axis, new_axis)
877 def _try_convert_types(self):
878 raise AbstractMethodError(self)
880 def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
881 """
882 Try to parse a ndarray like into a column by inferring dtype.
883 """
885 # don't try to coerce, unless a force conversion
886 if use_dtypes:
887 if not self.dtype:
888 return data, False
889 elif self.dtype is True:
890 pass
891 else:
892 # dtype to force
893 dtype = (
894 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
895 )
896 if dtype is not None:
897 try:
898 dtype = np.dtype(dtype)
899 return data.astype(dtype), True
900 except (TypeError, ValueError):
901 return data, False
903 if convert_dates:
904 new_data, result = self._try_convert_to_date(data)
905 if result:
906 return new_data, True
908 result = False
910 if data.dtype == "object":
912 # try float
913 try:
914 data = data.astype("float64")
915 result = True
916 except (TypeError, ValueError):
917 pass
919 if data.dtype.kind == "f":
921 if data.dtype != "float64":
923 # coerce floats to 64
924 try:
925 data = data.astype("float64")
926 result = True
927 except (TypeError, ValueError):
928 pass
930 # don't coerce 0-len data
931 if len(data) and (data.dtype == "float" or data.dtype == "object"):
933 # coerce ints if we can
934 try:
935 new_data = data.astype("int64")
936 if (new_data == data).all():
937 data = new_data
938 result = True
939 except (TypeError, ValueError):
940 pass
942 # coerce ints to 64
943 if data.dtype == "int":
945 # coerce floats to 64
946 try:
947 data = data.astype("int64")
948 result = True
949 except (TypeError, ValueError):
950 pass
952 return data, result
954 def _try_convert_to_date(self, data):
955 """
956 Try to parse a ndarray like into a date column.
958 Try to coerce object in epoch/iso formats and integer/float in epoch
959 formats. Return a boolean if parsing was successful.
960 """
962 # no conversion on empty
963 if not len(data):
964 return data, False
966 new_data = data
967 if new_data.dtype == "object":
968 try:
969 new_data = data.astype("int64")
970 except (TypeError, ValueError, OverflowError):
971 pass
973 # ignore numbers that are out of range
974 if issubclass(new_data.dtype.type, np.number):
975 in_range = (
976 isna(new_data.values)
977 | (new_data > self.min_stamp)
978 | (new_data.values == iNaT)
979 )
980 if not in_range.all():
981 return data, False
983 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
984 for date_unit in date_units:
985 try:
986 new_data = to_datetime(new_data, errors="raise", unit=date_unit)
987 except (ValueError, OverflowError):
988 continue
989 return new_data, True
990 return data, False
992 def _try_convert_dates(self):
993 raise AbstractMethodError(self)
996class SeriesParser(Parser):
997 _default_orient = "index"
998 _split_keys = ("name", "index", "data")
1000 def _parse_no_numpy(self):
1001 data = loads(self.json, precise_float=self.precise_float)
1003 if self.orient == "split":
1004 decoded = {str(k): v for k, v in data.items()}
1005 self.check_keys_split(decoded)
1006 self.obj = create_series_with_explicit_dtype(**decoded)
1007 else:
1008 self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object)
1010 def _parse_numpy(self):
1011 load_kwargs = {
1012 "dtype": None,
1013 "numpy": True,
1014 "precise_float": self.precise_float,
1015 }
1016 if self.orient in ["columns", "index"]:
1017 load_kwargs["labelled"] = True
1018 loads_ = functools.partial(loads, **load_kwargs)
1019 data = loads_(self.json)
1021 if self.orient == "split":
1022 decoded = {str(k): v for k, v in data.items()}
1023 self.check_keys_split(decoded)
1024 self.obj = create_series_with_explicit_dtype(**decoded)
1025 elif self.orient in ["columns", "index"]:
1026 self.obj = create_series_with_explicit_dtype(*data, dtype_if_empty=object)
1027 else:
1028 self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object)
1030 def _try_convert_types(self):
1031 if self.obj is None:
1032 return
1033 obj, result = self._try_convert_data(
1034 "data", self.obj, convert_dates=self.convert_dates
1035 )
1036 if result:
1037 self.obj = obj
1040class FrameParser(Parser):
1041 _default_orient = "columns"
1042 _split_keys = ("columns", "index", "data")
1044 def _parse_numpy(self):
1046 json = self.json
1047 orient = self.orient
1049 if orient == "columns":
1050 args = loads(
1051 json,
1052 dtype=None,
1053 numpy=True,
1054 labelled=True,
1055 precise_float=self.precise_float,
1056 )
1057 if len(args):
1058 args = (args[0].T, args[2], args[1])
1059 self.obj = DataFrame(*args)
1060 elif orient == "split":
1061 decoded = loads(
1062 json, dtype=None, numpy=True, precise_float=self.precise_float
1063 )
1064 decoded = {str(k): v for k, v in decoded.items()}
1065 self.check_keys_split(decoded)
1066 self.obj = DataFrame(**decoded)
1067 elif orient == "values":
1068 self.obj = DataFrame(
1069 loads(json, dtype=None, numpy=True, precise_float=self.precise_float)
1070 )
1071 else:
1072 self.obj = DataFrame(
1073 *loads(
1074 json,
1075 dtype=None,
1076 numpy=True,
1077 labelled=True,
1078 precise_float=self.precise_float,
1079 )
1080 )
1082 def _parse_no_numpy(self):
1084 json = self.json
1085 orient = self.orient
1087 if orient == "columns":
1088 self.obj = DataFrame(
1089 loads(json, precise_float=self.precise_float), dtype=None
1090 )
1091 elif orient == "split":
1092 decoded = {
1093 str(k): v
1094 for k, v in loads(json, precise_float=self.precise_float).items()
1095 }
1096 self.check_keys_split(decoded)
1097 self.obj = DataFrame(dtype=None, **decoded)
1098 elif orient == "index":
1099 self.obj = DataFrame.from_dict(
1100 loads(json, precise_float=self.precise_float),
1101 dtype=None,
1102 orient="index",
1103 )
1104 elif orient == "table":
1105 self.obj = parse_table_schema(json, precise_float=self.precise_float)
1106 else:
1107 self.obj = DataFrame(
1108 loads(json, precise_float=self.precise_float), dtype=None
1109 )
1111 def _process_converter(self, f, filt=None):
1112 """
1113 Take a conversion function and possibly recreate the frame.
1114 """
1116 if filt is None:
1117 filt = lambda col, c: True
1119 needs_new_obj = False
1120 new_obj = dict()
1121 for i, (col, c) in enumerate(self.obj.items()):
1122 if filt(col, c):
1123 new_data, result = f(col, c)
1124 if result:
1125 c = new_data
1126 needs_new_obj = True
1127 new_obj[i] = c
1129 if needs_new_obj:
1131 # possibly handle dup columns
1132 new_obj = DataFrame(new_obj, index=self.obj.index)
1133 new_obj.columns = self.obj.columns
1134 self.obj = new_obj
1136 def _try_convert_types(self):
1137 if self.obj is None:
1138 return
1139 if self.convert_dates:
1140 self._try_convert_dates()
1142 self._process_converter(
1143 lambda col, c: self._try_convert_data(col, c, convert_dates=False)
1144 )
1146 def _try_convert_dates(self):
1147 if self.obj is None:
1148 return
1150 # our columns to parse
1151 convert_dates = self.convert_dates
1152 if convert_dates is True:
1153 convert_dates = []
1154 convert_dates = set(convert_dates)
1156 def is_ok(col) -> bool:
1157 """
1158 Return if this col is ok to try for a date parse.
1159 """
1160 if not isinstance(col, str):
1161 return False
1163 col_lower = col.lower()
1164 if (
1165 col_lower.endswith("_at")
1166 or col_lower.endswith("_time")
1167 or col_lower == "modified"
1168 or col_lower == "date"
1169 or col_lower == "datetime"
1170 or col_lower.startswith("timestamp")
1171 ):
1172 return True
1173 return False
1175 self._process_converter(
1176 lambda col, c: self._try_convert_to_date(c),
1177 lambda col, c: (
1178 (self.keep_default_dates and is_ok(col)) or col in convert_dates
1179 ),
1180 )