Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/parsers.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Module contains tools for processing files into DataFrames or other objects
3"""
5from collections import abc, defaultdict
6import csv
7import datetime
8from io import StringIO, TextIOWrapper
9import re
10import sys
11from textwrap import fill
12from typing import Any, Dict, Set
13import warnings
15import numpy as np
17import pandas._libs.lib as lib
18import pandas._libs.ops as libops
19import pandas._libs.parsers as parsers
20from pandas._libs.parsers import STR_NA_VALUES
21from pandas._libs.tslibs import parsing
22from pandas._typing import FilePathOrBuffer
23from pandas.errors import (
24 AbstractMethodError,
25 EmptyDataError,
26 ParserError,
27 ParserWarning,
28)
29from pandas.util._decorators import Appender
31from pandas.core.dtypes.cast import astype_nansafe
32from pandas.core.dtypes.common import (
33 ensure_object,
34 ensure_str,
35 is_bool_dtype,
36 is_categorical_dtype,
37 is_dtype_equal,
38 is_extension_array_dtype,
39 is_file_like,
40 is_float,
41 is_integer,
42 is_integer_dtype,
43 is_list_like,
44 is_object_dtype,
45 is_scalar,
46 is_string_dtype,
47 pandas_dtype,
48)
49from pandas.core.dtypes.dtypes import CategoricalDtype
50from pandas.core.dtypes.missing import isna
52from pandas.core import algorithms
53from pandas.core.arrays import Categorical
54from pandas.core.frame import DataFrame
55from pandas.core.indexes.api import (
56 Index,
57 MultiIndex,
58 RangeIndex,
59 ensure_index_from_sequences,
60)
61from pandas.core.series import Series
62from pandas.core.tools import datetimes as tools
64from pandas.io.common import (
65 get_filepath_or_buffer,
66 get_handle,
67 infer_compression,
68 validate_header_arg,
69)
70from pandas.io.date_converters import generic_parser
72# BOM character (byte order mark)
73# This exists at the beginning of a file to indicate endianness
74# of a file (stream). Unfortunately, this marker screws up parsing,
75# so we need to remove it if we see it.
76_BOM = "\ufeff"
78_doc_read_csv_and_table = (
79 r"""
80{summary}
82Also supports optionally iterating or breaking of the file
83into chunks.
85Additional help can be found in the online docs for
86`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
88Parameters
89----------
90filepath_or_buffer : str, path object or file-like object
91 Any valid string path is acceptable. The string could be a URL. Valid
92 URL schemes include http, ftp, s3, and file. For file URLs, a host is
93 expected. A local file could be: file://localhost/path/to/table.csv.
95 If you want to pass in a path object, pandas accepts any ``os.PathLike``.
97 By file-like object, we refer to objects with a ``read()`` method, such as
98 a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
99sep : str, default {_default_sep}
100 Delimiter to use. If sep is None, the C engine cannot automatically detect
101 the separator, but the Python parsing engine can, meaning the latter will
102 be used and automatically detect the separator by Python's builtin sniffer
103 tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
104 different from ``'\s+'`` will be interpreted as regular expressions and
105 will also force the use of the Python parsing engine. Note that regex
106 delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
107delimiter : str, default ``None``
108 Alias for sep.
109header : int, list of int, default 'infer'
110 Row number(s) to use as the column names, and the start of the
111 data. Default behavior is to infer the column names: if no names
112 are passed the behavior is identical to ``header=0`` and column
113 names are inferred from the first line of the file, if column
114 names are passed explicitly then the behavior is identical to
115 ``header=None``. Explicitly pass ``header=0`` to be able to
116 replace existing names. The header can be a list of integers that
117 specify row locations for a multi-index on the columns
118 e.g. [0,1,3]. Intervening rows that are not specified will be
119 skipped (e.g. 2 in this example is skipped). Note that this
120 parameter ignores commented lines and empty lines if
121 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
122 data rather than the first line of the file.
123names : array-like, optional
124 List of column names to use. If the file contains a header row,
125 then you should explicitly pass ``header=0`` to override the column names.
126 Duplicates in this list are not allowed.
127index_col : int, str, sequence of int / str, or False, default ``None``
128 Column(s) to use as the row labels of the ``DataFrame``, either given as
129 string name or column index. If a sequence of int / str is given, a
130 MultiIndex is used.
132 Note: ``index_col=False`` can be used to force pandas to *not* use the first
133 column as the index, e.g. when you have a malformed file with delimiters at
134 the end of each line.
135usecols : list-like or callable, optional
136 Return a subset of the columns. If list-like, all elements must either
137 be positional (i.e. integer indices into the document columns) or strings
138 that correspond to column names provided either by the user in `names` or
139 inferred from the document header row(s). For example, a valid list-like
140 `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
141 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
142 To instantiate a DataFrame from ``data`` with element order preserved use
143 ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
144 in ``['foo', 'bar']`` order or
145 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
146 for ``['bar', 'foo']`` order.
148 If callable, the callable function will be evaluated against the column
149 names, returning names where the callable function evaluates to True. An
150 example of a valid callable argument would be ``lambda x: x.upper() in
151 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
152 parsing time and lower memory usage.
153squeeze : bool, default False
154 If the parsed data only contains one column then return a Series.
155prefix : str, optional
156 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
157mangle_dupe_cols : bool, default True
158 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
159 'X'...'X'. Passing in False will cause data to be overwritten if there
160 are duplicate names in the columns.
161dtype : Type name or dict of column -> type, optional
162 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
163 'c': 'Int64'}}
164 Use `str` or `object` together with suitable `na_values` settings
165 to preserve and not interpret dtype.
166 If converters are specified, they will be applied INSTEAD
167 of dtype conversion.
168engine : {{'c', 'python'}}, optional
169 Parser engine to use. The C engine is faster while the python engine is
170 currently more feature-complete.
171converters : dict, optional
172 Dict of functions for converting values in certain columns. Keys can either
173 be integers or column labels.
174true_values : list, optional
175 Values to consider as True.
176false_values : list, optional
177 Values to consider as False.
178skipinitialspace : bool, default False
179 Skip spaces after delimiter.
180skiprows : list-like, int or callable, optional
181 Line numbers to skip (0-indexed) or number of lines to skip (int)
182 at the start of the file.
184 If callable, the callable function will be evaluated against the row
185 indices, returning True if the row should be skipped and False otherwise.
186 An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
187skipfooter : int, default 0
188 Number of lines at bottom of file to skip (Unsupported with engine='c').
189nrows : int, optional
190 Number of rows of file to read. Useful for reading pieces of large files.
191na_values : scalar, str, list-like, or dict, optional
192 Additional strings to recognize as NA/NaN. If dict passed, specific
193 per-column NA values. By default the following values are interpreted as
194 NaN: '"""
195 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
196 + """'.
197keep_default_na : bool, default True
198 Whether or not to include the default NaN values when parsing the data.
199 Depending on whether `na_values` is passed in, the behavior is as follows:
201 * If `keep_default_na` is True, and `na_values` are specified, `na_values`
202 is appended to the default NaN values used for parsing.
203 * If `keep_default_na` is True, and `na_values` are not specified, only
204 the default NaN values are used for parsing.
205 * If `keep_default_na` is False, and `na_values` are specified, only
206 the NaN values specified `na_values` are used for parsing.
207 * If `keep_default_na` is False, and `na_values` are not specified, no
208 strings will be parsed as NaN.
210 Note that if `na_filter` is passed in as False, the `keep_default_na` and
211 `na_values` parameters will be ignored.
212na_filter : bool, default True
213 Detect missing value markers (empty strings and the value of na_values). In
214 data without any NAs, passing na_filter=False can improve the performance
215 of reading a large file.
216verbose : bool, default False
217 Indicate number of NA values placed in non-numeric columns.
218skip_blank_lines : bool, default True
219 If True, skip over blank lines rather than interpreting as NaN values.
220parse_dates : bool or list of int or names or list of lists or dict, \
221default False
222 The behavior is as follows:
224 * boolean. If True -> try parsing the index.
225 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
226 each as a separate date column.
227 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
228 a single date column.
229 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
230 result 'foo'
232 If a column or index cannot be represented as an array of datetimes,
233 say because of an unparseable value or a mixture of timezones, the column
234 or index will be returned unaltered as an object data type. For
235 non-standard datetime parsing, use ``pd.to_datetime`` after
236 ``pd.read_csv``. To parse an index or column with a mixture of timezones,
237 specify ``date_parser`` to be a partially-applied
238 :func:`pandas.to_datetime` with ``utc=True``. See
239 :ref:`io.csv.mixed_timezones` for more.
241 Note: A fast-path exists for iso8601-formatted dates.
242infer_datetime_format : bool, default False
243 If True and `parse_dates` is enabled, pandas will attempt to infer the
244 format of the datetime strings in the columns, and if it can be inferred,
245 switch to a faster method of parsing them. In some cases this can increase
246 the parsing speed by 5-10x.
247keep_date_col : bool, default False
248 If True and `parse_dates` specifies combining multiple columns then
249 keep the original columns.
250date_parser : function, optional
251 Function to use for converting a sequence of string columns to an array of
252 datetime instances. The default uses ``dateutil.parser.parser`` to do the
253 conversion. Pandas will try to call `date_parser` in three different ways,
254 advancing to the next if an exception occurs: 1) Pass one or more arrays
255 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
256 string values from the columns defined by `parse_dates` into a single array
257 and pass that; and 3) call `date_parser` once for each row using one or
258 more strings (corresponding to the columns defined by `parse_dates`) as
259 arguments.
260dayfirst : bool, default False
261 DD/MM format dates, international and European format.
262cache_dates : bool, default True
263 If True, use a cache of unique, converted dates to apply the datetime
264 conversion. May produce significant speed-up when parsing duplicate
265 date strings, especially ones with timezone offsets.
267 .. versionadded:: 0.25.0
268iterator : bool, default False
269 Return TextFileReader object for iteration or getting chunks with
270 ``get_chunk()``.
271chunksize : int, optional
272 Return TextFileReader object for iteration.
273 See the `IO Tools docs
274 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
275 for more information on ``iterator`` and ``chunksize``.
276compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
277 For on-the-fly decompression of on-disk data. If 'infer' and
278 `filepath_or_buffer` is path-like, then detect compression from the
279 following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
280 decompression). If using 'zip', the ZIP file must contain only one data
281 file to be read in. Set to None for no decompression.
282thousands : str, optional
283 Thousands separator.
284decimal : str, default '.'
285 Character to recognize as decimal point (e.g. use ',' for European data).
286lineterminator : str (length 1), optional
287 Character to break file into lines. Only valid with C parser.
288quotechar : str (length 1), optional
289 The character used to denote the start and end of a quoted item. Quoted
290 items can include the delimiter and it will be ignored.
291quoting : int or csv.QUOTE_* instance, default 0
292 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
293 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
294doublequote : bool, default ``True``
295 When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
296 whether or not to interpret two consecutive quotechar elements INSIDE a
297 field as a single ``quotechar`` element.
298escapechar : str (length 1), optional
299 One-character string used to escape other characters.
300comment : str, optional
301 Indicates remainder of line should not be parsed. If found at the beginning
302 of a line, the line will be ignored altogether. This parameter must be a
303 single character. Like empty lines (as long as ``skip_blank_lines=True``),
304 fully commented lines are ignored by the parameter `header` but not by
305 `skiprows`. For example, if ``comment='#'``, parsing
306 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
307 treated as the header.
308encoding : str, optional
309 Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
310 standard encodings
311 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
312dialect : str or csv.Dialect, optional
313 If provided, this parameter will override values (default or not) for the
314 following parameters: `delimiter`, `doublequote`, `escapechar`,
315 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
316 override values, a ParserWarning will be issued. See csv.Dialect
317 documentation for more details.
318error_bad_lines : bool, default True
319 Lines with too many fields (e.g. a csv line with too many commas) will by
320 default cause an exception to be raised, and no DataFrame will be returned.
321 If False, then these "bad lines" will dropped from the DataFrame that is
322 returned.
323warn_bad_lines : bool, default True
324 If error_bad_lines is False, and warn_bad_lines is True, a warning for each
325 "bad line" will be output.
326delim_whitespace : bool, default False
327 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
328 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
329 is set to True, nothing should be passed in for the ``delimiter``
330 parameter.
331low_memory : bool, default True
332 Internally process the file in chunks, resulting in lower memory use
333 while parsing, but possibly mixed type inference. To ensure no mixed
334 types either set False, or specify the type with the `dtype` parameter.
335 Note that the entire file is read into a single DataFrame regardless,
336 use the `chunksize` or `iterator` parameter to return the data in chunks.
337 (Only valid with C parser).
338memory_map : bool, default False
339 If a filepath is provided for `filepath_or_buffer`, map the file object
340 directly onto memory and access the data directly from there. Using this
341 option can improve performance because there is no longer any I/O overhead.
342float_precision : str, optional
343 Specifies which converter the C engine should use for floating-point
344 values. The options are `None` for the ordinary converter,
345 `high` for the high-precision converter, and `round_trip` for the
346 round-trip converter.
348Returns
349-------
350DataFrame or TextParser
351 A comma-separated values (csv) file is returned as two-dimensional
352 data structure with labeled axes.
354See Also
355--------
356to_csv : Write DataFrame to a comma-separated values (csv) file.
357read_csv : Read a comma-separated values (csv) file into DataFrame.
358read_fwf : Read a table of fixed-width formatted lines into DataFrame.
360Examples
361--------
362>>> pd.{func_name}('data.csv') # doctest: +SKIP
363"""
364)
367def _validate_integer(name, val, min_val=0):
368 """
369 Checks whether the 'name' parameter for parsing is either
370 an integer OR float that can SAFELY be cast to an integer
371 without losing accuracy. Raises a ValueError if that is
372 not the case.
374 Parameters
375 ----------
376 name : string
377 Parameter name (used for error reporting)
378 val : int or float
379 The value to check
380 min_val : int
381 Minimum allowed value (val < min_val will result in a ValueError)
382 """
383 msg = f"'{name:s}' must be an integer >={min_val:d}"
385 if val is not None:
386 if is_float(val):
387 if int(val) != val:
388 raise ValueError(msg)
389 val = int(val)
390 elif not (is_integer(val) and val >= min_val):
391 raise ValueError(msg)
393 return val
396def _validate_names(names):
397 """
398 Raise ValueError if the `names` parameter contains duplicates.
400 Parameters
401 ----------
402 names : array-like or None
403 An array containing a list of the names used for the output DataFrame.
405 Raises
406 ------
407 ValueError
408 If names are not unique.
409 """
411 if names is not None:
412 if len(names) != len(set(names)):
413 raise ValueError("Duplicate names are not allowed.")
416def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
417 """Generic reader of line files."""
418 encoding = kwds.get("encoding", None)
419 if encoding is not None:
420 encoding = re.sub("_", "-", encoding).lower()
421 kwds["encoding"] = encoding
423 compression = kwds.get("compression", "infer")
424 compression = infer_compression(filepath_or_buffer, compression)
426 # TODO: get_filepath_or_buffer could return
427 # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
428 # though mypy handling of conditional imports is difficult.
429 # See https://github.com/python/mypy/issues/1297
430 fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
431 filepath_or_buffer, encoding, compression
432 )
433 kwds["compression"] = compression
435 if kwds.get("date_parser", None) is not None:
436 if isinstance(kwds["parse_dates"], bool):
437 kwds["parse_dates"] = True
439 # Extract some of the arguments (pass chunksize on).
440 iterator = kwds.get("iterator", False)
441 chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
442 nrows = kwds.get("nrows", None)
444 # Check for duplicates in names.
445 _validate_names(kwds.get("names", None))
447 # Create the parser.
448 parser = TextFileReader(fp_or_buf, **kwds)
450 if chunksize or iterator:
451 return parser
453 try:
454 data = parser.read(nrows)
455 finally:
456 parser.close()
458 if should_close:
459 try:
460 fp_or_buf.close()
461 except ValueError:
462 pass
464 return data
467_parser_defaults = {
468 "delimiter": None,
469 "escapechar": None,
470 "quotechar": '"',
471 "quoting": csv.QUOTE_MINIMAL,
472 "doublequote": True,
473 "skipinitialspace": False,
474 "lineterminator": None,
475 "header": "infer",
476 "index_col": None,
477 "names": None,
478 "prefix": None,
479 "skiprows": None,
480 "skipfooter": 0,
481 "nrows": None,
482 "na_values": None,
483 "keep_default_na": True,
484 "true_values": None,
485 "false_values": None,
486 "converters": None,
487 "dtype": None,
488 "cache_dates": True,
489 "thousands": None,
490 "comment": None,
491 "decimal": ".",
492 # 'engine': 'c',
493 "parse_dates": False,
494 "keep_date_col": False,
495 "dayfirst": False,
496 "date_parser": None,
497 "usecols": None,
498 # 'iterator': False,
499 "chunksize": None,
500 "verbose": False,
501 "encoding": None,
502 "squeeze": False,
503 "compression": None,
504 "mangle_dupe_cols": True,
505 "infer_datetime_format": False,
506 "skip_blank_lines": True,
507}
510_c_parser_defaults = {
511 "delim_whitespace": False,
512 "na_filter": True,
513 "low_memory": True,
514 "memory_map": False,
515 "error_bad_lines": True,
516 "warn_bad_lines": True,
517 "float_precision": None,
518}
520_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
522_c_unsupported = {"skipfooter"}
523_python_unsupported = {"low_memory", "float_precision"}
525_deprecated_defaults: Dict[str, Any] = {}
526_deprecated_args: Set[str] = set()
529def _make_parser_function(name, default_sep=","):
530 def parser_f(
531 filepath_or_buffer: FilePathOrBuffer,
532 sep=default_sep,
533 delimiter=None,
534 # Column and Index Locations and Names
535 header="infer",
536 names=None,
537 index_col=None,
538 usecols=None,
539 squeeze=False,
540 prefix=None,
541 mangle_dupe_cols=True,
542 # General Parsing Configuration
543 dtype=None,
544 engine=None,
545 converters=None,
546 true_values=None,
547 false_values=None,
548 skipinitialspace=False,
549 skiprows=None,
550 skipfooter=0,
551 nrows=None,
552 # NA and Missing Data Handling
553 na_values=None,
554 keep_default_na=True,
555 na_filter=True,
556 verbose=False,
557 skip_blank_lines=True,
558 # Datetime Handling
559 parse_dates=False,
560 infer_datetime_format=False,
561 keep_date_col=False,
562 date_parser=None,
563 dayfirst=False,
564 cache_dates=True,
565 # Iteration
566 iterator=False,
567 chunksize=None,
568 # Quoting, Compression, and File Format
569 compression="infer",
570 thousands=None,
571 decimal: str = ".",
572 lineterminator=None,
573 quotechar='"',
574 quoting=csv.QUOTE_MINIMAL,
575 doublequote=True,
576 escapechar=None,
577 comment=None,
578 encoding=None,
579 dialect=None,
580 # Error Handling
581 error_bad_lines=True,
582 warn_bad_lines=True,
583 # Internal
584 delim_whitespace=False,
585 low_memory=_c_parser_defaults["low_memory"],
586 memory_map=False,
587 float_precision=None,
588 ):
590 # gh-23761
591 #
592 # When a dialect is passed, it overrides any of the overlapping
593 # parameters passed in directly. We don't want to warn if the
594 # default parameters were passed in (since it probably means
595 # that the user didn't pass them in explicitly in the first place).
596 #
597 # "delimiter" is the annoying corner case because we alias it to
598 # "sep" before doing comparison to the dialect values later on.
599 # Thus, we need a flag to indicate that we need to "override"
600 # the comparison to dialect values by checking if default values
601 # for BOTH "delimiter" and "sep" were provided.
602 if dialect is not None:
603 sep_override = delimiter is None and sep == default_sep
604 kwds = dict(sep_override=sep_override)
605 else:
606 kwds = dict()
608 # Alias sep -> delimiter.
609 if delimiter is None:
610 delimiter = sep
612 if delim_whitespace and delimiter != default_sep:
613 raise ValueError(
614 "Specified a delimiter with both sep and "
615 "delim_whitespace=True; you can only "
616 "specify one."
617 )
619 if engine is not None:
620 engine_specified = True
621 else:
622 engine = "c"
623 engine_specified = False
625 kwds.update(
626 delimiter=delimiter,
627 engine=engine,
628 dialect=dialect,
629 compression=compression,
630 engine_specified=engine_specified,
631 doublequote=doublequote,
632 escapechar=escapechar,
633 quotechar=quotechar,
634 quoting=quoting,
635 skipinitialspace=skipinitialspace,
636 lineterminator=lineterminator,
637 header=header,
638 index_col=index_col,
639 names=names,
640 prefix=prefix,
641 skiprows=skiprows,
642 skipfooter=skipfooter,
643 na_values=na_values,
644 true_values=true_values,
645 false_values=false_values,
646 keep_default_na=keep_default_na,
647 thousands=thousands,
648 comment=comment,
649 decimal=decimal,
650 parse_dates=parse_dates,
651 keep_date_col=keep_date_col,
652 dayfirst=dayfirst,
653 date_parser=date_parser,
654 cache_dates=cache_dates,
655 nrows=nrows,
656 iterator=iterator,
657 chunksize=chunksize,
658 converters=converters,
659 dtype=dtype,
660 usecols=usecols,
661 verbose=verbose,
662 encoding=encoding,
663 squeeze=squeeze,
664 memory_map=memory_map,
665 float_precision=float_precision,
666 na_filter=na_filter,
667 delim_whitespace=delim_whitespace,
668 warn_bad_lines=warn_bad_lines,
669 error_bad_lines=error_bad_lines,
670 low_memory=low_memory,
671 mangle_dupe_cols=mangle_dupe_cols,
672 infer_datetime_format=infer_datetime_format,
673 skip_blank_lines=skip_blank_lines,
674 )
676 return _read(filepath_or_buffer, kwds)
678 parser_f.__name__ = name
680 return parser_f
683read_csv = _make_parser_function("read_csv", default_sep=",")
684read_csv = Appender(
685 _doc_read_csv_and_table.format(
686 func_name="read_csv",
687 summary="Read a comma-separated values (csv) file into DataFrame.",
688 _default_sep="','",
689 )
690)(read_csv)
692read_table = _make_parser_function("read_table", default_sep="\t")
693read_table = Appender(
694 _doc_read_csv_and_table.format(
695 func_name="read_table",
696 summary="Read general delimited file into DataFrame.",
697 _default_sep=r"'\\t' (tab-stop)",
698 )
699)(read_table)
702def read_fwf(
703 filepath_or_buffer: FilePathOrBuffer,
704 colspecs="infer",
705 widths=None,
706 infer_nrows=100,
707 **kwds,
708):
710 r"""
711 Read a table of fixed-width formatted lines into DataFrame.
713 Also supports optionally iterating or breaking of the file
714 into chunks.
716 Additional help can be found in the `online docs for IO Tools
717 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
719 Parameters
720 ----------
721 filepath_or_buffer : str, path object or file-like object
722 Any valid string path is acceptable. The string could be a URL. Valid
723 URL schemes include http, ftp, s3, and file. For file URLs, a host is
724 expected. A local file could be:
725 ``file://localhost/path/to/table.csv``.
727 If you want to pass in a path object, pandas accepts any
728 ``os.PathLike``.
730 By file-like object, we refer to objects with a ``read()`` method,
731 such as a file handler (e.g. via builtin ``open`` function)
732 or ``StringIO``.
733 colspecs : list of tuple (int, int) or 'infer'. optional
734 A list of tuples giving the extents of the fixed-width
735 fields of each line as half-open intervals (i.e., [from, to[ ).
736 String value 'infer' can be used to instruct the parser to try
737 detecting the column specifications from the first 100 rows of
738 the data which are not being skipped via skiprows (default='infer').
739 widths : list of int, optional
740 A list of field widths which can be used instead of 'colspecs' if
741 the intervals are contiguous.
742 infer_nrows : int, default 100
743 The number of rows to consider when letting the parser determine the
744 `colspecs`.
746 .. versionadded:: 0.24.0
747 **kwds : optional
748 Optional keyword arguments can be passed to ``TextFileReader``.
750 Returns
751 -------
752 DataFrame or TextParser
753 A comma-separated values (csv) file is returned as two-dimensional
754 data structure with labeled axes.
756 See Also
757 --------
758 to_csv : Write DataFrame to a comma-separated values (csv) file.
759 read_csv : Read a comma-separated values (csv) file into DataFrame.
761 Examples
762 --------
763 >>> pd.read_fwf('data.csv') # doctest: +SKIP
764 """
766 # Check input arguments.
767 if colspecs is None and widths is None:
768 raise ValueError("Must specify either colspecs or widths")
769 elif colspecs not in (None, "infer") and widths is not None:
770 raise ValueError("You must specify only one of 'widths' and 'colspecs'")
772 # Compute 'colspecs' from 'widths', if specified.
773 if widths is not None:
774 colspecs, col = [], 0
775 for w in widths:
776 colspecs.append((col, col + w))
777 col += w
779 kwds["colspecs"] = colspecs
780 kwds["infer_nrows"] = infer_nrows
781 kwds["engine"] = "python-fwf"
782 return _read(filepath_or_buffer, kwds)
785class TextFileReader(abc.Iterator):
786 """
788 Passed dialect overrides any of the related parser options
790 """
792 def __init__(self, f, engine=None, **kwds):
794 self.f = f
796 if engine is not None:
797 engine_specified = True
798 else:
799 engine = "python"
800 engine_specified = False
802 self._engine_specified = kwds.get("engine_specified", engine_specified)
804 if kwds.get("dialect") is not None:
805 dialect = kwds["dialect"]
806 if dialect in csv.list_dialects():
807 dialect = csv.get_dialect(dialect)
809 # Any valid dialect should have these attributes.
810 # If any are missing, we will raise automatically.
811 for param in (
812 "delimiter",
813 "doublequote",
814 "escapechar",
815 "skipinitialspace",
816 "quotechar",
817 "quoting",
818 ):
819 try:
820 dialect_val = getattr(dialect, param)
821 except AttributeError:
822 raise ValueError(f"Invalid dialect {kwds['dialect']} provided")
823 parser_default = _parser_defaults[param]
824 provided = kwds.get(param, parser_default)
826 # Messages for conflicting values between the dialect
827 # instance and the actual parameters provided.
828 conflict_msgs = []
830 # Don't warn if the default parameter was passed in,
831 # even if it conflicts with the dialect (gh-23761).
832 if provided != parser_default and provided != dialect_val:
833 msg = (
834 f"Conflicting values for '{param}': '{provided}' was "
835 f"provided, but the dialect specifies '{dialect_val}'. "
836 "Using the dialect-specified value."
837 )
839 # Annoying corner case for not warning about
840 # conflicts between dialect and delimiter parameter.
841 # Refer to the outer "_read_" function for more info.
842 if not (param == "delimiter" and kwds.pop("sep_override", False)):
843 conflict_msgs.append(msg)
845 if conflict_msgs:
846 warnings.warn(
847 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2
848 )
849 kwds[param] = dialect_val
851 if kwds.get("skipfooter"):
852 if kwds.get("iterator") or kwds.get("chunksize"):
853 raise ValueError("'skipfooter' not supported for 'iteration'")
854 if kwds.get("nrows"):
855 raise ValueError("'skipfooter' not supported with 'nrows'")
857 if kwds.get("header", "infer") == "infer":
858 kwds["header"] = 0 if kwds.get("names") is None else None
860 self.orig_options = kwds
862 # miscellanea
863 self.engine = engine
864 self._engine = None
865 self._currow = 0
867 options = self._get_options_with_defaults(engine)
869 self.chunksize = options.pop("chunksize", None)
870 self.nrows = options.pop("nrows", None)
871 self.squeeze = options.pop("squeeze", False)
873 # might mutate self.engine
874 self.engine = self._check_file_or_buffer(f, engine)
875 self.options, self.engine = self._clean_options(options, engine)
877 if "has_index_names" in kwds:
878 self.options["has_index_names"] = kwds["has_index_names"]
880 self._make_engine(self.engine)
882 def close(self):
883 self._engine.close()
885 def _get_options_with_defaults(self, engine):
886 kwds = self.orig_options
888 options = {}
890 for argname, default in _parser_defaults.items():
891 value = kwds.get(argname, default)
893 # see gh-12935
894 if argname == "mangle_dupe_cols" and not value:
895 raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
896 else:
897 options[argname] = value
899 for argname, default in _c_parser_defaults.items():
900 if argname in kwds:
901 value = kwds[argname]
903 if engine != "c" and value != default:
904 if "python" in engine and argname not in _python_unsupported:
905 pass
906 elif value == _deprecated_defaults.get(argname, default):
907 pass
908 else:
909 raise ValueError(
910 f"The {repr(argname)} option is not supported with the"
911 f" {repr(engine)} engine"
912 )
913 else:
914 value = _deprecated_defaults.get(argname, default)
915 options[argname] = value
917 if engine == "python-fwf":
918 for argname, default in _fwf_defaults.items():
919 options[argname] = kwds.get(argname, default)
921 return options
923 def _check_file_or_buffer(self, f, engine):
924 # see gh-16530
925 if is_file_like(f):
926 next_attr = "__next__"
928 # The C engine doesn't need the file-like to have the "next" or
929 # "__next__" attribute. However, the Python engine explicitly calls
930 # "next(...)" when iterating through such an object, meaning it
931 # needs to have that attribute ("next" for Python 2.x, "__next__"
932 # for Python 3.x)
933 if engine != "c" and not hasattr(f, next_attr):
934 msg = "The 'python' engine cannot iterate through this file buffer."
935 raise ValueError(msg)
937 return engine
939 def _clean_options(self, options, engine):
940 result = options.copy()
942 engine_specified = self._engine_specified
943 fallback_reason = None
945 sep = options["delimiter"]
946 delim_whitespace = options["delim_whitespace"]
948 # C engine not supported yet
949 if engine == "c":
950 if options["skipfooter"] > 0:
951 fallback_reason = "the 'c' engine does not support skipfooter"
952 engine = "python"
954 encoding = sys.getfilesystemencoding() or "utf-8"
955 if sep is None and not delim_whitespace:
956 if engine == "c":
957 fallback_reason = (
958 "the 'c' engine does not support "
959 "sep=None with delim_whitespace=False"
960 )
961 engine = "python"
962 elif sep is not None and len(sep) > 1:
963 if engine == "c" and sep == r"\s+":
964 result["delim_whitespace"] = True
965 del result["delimiter"]
966 elif engine not in ("python", "python-fwf"):
967 # wait until regex engine integrated
968 fallback_reason = (
969 "the 'c' engine does not support "
970 "regex separators (separators > 1 char and "
971 r"different from '\s+' are "
972 "interpreted as regex)"
973 )
974 engine = "python"
975 elif delim_whitespace:
976 if "python" in engine:
977 result["delimiter"] = r"\s+"
978 elif sep is not None:
979 encodeable = True
980 try:
981 if len(sep.encode(encoding)) > 1:
982 encodeable = False
983 except UnicodeDecodeError:
984 encodeable = False
985 if not encodeable and engine not in ("python", "python-fwf"):
986 fallback_reason = (
987 f"the separator encoded in {encoding} "
988 "is > 1 char long, and the 'c' engine "
989 "does not support such separators"
990 )
991 engine = "python"
993 quotechar = options["quotechar"]
994 if quotechar is not None and isinstance(quotechar, (str, bytes)):
995 if (
996 len(quotechar) == 1
997 and ord(quotechar) > 127
998 and engine not in ("python", "python-fwf")
999 ):
1000 fallback_reason = (
1001 "ord(quotechar) > 127, meaning the "
1002 "quotechar is larger than one byte, "
1003 "and the 'c' engine does not support "
1004 "such quotechars"
1005 )
1006 engine = "python"
1008 if fallback_reason and engine_specified:
1009 raise ValueError(fallback_reason)
1011 if engine == "c":
1012 for arg in _c_unsupported:
1013 del result[arg]
1015 if "python" in engine:
1016 for arg in _python_unsupported:
1017 if fallback_reason and result[arg] != _c_parser_defaults[arg]:
1018 raise ValueError(
1019 "Falling back to the 'python' engine because "
1020 f"{fallback_reason}, but this causes {repr(arg)} to be "
1021 "ignored as it is not supported by the 'python' engine."
1022 )
1023 del result[arg]
1025 if fallback_reason:
1026 warnings.warn(
1027 (
1028 "Falling back to the 'python' engine because "
1029 f"{fallback_reason}; you can avoid this warning by specifying "
1030 "engine='python'."
1031 ),
1032 ParserWarning,
1033 stacklevel=5,
1034 )
1036 index_col = options["index_col"]
1037 names = options["names"]
1038 converters = options["converters"]
1039 na_values = options["na_values"]
1040 skiprows = options["skiprows"]
1042 validate_header_arg(options["header"])
1044 depr_warning = ""
1046 for arg in _deprecated_args:
1047 parser_default = _c_parser_defaults[arg]
1048 depr_default = _deprecated_defaults[arg]
1050 msg = (
1051 f"The {repr(arg)} argument has been deprecated and will be "
1052 "removed in a future version."
1053 )
1055 if result.get(arg, depr_default) != depr_default:
1056 depr_warning += msg + "\n\n"
1057 else:
1058 result[arg] = parser_default
1060 if depr_warning != "":
1061 warnings.warn(depr_warning, FutureWarning, stacklevel=2)
1063 if index_col is True:
1064 raise ValueError("The value of index_col couldn't be 'True'")
1065 if _is_index_col(index_col):
1066 if not isinstance(index_col, (list, tuple, np.ndarray)):
1067 index_col = [index_col]
1068 result["index_col"] = index_col
1070 names = list(names) if names is not None else names
1072 # type conversion-related
1073 if converters is not None:
1074 if not isinstance(converters, dict):
1075 raise TypeError(
1076 "Type converters must be a dict or subclass, "
1077 f"input was a {type(converters).__name__}"
1078 )
1079 else:
1080 converters = {}
1082 # Converting values to NA
1083 keep_default_na = options["keep_default_na"]
1084 na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
1086 # handle skiprows; this is internally handled by the
1087 # c-engine, so only need for python parsers
1088 if engine != "c":
1089 if is_integer(skiprows):
1090 skiprows = list(range(skiprows))
1091 if skiprows is None:
1092 skiprows = set()
1093 elif not callable(skiprows):
1094 skiprows = set(skiprows)
1096 # put stuff back
1097 result["names"] = names
1098 result["converters"] = converters
1099 result["na_values"] = na_values
1100 result["na_fvalues"] = na_fvalues
1101 result["skiprows"] = skiprows
1103 return result, engine
1105 def __next__(self):
1106 try:
1107 return self.get_chunk()
1108 except StopIteration:
1109 self.close()
1110 raise
1112 def _make_engine(self, engine="c"):
1113 if engine == "c":
1114 self._engine = CParserWrapper(self.f, **self.options)
1115 else:
1116 if engine == "python":
1117 klass = PythonParser
1118 elif engine == "python-fwf":
1119 klass = FixedWidthFieldParser
1120 else:
1121 raise ValueError(
1122 f"Unknown engine: {engine} (valid options are "
1123 '"c", "python", or '
1124 '"python-fwf")'
1125 )
1126 self._engine = klass(self.f, **self.options)
1128 def _failover_to_python(self):
1129 raise AbstractMethodError(self)
1131 def read(self, nrows=None):
1132 nrows = _validate_integer("nrows", nrows)
1133 ret = self._engine.read(nrows)
1135 # May alter columns / col_dict
1136 index, columns, col_dict = self._create_index(ret)
1138 if index is None:
1139 if col_dict:
1140 # Any column is actually fine:
1141 new_rows = len(next(iter(col_dict.values())))
1142 index = RangeIndex(self._currow, self._currow + new_rows)
1143 else:
1144 new_rows = 0
1145 else:
1146 new_rows = len(index)
1148 df = DataFrame(col_dict, columns=columns, index=index)
1150 self._currow += new_rows
1152 if self.squeeze and len(df.columns) == 1:
1153 return df[df.columns[0]].copy()
1154 return df
1156 def _create_index(self, ret):
1157 index, columns, col_dict = ret
1158 return index, columns, col_dict
1160 def get_chunk(self, size=None):
1161 if size is None:
1162 size = self.chunksize
1163 if self.nrows is not None:
1164 if self._currow >= self.nrows:
1165 raise StopIteration
1166 size = min(size, self.nrows - self._currow)
1167 return self.read(nrows=size)
1170def _is_index_col(col):
1171 return col is not None and col is not False
1174def _is_potential_multi_index(columns):
1175 """
1176 Check whether or not the `columns` parameter
1177 could be converted into a MultiIndex.
1179 Parameters
1180 ----------
1181 columns : array-like
1182 Object which may or may not be convertible into a MultiIndex
1184 Returns
1185 -------
1186 boolean : Whether or not columns could become a MultiIndex
1187 """
1188 return (
1189 len(columns)
1190 and not isinstance(columns, MultiIndex)
1191 and all(isinstance(c, tuple) for c in columns)
1192 )
1195def _evaluate_usecols(usecols, names):
1196 """
1197 Check whether or not the 'usecols' parameter
1198 is a callable. If so, enumerates the 'names'
1199 parameter and returns a set of indices for
1200 each entry in 'names' that evaluates to True.
1201 If not a callable, returns 'usecols'.
1202 """
1203 if callable(usecols):
1204 return {i for i, name in enumerate(names) if usecols(name)}
1205 return usecols
1208def _validate_usecols_names(usecols, names):
1209 """
1210 Validates that all usecols are present in a given
1211 list of names. If not, raise a ValueError that
1212 shows what usecols are missing.
1214 Parameters
1215 ----------
1216 usecols : iterable of usecols
1217 The columns to validate are present in names.
1218 names : iterable of names
1219 The column names to check against.
1221 Returns
1222 -------
1223 usecols : iterable of usecols
1224 The `usecols` parameter if the validation succeeds.
1226 Raises
1227 ------
1228 ValueError : Columns were missing. Error message will list them.
1229 """
1230 missing = [c for c in usecols if c not in names]
1231 if len(missing) > 0:
1232 raise ValueError(
1233 "Usecols do not match columns, "
1234 f"columns expected but not found: {missing}"
1235 )
1237 return usecols
1240def _validate_skipfooter_arg(skipfooter):
1241 """
1242 Validate the 'skipfooter' parameter.
1244 Checks whether 'skipfooter' is a non-negative integer.
1245 Raises a ValueError if that is not the case.
1247 Parameters
1248 ----------
1249 skipfooter : non-negative integer
1250 The number of rows to skip at the end of the file.
1252 Returns
1253 -------
1254 validated_skipfooter : non-negative integer
1255 The original input if the validation succeeds.
1257 Raises
1258 ------
1259 ValueError : 'skipfooter' was not a non-negative integer.
1260 """
1262 if not is_integer(skipfooter):
1263 raise ValueError("skipfooter must be an integer")
1265 if skipfooter < 0:
1266 raise ValueError("skipfooter cannot be negative")
1268 return skipfooter
1271def _validate_usecols_arg(usecols):
1272 """
1273 Validate the 'usecols' parameter.
1275 Checks whether or not the 'usecols' parameter contains all integers
1276 (column selection by index), strings (column by name) or is a callable.
1277 Raises a ValueError if that is not the case.
1279 Parameters
1280 ----------
1281 usecols : list-like, callable, or None
1282 List of columns to use when parsing or a callable that can be used
1283 to filter a list of table columns.
1285 Returns
1286 -------
1287 usecols_tuple : tuple
1288 A tuple of (verified_usecols, usecols_dtype).
1290 'verified_usecols' is either a set if an array-like is passed in or
1291 'usecols' if a callable or None is passed in.
1293 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
1294 is passed in or None if a callable or None is passed in.
1295 """
1296 msg = (
1297 "'usecols' must either be list-like of all strings, all unicode, "
1298 "all integers or a callable."
1299 )
1300 if usecols is not None:
1301 if callable(usecols):
1302 return usecols, None
1304 if not is_list_like(usecols):
1305 # see gh-20529
1306 #
1307 # Ensure it is iterable container but not string.
1308 raise ValueError(msg)
1310 usecols_dtype = lib.infer_dtype(usecols, skipna=False)
1312 if usecols_dtype not in ("empty", "integer", "string", "unicode"):
1313 raise ValueError(msg)
1315 usecols = set(usecols)
1317 return usecols, usecols_dtype
1318 return usecols, None
1321def _validate_parse_dates_arg(parse_dates):
1322 """
1323 Check whether or not the 'parse_dates' parameter
1324 is a non-boolean scalar. Raises a ValueError if
1325 that is the case.
1326 """
1327 msg = (
1328 "Only booleans, lists, and "
1329 "dictionaries are accepted "
1330 "for the 'parse_dates' parameter"
1331 )
1333 if parse_dates is not None:
1334 if is_scalar(parse_dates):
1335 if not lib.is_bool(parse_dates):
1336 raise TypeError(msg)
1338 elif not isinstance(parse_dates, (list, dict)):
1339 raise TypeError(msg)
1341 return parse_dates
1344class ParserBase:
1345 def __init__(self, kwds):
1346 self.names = kwds.get("names")
1347 self.orig_names = None
1348 self.prefix = kwds.pop("prefix", None)
1350 self.index_col = kwds.get("index_col", None)
1351 self.unnamed_cols = set()
1352 self.index_names = None
1353 self.col_names = None
1355 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
1356 self.date_parser = kwds.pop("date_parser", None)
1357 self.dayfirst = kwds.pop("dayfirst", False)
1358 self.keep_date_col = kwds.pop("keep_date_col", False)
1360 self.na_values = kwds.get("na_values")
1361 self.na_fvalues = kwds.get("na_fvalues")
1362 self.na_filter = kwds.get("na_filter", False)
1363 self.keep_default_na = kwds.get("keep_default_na", True)
1365 self.true_values = kwds.get("true_values")
1366 self.false_values = kwds.get("false_values")
1367 self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
1368 self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
1369 self.cache_dates = kwds.pop("cache_dates", True)
1371 self._date_conv = _make_date_converter(
1372 date_parser=self.date_parser,
1373 dayfirst=self.dayfirst,
1374 infer_datetime_format=self.infer_datetime_format,
1375 cache_dates=self.cache_dates,
1376 )
1378 # validate header options for mi
1379 self.header = kwds.get("header")
1380 if isinstance(self.header, (list, tuple, np.ndarray)):
1381 if not all(map(is_integer, self.header)):
1382 raise ValueError("header must be integer or list of integers")
1383 if any(i < 0 for i in self.header):
1384 raise ValueError(
1385 "cannot specify multi-index header with negative integers"
1386 )
1387 if kwds.get("usecols"):
1388 raise ValueError(
1389 "cannot specify usecols when specifying a multi-index header"
1390 )
1391 if kwds.get("names"):
1392 raise ValueError(
1393 "cannot specify names when specifying a multi-index header"
1394 )
1396 # validate index_col that only contains integers
1397 if self.index_col is not None:
1398 is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray))
1399 if not (
1400 is_sequence
1401 and all(map(is_integer, self.index_col))
1402 or is_integer(self.index_col)
1403 ):
1404 raise ValueError(
1405 "index_col must only contain row numbers "
1406 "when specifying a multi-index header"
1407 )
1409 # GH 16338
1410 elif self.header is not None and not is_integer(self.header):
1411 raise ValueError("header must be integer or list of integers")
1413 # GH 27779
1414 elif self.header is not None and self.header < 0:
1415 raise ValueError(
1416 "Passing negative integer to header is invalid. "
1417 "For no header, use header=None instead"
1418 )
1420 self._name_processed = False
1422 self._first_chunk = True
1424 # GH 13932
1425 # keep references to file handles opened by the parser itself
1426 self.handles = []
1428 def close(self):
1429 for f in self.handles:
1430 f.close()
1432 @property
1433 def _has_complex_date_col(self):
1434 return isinstance(self.parse_dates, dict) or (
1435 isinstance(self.parse_dates, list)
1436 and len(self.parse_dates) > 0
1437 and isinstance(self.parse_dates[0], list)
1438 )
1440 def _should_parse_dates(self, i):
1441 if isinstance(self.parse_dates, bool):
1442 return self.parse_dates
1443 else:
1444 if self.index_names is not None:
1445 name = self.index_names[i]
1446 else:
1447 name = None
1448 j = self.index_col[i]
1450 if is_scalar(self.parse_dates):
1451 return (j == self.parse_dates) or (
1452 name is not None and name == self.parse_dates
1453 )
1454 else:
1455 return (j in self.parse_dates) or (
1456 name is not None and name in self.parse_dates
1457 )
1459 def _extract_multi_indexer_columns(
1460 self, header, index_names, col_names, passed_names=False
1461 ):
1462 """ extract and return the names, index_names, col_names
1463 header is a list-of-lists returned from the parsers """
1464 if len(header) < 2:
1465 return header[0], index_names, col_names, passed_names
1467 # the names are the tuples of the header that are not the index cols
1468 # 0 is the name of the index, assuming index_col is a list of column
1469 # numbers
1470 ic = self.index_col
1471 if ic is None:
1472 ic = []
1474 if not isinstance(ic, (list, tuple, np.ndarray)):
1475 ic = [ic]
1476 sic = set(ic)
1478 # clean the index_names
1479 index_names = header.pop(-1)
1480 index_names, names, index_col = _clean_index_names(
1481 index_names, self.index_col, self.unnamed_cols
1482 )
1484 # extract the columns
1485 field_count = len(header[0])
1487 def extract(r):
1488 return tuple(r[i] for i in range(field_count) if i not in sic)
1490 columns = list(zip(*(extract(r) for r in header)))
1491 names = ic + columns
1493 # If we find unnamed columns all in a single
1494 # level, then our header was too long.
1495 for n in range(len(columns[0])):
1496 if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
1497 raise ParserError(
1498 "Passed header=[{header}] are too many rows for this "
1499 "multi_index of columns".format(
1500 header=",".join(str(x) for x in self.header)
1501 )
1502 )
1504 # Clean the column names (if we have an index_col).
1505 if len(ic):
1506 col_names = [
1507 r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None
1508 for r in header
1509 ]
1510 else:
1511 col_names = [None] * len(header)
1513 passed_names = True
1515 return names, index_names, col_names, passed_names
1517 def _maybe_dedup_names(self, names):
1518 # see gh-7160 and gh-9424: this helps to provide
1519 # immediate alleviation of the duplicate names
1520 # issue and appears to be satisfactory to users,
1521 # but ultimately, not needing to butcher the names
1522 # would be nice!
1523 if self.mangle_dupe_cols:
1524 names = list(names) # so we can index
1525 counts = defaultdict(int)
1526 is_potential_mi = _is_potential_multi_index(names)
1528 for i, col in enumerate(names):
1529 cur_count = counts[col]
1531 while cur_count > 0:
1532 counts[col] = cur_count + 1
1534 if is_potential_mi:
1535 col = col[:-1] + (f"{col[-1]}.{cur_count}",)
1536 else:
1537 col = f"{col}.{cur_count}"
1538 cur_count = counts[col]
1540 names[i] = col
1541 counts[col] = cur_count + 1
1543 return names
1545 def _maybe_make_multi_index_columns(self, columns, col_names=None):
1546 # possibly create a column mi here
1547 if _is_potential_multi_index(columns):
1548 columns = MultiIndex.from_tuples(columns, names=col_names)
1549 return columns
1551 def _make_index(self, data, alldata, columns, indexnamerow=False):
1552 if not _is_index_col(self.index_col) or not self.index_col:
1553 index = None
1555 elif not self._has_complex_date_col:
1556 index = self._get_simple_index(alldata, columns)
1557 index = self._agg_index(index)
1558 elif self._has_complex_date_col:
1559 if not self._name_processed:
1560 (self.index_names, _, self.index_col) = _clean_index_names(
1561 list(columns), self.index_col, self.unnamed_cols
1562 )
1563 self._name_processed = True
1564 index = self._get_complex_date_index(data, columns)
1565 index = self._agg_index(index, try_parse_dates=False)
1567 # add names for the index
1568 if indexnamerow:
1569 coffset = len(indexnamerow) - len(columns)
1570 index = index.set_names(indexnamerow[:coffset])
1572 # maybe create a mi on the columns
1573 columns = self._maybe_make_multi_index_columns(columns, self.col_names)
1575 return index, columns
1577 _implicit_index = False
1579 def _get_simple_index(self, data, columns):
1580 def ix(col):
1581 if not isinstance(col, str):
1582 return col
1583 raise ValueError(f"Index {col} invalid")
1585 to_remove = []
1586 index = []
1587 for idx in self.index_col:
1588 i = ix(idx)
1589 to_remove.append(i)
1590 index.append(data[i])
1592 # remove index items from content and columns, don't pop in
1593 # loop
1594 for i in sorted(to_remove, reverse=True):
1595 data.pop(i)
1596 if not self._implicit_index:
1597 columns.pop(i)
1599 return index
1601 def _get_complex_date_index(self, data, col_names):
1602 def _get_name(icol):
1603 if isinstance(icol, str):
1604 return icol
1606 if col_names is None:
1607 raise ValueError(f"Must supply column order to use {icol!s} as index")
1609 for i, c in enumerate(col_names):
1610 if i == icol:
1611 return c
1613 to_remove = []
1614 index = []
1615 for idx in self.index_col:
1616 name = _get_name(idx)
1617 to_remove.append(name)
1618 index.append(data[name])
1620 # remove index items from content and columns, don't pop in
1621 # loop
1622 for c in sorted(to_remove, reverse=True):
1623 data.pop(c)
1624 col_names.remove(c)
1626 return index
1628 def _agg_index(self, index, try_parse_dates=True):
1629 arrays = []
1631 for i, arr in enumerate(index):
1633 if try_parse_dates and self._should_parse_dates(i):
1634 arr = self._date_conv(arr)
1636 if self.na_filter:
1637 col_na_values = self.na_values
1638 col_na_fvalues = self.na_fvalues
1639 else:
1640 col_na_values = set()
1641 col_na_fvalues = set()
1643 if isinstance(self.na_values, dict):
1644 col_name = self.index_names[i]
1645 if col_name is not None:
1646 col_na_values, col_na_fvalues = _get_na_values(
1647 col_name, self.na_values, self.na_fvalues, self.keep_default_na
1648 )
1650 arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
1651 arrays.append(arr)
1653 names = self.index_names
1654 index = ensure_index_from_sequences(arrays, names)
1656 return index
1658 def _convert_to_ndarrays(
1659 self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None
1660 ):
1661 result = {}
1662 for c, values in dct.items():
1663 conv_f = None if converters is None else converters.get(c, None)
1664 if isinstance(dtypes, dict):
1665 cast_type = dtypes.get(c, None)
1666 else:
1667 # single dtype or None
1668 cast_type = dtypes
1670 if self.na_filter:
1671 col_na_values, col_na_fvalues = _get_na_values(
1672 c, na_values, na_fvalues, self.keep_default_na
1673 )
1674 else:
1675 col_na_values, col_na_fvalues = set(), set()
1677 if conv_f is not None:
1678 # conv_f applied to data before inference
1679 if cast_type is not None:
1680 warnings.warn(
1681 (
1682 "Both a converter and dtype were specified "
1683 f"for column {c} - only the converter will "
1684 "be used"
1685 ),
1686 ParserWarning,
1687 stacklevel=7,
1688 )
1690 try:
1691 values = lib.map_infer(values, conv_f)
1692 except ValueError:
1693 mask = algorithms.isin(values, list(na_values)).view(np.uint8)
1694 values = lib.map_infer_mask(values, conv_f, mask)
1696 cvals, na_count = self._infer_types(
1697 values, set(col_na_values) | col_na_fvalues, try_num_bool=False
1698 )
1699 else:
1700 is_str_or_ea_dtype = is_string_dtype(
1701 cast_type
1702 ) or is_extension_array_dtype(cast_type)
1703 # skip inference if specified dtype is object
1704 # or casting to an EA
1705 try_num_bool = not (cast_type and is_str_or_ea_dtype)
1707 # general type inference and conversion
1708 cvals, na_count = self._infer_types(
1709 values, set(col_na_values) | col_na_fvalues, try_num_bool
1710 )
1712 # type specified in dtype param or cast_type is an EA
1713 if cast_type and (
1714 not is_dtype_equal(cvals, cast_type)
1715 or is_extension_array_dtype(cast_type)
1716 ):
1717 try:
1718 if (
1719 is_bool_dtype(cast_type)
1720 and not is_categorical_dtype(cast_type)
1721 and na_count > 0
1722 ):
1723 raise ValueError(f"Bool column has NA values in column {c}")
1724 except (AttributeError, TypeError):
1725 # invalid input to is_bool_dtype
1726 pass
1727 cvals = self._cast_types(cvals, cast_type, c)
1729 result[c] = cvals
1730 if verbose and na_count:
1731 print(f"Filled {na_count} NA values in column {c!s}")
1732 return result
1734 def _infer_types(self, values, na_values, try_num_bool=True):
1735 """
1736 Infer types of values, possibly casting
1738 Parameters
1739 ----------
1740 values : ndarray
1741 na_values : set
1742 try_num_bool : bool, default try
1743 try to cast values to numeric (first preference) or boolean
1745 Returns
1746 -------
1747 converted : ndarray
1748 na_count : int
1749 """
1750 na_count = 0
1751 if issubclass(values.dtype.type, (np.number, np.bool_)):
1752 mask = algorithms.isin(values, list(na_values))
1753 na_count = mask.sum()
1754 if na_count > 0:
1755 if is_integer_dtype(values):
1756 values = values.astype(np.float64)
1757 np.putmask(values, mask, np.nan)
1758 return values, na_count
1760 if try_num_bool and is_object_dtype(values.dtype):
1761 # exclude e.g DatetimeIndex here
1762 try:
1763 result = lib.maybe_convert_numeric(values, na_values, False)
1764 except (ValueError, TypeError):
1765 # e.g. encountering datetime string gets ValueError
1766 # TypeError can be raised in floatify
1767 result = values
1768 na_count = parsers.sanitize_objects(result, na_values, False)
1769 else:
1770 na_count = isna(result).sum()
1771 else:
1772 result = values
1773 if values.dtype == np.object_:
1774 na_count = parsers.sanitize_objects(values, na_values, False)
1776 if result.dtype == np.object_ and try_num_bool:
1777 result = libops.maybe_convert_bool(
1778 np.asarray(values),
1779 true_values=self.true_values,
1780 false_values=self.false_values,
1781 )
1783 return result, na_count
1785 def _cast_types(self, values, cast_type, column):
1786 """
1787 Cast values to specified type
1789 Parameters
1790 ----------
1791 values : ndarray
1792 cast_type : string or np.dtype
1793 dtype to cast values to
1794 column : string
1795 column name - used only for error reporting
1797 Returns
1798 -------
1799 converted : ndarray
1800 """
1802 if is_categorical_dtype(cast_type):
1803 known_cats = (
1804 isinstance(cast_type, CategoricalDtype)
1805 and cast_type.categories is not None
1806 )
1808 if not is_object_dtype(values) and not known_cats:
1809 # XXX this is for consistency with
1810 # c-parser which parses all categories
1811 # as strings
1812 values = astype_nansafe(values, str)
1814 cats = Index(values).unique().dropna()
1815 values = Categorical._from_inferred_categories(
1816 cats, cats.get_indexer(values), cast_type, true_values=self.true_values
1817 )
1819 # use the EA's implementation of casting
1820 elif is_extension_array_dtype(cast_type):
1821 # ensure cast_type is an actual dtype and not a string
1822 cast_type = pandas_dtype(cast_type)
1823 array_type = cast_type.construct_array_type()
1824 try:
1825 return array_type._from_sequence_of_strings(values, dtype=cast_type)
1826 except NotImplementedError:
1827 raise NotImplementedError(
1828 f"Extension Array: {array_type} must implement "
1829 "_from_sequence_of_strings in order "
1830 "to be used in parser methods"
1831 )
1833 else:
1834 try:
1835 values = astype_nansafe(values, cast_type, copy=True, skipna=True)
1836 except ValueError:
1837 raise ValueError(
1838 f"Unable to convert column {column} to type {cast_type}"
1839 )
1840 return values
1842 def _do_date_conversions(self, names, data):
1843 # returns data, columns
1845 if self.parse_dates is not None:
1846 data, names = _process_date_conversion(
1847 data,
1848 self._date_conv,
1849 self.parse_dates,
1850 self.index_col,
1851 self.index_names,
1852 names,
1853 keep_date_col=self.keep_date_col,
1854 )
1856 return names, data
1859class CParserWrapper(ParserBase):
1860 """
1862 """
1864 def __init__(self, src, **kwds):
1865 self.kwds = kwds
1866 kwds = kwds.copy()
1868 ParserBase.__init__(self, kwds)
1870 encoding = kwds.get("encoding")
1872 if kwds.get("compression") is None and encoding:
1873 if isinstance(src, str):
1874 src = open(src, "rb")
1875 self.handles.append(src)
1877 # Handle the file object with universal line mode enabled.
1878 # We will handle the newline character ourselves later on.
1879 if hasattr(src, "read") and not hasattr(src, "encoding"):
1880 src = TextIOWrapper(src, encoding=encoding, newline="")
1882 kwds["encoding"] = "utf-8"
1884 # #2442
1885 kwds["allow_leading_cols"] = self.index_col is not False
1887 # GH20529, validate usecol arg before TextReader
1888 self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
1889 kwds["usecols"] = self.usecols
1891 self._reader = parsers.TextReader(src, **kwds)
1892 self.unnamed_cols = self._reader.unnamed_cols
1894 passed_names = self.names is None
1896 if self._reader.header is None:
1897 self.names = None
1898 else:
1899 if len(self._reader.header) > 1:
1900 # we have a multi index in the columns
1901 (
1902 self.names,
1903 self.index_names,
1904 self.col_names,
1905 passed_names,
1906 ) = self._extract_multi_indexer_columns(
1907 self._reader.header, self.index_names, self.col_names, passed_names
1908 )
1909 else:
1910 self.names = list(self._reader.header[0])
1912 if self.names is None:
1913 if self.prefix:
1914 self.names = [
1915 f"{self.prefix}{i}" for i in range(self._reader.table_width)
1916 ]
1917 else:
1918 self.names = list(range(self._reader.table_width))
1920 # gh-9755
1921 #
1922 # need to set orig_names here first
1923 # so that proper indexing can be done
1924 # with _set_noconvert_columns
1925 #
1926 # once names has been filtered, we will
1927 # then set orig_names again to names
1928 self.orig_names = self.names[:]
1930 if self.usecols:
1931 usecols = _evaluate_usecols(self.usecols, self.orig_names)
1933 # GH 14671
1934 if self.usecols_dtype == "string" and not set(usecols).issubset(
1935 self.orig_names
1936 ):
1937 _validate_usecols_names(usecols, self.orig_names)
1939 if len(self.names) > len(usecols):
1940 self.names = [
1941 n
1942 for i, n in enumerate(self.names)
1943 if (i in usecols or n in usecols)
1944 ]
1946 if len(self.names) < len(usecols):
1947 _validate_usecols_names(usecols, self.names)
1949 self._set_noconvert_columns()
1951 self.orig_names = self.names
1953 if not self._has_complex_date_col:
1954 if self._reader.leading_cols == 0 and _is_index_col(self.index_col):
1956 self._name_processed = True
1957 (index_names, self.names, self.index_col) = _clean_index_names(
1958 self.names, self.index_col, self.unnamed_cols
1959 )
1961 if self.index_names is None:
1962 self.index_names = index_names
1964 if self._reader.header is None and not passed_names:
1965 self.index_names = [None] * len(self.index_names)
1967 self._implicit_index = self._reader.leading_cols > 0
1969 def close(self):
1970 for f in self.handles:
1971 f.close()
1973 # close additional handles opened by C parser (for compression)
1974 try:
1975 self._reader.close()
1976 except ValueError:
1977 pass
1979 def _set_noconvert_columns(self):
1980 """
1981 Set the columns that should not undergo dtype conversions.
1983 Currently, any column that is involved with date parsing will not
1984 undergo such conversions.
1985 """
1986 names = self.orig_names
1987 if self.usecols_dtype == "integer":
1988 # A set of integers will be converted to a list in
1989 # the correct order every single time.
1990 usecols = list(self.usecols)
1991 usecols.sort()
1992 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
1993 # The names attribute should have the correct columns
1994 # in the proper order for indexing with parse_dates.
1995 usecols = self.names[:]
1996 else:
1997 # Usecols is empty.
1998 usecols = None
2000 def _set(x):
2001 if usecols is not None and is_integer(x):
2002 x = usecols[x]
2004 if not is_integer(x):
2005 x = names.index(x)
2007 self._reader.set_noconvert(x)
2009 if isinstance(self.parse_dates, list):
2010 for val in self.parse_dates:
2011 if isinstance(val, list):
2012 for k in val:
2013 _set(k)
2014 else:
2015 _set(val)
2017 elif isinstance(self.parse_dates, dict):
2018 for val in self.parse_dates.values():
2019 if isinstance(val, list):
2020 for k in val:
2021 _set(k)
2022 else:
2023 _set(val)
2025 elif self.parse_dates:
2026 if isinstance(self.index_col, list):
2027 for k in self.index_col:
2028 _set(k)
2029 elif self.index_col is not None:
2030 _set(self.index_col)
2032 def set_error_bad_lines(self, status):
2033 self._reader.set_error_bad_lines(int(status))
2035 def read(self, nrows=None):
2036 try:
2037 data = self._reader.read(nrows)
2038 except StopIteration:
2039 if self._first_chunk:
2040 self._first_chunk = False
2041 names = self._maybe_dedup_names(self.orig_names)
2042 index, columns, col_dict = _get_empty_meta(
2043 names,
2044 self.index_col,
2045 self.index_names,
2046 dtype=self.kwds.get("dtype"),
2047 )
2048 columns = self._maybe_make_multi_index_columns(columns, self.col_names)
2050 if self.usecols is not None:
2051 columns = self._filter_usecols(columns)
2053 col_dict = dict(
2054 filter(lambda item: item[0] in columns, col_dict.items())
2055 )
2057 return index, columns, col_dict
2059 else:
2060 raise
2062 # Done with first read, next time raise StopIteration
2063 self._first_chunk = False
2065 names = self.names
2067 if self._reader.leading_cols:
2068 if self._has_complex_date_col:
2069 raise NotImplementedError("file structure not yet supported")
2071 # implicit index, no index names
2072 arrays = []
2074 for i in range(self._reader.leading_cols):
2075 if self.index_col is None:
2076 values = data.pop(i)
2077 else:
2078 values = data.pop(self.index_col[i])
2080 values = self._maybe_parse_dates(values, i, try_parse_dates=True)
2081 arrays.append(values)
2083 index = ensure_index_from_sequences(arrays)
2085 if self.usecols is not None:
2086 names = self._filter_usecols(names)
2088 names = self._maybe_dedup_names(names)
2090 # rename dict keys
2091 data = sorted(data.items())
2092 data = {k: v for k, (i, v) in zip(names, data)}
2094 names, data = self._do_date_conversions(names, data)
2096 else:
2097 # rename dict keys
2098 data = sorted(data.items())
2100 # ugh, mutation
2101 names = list(self.orig_names)
2102 names = self._maybe_dedup_names(names)
2104 if self.usecols is not None:
2105 names = self._filter_usecols(names)
2107 # columns as list
2108 alldata = [x[1] for x in data]
2110 data = {k: v for k, (i, v) in zip(names, data)}
2112 names, data = self._do_date_conversions(names, data)
2113 index, names = self._make_index(data, alldata, names)
2115 # maybe create a mi on the columns
2116 names = self._maybe_make_multi_index_columns(names, self.col_names)
2118 return index, names, data
2120 def _filter_usecols(self, names):
2121 # hackish
2122 usecols = _evaluate_usecols(self.usecols, names)
2123 if usecols is not None and len(names) != len(usecols):
2124 names = [
2125 name for i, name in enumerate(names) if i in usecols or name in usecols
2126 ]
2127 return names
2129 def _get_index_names(self):
2130 names = list(self._reader.header[0])
2131 idx_names = None
2133 if self._reader.leading_cols == 0 and self.index_col is not None:
2134 (idx_names, names, self.index_col) = _clean_index_names(
2135 names, self.index_col, self.unnamed_cols
2136 )
2138 return names, idx_names
2140 def _maybe_parse_dates(self, values, index, try_parse_dates=True):
2141 if try_parse_dates and self._should_parse_dates(index):
2142 values = self._date_conv(values)
2143 return values
2146def TextParser(*args, **kwds):
2147 """
2148 Converts lists of lists/tuples into DataFrames with proper type inference
2149 and optional (e.g. string to datetime) conversion. Also enables iterating
2150 lazily over chunks of large files
2152 Parameters
2153 ----------
2154 data : file-like object or list
2155 delimiter : separator character to use
2156 dialect : str or csv.Dialect instance, optional
2157 Ignored if delimiter is longer than 1 character
2158 names : sequence, default
2159 header : int, default 0
2160 Row to use to parse column labels. Defaults to the first row. Prior
2161 rows will be discarded
2162 index_col : int or list, optional
2163 Column or columns to use as the (possibly hierarchical) index
2164 has_index_names: bool, default False
2165 True if the cols defined in index_col have an index name and are
2166 not in the header.
2167 na_values : scalar, str, list-like, or dict, optional
2168 Additional strings to recognize as NA/NaN.
2169 keep_default_na : bool, default True
2170 thousands : str, optional
2171 Thousands separator
2172 comment : str, optional
2173 Comment out remainder of line
2174 parse_dates : bool, default False
2175 keep_date_col : bool, default False
2176 date_parser : function, optional
2177 skiprows : list of integers
2178 Row numbers to skip
2179 skipfooter : int
2180 Number of line at bottom of file to skip
2181 converters : dict, optional
2182 Dict of functions for converting values in certain columns. Keys can
2183 either be integers or column labels, values are functions that take one
2184 input argument, the cell (not column) content, and return the
2185 transformed content.
2186 encoding : str, optional
2187 Encoding to use for UTF when reading/writing (ex. 'utf-8')
2188 squeeze : bool, default False
2189 returns Series if only one column.
2190 infer_datetime_format: bool, default False
2191 If True and `parse_dates` is True for a column, try to infer the
2192 datetime format based on the first datetime string. If the format
2193 can be inferred, there often will be a large parsing speed-up.
2194 float_precision : str, optional
2195 Specifies which converter the C engine should use for floating-point
2196 values. The options are None for the ordinary converter,
2197 'high' for the high-precision converter, and 'round_trip' for the
2198 round-trip converter.
2199 """
2200 kwds["engine"] = "python"
2201 return TextFileReader(*args, **kwds)
2204def count_empty_vals(vals):
2205 return sum(1 for v in vals if v == "" or v is None)
2208class PythonParser(ParserBase):
2209 def __init__(self, f, **kwds):
2210 """
2211 Workhorse function for processing nested list into DataFrame
2212 """
2213 ParserBase.__init__(self, kwds)
2215 self.data = None
2216 self.buf = []
2217 self.pos = 0
2218 self.line_pos = 0
2220 self.encoding = kwds["encoding"]
2221 self.compression = kwds["compression"]
2222 self.memory_map = kwds["memory_map"]
2223 self.skiprows = kwds["skiprows"]
2225 if callable(self.skiprows):
2226 self.skipfunc = self.skiprows
2227 else:
2228 self.skipfunc = lambda x: x in self.skiprows
2230 self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
2231 self.delimiter = kwds["delimiter"]
2233 self.quotechar = kwds["quotechar"]
2234 if isinstance(self.quotechar, str):
2235 self.quotechar = str(self.quotechar)
2237 self.escapechar = kwds["escapechar"]
2238 self.doublequote = kwds["doublequote"]
2239 self.skipinitialspace = kwds["skipinitialspace"]
2240 self.lineterminator = kwds["lineterminator"]
2241 self.quoting = kwds["quoting"]
2242 self.usecols, _ = _validate_usecols_arg(kwds["usecols"])
2243 self.skip_blank_lines = kwds["skip_blank_lines"]
2245 self.warn_bad_lines = kwds["warn_bad_lines"]
2246 self.error_bad_lines = kwds["error_bad_lines"]
2248 self.names_passed = kwds["names"] or None
2250 self.has_index_names = False
2251 if "has_index_names" in kwds:
2252 self.has_index_names = kwds["has_index_names"]
2254 self.verbose = kwds["verbose"]
2255 self.converters = kwds["converters"]
2257 self.dtype = kwds["dtype"]
2258 self.thousands = kwds["thousands"]
2259 self.decimal = kwds["decimal"]
2261 self.comment = kwds["comment"]
2262 self._comment_lines = []
2264 f, handles = get_handle(
2265 f,
2266 "r",
2267 encoding=self.encoding,
2268 compression=self.compression,
2269 memory_map=self.memory_map,
2270 )
2271 self.handles.extend(handles)
2273 # Set self.data to something that can read lines.
2274 if hasattr(f, "readline"):
2275 self._make_reader(f)
2276 else:
2277 self.data = f
2279 # Get columns in two steps: infer from data, then
2280 # infer column indices from self.usecols if it is specified.
2281 self._col_indices = None
2282 (
2283 self.columns,
2284 self.num_original_columns,
2285 self.unnamed_cols,
2286 ) = self._infer_columns()
2288 # Now self.columns has the set of columns that we will process.
2289 # The original set is stored in self.original_columns.
2290 if len(self.columns) > 1:
2291 # we are processing a multi index column
2292 (
2293 self.columns,
2294 self.index_names,
2295 self.col_names,
2296 _,
2297 ) = self._extract_multi_indexer_columns(
2298 self.columns, self.index_names, self.col_names
2299 )
2300 # Update list of original names to include all indices.
2301 self.num_original_columns = len(self.columns)
2302 else:
2303 self.columns = self.columns[0]
2305 # get popped off for index
2306 self.orig_names = list(self.columns)
2308 # needs to be cleaned/refactored
2309 # multiple date column thing turning into a real spaghetti factory
2311 if not self._has_complex_date_col:
2312 (index_names, self.orig_names, self.columns) = self._get_index_name(
2313 self.columns
2314 )
2315 self._name_processed = True
2316 if self.index_names is None:
2317 self.index_names = index_names
2319 if self.parse_dates:
2320 self._no_thousands_columns = self._set_no_thousands_columns()
2321 else:
2322 self._no_thousands_columns = None
2324 if len(self.decimal) != 1:
2325 raise ValueError("Only length-1 decimal markers supported")
2327 if self.thousands is None:
2328 self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+")
2329 else:
2330 self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+")
2332 def _set_no_thousands_columns(self):
2333 # Create a set of column ids that are not to be stripped of thousands
2334 # operators.
2335 noconvert_columns = set()
2337 def _set(x):
2338 if is_integer(x):
2339 noconvert_columns.add(x)
2340 else:
2341 noconvert_columns.add(self.columns.index(x))
2343 if isinstance(self.parse_dates, list):
2344 for val in self.parse_dates:
2345 if isinstance(val, list):
2346 for k in val:
2347 _set(k)
2348 else:
2349 _set(val)
2351 elif isinstance(self.parse_dates, dict):
2352 for val in self.parse_dates.values():
2353 if isinstance(val, list):
2354 for k in val:
2355 _set(k)
2356 else:
2357 _set(val)
2359 elif self.parse_dates:
2360 if isinstance(self.index_col, list):
2361 for k in self.index_col:
2362 _set(k)
2363 elif self.index_col is not None:
2364 _set(self.index_col)
2366 return noconvert_columns
2368 def _make_reader(self, f):
2369 sep = self.delimiter
2371 if sep is None or len(sep) == 1:
2372 if self.lineterminator:
2373 raise ValueError(
2374 "Custom line terminators not supported in python parser (yet)"
2375 )
2377 class MyDialect(csv.Dialect):
2378 delimiter = self.delimiter
2379 quotechar = self.quotechar
2380 escapechar = self.escapechar
2381 doublequote = self.doublequote
2382 skipinitialspace = self.skipinitialspace
2383 quoting = self.quoting
2384 lineterminator = "\n"
2386 dia = MyDialect
2388 sniff_sep = True
2390 if sep is not None:
2391 sniff_sep = False
2392 dia.delimiter = sep
2393 # attempt to sniff the delimiter
2394 if sniff_sep:
2395 line = f.readline()
2396 while self.skipfunc(self.pos):
2397 self.pos += 1
2398 line = f.readline()
2400 line = self._check_comments([line])[0]
2402 self.pos += 1
2403 self.line_pos += 1
2404 sniffed = csv.Sniffer().sniff(line)
2405 dia.delimiter = sniffed.delimiter
2407 # Note: self.encoding is irrelevant here
2408 line_rdr = csv.reader(StringIO(line), dialect=dia)
2409 self.buf.extend(list(line_rdr))
2411 # Note: self.encoding is irrelevant here
2412 reader = csv.reader(f, dialect=dia, strict=True)
2414 else:
2416 def _read():
2417 line = f.readline()
2418 pat = re.compile(sep)
2420 yield pat.split(line.strip())
2422 for line in f:
2423 yield pat.split(line.strip())
2425 reader = _read()
2427 self.data = reader
2429 def read(self, rows=None):
2430 try:
2431 content = self._get_lines(rows)
2432 except StopIteration:
2433 if self._first_chunk:
2434 content = []
2435 else:
2436 raise
2438 # done with first read, next time raise StopIteration
2439 self._first_chunk = False
2441 columns = list(self.orig_names)
2442 if not len(content): # pragma: no cover
2443 # DataFrame with the right metadata, even though it's length 0
2444 names = self._maybe_dedup_names(self.orig_names)
2445 index, columns, col_dict = _get_empty_meta(
2446 names, self.index_col, self.index_names, self.dtype
2447 )
2448 columns = self._maybe_make_multi_index_columns(columns, self.col_names)
2449 return index, columns, col_dict
2451 # handle new style for names in index
2452 count_empty_content_vals = count_empty_vals(content[0])
2453 indexnamerow = None
2454 if self.has_index_names and count_empty_content_vals == len(columns):
2455 indexnamerow = content[0]
2456 content = content[1:]
2458 alldata = self._rows_to_cols(content)
2459 data = self._exclude_implicit_index(alldata)
2461 columns = self._maybe_dedup_names(self.columns)
2462 columns, data = self._do_date_conversions(columns, data)
2464 data = self._convert_data(data)
2465 index, columns = self._make_index(data, alldata, columns, indexnamerow)
2467 return index, columns, data
2469 def _exclude_implicit_index(self, alldata):
2470 names = self._maybe_dedup_names(self.orig_names)
2472 if self._implicit_index:
2473 excl_indices = self.index_col
2475 data = {}
2476 offset = 0
2477 for i, col in enumerate(names):
2478 while i + offset in excl_indices:
2479 offset += 1
2480 data[col] = alldata[i + offset]
2481 else:
2482 data = {k: v for k, v in zip(names, alldata)}
2484 return data
2486 # legacy
2487 def get_chunk(self, size=None):
2488 if size is None:
2489 size = self.chunksize
2490 return self.read(rows=size)
2492 def _convert_data(self, data):
2493 # apply converters
2494 def _clean_mapping(mapping):
2495 "converts col numbers to names"
2496 clean = {}
2497 for col, v in mapping.items():
2498 if isinstance(col, int) and col not in self.orig_names:
2499 col = self.orig_names[col]
2500 clean[col] = v
2501 return clean
2503 clean_conv = _clean_mapping(self.converters)
2504 if not isinstance(self.dtype, dict):
2505 # handles single dtype applied to all columns
2506 clean_dtypes = self.dtype
2507 else:
2508 clean_dtypes = _clean_mapping(self.dtype)
2510 # Apply NA values.
2511 clean_na_values = {}
2512 clean_na_fvalues = {}
2514 if isinstance(self.na_values, dict):
2515 for col in self.na_values:
2516 na_value = self.na_values[col]
2517 na_fvalue = self.na_fvalues[col]
2519 if isinstance(col, int) and col not in self.orig_names:
2520 col = self.orig_names[col]
2522 clean_na_values[col] = na_value
2523 clean_na_fvalues[col] = na_fvalue
2524 else:
2525 clean_na_values = self.na_values
2526 clean_na_fvalues = self.na_fvalues
2528 return self._convert_to_ndarrays(
2529 data,
2530 clean_na_values,
2531 clean_na_fvalues,
2532 self.verbose,
2533 clean_conv,
2534 clean_dtypes,
2535 )
2537 def _infer_columns(self):
2538 names = self.names
2539 num_original_columns = 0
2540 clear_buffer = True
2541 unnamed_cols = set()
2543 if self.header is not None:
2544 header = self.header
2546 if isinstance(header, (list, tuple, np.ndarray)):
2547 have_mi_columns = len(header) > 1
2548 # we have a mi columns, so read an extra line
2549 if have_mi_columns:
2550 header = list(header) + [header[-1] + 1]
2551 else:
2552 have_mi_columns = False
2553 header = [header]
2555 columns = []
2556 for level, hr in enumerate(header):
2557 try:
2558 line = self._buffered_line()
2560 while self.line_pos <= hr:
2561 line = self._next_line()
2563 except StopIteration:
2564 if self.line_pos < hr:
2565 raise ValueError(
2566 f"Passed header={hr} but only {self.line_pos + 1} lines in "
2567 "file"
2568 )
2570 # We have an empty file, so check
2571 # if columns are provided. That will
2572 # serve as the 'line' for parsing
2573 if have_mi_columns and hr > 0:
2574 if clear_buffer:
2575 self._clear_buffer()
2576 columns.append([None] * len(columns[-1]))
2577 return columns, num_original_columns, unnamed_cols
2579 if not self.names:
2580 raise EmptyDataError("No columns to parse from file")
2582 line = self.names[:]
2584 this_columns = []
2585 this_unnamed_cols = []
2587 for i, c in enumerate(line):
2588 if c == "":
2589 if have_mi_columns:
2590 col_name = f"Unnamed: {i}_level_{level}"
2591 else:
2592 col_name = f"Unnamed: {i}"
2594 this_unnamed_cols.append(i)
2595 this_columns.append(col_name)
2596 else:
2597 this_columns.append(c)
2599 if not have_mi_columns and self.mangle_dupe_cols:
2600 counts = defaultdict(int)
2602 for i, col in enumerate(this_columns):
2603 cur_count = counts[col]
2605 while cur_count > 0:
2606 counts[col] = cur_count + 1
2607 col = f"{col}.{cur_count}"
2608 cur_count = counts[col]
2610 this_columns[i] = col
2611 counts[col] = cur_count + 1
2612 elif have_mi_columns:
2614 # if we have grabbed an extra line, but its not in our
2615 # format so save in the buffer, and create an blank extra
2616 # line for the rest of the parsing code
2617 if hr == header[-1]:
2618 lc = len(this_columns)
2619 ic = len(self.index_col) if self.index_col is not None else 0
2620 unnamed_count = len(this_unnamed_cols)
2622 if lc != unnamed_count and lc - ic > unnamed_count:
2623 clear_buffer = False
2624 this_columns = [None] * lc
2625 self.buf = [self.buf[-1]]
2627 columns.append(this_columns)
2628 unnamed_cols.update({this_columns[i] for i in this_unnamed_cols})
2630 if len(columns) == 1:
2631 num_original_columns = len(this_columns)
2633 if clear_buffer:
2634 self._clear_buffer()
2636 if names is not None:
2637 if (self.usecols is not None and len(names) != len(self.usecols)) or (
2638 self.usecols is None and len(names) != len(columns[0])
2639 ):
2640 raise ValueError(
2641 "Number of passed names did not match "
2642 "number of header fields in the file"
2643 )
2644 if len(columns) > 1:
2645 raise TypeError("Cannot pass names with multi-index columns")
2647 if self.usecols is not None:
2648 # Set _use_cols. We don't store columns because they are
2649 # overwritten.
2650 self._handle_usecols(columns, names)
2651 else:
2652 self._col_indices = None
2653 num_original_columns = len(names)
2654 columns = [names]
2655 else:
2656 columns = self._handle_usecols(columns, columns[0])
2657 else:
2658 try:
2659 line = self._buffered_line()
2661 except StopIteration:
2662 if not names:
2663 raise EmptyDataError("No columns to parse from file")
2665 line = names[:]
2667 ncols = len(line)
2668 num_original_columns = ncols
2670 if not names:
2671 if self.prefix:
2672 columns = [[f"{self.prefix}{i}" for i in range(ncols)]]
2673 else:
2674 columns = [list(range(ncols))]
2675 columns = self._handle_usecols(columns, columns[0])
2676 else:
2677 if self.usecols is None or len(names) >= num_original_columns:
2678 columns = self._handle_usecols([names], names)
2679 num_original_columns = len(names)
2680 else:
2681 if not callable(self.usecols) and len(names) != len(self.usecols):
2682 raise ValueError(
2683 "Number of passed names did not match number of "
2684 "header fields in the file"
2685 )
2686 # Ignore output but set used columns.
2687 self._handle_usecols([names], names)
2688 columns = [names]
2689 num_original_columns = ncols
2691 return columns, num_original_columns, unnamed_cols
2693 def _handle_usecols(self, columns, usecols_key):
2694 """
2695 Sets self._col_indices
2697 usecols_key is used if there are string usecols.
2698 """
2699 if self.usecols is not None:
2700 if callable(self.usecols):
2701 col_indices = _evaluate_usecols(self.usecols, usecols_key)
2702 elif any(isinstance(u, str) for u in self.usecols):
2703 if len(columns) > 1:
2704 raise ValueError(
2705 "If using multiple headers, usecols must be integers."
2706 )
2707 col_indices = []
2709 for col in self.usecols:
2710 if isinstance(col, str):
2711 try:
2712 col_indices.append(usecols_key.index(col))
2713 except ValueError:
2714 _validate_usecols_names(self.usecols, usecols_key)
2715 else:
2716 col_indices.append(col)
2717 else:
2718 col_indices = self.usecols
2720 columns = [
2721 [n for i, n in enumerate(column) if i in col_indices]
2722 for column in columns
2723 ]
2724 self._col_indices = col_indices
2725 return columns
2727 def _buffered_line(self):
2728 """
2729 Return a line from buffer, filling buffer if required.
2730 """
2731 if len(self.buf) > 0:
2732 return self.buf[0]
2733 else:
2734 return self._next_line()
2736 def _check_for_bom(self, first_row):
2737 """
2738 Checks whether the file begins with the BOM character.
2739 If it does, remove it. In addition, if there is quoting
2740 in the field subsequent to the BOM, remove it as well
2741 because it technically takes place at the beginning of
2742 the name, not the middle of it.
2743 """
2744 # first_row will be a list, so we need to check
2745 # that that list is not empty before proceeding.
2746 if not first_row:
2747 return first_row
2749 # The first element of this row is the one that could have the
2750 # BOM that we want to remove. Check that the first element is a
2751 # string before proceeding.
2752 if not isinstance(first_row[0], str):
2753 return first_row
2755 # Check that the string is not empty, as that would
2756 # obviously not have a BOM at the start of it.
2757 if not first_row[0]:
2758 return first_row
2760 # Since the string is non-empty, check that it does
2761 # in fact begin with a BOM.
2762 first_elt = first_row[0][0]
2763 if first_elt != _BOM:
2764 return first_row
2766 first_row_bom = first_row[0]
2768 if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
2769 start = 2
2770 quote = first_row_bom[1]
2771 end = first_row_bom[2:].index(quote) + 2
2773 # Extract the data between the quotation marks
2774 new_row = first_row_bom[start:end]
2776 # Extract any remaining data after the second
2777 # quotation mark.
2778 if len(first_row_bom) > end + 1:
2779 new_row += first_row_bom[end + 1 :]
2780 return [new_row] + first_row[1:]
2782 elif len(first_row_bom) > 1:
2783 return [first_row_bom[1:]]
2784 else:
2785 # First row is just the BOM, so we
2786 # return an empty string.
2787 return [""]
2789 def _is_line_empty(self, line):
2790 """
2791 Check if a line is empty or not.
2793 Parameters
2794 ----------
2795 line : str, array-like
2796 The line of data to check.
2798 Returns
2799 -------
2800 boolean : Whether or not the line is empty.
2801 """
2802 return not line or all(not x for x in line)
2804 def _next_line(self):
2805 if isinstance(self.data, list):
2806 while self.skipfunc(self.pos):
2807 self.pos += 1
2809 while True:
2810 try:
2811 line = self._check_comments([self.data[self.pos]])[0]
2812 self.pos += 1
2813 # either uncommented or blank to begin with
2814 if not self.skip_blank_lines and (
2815 self._is_line_empty(self.data[self.pos - 1]) or line
2816 ):
2817 break
2818 elif self.skip_blank_lines:
2819 ret = self._remove_empty_lines([line])
2820 if ret:
2821 line = ret[0]
2822 break
2823 except IndexError:
2824 raise StopIteration
2825 else:
2826 while self.skipfunc(self.pos):
2827 self.pos += 1
2828 next(self.data)
2830 while True:
2831 orig_line = self._next_iter_line(row_num=self.pos + 1)
2832 self.pos += 1
2834 if orig_line is not None:
2835 line = self._check_comments([orig_line])[0]
2837 if self.skip_blank_lines:
2838 ret = self._remove_empty_lines([line])
2840 if ret:
2841 line = ret[0]
2842 break
2843 elif self._is_line_empty(orig_line) or line:
2844 break
2846 # This was the first line of the file,
2847 # which could contain the BOM at the
2848 # beginning of it.
2849 if self.pos == 1:
2850 line = self._check_for_bom(line)
2852 self.line_pos += 1
2853 self.buf.append(line)
2854 return line
2856 def _alert_malformed(self, msg, row_num):
2857 """
2858 Alert a user about a malformed row.
2860 If `self.error_bad_lines` is True, the alert will be `ParserError`.
2861 If `self.warn_bad_lines` is True, the alert will be printed out.
2863 Parameters
2864 ----------
2865 msg : The error message to display.
2866 row_num : The row number where the parsing error occurred.
2867 Because this row number is displayed, we 1-index,
2868 even though we 0-index internally.
2869 """
2871 if self.error_bad_lines:
2872 raise ParserError(msg)
2873 elif self.warn_bad_lines:
2874 base = f"Skipping line {row_num}: "
2875 sys.stderr.write(base + msg + "\n")
2877 def _next_iter_line(self, row_num):
2878 """
2879 Wrapper around iterating through `self.data` (CSV source).
2881 When a CSV error is raised, we check for specific
2882 error messages that allow us to customize the
2883 error message displayed to the user.
2885 Parameters
2886 ----------
2887 row_num : The row number of the line being parsed.
2888 """
2890 try:
2891 return next(self.data)
2892 except csv.Error as e:
2893 if self.warn_bad_lines or self.error_bad_lines:
2894 msg = str(e)
2896 if "NULL byte" in msg or "line contains NUL" in msg:
2897 msg = (
2898 "NULL byte detected. This byte "
2899 "cannot be processed in Python's "
2900 "native csv library at the moment, "
2901 "so please pass in engine='c' instead"
2902 )
2904 if self.skipfooter > 0:
2905 reason = (
2906 "Error could possibly be due to "
2907 "parsing errors in the skipped footer rows "
2908 "(the skipfooter keyword is only applied "
2909 "after Python's csv library has parsed "
2910 "all rows)."
2911 )
2912 msg += ". " + reason
2914 self._alert_malformed(msg, row_num)
2915 return None
2917 def _check_comments(self, lines):
2918 if self.comment is None:
2919 return lines
2920 ret = []
2921 for l in lines:
2922 rl = []
2923 for x in l:
2924 if not isinstance(x, str) or self.comment not in x:
2925 rl.append(x)
2926 else:
2927 x = x[: x.find(self.comment)]
2928 if len(x) > 0:
2929 rl.append(x)
2930 break
2931 ret.append(rl)
2932 return ret
2934 def _remove_empty_lines(self, lines):
2935 """
2936 Iterate through the lines and remove any that are
2937 either empty or contain only one whitespace value
2939 Parameters
2940 ----------
2941 lines : array-like
2942 The array of lines that we are to filter.
2944 Returns
2945 -------
2946 filtered_lines : array-like
2947 The same array of lines with the "empty" ones removed.
2948 """
2950 ret = []
2951 for l in lines:
2952 # Remove empty lines and lines with only one whitespace value
2953 if (
2954 len(l) > 1
2955 or len(l) == 1
2956 and (not isinstance(l[0], str) or l[0].strip())
2957 ):
2958 ret.append(l)
2959 return ret
2961 def _check_thousands(self, lines):
2962 if self.thousands is None:
2963 return lines
2965 return self._search_replace_num_columns(
2966 lines=lines, search=self.thousands, replace=""
2967 )
2969 def _search_replace_num_columns(self, lines, search, replace):
2970 ret = []
2971 for l in lines:
2972 rl = []
2973 for i, x in enumerate(l):
2974 if (
2975 not isinstance(x, str)
2976 or search not in x
2977 or (self._no_thousands_columns and i in self._no_thousands_columns)
2978 or self.nonnum.search(x.strip())
2979 ):
2980 rl.append(x)
2981 else:
2982 rl.append(x.replace(search, replace))
2983 ret.append(rl)
2984 return ret
2986 def _check_decimal(self, lines):
2987 if self.decimal == _parser_defaults["decimal"]:
2988 return lines
2990 return self._search_replace_num_columns(
2991 lines=lines, search=self.decimal, replace="."
2992 )
2994 def _clear_buffer(self):
2995 self.buf = []
2997 _implicit_index = False
2999 def _get_index_name(self, columns):
3000 """
3001 Try several cases to get lines:
3003 0) There are headers on row 0 and row 1 and their
3004 total summed lengths equals the length of the next line.
3005 Treat row 0 as columns and row 1 as indices
3006 1) Look for implicit index: there are more columns
3007 on row 1 than row 0. If this is true, assume that row
3008 1 lists index columns and row 0 lists normal columns.
3009 2) Get index from the columns if it was listed.
3010 """
3011 orig_names = list(columns)
3012 columns = list(columns)
3014 try:
3015 line = self._next_line()
3016 except StopIteration:
3017 line = None
3019 try:
3020 next_line = self._next_line()
3021 except StopIteration:
3022 next_line = None
3024 # implicitly index_col=0 b/c 1 fewer column names
3025 implicit_first_cols = 0
3026 if line is not None:
3027 # leave it 0, #2442
3028 # Case 1
3029 if self.index_col is not False:
3030 implicit_first_cols = len(line) - self.num_original_columns
3032 # Case 0
3033 if next_line is not None:
3034 if len(next_line) == len(line) + self.num_original_columns:
3035 # column and index names on diff rows
3036 self.index_col = list(range(len(line)))
3037 self.buf = self.buf[1:]
3039 for c in reversed(line):
3040 columns.insert(0, c)
3042 # Update list of original names to include all indices.
3043 orig_names = list(columns)
3044 self.num_original_columns = len(columns)
3045 return line, orig_names, columns
3047 if implicit_first_cols > 0:
3048 # Case 1
3049 self._implicit_index = True
3050 if self.index_col is None:
3051 self.index_col = list(range(implicit_first_cols))
3053 index_name = None
3055 else:
3056 # Case 2
3057 (index_name, columns_, self.index_col) = _clean_index_names(
3058 columns, self.index_col, self.unnamed_cols
3059 )
3061 return index_name, orig_names, columns
3063 def _rows_to_cols(self, content):
3064 col_len = self.num_original_columns
3066 if self._implicit_index:
3067 col_len += len(self.index_col)
3069 max_len = max(len(row) for row in content)
3071 # Check that there are no rows with too many
3072 # elements in their row (rows with too few
3073 # elements are padded with NaN).
3074 if max_len > col_len and self.index_col is not False and self.usecols is None:
3076 footers = self.skipfooter if self.skipfooter else 0
3077 bad_lines = []
3079 iter_content = enumerate(content)
3080 content_len = len(content)
3081 content = []
3083 for (i, l) in iter_content:
3084 actual_len = len(l)
3086 if actual_len > col_len:
3087 if self.error_bad_lines or self.warn_bad_lines:
3088 row_num = self.pos - (content_len - i + footers)
3089 bad_lines.append((row_num, actual_len))
3091 if self.error_bad_lines:
3092 break
3093 else:
3094 content.append(l)
3096 for row_num, actual_len in bad_lines:
3097 msg = (
3098 f"Expected {col_len} fields in line {row_num + 1}, saw "
3099 f"{actual_len}"
3100 )
3101 if (
3102 self.delimiter
3103 and len(self.delimiter) > 1
3104 and self.quoting != csv.QUOTE_NONE
3105 ):
3106 # see gh-13374
3107 reason = (
3108 "Error could possibly be due to quotes being "
3109 "ignored when a multi-char delimiter is used."
3110 )
3111 msg += ". " + reason
3113 self._alert_malformed(msg, row_num + 1)
3115 # see gh-13320
3116 zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
3118 if self.usecols:
3119 if self._implicit_index:
3120 zipped_content = [
3121 a
3122 for i, a in enumerate(zipped_content)
3123 if (
3124 i < len(self.index_col)
3125 or i - len(self.index_col) in self._col_indices
3126 )
3127 ]
3128 else:
3129 zipped_content = [
3130 a for i, a in enumerate(zipped_content) if i in self._col_indices
3131 ]
3132 return zipped_content
3134 def _get_lines(self, rows=None):
3135 lines = self.buf
3136 new_rows = None
3138 # already fetched some number
3139 if rows is not None:
3140 # we already have the lines in the buffer
3141 if len(self.buf) >= rows:
3142 new_rows, self.buf = self.buf[:rows], self.buf[rows:]
3144 # need some lines
3145 else:
3146 rows -= len(self.buf)
3148 if new_rows is None:
3149 if isinstance(self.data, list):
3150 if self.pos > len(self.data):
3151 raise StopIteration
3152 if rows is None:
3153 new_rows = self.data[self.pos :]
3154 new_pos = len(self.data)
3155 else:
3156 new_rows = self.data[self.pos : self.pos + rows]
3157 new_pos = self.pos + rows
3159 # Check for stop rows. n.b.: self.skiprows is a set.
3160 if self.skiprows:
3161 new_rows = [
3162 row
3163 for i, row in enumerate(new_rows)
3164 if not self.skipfunc(i + self.pos)
3165 ]
3167 lines.extend(new_rows)
3168 self.pos = new_pos
3170 else:
3171 new_rows = []
3172 try:
3173 if rows is not None:
3174 for _ in range(rows):
3175 new_rows.append(next(self.data))
3176 lines.extend(new_rows)
3177 else:
3178 rows = 0
3180 while True:
3181 new_row = self._next_iter_line(row_num=self.pos + rows + 1)
3182 rows += 1
3184 if new_row is not None:
3185 new_rows.append(new_row)
3187 except StopIteration:
3188 if self.skiprows:
3189 new_rows = [
3190 row
3191 for i, row in enumerate(new_rows)
3192 if not self.skipfunc(i + self.pos)
3193 ]
3194 lines.extend(new_rows)
3195 if len(lines) == 0:
3196 raise
3197 self.pos += len(new_rows)
3199 self.buf = []
3200 else:
3201 lines = new_rows
3203 if self.skipfooter:
3204 lines = lines[: -self.skipfooter]
3206 lines = self._check_comments(lines)
3207 if self.skip_blank_lines:
3208 lines = self._remove_empty_lines(lines)
3209 lines = self._check_thousands(lines)
3210 return self._check_decimal(lines)
3213def _make_date_converter(
3214 date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True
3215):
3216 def converter(*date_cols):
3217 if date_parser is None:
3218 strs = parsing._concat_date_cols(date_cols)
3220 try:
3221 return tools.to_datetime(
3222 ensure_object(strs),
3223 utc=None,
3224 dayfirst=dayfirst,
3225 errors="ignore",
3226 infer_datetime_format=infer_datetime_format,
3227 cache=cache_dates,
3228 ).to_numpy()
3230 except ValueError:
3231 return tools.to_datetime(
3232 parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates
3233 )
3234 else:
3235 try:
3236 result = tools.to_datetime(
3237 date_parser(*date_cols), errors="ignore", cache=cache_dates
3238 )
3239 if isinstance(result, datetime.datetime):
3240 raise Exception("scalar parser")
3241 return result
3242 except Exception:
3243 try:
3244 return tools.to_datetime(
3245 parsing.try_parse_dates(
3246 parsing._concat_date_cols(date_cols),
3247 parser=date_parser,
3248 dayfirst=dayfirst,
3249 ),
3250 errors="ignore",
3251 )
3252 except Exception:
3253 return generic_parser(date_parser, *date_cols)
3255 return converter
3258def _process_date_conversion(
3259 data_dict,
3260 converter,
3261 parse_spec,
3262 index_col,
3263 index_names,
3264 columns,
3265 keep_date_col=False,
3266):
3267 def _isindex(colspec):
3268 return (isinstance(index_col, list) and colspec in index_col) or (
3269 isinstance(index_names, list) and colspec in index_names
3270 )
3272 new_cols = []
3273 new_data = {}
3275 orig_names = columns
3276 columns = list(columns)
3278 date_cols = set()
3280 if parse_spec is None or isinstance(parse_spec, bool):
3281 return data_dict, columns
3283 if isinstance(parse_spec, list):
3284 # list of column lists
3285 for colspec in parse_spec:
3286 if is_scalar(colspec):
3287 if isinstance(colspec, int) and colspec not in data_dict:
3288 colspec = orig_names[colspec]
3289 if _isindex(colspec):
3290 continue
3291 data_dict[colspec] = converter(data_dict[colspec])
3292 else:
3293 new_name, col, old_names = _try_convert_dates(
3294 converter, colspec, data_dict, orig_names
3295 )
3296 if new_name in data_dict:
3297 raise ValueError(f"New date column already in dict {new_name}")
3298 new_data[new_name] = col
3299 new_cols.append(new_name)
3300 date_cols.update(old_names)
3302 elif isinstance(parse_spec, dict):
3303 # dict of new name to column list
3304 for new_name, colspec in parse_spec.items():
3305 if new_name in data_dict:
3306 raise ValueError(f"Date column {new_name} already in dict")
3308 _, col, old_names = _try_convert_dates(
3309 converter, colspec, data_dict, orig_names
3310 )
3312 new_data[new_name] = col
3313 new_cols.append(new_name)
3314 date_cols.update(old_names)
3316 data_dict.update(new_data)
3317 new_cols.extend(columns)
3319 if not keep_date_col:
3320 for c in list(date_cols):
3321 data_dict.pop(c)
3322 new_cols.remove(c)
3324 return data_dict, new_cols
3327def _try_convert_dates(parser, colspec, data_dict, columns):
3328 colset = set(columns)
3329 colnames = []
3331 for c in colspec:
3332 if c in colset:
3333 colnames.append(c)
3334 elif isinstance(c, int) and c not in columns:
3335 colnames.append(columns[c])
3336 else:
3337 colnames.append(c)
3339 new_name = "_".join(str(x) for x in colnames)
3340 to_parse = [data_dict[c] for c in colnames if c in data_dict]
3342 new_col = parser(*to_parse)
3343 return new_name, new_col, colnames
3346def _clean_na_values(na_values, keep_default_na=True):
3348 if na_values is None:
3349 if keep_default_na:
3350 na_values = STR_NA_VALUES
3351 else:
3352 na_values = set()
3353 na_fvalues = set()
3354 elif isinstance(na_values, dict):
3355 old_na_values = na_values.copy()
3356 na_values = {} # Prevent aliasing.
3358 # Convert the values in the na_values dictionary
3359 # into array-likes for further use. This is also
3360 # where we append the default NaN values, provided
3361 # that `keep_default_na=True`.
3362 for k, v in old_na_values.items():
3363 if not is_list_like(v):
3364 v = [v]
3366 if keep_default_na:
3367 v = set(v) | STR_NA_VALUES
3369 na_values[k] = v
3370 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
3371 else:
3372 if not is_list_like(na_values):
3373 na_values = [na_values]
3374 na_values = _stringify_na_values(na_values)
3375 if keep_default_na:
3376 na_values = na_values | STR_NA_VALUES
3378 na_fvalues = _floatify_na_values(na_values)
3380 return na_values, na_fvalues
3383def _clean_index_names(columns, index_col, unnamed_cols):
3384 if not _is_index_col(index_col):
3385 return None, columns, index_col
3387 columns = list(columns)
3389 cp_cols = list(columns)
3390 index_names = []
3392 # don't mutate
3393 index_col = list(index_col)
3395 for i, c in enumerate(index_col):
3396 if isinstance(c, str):
3397 index_names.append(c)
3398 for j, name in enumerate(cp_cols):
3399 if name == c:
3400 index_col[i] = j
3401 columns.remove(name)
3402 break
3403 else:
3404 name = cp_cols[c]
3405 columns.remove(name)
3406 index_names.append(name)
3408 # Only clean index names that were placeholders.
3409 for i, name in enumerate(index_names):
3410 if isinstance(name, str) and name in unnamed_cols:
3411 index_names[i] = None
3413 return index_names, columns, index_col
3416def _get_empty_meta(columns, index_col, index_names, dtype=None):
3417 columns = list(columns)
3419 # Convert `dtype` to a defaultdict of some kind.
3420 # This will enable us to write `dtype[col_name]`
3421 # without worrying about KeyError issues later on.
3422 if not isinstance(dtype, dict):
3423 # if dtype == None, default will be np.object.
3424 default_dtype = dtype or np.object
3425 dtype = defaultdict(lambda: default_dtype)
3426 else:
3427 # Save a copy of the dictionary.
3428 _dtype = dtype.copy()
3429 dtype = defaultdict(lambda: np.object)
3431 # Convert column indexes to column names.
3432 for k, v in _dtype.items():
3433 col = columns[k] if is_integer(k) else k
3434 dtype[col] = v
3436 # Even though we have no data, the "index" of the empty DataFrame
3437 # could for example still be an empty MultiIndex. Thus, we need to
3438 # check whether we have any index columns specified, via either:
3439 #
3440 # 1) index_col (column indices)
3441 # 2) index_names (column names)
3442 #
3443 # Both must be non-null to ensure a successful construction. Otherwise,
3444 # we have to create a generic empty Index.
3445 if (index_col is None or index_col is False) or index_names is None:
3446 index = Index([])
3447 else:
3448 data = [Series([], dtype=dtype[name]) for name in index_names]
3449 index = ensure_index_from_sequences(data, names=index_names)
3450 index_col.sort()
3452 for i, n in enumerate(index_col):
3453 columns.pop(n - i)
3455 col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns}
3457 return index, columns, col_dict
3460def _floatify_na_values(na_values):
3461 # create float versions of the na_values
3462 result = set()
3463 for v in na_values:
3464 try:
3465 v = float(v)
3466 if not np.isnan(v):
3467 result.add(v)
3468 except (TypeError, ValueError, OverflowError):
3469 pass
3470 return result
3473def _stringify_na_values(na_values):
3474 """ return a stringified and numeric for these values """
3475 result = []
3476 for x in na_values:
3477 result.append(str(x))
3478 result.append(x)
3479 try:
3480 v = float(x)
3482 # we are like 999 here
3483 if v == int(v):
3484 v = int(v)
3485 result.append(f"{v}.0")
3486 result.append(str(v))
3488 result.append(v)
3489 except (TypeError, ValueError, OverflowError):
3490 pass
3491 try:
3492 result.append(int(x))
3493 except (TypeError, ValueError, OverflowError):
3494 pass
3495 return set(result)
3498def _get_na_values(col, na_values, na_fvalues, keep_default_na):
3499 """
3500 Get the NaN values for a given column.
3502 Parameters
3503 ----------
3504 col : str
3505 The name of the column.
3506 na_values : array-like, dict
3507 The object listing the NaN values as strings.
3508 na_fvalues : array-like, dict
3509 The object listing the NaN values as floats.
3510 keep_default_na : bool
3511 If `na_values` is a dict, and the column is not mapped in the
3512 dictionary, whether to return the default NaN values or the empty set.
3514 Returns
3515 -------
3516 nan_tuple : A length-two tuple composed of
3518 1) na_values : the string NaN values for that column.
3519 2) na_fvalues : the float NaN values for that column.
3520 """
3522 if isinstance(na_values, dict):
3523 if col in na_values:
3524 return na_values[col], na_fvalues[col]
3525 else:
3526 if keep_default_na:
3527 return STR_NA_VALUES, set()
3529 return set(), set()
3530 else:
3531 return na_values, na_fvalues
3534def _get_col_names(colspec, columns):
3535 colset = set(columns)
3536 colnames = []
3537 for c in colspec:
3538 if c in colset:
3539 colnames.append(c)
3540 elif isinstance(c, int):
3541 colnames.append(columns[c])
3542 return colnames
3545class FixedWidthReader(abc.Iterator):
3546 """
3547 A reader of fixed-width lines.
3548 """
3550 def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100):
3551 self.f = f
3552 self.buffer = None
3553 self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
3554 self.comment = comment
3555 if colspecs == "infer":
3556 self.colspecs = self.detect_colspecs(
3557 infer_nrows=infer_nrows, skiprows=skiprows
3558 )
3559 else:
3560 self.colspecs = colspecs
3562 if not isinstance(self.colspecs, (tuple, list)):
3563 raise TypeError(
3564 "column specifications must be a list or tuple, "
3565 f"input was a {type(colspecs).__name__}"
3566 )
3568 for colspec in self.colspecs:
3569 if not (
3570 isinstance(colspec, (tuple, list))
3571 and len(colspec) == 2
3572 and isinstance(colspec[0], (int, np.integer, type(None)))
3573 and isinstance(colspec[1], (int, np.integer, type(None)))
3574 ):
3575 raise TypeError(
3576 "Each column specification must be "
3577 "2 element tuple or list of integers"
3578 )
3580 def get_rows(self, infer_nrows, skiprows=None):
3581 """
3582 Read rows from self.f, skipping as specified.
3584 We distinguish buffer_rows (the first <= infer_nrows
3585 lines) from the rows returned to detect_colspecs
3586 because it's simpler to leave the other locations
3587 with skiprows logic alone than to modify them to
3588 deal with the fact we skipped some rows here as
3589 well.
3591 Parameters
3592 ----------
3593 infer_nrows : int
3594 Number of rows to read from self.f, not counting
3595 rows that are skipped.
3596 skiprows: set, optional
3597 Indices of rows to skip.
3599 Returns
3600 -------
3601 detect_rows : list of str
3602 A list containing the rows to read.
3604 """
3605 if skiprows is None:
3606 skiprows = set()
3607 buffer_rows = []
3608 detect_rows = []
3609 for i, row in enumerate(self.f):
3610 if i not in skiprows:
3611 detect_rows.append(row)
3612 buffer_rows.append(row)
3613 if len(detect_rows) >= infer_nrows:
3614 break
3615 self.buffer = iter(buffer_rows)
3616 return detect_rows
3618 def detect_colspecs(self, infer_nrows=100, skiprows=None):
3619 # Regex escape the delimiters
3620 delimiters = "".join(r"\{}".format(x) for x in self.delimiter)
3621 pattern = re.compile("([^{}]+)".format(delimiters))
3622 rows = self.get_rows(infer_nrows, skiprows)
3623 if not rows:
3624 raise EmptyDataError("No rows from which to infer column width")
3625 max_len = max(map(len, rows))
3626 mask = np.zeros(max_len + 1, dtype=int)
3627 if self.comment is not None:
3628 rows = [row.partition(self.comment)[0] for row in rows]
3629 for row in rows:
3630 for m in pattern.finditer(row):
3631 mask[m.start() : m.end()] = 1
3632 shifted = np.roll(mask, 1)
3633 shifted[0] = 0
3634 edges = np.where((mask ^ shifted) == 1)[0]
3635 edge_pairs = list(zip(edges[::2], edges[1::2]))
3636 return edge_pairs
3638 def __next__(self):
3639 if self.buffer is not None:
3640 try:
3641 line = next(self.buffer)
3642 except StopIteration:
3643 self.buffer = None
3644 line = next(self.f)
3645 else:
3646 line = next(self.f)
3647 # Note: 'colspecs' is a sequence of half-open intervals.
3648 return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
3651class FixedWidthFieldParser(PythonParser):
3652 """
3653 Specialization that Converts fixed-width fields into DataFrames.
3654 See PythonParser for details.
3655 """
3657 def __init__(self, f, **kwds):
3658 # Support iterators, convert to a list.
3659 self.colspecs = kwds.pop("colspecs")
3660 self.infer_nrows = kwds.pop("infer_nrows")
3661 PythonParser.__init__(self, f, **kwds)
3663 def _make_reader(self, f):
3664 self.data = FixedWidthReader(
3665 f,
3666 self.colspecs,
3667 self.delimiter,
3668 self.comment,
3669 self.skiprows,
3670 self.infer_nrows,
3671 )