Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Module contains tools for processing files into DataFrames or other objects 

3""" 

4 

5from collections import abc, defaultdict 

6import csv 

7import datetime 

8from io import StringIO, TextIOWrapper 

9import re 

10import sys 

11from textwrap import fill 

12from typing import Any, Dict, Set 

13import warnings 

14 

15import numpy as np 

16 

17import pandas._libs.lib as lib 

18import pandas._libs.ops as libops 

19import pandas._libs.parsers as parsers 

20from pandas._libs.parsers import STR_NA_VALUES 

21from pandas._libs.tslibs import parsing 

22from pandas._typing import FilePathOrBuffer 

23from pandas.errors import ( 

24 AbstractMethodError, 

25 EmptyDataError, 

26 ParserError, 

27 ParserWarning, 

28) 

29from pandas.util._decorators import Appender 

30 

31from pandas.core.dtypes.cast import astype_nansafe 

32from pandas.core.dtypes.common import ( 

33 ensure_object, 

34 ensure_str, 

35 is_bool_dtype, 

36 is_categorical_dtype, 

37 is_dtype_equal, 

38 is_extension_array_dtype, 

39 is_file_like, 

40 is_float, 

41 is_integer, 

42 is_integer_dtype, 

43 is_list_like, 

44 is_object_dtype, 

45 is_scalar, 

46 is_string_dtype, 

47 pandas_dtype, 

48) 

49from pandas.core.dtypes.dtypes import CategoricalDtype 

50from pandas.core.dtypes.missing import isna 

51 

52from pandas.core import algorithms 

53from pandas.core.arrays import Categorical 

54from pandas.core.frame import DataFrame 

55from pandas.core.indexes.api import ( 

56 Index, 

57 MultiIndex, 

58 RangeIndex, 

59 ensure_index_from_sequences, 

60) 

61from pandas.core.series import Series 

62from pandas.core.tools import datetimes as tools 

63 

64from pandas.io.common import ( 

65 get_filepath_or_buffer, 

66 get_handle, 

67 infer_compression, 

68 validate_header_arg, 

69) 

70from pandas.io.date_converters import generic_parser 

71 

72# BOM character (byte order mark) 

73# This exists at the beginning of a file to indicate endianness 

74# of a file (stream). Unfortunately, this marker screws up parsing, 

75# so we need to remove it if we see it. 

76_BOM = "\ufeff" 

77 

78_doc_read_csv_and_table = ( 

79 r""" 

80{summary} 

81 

82Also supports optionally iterating or breaking of the file 

83into chunks. 

84 

85Additional help can be found in the online docs for 

86`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 

87 

88Parameters 

89---------- 

90filepath_or_buffer : str, path object or file-like object 

91 Any valid string path is acceptable. The string could be a URL. Valid 

92 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

93 expected. A local file could be: file://localhost/path/to/table.csv. 

94 

95 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

96 

97 By file-like object, we refer to objects with a ``read()`` method, such as 

98 a file handler (e.g. via builtin ``open`` function) or ``StringIO``. 

99sep : str, default {_default_sep} 

100 Delimiter to use. If sep is None, the C engine cannot automatically detect 

101 the separator, but the Python parsing engine can, meaning the latter will 

102 be used and automatically detect the separator by Python's builtin sniffer 

103 tool, ``csv.Sniffer``. In addition, separators longer than 1 character and 

104 different from ``'\s+'`` will be interpreted as regular expressions and 

105 will also force the use of the Python parsing engine. Note that regex 

106 delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. 

107delimiter : str, default ``None`` 

108 Alias for sep. 

109header : int, list of int, default 'infer' 

110 Row number(s) to use as the column names, and the start of the 

111 data. Default behavior is to infer the column names: if no names 

112 are passed the behavior is identical to ``header=0`` and column 

113 names are inferred from the first line of the file, if column 

114 names are passed explicitly then the behavior is identical to 

115 ``header=None``. Explicitly pass ``header=0`` to be able to 

116 replace existing names. The header can be a list of integers that 

117 specify row locations for a multi-index on the columns 

118 e.g. [0,1,3]. Intervening rows that are not specified will be 

119 skipped (e.g. 2 in this example is skipped). Note that this 

120 parameter ignores commented lines and empty lines if 

121 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of 

122 data rather than the first line of the file. 

123names : array-like, optional 

124 List of column names to use. If the file contains a header row, 

125 then you should explicitly pass ``header=0`` to override the column names. 

126 Duplicates in this list are not allowed. 

127index_col : int, str, sequence of int / str, or False, default ``None`` 

128 Column(s) to use as the row labels of the ``DataFrame``, either given as 

129 string name or column index. If a sequence of int / str is given, a 

130 MultiIndex is used. 

131 

132 Note: ``index_col=False`` can be used to force pandas to *not* use the first 

133 column as the index, e.g. when you have a malformed file with delimiters at 

134 the end of each line. 

135usecols : list-like or callable, optional 

136 Return a subset of the columns. If list-like, all elements must either 

137 be positional (i.e. integer indices into the document columns) or strings 

138 that correspond to column names provided either by the user in `names` or 

139 inferred from the document header row(s). For example, a valid list-like 

140 `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. 

141 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. 

142 To instantiate a DataFrame from ``data`` with element order preserved use 

143 ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns 

144 in ``['foo', 'bar']`` order or 

145 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` 

146 for ``['bar', 'foo']`` order. 

147 

148 If callable, the callable function will be evaluated against the column 

149 names, returning names where the callable function evaluates to True. An 

150 example of a valid callable argument would be ``lambda x: x.upper() in 

151 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster 

152 parsing time and lower memory usage. 

153squeeze : bool, default False 

154 If the parsed data only contains one column then return a Series. 

155prefix : str, optional 

156 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... 

157mangle_dupe_cols : bool, default True 

158 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 

159 'X'...'X'. Passing in False will cause data to be overwritten if there 

160 are duplicate names in the columns. 

161dtype : Type name or dict of column -> type, optional 

162 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 

163 'c': 'Int64'}} 

164 Use `str` or `object` together with suitable `na_values` settings 

165 to preserve and not interpret dtype. 

166 If converters are specified, they will be applied INSTEAD 

167 of dtype conversion. 

168engine : {{'c', 'python'}}, optional 

169 Parser engine to use. The C engine is faster while the python engine is 

170 currently more feature-complete. 

171converters : dict, optional 

172 Dict of functions for converting values in certain columns. Keys can either 

173 be integers or column labels. 

174true_values : list, optional 

175 Values to consider as True. 

176false_values : list, optional 

177 Values to consider as False. 

178skipinitialspace : bool, default False 

179 Skip spaces after delimiter. 

180skiprows : list-like, int or callable, optional 

181 Line numbers to skip (0-indexed) or number of lines to skip (int) 

182 at the start of the file. 

183 

184 If callable, the callable function will be evaluated against the row 

185 indices, returning True if the row should be skipped and False otherwise. 

186 An example of a valid callable argument would be ``lambda x: x in [0, 2]``. 

187skipfooter : int, default 0 

188 Number of lines at bottom of file to skip (Unsupported with engine='c'). 

189nrows : int, optional 

190 Number of rows of file to read. Useful for reading pieces of large files. 

191na_values : scalar, str, list-like, or dict, optional 

192 Additional strings to recognize as NA/NaN. If dict passed, specific 

193 per-column NA values. By default the following values are interpreted as 

194 NaN: '""" 

195 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 

196 + """'. 

197keep_default_na : bool, default True 

198 Whether or not to include the default NaN values when parsing the data. 

199 Depending on whether `na_values` is passed in, the behavior is as follows: 

200 

201 * If `keep_default_na` is True, and `na_values` are specified, `na_values` 

202 is appended to the default NaN values used for parsing. 

203 * If `keep_default_na` is True, and `na_values` are not specified, only 

204 the default NaN values are used for parsing. 

205 * If `keep_default_na` is False, and `na_values` are specified, only 

206 the NaN values specified `na_values` are used for parsing. 

207 * If `keep_default_na` is False, and `na_values` are not specified, no 

208 strings will be parsed as NaN. 

209 

210 Note that if `na_filter` is passed in as False, the `keep_default_na` and 

211 `na_values` parameters will be ignored. 

212na_filter : bool, default True 

213 Detect missing value markers (empty strings and the value of na_values). In 

214 data without any NAs, passing na_filter=False can improve the performance 

215 of reading a large file. 

216verbose : bool, default False 

217 Indicate number of NA values placed in non-numeric columns. 

218skip_blank_lines : bool, default True 

219 If True, skip over blank lines rather than interpreting as NaN values. 

220parse_dates : bool or list of int or names or list of lists or dict, \ 

221default False 

222 The behavior is as follows: 

223 

224 * boolean. If True -> try parsing the index. 

225 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

226 each as a separate date column. 

227 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

228 a single date column. 

229 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

230 result 'foo' 

231 

232 If a column or index cannot be represented as an array of datetimes, 

233 say because of an unparseable value or a mixture of timezones, the column 

234 or index will be returned unaltered as an object data type. For 

235 non-standard datetime parsing, use ``pd.to_datetime`` after 

236 ``pd.read_csv``. To parse an index or column with a mixture of timezones, 

237 specify ``date_parser`` to be a partially-applied 

238 :func:`pandas.to_datetime` with ``utc=True``. See 

239 :ref:`io.csv.mixed_timezones` for more. 

240 

241 Note: A fast-path exists for iso8601-formatted dates. 

242infer_datetime_format : bool, default False 

243 If True and `parse_dates` is enabled, pandas will attempt to infer the 

244 format of the datetime strings in the columns, and if it can be inferred, 

245 switch to a faster method of parsing them. In some cases this can increase 

246 the parsing speed by 5-10x. 

247keep_date_col : bool, default False 

248 If True and `parse_dates` specifies combining multiple columns then 

249 keep the original columns. 

250date_parser : function, optional 

251 Function to use for converting a sequence of string columns to an array of 

252 datetime instances. The default uses ``dateutil.parser.parser`` to do the 

253 conversion. Pandas will try to call `date_parser` in three different ways, 

254 advancing to the next if an exception occurs: 1) Pass one or more arrays 

255 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the 

256 string values from the columns defined by `parse_dates` into a single array 

257 and pass that; and 3) call `date_parser` once for each row using one or 

258 more strings (corresponding to the columns defined by `parse_dates`) as 

259 arguments. 

260dayfirst : bool, default False 

261 DD/MM format dates, international and European format. 

262cache_dates : bool, default True 

263 If True, use a cache of unique, converted dates to apply the datetime 

264 conversion. May produce significant speed-up when parsing duplicate 

265 date strings, especially ones with timezone offsets. 

266 

267 .. versionadded:: 0.25.0 

268iterator : bool, default False 

269 Return TextFileReader object for iteration or getting chunks with 

270 ``get_chunk()``. 

271chunksize : int, optional 

272 Return TextFileReader object for iteration. 

273 See the `IO Tools docs 

274 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ 

275 for more information on ``iterator`` and ``chunksize``. 

276compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' 

277 For on-the-fly decompression of on-disk data. If 'infer' and 

278 `filepath_or_buffer` is path-like, then detect compression from the 

279 following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no 

280 decompression). If using 'zip', the ZIP file must contain only one data 

281 file to be read in. Set to None for no decompression. 

282thousands : str, optional 

283 Thousands separator. 

284decimal : str, default '.' 

285 Character to recognize as decimal point (e.g. use ',' for European data). 

286lineterminator : str (length 1), optional 

287 Character to break file into lines. Only valid with C parser. 

288quotechar : str (length 1), optional 

289 The character used to denote the start and end of a quoted item. Quoted 

290 items can include the delimiter and it will be ignored. 

291quoting : int or csv.QUOTE_* instance, default 0 

292 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of 

293 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). 

294doublequote : bool, default ``True`` 

295 When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate 

296 whether or not to interpret two consecutive quotechar elements INSIDE a 

297 field as a single ``quotechar`` element. 

298escapechar : str (length 1), optional 

299 One-character string used to escape other characters. 

300comment : str, optional 

301 Indicates remainder of line should not be parsed. If found at the beginning 

302 of a line, the line will be ignored altogether. This parameter must be a 

303 single character. Like empty lines (as long as ``skip_blank_lines=True``), 

304 fully commented lines are ignored by the parameter `header` but not by 

305 `skiprows`. For example, if ``comment='#'``, parsing 

306 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being 

307 treated as the header. 

308encoding : str, optional 

309 Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python 

310 standard encodings 

311 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ . 

312dialect : str or csv.Dialect, optional 

313 If provided, this parameter will override values (default or not) for the 

314 following parameters: `delimiter`, `doublequote`, `escapechar`, 

315 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to 

316 override values, a ParserWarning will be issued. See csv.Dialect 

317 documentation for more details. 

318error_bad_lines : bool, default True 

319 Lines with too many fields (e.g. a csv line with too many commas) will by 

320 default cause an exception to be raised, and no DataFrame will be returned. 

321 If False, then these "bad lines" will dropped from the DataFrame that is 

322 returned. 

323warn_bad_lines : bool, default True 

324 If error_bad_lines is False, and warn_bad_lines is True, a warning for each 

325 "bad line" will be output. 

326delim_whitespace : bool, default False 

327 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be 

328 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option 

329 is set to True, nothing should be passed in for the ``delimiter`` 

330 parameter. 

331low_memory : bool, default True 

332 Internally process the file in chunks, resulting in lower memory use 

333 while parsing, but possibly mixed type inference. To ensure no mixed 

334 types either set False, or specify the type with the `dtype` parameter. 

335 Note that the entire file is read into a single DataFrame regardless, 

336 use the `chunksize` or `iterator` parameter to return the data in chunks. 

337 (Only valid with C parser). 

338memory_map : bool, default False 

339 If a filepath is provided for `filepath_or_buffer`, map the file object 

340 directly onto memory and access the data directly from there. Using this 

341 option can improve performance because there is no longer any I/O overhead. 

342float_precision : str, optional 

343 Specifies which converter the C engine should use for floating-point 

344 values. The options are `None` for the ordinary converter, 

345 `high` for the high-precision converter, and `round_trip` for the 

346 round-trip converter. 

347 

348Returns 

349------- 

350DataFrame or TextParser 

351 A comma-separated values (csv) file is returned as two-dimensional 

352 data structure with labeled axes. 

353 

354See Also 

355-------- 

356to_csv : Write DataFrame to a comma-separated values (csv) file. 

357read_csv : Read a comma-separated values (csv) file into DataFrame. 

358read_fwf : Read a table of fixed-width formatted lines into DataFrame. 

359 

360Examples 

361-------- 

362>>> pd.{func_name}('data.csv') # doctest: +SKIP 

363""" 

364) 

365 

366 

367def _validate_integer(name, val, min_val=0): 

368 """ 

369 Checks whether the 'name' parameter for parsing is either 

370 an integer OR float that can SAFELY be cast to an integer 

371 without losing accuracy. Raises a ValueError if that is 

372 not the case. 

373 

374 Parameters 

375 ---------- 

376 name : string 

377 Parameter name (used for error reporting) 

378 val : int or float 

379 The value to check 

380 min_val : int 

381 Minimum allowed value (val < min_val will result in a ValueError) 

382 """ 

383 msg = f"'{name:s}' must be an integer >={min_val:d}" 

384 

385 if val is not None: 

386 if is_float(val): 

387 if int(val) != val: 

388 raise ValueError(msg) 

389 val = int(val) 

390 elif not (is_integer(val) and val >= min_val): 

391 raise ValueError(msg) 

392 

393 return val 

394 

395 

396def _validate_names(names): 

397 """ 

398 Raise ValueError if the `names` parameter contains duplicates. 

399 

400 Parameters 

401 ---------- 

402 names : array-like or None 

403 An array containing a list of the names used for the output DataFrame. 

404 

405 Raises 

406 ------ 

407 ValueError 

408 If names are not unique. 

409 """ 

410 

411 if names is not None: 

412 if len(names) != len(set(names)): 

413 raise ValueError("Duplicate names are not allowed.") 

414 

415 

416def _read(filepath_or_buffer: FilePathOrBuffer, kwds): 

417 """Generic reader of line files.""" 

418 encoding = kwds.get("encoding", None) 

419 if encoding is not None: 

420 encoding = re.sub("_", "-", encoding).lower() 

421 kwds["encoding"] = encoding 

422 

423 compression = kwds.get("compression", "infer") 

424 compression = infer_compression(filepath_or_buffer, compression) 

425 

426 # TODO: get_filepath_or_buffer could return 

427 # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] 

428 # though mypy handling of conditional imports is difficult. 

429 # See https://github.com/python/mypy/issues/1297 

430 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( 

431 filepath_or_buffer, encoding, compression 

432 ) 

433 kwds["compression"] = compression 

434 

435 if kwds.get("date_parser", None) is not None: 

436 if isinstance(kwds["parse_dates"], bool): 

437 kwds["parse_dates"] = True 

438 

439 # Extract some of the arguments (pass chunksize on). 

440 iterator = kwds.get("iterator", False) 

441 chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) 

442 nrows = kwds.get("nrows", None) 

443 

444 # Check for duplicates in names. 

445 _validate_names(kwds.get("names", None)) 

446 

447 # Create the parser. 

448 parser = TextFileReader(fp_or_buf, **kwds) 

449 

450 if chunksize or iterator: 

451 return parser 

452 

453 try: 

454 data = parser.read(nrows) 

455 finally: 

456 parser.close() 

457 

458 if should_close: 

459 try: 

460 fp_or_buf.close() 

461 except ValueError: 

462 pass 

463 

464 return data 

465 

466 

467_parser_defaults = { 

468 "delimiter": None, 

469 "escapechar": None, 

470 "quotechar": '"', 

471 "quoting": csv.QUOTE_MINIMAL, 

472 "doublequote": True, 

473 "skipinitialspace": False, 

474 "lineterminator": None, 

475 "header": "infer", 

476 "index_col": None, 

477 "names": None, 

478 "prefix": None, 

479 "skiprows": None, 

480 "skipfooter": 0, 

481 "nrows": None, 

482 "na_values": None, 

483 "keep_default_na": True, 

484 "true_values": None, 

485 "false_values": None, 

486 "converters": None, 

487 "dtype": None, 

488 "cache_dates": True, 

489 "thousands": None, 

490 "comment": None, 

491 "decimal": ".", 

492 # 'engine': 'c', 

493 "parse_dates": False, 

494 "keep_date_col": False, 

495 "dayfirst": False, 

496 "date_parser": None, 

497 "usecols": None, 

498 # 'iterator': False, 

499 "chunksize": None, 

500 "verbose": False, 

501 "encoding": None, 

502 "squeeze": False, 

503 "compression": None, 

504 "mangle_dupe_cols": True, 

505 "infer_datetime_format": False, 

506 "skip_blank_lines": True, 

507} 

508 

509 

510_c_parser_defaults = { 

511 "delim_whitespace": False, 

512 "na_filter": True, 

513 "low_memory": True, 

514 "memory_map": False, 

515 "error_bad_lines": True, 

516 "warn_bad_lines": True, 

517 "float_precision": None, 

518} 

519 

520_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} 

521 

522_c_unsupported = {"skipfooter"} 

523_python_unsupported = {"low_memory", "float_precision"} 

524 

525_deprecated_defaults: Dict[str, Any] = {} 

526_deprecated_args: Set[str] = set() 

527 

528 

529def _make_parser_function(name, default_sep=","): 

530 def parser_f( 

531 filepath_or_buffer: FilePathOrBuffer, 

532 sep=default_sep, 

533 delimiter=None, 

534 # Column and Index Locations and Names 

535 header="infer", 

536 names=None, 

537 index_col=None, 

538 usecols=None, 

539 squeeze=False, 

540 prefix=None, 

541 mangle_dupe_cols=True, 

542 # General Parsing Configuration 

543 dtype=None, 

544 engine=None, 

545 converters=None, 

546 true_values=None, 

547 false_values=None, 

548 skipinitialspace=False, 

549 skiprows=None, 

550 skipfooter=0, 

551 nrows=None, 

552 # NA and Missing Data Handling 

553 na_values=None, 

554 keep_default_na=True, 

555 na_filter=True, 

556 verbose=False, 

557 skip_blank_lines=True, 

558 # Datetime Handling 

559 parse_dates=False, 

560 infer_datetime_format=False, 

561 keep_date_col=False, 

562 date_parser=None, 

563 dayfirst=False, 

564 cache_dates=True, 

565 # Iteration 

566 iterator=False, 

567 chunksize=None, 

568 # Quoting, Compression, and File Format 

569 compression="infer", 

570 thousands=None, 

571 decimal: str = ".", 

572 lineterminator=None, 

573 quotechar='"', 

574 quoting=csv.QUOTE_MINIMAL, 

575 doublequote=True, 

576 escapechar=None, 

577 comment=None, 

578 encoding=None, 

579 dialect=None, 

580 # Error Handling 

581 error_bad_lines=True, 

582 warn_bad_lines=True, 

583 # Internal 

584 delim_whitespace=False, 

585 low_memory=_c_parser_defaults["low_memory"], 

586 memory_map=False, 

587 float_precision=None, 

588 ): 

589 

590 # gh-23761 

591 # 

592 # When a dialect is passed, it overrides any of the overlapping 

593 # parameters passed in directly. We don't want to warn if the 

594 # default parameters were passed in (since it probably means 

595 # that the user didn't pass them in explicitly in the first place). 

596 # 

597 # "delimiter" is the annoying corner case because we alias it to 

598 # "sep" before doing comparison to the dialect values later on. 

599 # Thus, we need a flag to indicate that we need to "override" 

600 # the comparison to dialect values by checking if default values 

601 # for BOTH "delimiter" and "sep" were provided. 

602 if dialect is not None: 

603 sep_override = delimiter is None and sep == default_sep 

604 kwds = dict(sep_override=sep_override) 

605 else: 

606 kwds = dict() 

607 

608 # Alias sep -> delimiter. 

609 if delimiter is None: 

610 delimiter = sep 

611 

612 if delim_whitespace and delimiter != default_sep: 

613 raise ValueError( 

614 "Specified a delimiter with both sep and " 

615 "delim_whitespace=True; you can only " 

616 "specify one." 

617 ) 

618 

619 if engine is not None: 

620 engine_specified = True 

621 else: 

622 engine = "c" 

623 engine_specified = False 

624 

625 kwds.update( 

626 delimiter=delimiter, 

627 engine=engine, 

628 dialect=dialect, 

629 compression=compression, 

630 engine_specified=engine_specified, 

631 doublequote=doublequote, 

632 escapechar=escapechar, 

633 quotechar=quotechar, 

634 quoting=quoting, 

635 skipinitialspace=skipinitialspace, 

636 lineterminator=lineterminator, 

637 header=header, 

638 index_col=index_col, 

639 names=names, 

640 prefix=prefix, 

641 skiprows=skiprows, 

642 skipfooter=skipfooter, 

643 na_values=na_values, 

644 true_values=true_values, 

645 false_values=false_values, 

646 keep_default_na=keep_default_na, 

647 thousands=thousands, 

648 comment=comment, 

649 decimal=decimal, 

650 parse_dates=parse_dates, 

651 keep_date_col=keep_date_col, 

652 dayfirst=dayfirst, 

653 date_parser=date_parser, 

654 cache_dates=cache_dates, 

655 nrows=nrows, 

656 iterator=iterator, 

657 chunksize=chunksize, 

658 converters=converters, 

659 dtype=dtype, 

660 usecols=usecols, 

661 verbose=verbose, 

662 encoding=encoding, 

663 squeeze=squeeze, 

664 memory_map=memory_map, 

665 float_precision=float_precision, 

666 na_filter=na_filter, 

667 delim_whitespace=delim_whitespace, 

668 warn_bad_lines=warn_bad_lines, 

669 error_bad_lines=error_bad_lines, 

670 low_memory=low_memory, 

671 mangle_dupe_cols=mangle_dupe_cols, 

672 infer_datetime_format=infer_datetime_format, 

673 skip_blank_lines=skip_blank_lines, 

674 ) 

675 

676 return _read(filepath_or_buffer, kwds) 

677 

678 parser_f.__name__ = name 

679 

680 return parser_f 

681 

682 

683read_csv = _make_parser_function("read_csv", default_sep=",") 

684read_csv = Appender( 

685 _doc_read_csv_and_table.format( 

686 func_name="read_csv", 

687 summary="Read a comma-separated values (csv) file into DataFrame.", 

688 _default_sep="','", 

689 ) 

690)(read_csv) 

691 

692read_table = _make_parser_function("read_table", default_sep="\t") 

693read_table = Appender( 

694 _doc_read_csv_and_table.format( 

695 func_name="read_table", 

696 summary="Read general delimited file into DataFrame.", 

697 _default_sep=r"'\\t' (tab-stop)", 

698 ) 

699)(read_table) 

700 

701 

702def read_fwf( 

703 filepath_or_buffer: FilePathOrBuffer, 

704 colspecs="infer", 

705 widths=None, 

706 infer_nrows=100, 

707 **kwds, 

708): 

709 

710 r""" 

711 Read a table of fixed-width formatted lines into DataFrame. 

712 

713 Also supports optionally iterating or breaking of the file 

714 into chunks. 

715 

716 Additional help can be found in the `online docs for IO Tools 

717 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 

718 

719 Parameters 

720 ---------- 

721 filepath_or_buffer : str, path object or file-like object 

722 Any valid string path is acceptable. The string could be a URL. Valid 

723 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

724 expected. A local file could be: 

725 ``file://localhost/path/to/table.csv``. 

726 

727 If you want to pass in a path object, pandas accepts any 

728 ``os.PathLike``. 

729 

730 By file-like object, we refer to objects with a ``read()`` method, 

731 such as a file handler (e.g. via builtin ``open`` function) 

732 or ``StringIO``. 

733 colspecs : list of tuple (int, int) or 'infer'. optional 

734 A list of tuples giving the extents of the fixed-width 

735 fields of each line as half-open intervals (i.e., [from, to[ ). 

736 String value 'infer' can be used to instruct the parser to try 

737 detecting the column specifications from the first 100 rows of 

738 the data which are not being skipped via skiprows (default='infer'). 

739 widths : list of int, optional 

740 A list of field widths which can be used instead of 'colspecs' if 

741 the intervals are contiguous. 

742 infer_nrows : int, default 100 

743 The number of rows to consider when letting the parser determine the 

744 `colspecs`. 

745 

746 .. versionadded:: 0.24.0 

747 **kwds : optional 

748 Optional keyword arguments can be passed to ``TextFileReader``. 

749 

750 Returns 

751 ------- 

752 DataFrame or TextParser 

753 A comma-separated values (csv) file is returned as two-dimensional 

754 data structure with labeled axes. 

755 

756 See Also 

757 -------- 

758 to_csv : Write DataFrame to a comma-separated values (csv) file. 

759 read_csv : Read a comma-separated values (csv) file into DataFrame. 

760 

761 Examples 

762 -------- 

763 >>> pd.read_fwf('data.csv') # doctest: +SKIP 

764 """ 

765 

766 # Check input arguments. 

767 if colspecs is None and widths is None: 

768 raise ValueError("Must specify either colspecs or widths") 

769 elif colspecs not in (None, "infer") and widths is not None: 

770 raise ValueError("You must specify only one of 'widths' and 'colspecs'") 

771 

772 # Compute 'colspecs' from 'widths', if specified. 

773 if widths is not None: 

774 colspecs, col = [], 0 

775 for w in widths: 

776 colspecs.append((col, col + w)) 

777 col += w 

778 

779 kwds["colspecs"] = colspecs 

780 kwds["infer_nrows"] = infer_nrows 

781 kwds["engine"] = "python-fwf" 

782 return _read(filepath_or_buffer, kwds) 

783 

784 

785class TextFileReader(abc.Iterator): 

786 """ 

787 

788 Passed dialect overrides any of the related parser options 

789 

790 """ 

791 

792 def __init__(self, f, engine=None, **kwds): 

793 

794 self.f = f 

795 

796 if engine is not None: 

797 engine_specified = True 

798 else: 

799 engine = "python" 

800 engine_specified = False 

801 

802 self._engine_specified = kwds.get("engine_specified", engine_specified) 

803 

804 if kwds.get("dialect") is not None: 

805 dialect = kwds["dialect"] 

806 if dialect in csv.list_dialects(): 

807 dialect = csv.get_dialect(dialect) 

808 

809 # Any valid dialect should have these attributes. 

810 # If any are missing, we will raise automatically. 

811 for param in ( 

812 "delimiter", 

813 "doublequote", 

814 "escapechar", 

815 "skipinitialspace", 

816 "quotechar", 

817 "quoting", 

818 ): 

819 try: 

820 dialect_val = getattr(dialect, param) 

821 except AttributeError: 

822 raise ValueError(f"Invalid dialect {kwds['dialect']} provided") 

823 parser_default = _parser_defaults[param] 

824 provided = kwds.get(param, parser_default) 

825 

826 # Messages for conflicting values between the dialect 

827 # instance and the actual parameters provided. 

828 conflict_msgs = [] 

829 

830 # Don't warn if the default parameter was passed in, 

831 # even if it conflicts with the dialect (gh-23761). 

832 if provided != parser_default and provided != dialect_val: 

833 msg = ( 

834 f"Conflicting values for '{param}': '{provided}' was " 

835 f"provided, but the dialect specifies '{dialect_val}'. " 

836 "Using the dialect-specified value." 

837 ) 

838 

839 # Annoying corner case for not warning about 

840 # conflicts between dialect and delimiter parameter. 

841 # Refer to the outer "_read_" function for more info. 

842 if not (param == "delimiter" and kwds.pop("sep_override", False)): 

843 conflict_msgs.append(msg) 

844 

845 if conflict_msgs: 

846 warnings.warn( 

847 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2 

848 ) 

849 kwds[param] = dialect_val 

850 

851 if kwds.get("skipfooter"): 

852 if kwds.get("iterator") or kwds.get("chunksize"): 

853 raise ValueError("'skipfooter' not supported for 'iteration'") 

854 if kwds.get("nrows"): 

855 raise ValueError("'skipfooter' not supported with 'nrows'") 

856 

857 if kwds.get("header", "infer") == "infer": 

858 kwds["header"] = 0 if kwds.get("names") is None else None 

859 

860 self.orig_options = kwds 

861 

862 # miscellanea 

863 self.engine = engine 

864 self._engine = None 

865 self._currow = 0 

866 

867 options = self._get_options_with_defaults(engine) 

868 

869 self.chunksize = options.pop("chunksize", None) 

870 self.nrows = options.pop("nrows", None) 

871 self.squeeze = options.pop("squeeze", False) 

872 

873 # might mutate self.engine 

874 self.engine = self._check_file_or_buffer(f, engine) 

875 self.options, self.engine = self._clean_options(options, engine) 

876 

877 if "has_index_names" in kwds: 

878 self.options["has_index_names"] = kwds["has_index_names"] 

879 

880 self._make_engine(self.engine) 

881 

882 def close(self): 

883 self._engine.close() 

884 

885 def _get_options_with_defaults(self, engine): 

886 kwds = self.orig_options 

887 

888 options = {} 

889 

890 for argname, default in _parser_defaults.items(): 

891 value = kwds.get(argname, default) 

892 

893 # see gh-12935 

894 if argname == "mangle_dupe_cols" and not value: 

895 raise ValueError("Setting mangle_dupe_cols=False is not supported yet") 

896 else: 

897 options[argname] = value 

898 

899 for argname, default in _c_parser_defaults.items(): 

900 if argname in kwds: 

901 value = kwds[argname] 

902 

903 if engine != "c" and value != default: 

904 if "python" in engine and argname not in _python_unsupported: 

905 pass 

906 elif value == _deprecated_defaults.get(argname, default): 

907 pass 

908 else: 

909 raise ValueError( 

910 f"The {repr(argname)} option is not supported with the" 

911 f" {repr(engine)} engine" 

912 ) 

913 else: 

914 value = _deprecated_defaults.get(argname, default) 

915 options[argname] = value 

916 

917 if engine == "python-fwf": 

918 for argname, default in _fwf_defaults.items(): 

919 options[argname] = kwds.get(argname, default) 

920 

921 return options 

922 

923 def _check_file_or_buffer(self, f, engine): 

924 # see gh-16530 

925 if is_file_like(f): 

926 next_attr = "__next__" 

927 

928 # The C engine doesn't need the file-like to have the "next" or 

929 # "__next__" attribute. However, the Python engine explicitly calls 

930 # "next(...)" when iterating through such an object, meaning it 

931 # needs to have that attribute ("next" for Python 2.x, "__next__" 

932 # for Python 3.x) 

933 if engine != "c" and not hasattr(f, next_attr): 

934 msg = "The 'python' engine cannot iterate through this file buffer." 

935 raise ValueError(msg) 

936 

937 return engine 

938 

939 def _clean_options(self, options, engine): 

940 result = options.copy() 

941 

942 engine_specified = self._engine_specified 

943 fallback_reason = None 

944 

945 sep = options["delimiter"] 

946 delim_whitespace = options["delim_whitespace"] 

947 

948 # C engine not supported yet 

949 if engine == "c": 

950 if options["skipfooter"] > 0: 

951 fallback_reason = "the 'c' engine does not support skipfooter" 

952 engine = "python" 

953 

954 encoding = sys.getfilesystemencoding() or "utf-8" 

955 if sep is None and not delim_whitespace: 

956 if engine == "c": 

957 fallback_reason = ( 

958 "the 'c' engine does not support " 

959 "sep=None with delim_whitespace=False" 

960 ) 

961 engine = "python" 

962 elif sep is not None and len(sep) > 1: 

963 if engine == "c" and sep == r"\s+": 

964 result["delim_whitespace"] = True 

965 del result["delimiter"] 

966 elif engine not in ("python", "python-fwf"): 

967 # wait until regex engine integrated 

968 fallback_reason = ( 

969 "the 'c' engine does not support " 

970 "regex separators (separators > 1 char and " 

971 r"different from '\s+' are " 

972 "interpreted as regex)" 

973 ) 

974 engine = "python" 

975 elif delim_whitespace: 

976 if "python" in engine: 

977 result["delimiter"] = r"\s+" 

978 elif sep is not None: 

979 encodeable = True 

980 try: 

981 if len(sep.encode(encoding)) > 1: 

982 encodeable = False 

983 except UnicodeDecodeError: 

984 encodeable = False 

985 if not encodeable and engine not in ("python", "python-fwf"): 

986 fallback_reason = ( 

987 f"the separator encoded in {encoding} " 

988 "is > 1 char long, and the 'c' engine " 

989 "does not support such separators" 

990 ) 

991 engine = "python" 

992 

993 quotechar = options["quotechar"] 

994 if quotechar is not None and isinstance(quotechar, (str, bytes)): 

995 if ( 

996 len(quotechar) == 1 

997 and ord(quotechar) > 127 

998 and engine not in ("python", "python-fwf") 

999 ): 

1000 fallback_reason = ( 

1001 "ord(quotechar) > 127, meaning the " 

1002 "quotechar is larger than one byte, " 

1003 "and the 'c' engine does not support " 

1004 "such quotechars" 

1005 ) 

1006 engine = "python" 

1007 

1008 if fallback_reason and engine_specified: 

1009 raise ValueError(fallback_reason) 

1010 

1011 if engine == "c": 

1012 for arg in _c_unsupported: 

1013 del result[arg] 

1014 

1015 if "python" in engine: 

1016 for arg in _python_unsupported: 

1017 if fallback_reason and result[arg] != _c_parser_defaults[arg]: 

1018 raise ValueError( 

1019 "Falling back to the 'python' engine because " 

1020 f"{fallback_reason}, but this causes {repr(arg)} to be " 

1021 "ignored as it is not supported by the 'python' engine." 

1022 ) 

1023 del result[arg] 

1024 

1025 if fallback_reason: 

1026 warnings.warn( 

1027 ( 

1028 "Falling back to the 'python' engine because " 

1029 f"{fallback_reason}; you can avoid this warning by specifying " 

1030 "engine='python'." 

1031 ), 

1032 ParserWarning, 

1033 stacklevel=5, 

1034 ) 

1035 

1036 index_col = options["index_col"] 

1037 names = options["names"] 

1038 converters = options["converters"] 

1039 na_values = options["na_values"] 

1040 skiprows = options["skiprows"] 

1041 

1042 validate_header_arg(options["header"]) 

1043 

1044 depr_warning = "" 

1045 

1046 for arg in _deprecated_args: 

1047 parser_default = _c_parser_defaults[arg] 

1048 depr_default = _deprecated_defaults[arg] 

1049 

1050 msg = ( 

1051 f"The {repr(arg)} argument has been deprecated and will be " 

1052 "removed in a future version." 

1053 ) 

1054 

1055 if result.get(arg, depr_default) != depr_default: 

1056 depr_warning += msg + "\n\n" 

1057 else: 

1058 result[arg] = parser_default 

1059 

1060 if depr_warning != "": 

1061 warnings.warn(depr_warning, FutureWarning, stacklevel=2) 

1062 

1063 if index_col is True: 

1064 raise ValueError("The value of index_col couldn't be 'True'") 

1065 if _is_index_col(index_col): 

1066 if not isinstance(index_col, (list, tuple, np.ndarray)): 

1067 index_col = [index_col] 

1068 result["index_col"] = index_col 

1069 

1070 names = list(names) if names is not None else names 

1071 

1072 # type conversion-related 

1073 if converters is not None: 

1074 if not isinstance(converters, dict): 

1075 raise TypeError( 

1076 "Type converters must be a dict or subclass, " 

1077 f"input was a {type(converters).__name__}" 

1078 ) 

1079 else: 

1080 converters = {} 

1081 

1082 # Converting values to NA 

1083 keep_default_na = options["keep_default_na"] 

1084 na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) 

1085 

1086 # handle skiprows; this is internally handled by the 

1087 # c-engine, so only need for python parsers 

1088 if engine != "c": 

1089 if is_integer(skiprows): 

1090 skiprows = list(range(skiprows)) 

1091 if skiprows is None: 

1092 skiprows = set() 

1093 elif not callable(skiprows): 

1094 skiprows = set(skiprows) 

1095 

1096 # put stuff back 

1097 result["names"] = names 

1098 result["converters"] = converters 

1099 result["na_values"] = na_values 

1100 result["na_fvalues"] = na_fvalues 

1101 result["skiprows"] = skiprows 

1102 

1103 return result, engine 

1104 

1105 def __next__(self): 

1106 try: 

1107 return self.get_chunk() 

1108 except StopIteration: 

1109 self.close() 

1110 raise 

1111 

1112 def _make_engine(self, engine="c"): 

1113 if engine == "c": 

1114 self._engine = CParserWrapper(self.f, **self.options) 

1115 else: 

1116 if engine == "python": 

1117 klass = PythonParser 

1118 elif engine == "python-fwf": 

1119 klass = FixedWidthFieldParser 

1120 else: 

1121 raise ValueError( 

1122 f"Unknown engine: {engine} (valid options are " 

1123 '"c", "python", or ' 

1124 '"python-fwf")' 

1125 ) 

1126 self._engine = klass(self.f, **self.options) 

1127 

1128 def _failover_to_python(self): 

1129 raise AbstractMethodError(self) 

1130 

1131 def read(self, nrows=None): 

1132 nrows = _validate_integer("nrows", nrows) 

1133 ret = self._engine.read(nrows) 

1134 

1135 # May alter columns / col_dict 

1136 index, columns, col_dict = self._create_index(ret) 

1137 

1138 if index is None: 

1139 if col_dict: 

1140 # Any column is actually fine: 

1141 new_rows = len(next(iter(col_dict.values()))) 

1142 index = RangeIndex(self._currow, self._currow + new_rows) 

1143 else: 

1144 new_rows = 0 

1145 else: 

1146 new_rows = len(index) 

1147 

1148 df = DataFrame(col_dict, columns=columns, index=index) 

1149 

1150 self._currow += new_rows 

1151 

1152 if self.squeeze and len(df.columns) == 1: 

1153 return df[df.columns[0]].copy() 

1154 return df 

1155 

1156 def _create_index(self, ret): 

1157 index, columns, col_dict = ret 

1158 return index, columns, col_dict 

1159 

1160 def get_chunk(self, size=None): 

1161 if size is None: 

1162 size = self.chunksize 

1163 if self.nrows is not None: 

1164 if self._currow >= self.nrows: 

1165 raise StopIteration 

1166 size = min(size, self.nrows - self._currow) 

1167 return self.read(nrows=size) 

1168 

1169 

1170def _is_index_col(col): 

1171 return col is not None and col is not False 

1172 

1173 

1174def _is_potential_multi_index(columns): 

1175 """ 

1176 Check whether or not the `columns` parameter 

1177 could be converted into a MultiIndex. 

1178 

1179 Parameters 

1180 ---------- 

1181 columns : array-like 

1182 Object which may or may not be convertible into a MultiIndex 

1183 

1184 Returns 

1185 ------- 

1186 boolean : Whether or not columns could become a MultiIndex 

1187 """ 

1188 return ( 

1189 len(columns) 

1190 and not isinstance(columns, MultiIndex) 

1191 and all(isinstance(c, tuple) for c in columns) 

1192 ) 

1193 

1194 

1195def _evaluate_usecols(usecols, names): 

1196 """ 

1197 Check whether or not the 'usecols' parameter 

1198 is a callable. If so, enumerates the 'names' 

1199 parameter and returns a set of indices for 

1200 each entry in 'names' that evaluates to True. 

1201 If not a callable, returns 'usecols'. 

1202 """ 

1203 if callable(usecols): 

1204 return {i for i, name in enumerate(names) if usecols(name)} 

1205 return usecols 

1206 

1207 

1208def _validate_usecols_names(usecols, names): 

1209 """ 

1210 Validates that all usecols are present in a given 

1211 list of names. If not, raise a ValueError that 

1212 shows what usecols are missing. 

1213 

1214 Parameters 

1215 ---------- 

1216 usecols : iterable of usecols 

1217 The columns to validate are present in names. 

1218 names : iterable of names 

1219 The column names to check against. 

1220 

1221 Returns 

1222 ------- 

1223 usecols : iterable of usecols 

1224 The `usecols` parameter if the validation succeeds. 

1225 

1226 Raises 

1227 ------ 

1228 ValueError : Columns were missing. Error message will list them. 

1229 """ 

1230 missing = [c for c in usecols if c not in names] 

1231 if len(missing) > 0: 

1232 raise ValueError( 

1233 "Usecols do not match columns, " 

1234 f"columns expected but not found: {missing}" 

1235 ) 

1236 

1237 return usecols 

1238 

1239 

1240def _validate_skipfooter_arg(skipfooter): 

1241 """ 

1242 Validate the 'skipfooter' parameter. 

1243 

1244 Checks whether 'skipfooter' is a non-negative integer. 

1245 Raises a ValueError if that is not the case. 

1246 

1247 Parameters 

1248 ---------- 

1249 skipfooter : non-negative integer 

1250 The number of rows to skip at the end of the file. 

1251 

1252 Returns 

1253 ------- 

1254 validated_skipfooter : non-negative integer 

1255 The original input if the validation succeeds. 

1256 

1257 Raises 

1258 ------ 

1259 ValueError : 'skipfooter' was not a non-negative integer. 

1260 """ 

1261 

1262 if not is_integer(skipfooter): 

1263 raise ValueError("skipfooter must be an integer") 

1264 

1265 if skipfooter < 0: 

1266 raise ValueError("skipfooter cannot be negative") 

1267 

1268 return skipfooter 

1269 

1270 

1271def _validate_usecols_arg(usecols): 

1272 """ 

1273 Validate the 'usecols' parameter. 

1274 

1275 Checks whether or not the 'usecols' parameter contains all integers 

1276 (column selection by index), strings (column by name) or is a callable. 

1277 Raises a ValueError if that is not the case. 

1278 

1279 Parameters 

1280 ---------- 

1281 usecols : list-like, callable, or None 

1282 List of columns to use when parsing or a callable that can be used 

1283 to filter a list of table columns. 

1284 

1285 Returns 

1286 ------- 

1287 usecols_tuple : tuple 

1288 A tuple of (verified_usecols, usecols_dtype). 

1289 

1290 'verified_usecols' is either a set if an array-like is passed in or 

1291 'usecols' if a callable or None is passed in. 

1292 

1293 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like 

1294 is passed in or None if a callable or None is passed in. 

1295 """ 

1296 msg = ( 

1297 "'usecols' must either be list-like of all strings, all unicode, " 

1298 "all integers or a callable." 

1299 ) 

1300 if usecols is not None: 

1301 if callable(usecols): 

1302 return usecols, None 

1303 

1304 if not is_list_like(usecols): 

1305 # see gh-20529 

1306 # 

1307 # Ensure it is iterable container but not string. 

1308 raise ValueError(msg) 

1309 

1310 usecols_dtype = lib.infer_dtype(usecols, skipna=False) 

1311 

1312 if usecols_dtype not in ("empty", "integer", "string", "unicode"): 

1313 raise ValueError(msg) 

1314 

1315 usecols = set(usecols) 

1316 

1317 return usecols, usecols_dtype 

1318 return usecols, None 

1319 

1320 

1321def _validate_parse_dates_arg(parse_dates): 

1322 """ 

1323 Check whether or not the 'parse_dates' parameter 

1324 is a non-boolean scalar. Raises a ValueError if 

1325 that is the case. 

1326 """ 

1327 msg = ( 

1328 "Only booleans, lists, and " 

1329 "dictionaries are accepted " 

1330 "for the 'parse_dates' parameter" 

1331 ) 

1332 

1333 if parse_dates is not None: 

1334 if is_scalar(parse_dates): 

1335 if not lib.is_bool(parse_dates): 

1336 raise TypeError(msg) 

1337 

1338 elif not isinstance(parse_dates, (list, dict)): 

1339 raise TypeError(msg) 

1340 

1341 return parse_dates 

1342 

1343 

1344class ParserBase: 

1345 def __init__(self, kwds): 

1346 self.names = kwds.get("names") 

1347 self.orig_names = None 

1348 self.prefix = kwds.pop("prefix", None) 

1349 

1350 self.index_col = kwds.get("index_col", None) 

1351 self.unnamed_cols = set() 

1352 self.index_names = None 

1353 self.col_names = None 

1354 

1355 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) 

1356 self.date_parser = kwds.pop("date_parser", None) 

1357 self.dayfirst = kwds.pop("dayfirst", False) 

1358 self.keep_date_col = kwds.pop("keep_date_col", False) 

1359 

1360 self.na_values = kwds.get("na_values") 

1361 self.na_fvalues = kwds.get("na_fvalues") 

1362 self.na_filter = kwds.get("na_filter", False) 

1363 self.keep_default_na = kwds.get("keep_default_na", True) 

1364 

1365 self.true_values = kwds.get("true_values") 

1366 self.false_values = kwds.get("false_values") 

1367 self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) 

1368 self.infer_datetime_format = kwds.pop("infer_datetime_format", False) 

1369 self.cache_dates = kwds.pop("cache_dates", True) 

1370 

1371 self._date_conv = _make_date_converter( 

1372 date_parser=self.date_parser, 

1373 dayfirst=self.dayfirst, 

1374 infer_datetime_format=self.infer_datetime_format, 

1375 cache_dates=self.cache_dates, 

1376 ) 

1377 

1378 # validate header options for mi 

1379 self.header = kwds.get("header") 

1380 if isinstance(self.header, (list, tuple, np.ndarray)): 

1381 if not all(map(is_integer, self.header)): 

1382 raise ValueError("header must be integer or list of integers") 

1383 if any(i < 0 for i in self.header): 

1384 raise ValueError( 

1385 "cannot specify multi-index header with negative integers" 

1386 ) 

1387 if kwds.get("usecols"): 

1388 raise ValueError( 

1389 "cannot specify usecols when specifying a multi-index header" 

1390 ) 

1391 if kwds.get("names"): 

1392 raise ValueError( 

1393 "cannot specify names when specifying a multi-index header" 

1394 ) 

1395 

1396 # validate index_col that only contains integers 

1397 if self.index_col is not None: 

1398 is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) 

1399 if not ( 

1400 is_sequence 

1401 and all(map(is_integer, self.index_col)) 

1402 or is_integer(self.index_col) 

1403 ): 

1404 raise ValueError( 

1405 "index_col must only contain row numbers " 

1406 "when specifying a multi-index header" 

1407 ) 

1408 

1409 # GH 16338 

1410 elif self.header is not None and not is_integer(self.header): 

1411 raise ValueError("header must be integer or list of integers") 

1412 

1413 # GH 27779 

1414 elif self.header is not None and self.header < 0: 

1415 raise ValueError( 

1416 "Passing negative integer to header is invalid. " 

1417 "For no header, use header=None instead" 

1418 ) 

1419 

1420 self._name_processed = False 

1421 

1422 self._first_chunk = True 

1423 

1424 # GH 13932 

1425 # keep references to file handles opened by the parser itself 

1426 self.handles = [] 

1427 

1428 def close(self): 

1429 for f in self.handles: 

1430 f.close() 

1431 

1432 @property 

1433 def _has_complex_date_col(self): 

1434 return isinstance(self.parse_dates, dict) or ( 

1435 isinstance(self.parse_dates, list) 

1436 and len(self.parse_dates) > 0 

1437 and isinstance(self.parse_dates[0], list) 

1438 ) 

1439 

1440 def _should_parse_dates(self, i): 

1441 if isinstance(self.parse_dates, bool): 

1442 return self.parse_dates 

1443 else: 

1444 if self.index_names is not None: 

1445 name = self.index_names[i] 

1446 else: 

1447 name = None 

1448 j = self.index_col[i] 

1449 

1450 if is_scalar(self.parse_dates): 

1451 return (j == self.parse_dates) or ( 

1452 name is not None and name == self.parse_dates 

1453 ) 

1454 else: 

1455 return (j in self.parse_dates) or ( 

1456 name is not None and name in self.parse_dates 

1457 ) 

1458 

1459 def _extract_multi_indexer_columns( 

1460 self, header, index_names, col_names, passed_names=False 

1461 ): 

1462 """ extract and return the names, index_names, col_names 

1463 header is a list-of-lists returned from the parsers """ 

1464 if len(header) < 2: 

1465 return header[0], index_names, col_names, passed_names 

1466 

1467 # the names are the tuples of the header that are not the index cols 

1468 # 0 is the name of the index, assuming index_col is a list of column 

1469 # numbers 

1470 ic = self.index_col 

1471 if ic is None: 

1472 ic = [] 

1473 

1474 if not isinstance(ic, (list, tuple, np.ndarray)): 

1475 ic = [ic] 

1476 sic = set(ic) 

1477 

1478 # clean the index_names 

1479 index_names = header.pop(-1) 

1480 index_names, names, index_col = _clean_index_names( 

1481 index_names, self.index_col, self.unnamed_cols 

1482 ) 

1483 

1484 # extract the columns 

1485 field_count = len(header[0]) 

1486 

1487 def extract(r): 

1488 return tuple(r[i] for i in range(field_count) if i not in sic) 

1489 

1490 columns = list(zip(*(extract(r) for r in header))) 

1491 names = ic + columns 

1492 

1493 # If we find unnamed columns all in a single 

1494 # level, then our header was too long. 

1495 for n in range(len(columns[0])): 

1496 if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): 

1497 raise ParserError( 

1498 "Passed header=[{header}] are too many rows for this " 

1499 "multi_index of columns".format( 

1500 header=",".join(str(x) for x in self.header) 

1501 ) 

1502 ) 

1503 

1504 # Clean the column names (if we have an index_col). 

1505 if len(ic): 

1506 col_names = [ 

1507 r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None 

1508 for r in header 

1509 ] 

1510 else: 

1511 col_names = [None] * len(header) 

1512 

1513 passed_names = True 

1514 

1515 return names, index_names, col_names, passed_names 

1516 

1517 def _maybe_dedup_names(self, names): 

1518 # see gh-7160 and gh-9424: this helps to provide 

1519 # immediate alleviation of the duplicate names 

1520 # issue and appears to be satisfactory to users, 

1521 # but ultimately, not needing to butcher the names 

1522 # would be nice! 

1523 if self.mangle_dupe_cols: 

1524 names = list(names) # so we can index 

1525 counts = defaultdict(int) 

1526 is_potential_mi = _is_potential_multi_index(names) 

1527 

1528 for i, col in enumerate(names): 

1529 cur_count = counts[col] 

1530 

1531 while cur_count > 0: 

1532 counts[col] = cur_count + 1 

1533 

1534 if is_potential_mi: 

1535 col = col[:-1] + (f"{col[-1]}.{cur_count}",) 

1536 else: 

1537 col = f"{col}.{cur_count}" 

1538 cur_count = counts[col] 

1539 

1540 names[i] = col 

1541 counts[col] = cur_count + 1 

1542 

1543 return names 

1544 

1545 def _maybe_make_multi_index_columns(self, columns, col_names=None): 

1546 # possibly create a column mi here 

1547 if _is_potential_multi_index(columns): 

1548 columns = MultiIndex.from_tuples(columns, names=col_names) 

1549 return columns 

1550 

1551 def _make_index(self, data, alldata, columns, indexnamerow=False): 

1552 if not _is_index_col(self.index_col) or not self.index_col: 

1553 index = None 

1554 

1555 elif not self._has_complex_date_col: 

1556 index = self._get_simple_index(alldata, columns) 

1557 index = self._agg_index(index) 

1558 elif self._has_complex_date_col: 

1559 if not self._name_processed: 

1560 (self.index_names, _, self.index_col) = _clean_index_names( 

1561 list(columns), self.index_col, self.unnamed_cols 

1562 ) 

1563 self._name_processed = True 

1564 index = self._get_complex_date_index(data, columns) 

1565 index = self._agg_index(index, try_parse_dates=False) 

1566 

1567 # add names for the index 

1568 if indexnamerow: 

1569 coffset = len(indexnamerow) - len(columns) 

1570 index = index.set_names(indexnamerow[:coffset]) 

1571 

1572 # maybe create a mi on the columns 

1573 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 

1574 

1575 return index, columns 

1576 

1577 _implicit_index = False 

1578 

1579 def _get_simple_index(self, data, columns): 

1580 def ix(col): 

1581 if not isinstance(col, str): 

1582 return col 

1583 raise ValueError(f"Index {col} invalid") 

1584 

1585 to_remove = [] 

1586 index = [] 

1587 for idx in self.index_col: 

1588 i = ix(idx) 

1589 to_remove.append(i) 

1590 index.append(data[i]) 

1591 

1592 # remove index items from content and columns, don't pop in 

1593 # loop 

1594 for i in sorted(to_remove, reverse=True): 

1595 data.pop(i) 

1596 if not self._implicit_index: 

1597 columns.pop(i) 

1598 

1599 return index 

1600 

1601 def _get_complex_date_index(self, data, col_names): 

1602 def _get_name(icol): 

1603 if isinstance(icol, str): 

1604 return icol 

1605 

1606 if col_names is None: 

1607 raise ValueError(f"Must supply column order to use {icol!s} as index") 

1608 

1609 for i, c in enumerate(col_names): 

1610 if i == icol: 

1611 return c 

1612 

1613 to_remove = [] 

1614 index = [] 

1615 for idx in self.index_col: 

1616 name = _get_name(idx) 

1617 to_remove.append(name) 

1618 index.append(data[name]) 

1619 

1620 # remove index items from content and columns, don't pop in 

1621 # loop 

1622 for c in sorted(to_remove, reverse=True): 

1623 data.pop(c) 

1624 col_names.remove(c) 

1625 

1626 return index 

1627 

1628 def _agg_index(self, index, try_parse_dates=True): 

1629 arrays = [] 

1630 

1631 for i, arr in enumerate(index): 

1632 

1633 if try_parse_dates and self._should_parse_dates(i): 

1634 arr = self._date_conv(arr) 

1635 

1636 if self.na_filter: 

1637 col_na_values = self.na_values 

1638 col_na_fvalues = self.na_fvalues 

1639 else: 

1640 col_na_values = set() 

1641 col_na_fvalues = set() 

1642 

1643 if isinstance(self.na_values, dict): 

1644 col_name = self.index_names[i] 

1645 if col_name is not None: 

1646 col_na_values, col_na_fvalues = _get_na_values( 

1647 col_name, self.na_values, self.na_fvalues, self.keep_default_na 

1648 ) 

1649 

1650 arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) 

1651 arrays.append(arr) 

1652 

1653 names = self.index_names 

1654 index = ensure_index_from_sequences(arrays, names) 

1655 

1656 return index 

1657 

1658 def _convert_to_ndarrays( 

1659 self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None 

1660 ): 

1661 result = {} 

1662 for c, values in dct.items(): 

1663 conv_f = None if converters is None else converters.get(c, None) 

1664 if isinstance(dtypes, dict): 

1665 cast_type = dtypes.get(c, None) 

1666 else: 

1667 # single dtype or None 

1668 cast_type = dtypes 

1669 

1670 if self.na_filter: 

1671 col_na_values, col_na_fvalues = _get_na_values( 

1672 c, na_values, na_fvalues, self.keep_default_na 

1673 ) 

1674 else: 

1675 col_na_values, col_na_fvalues = set(), set() 

1676 

1677 if conv_f is not None: 

1678 # conv_f applied to data before inference 

1679 if cast_type is not None: 

1680 warnings.warn( 

1681 ( 

1682 "Both a converter and dtype were specified " 

1683 f"for column {c} - only the converter will " 

1684 "be used" 

1685 ), 

1686 ParserWarning, 

1687 stacklevel=7, 

1688 ) 

1689 

1690 try: 

1691 values = lib.map_infer(values, conv_f) 

1692 except ValueError: 

1693 mask = algorithms.isin(values, list(na_values)).view(np.uint8) 

1694 values = lib.map_infer_mask(values, conv_f, mask) 

1695 

1696 cvals, na_count = self._infer_types( 

1697 values, set(col_na_values) | col_na_fvalues, try_num_bool=False 

1698 ) 

1699 else: 

1700 is_str_or_ea_dtype = is_string_dtype( 

1701 cast_type 

1702 ) or is_extension_array_dtype(cast_type) 

1703 # skip inference if specified dtype is object 

1704 # or casting to an EA 

1705 try_num_bool = not (cast_type and is_str_or_ea_dtype) 

1706 

1707 # general type inference and conversion 

1708 cvals, na_count = self._infer_types( 

1709 values, set(col_na_values) | col_na_fvalues, try_num_bool 

1710 ) 

1711 

1712 # type specified in dtype param or cast_type is an EA 

1713 if cast_type and ( 

1714 not is_dtype_equal(cvals, cast_type) 

1715 or is_extension_array_dtype(cast_type) 

1716 ): 

1717 try: 

1718 if ( 

1719 is_bool_dtype(cast_type) 

1720 and not is_categorical_dtype(cast_type) 

1721 and na_count > 0 

1722 ): 

1723 raise ValueError(f"Bool column has NA values in column {c}") 

1724 except (AttributeError, TypeError): 

1725 # invalid input to is_bool_dtype 

1726 pass 

1727 cvals = self._cast_types(cvals, cast_type, c) 

1728 

1729 result[c] = cvals 

1730 if verbose and na_count: 

1731 print(f"Filled {na_count} NA values in column {c!s}") 

1732 return result 

1733 

1734 def _infer_types(self, values, na_values, try_num_bool=True): 

1735 """ 

1736 Infer types of values, possibly casting 

1737 

1738 Parameters 

1739 ---------- 

1740 values : ndarray 

1741 na_values : set 

1742 try_num_bool : bool, default try 

1743 try to cast values to numeric (first preference) or boolean 

1744 

1745 Returns 

1746 ------- 

1747 converted : ndarray 

1748 na_count : int 

1749 """ 

1750 na_count = 0 

1751 if issubclass(values.dtype.type, (np.number, np.bool_)): 

1752 mask = algorithms.isin(values, list(na_values)) 

1753 na_count = mask.sum() 

1754 if na_count > 0: 

1755 if is_integer_dtype(values): 

1756 values = values.astype(np.float64) 

1757 np.putmask(values, mask, np.nan) 

1758 return values, na_count 

1759 

1760 if try_num_bool and is_object_dtype(values.dtype): 

1761 # exclude e.g DatetimeIndex here 

1762 try: 

1763 result = lib.maybe_convert_numeric(values, na_values, False) 

1764 except (ValueError, TypeError): 

1765 # e.g. encountering datetime string gets ValueError 

1766 # TypeError can be raised in floatify 

1767 result = values 

1768 na_count = parsers.sanitize_objects(result, na_values, False) 

1769 else: 

1770 na_count = isna(result).sum() 

1771 else: 

1772 result = values 

1773 if values.dtype == np.object_: 

1774 na_count = parsers.sanitize_objects(values, na_values, False) 

1775 

1776 if result.dtype == np.object_ and try_num_bool: 

1777 result = libops.maybe_convert_bool( 

1778 np.asarray(values), 

1779 true_values=self.true_values, 

1780 false_values=self.false_values, 

1781 ) 

1782 

1783 return result, na_count 

1784 

1785 def _cast_types(self, values, cast_type, column): 

1786 """ 

1787 Cast values to specified type 

1788 

1789 Parameters 

1790 ---------- 

1791 values : ndarray 

1792 cast_type : string or np.dtype 

1793 dtype to cast values to 

1794 column : string 

1795 column name - used only for error reporting 

1796 

1797 Returns 

1798 ------- 

1799 converted : ndarray 

1800 """ 

1801 

1802 if is_categorical_dtype(cast_type): 

1803 known_cats = ( 

1804 isinstance(cast_type, CategoricalDtype) 

1805 and cast_type.categories is not None 

1806 ) 

1807 

1808 if not is_object_dtype(values) and not known_cats: 

1809 # XXX this is for consistency with 

1810 # c-parser which parses all categories 

1811 # as strings 

1812 values = astype_nansafe(values, str) 

1813 

1814 cats = Index(values).unique().dropna() 

1815 values = Categorical._from_inferred_categories( 

1816 cats, cats.get_indexer(values), cast_type, true_values=self.true_values 

1817 ) 

1818 

1819 # use the EA's implementation of casting 

1820 elif is_extension_array_dtype(cast_type): 

1821 # ensure cast_type is an actual dtype and not a string 

1822 cast_type = pandas_dtype(cast_type) 

1823 array_type = cast_type.construct_array_type() 

1824 try: 

1825 return array_type._from_sequence_of_strings(values, dtype=cast_type) 

1826 except NotImplementedError: 

1827 raise NotImplementedError( 

1828 f"Extension Array: {array_type} must implement " 

1829 "_from_sequence_of_strings in order " 

1830 "to be used in parser methods" 

1831 ) 

1832 

1833 else: 

1834 try: 

1835 values = astype_nansafe(values, cast_type, copy=True, skipna=True) 

1836 except ValueError: 

1837 raise ValueError( 

1838 f"Unable to convert column {column} to type {cast_type}" 

1839 ) 

1840 return values 

1841 

1842 def _do_date_conversions(self, names, data): 

1843 # returns data, columns 

1844 

1845 if self.parse_dates is not None: 

1846 data, names = _process_date_conversion( 

1847 data, 

1848 self._date_conv, 

1849 self.parse_dates, 

1850 self.index_col, 

1851 self.index_names, 

1852 names, 

1853 keep_date_col=self.keep_date_col, 

1854 ) 

1855 

1856 return names, data 

1857 

1858 

1859class CParserWrapper(ParserBase): 

1860 """ 

1861 

1862 """ 

1863 

1864 def __init__(self, src, **kwds): 

1865 self.kwds = kwds 

1866 kwds = kwds.copy() 

1867 

1868 ParserBase.__init__(self, kwds) 

1869 

1870 encoding = kwds.get("encoding") 

1871 

1872 if kwds.get("compression") is None and encoding: 

1873 if isinstance(src, str): 

1874 src = open(src, "rb") 

1875 self.handles.append(src) 

1876 

1877 # Handle the file object with universal line mode enabled. 

1878 # We will handle the newline character ourselves later on. 

1879 if hasattr(src, "read") and not hasattr(src, "encoding"): 

1880 src = TextIOWrapper(src, encoding=encoding, newline="") 

1881 

1882 kwds["encoding"] = "utf-8" 

1883 

1884 # #2442 

1885 kwds["allow_leading_cols"] = self.index_col is not False 

1886 

1887 # GH20529, validate usecol arg before TextReader 

1888 self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) 

1889 kwds["usecols"] = self.usecols 

1890 

1891 self._reader = parsers.TextReader(src, **kwds) 

1892 self.unnamed_cols = self._reader.unnamed_cols 

1893 

1894 passed_names = self.names is None 

1895 

1896 if self._reader.header is None: 

1897 self.names = None 

1898 else: 

1899 if len(self._reader.header) > 1: 

1900 # we have a multi index in the columns 

1901 ( 

1902 self.names, 

1903 self.index_names, 

1904 self.col_names, 

1905 passed_names, 

1906 ) = self._extract_multi_indexer_columns( 

1907 self._reader.header, self.index_names, self.col_names, passed_names 

1908 ) 

1909 else: 

1910 self.names = list(self._reader.header[0]) 

1911 

1912 if self.names is None: 

1913 if self.prefix: 

1914 self.names = [ 

1915 f"{self.prefix}{i}" for i in range(self._reader.table_width) 

1916 ] 

1917 else: 

1918 self.names = list(range(self._reader.table_width)) 

1919 

1920 # gh-9755 

1921 # 

1922 # need to set orig_names here first 

1923 # so that proper indexing can be done 

1924 # with _set_noconvert_columns 

1925 # 

1926 # once names has been filtered, we will 

1927 # then set orig_names again to names 

1928 self.orig_names = self.names[:] 

1929 

1930 if self.usecols: 

1931 usecols = _evaluate_usecols(self.usecols, self.orig_names) 

1932 

1933 # GH 14671 

1934 if self.usecols_dtype == "string" and not set(usecols).issubset( 

1935 self.orig_names 

1936 ): 

1937 _validate_usecols_names(usecols, self.orig_names) 

1938 

1939 if len(self.names) > len(usecols): 

1940 self.names = [ 

1941 n 

1942 for i, n in enumerate(self.names) 

1943 if (i in usecols or n in usecols) 

1944 ] 

1945 

1946 if len(self.names) < len(usecols): 

1947 _validate_usecols_names(usecols, self.names) 

1948 

1949 self._set_noconvert_columns() 

1950 

1951 self.orig_names = self.names 

1952 

1953 if not self._has_complex_date_col: 

1954 if self._reader.leading_cols == 0 and _is_index_col(self.index_col): 

1955 

1956 self._name_processed = True 

1957 (index_names, self.names, self.index_col) = _clean_index_names( 

1958 self.names, self.index_col, self.unnamed_cols 

1959 ) 

1960 

1961 if self.index_names is None: 

1962 self.index_names = index_names 

1963 

1964 if self._reader.header is None and not passed_names: 

1965 self.index_names = [None] * len(self.index_names) 

1966 

1967 self._implicit_index = self._reader.leading_cols > 0 

1968 

1969 def close(self): 

1970 for f in self.handles: 

1971 f.close() 

1972 

1973 # close additional handles opened by C parser (for compression) 

1974 try: 

1975 self._reader.close() 

1976 except ValueError: 

1977 pass 

1978 

1979 def _set_noconvert_columns(self): 

1980 """ 

1981 Set the columns that should not undergo dtype conversions. 

1982 

1983 Currently, any column that is involved with date parsing will not 

1984 undergo such conversions. 

1985 """ 

1986 names = self.orig_names 

1987 if self.usecols_dtype == "integer": 

1988 # A set of integers will be converted to a list in 

1989 # the correct order every single time. 

1990 usecols = list(self.usecols) 

1991 usecols.sort() 

1992 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): 

1993 # The names attribute should have the correct columns 

1994 # in the proper order for indexing with parse_dates. 

1995 usecols = self.names[:] 

1996 else: 

1997 # Usecols is empty. 

1998 usecols = None 

1999 

2000 def _set(x): 

2001 if usecols is not None and is_integer(x): 

2002 x = usecols[x] 

2003 

2004 if not is_integer(x): 

2005 x = names.index(x) 

2006 

2007 self._reader.set_noconvert(x) 

2008 

2009 if isinstance(self.parse_dates, list): 

2010 for val in self.parse_dates: 

2011 if isinstance(val, list): 

2012 for k in val: 

2013 _set(k) 

2014 else: 

2015 _set(val) 

2016 

2017 elif isinstance(self.parse_dates, dict): 

2018 for val in self.parse_dates.values(): 

2019 if isinstance(val, list): 

2020 for k in val: 

2021 _set(k) 

2022 else: 

2023 _set(val) 

2024 

2025 elif self.parse_dates: 

2026 if isinstance(self.index_col, list): 

2027 for k in self.index_col: 

2028 _set(k) 

2029 elif self.index_col is not None: 

2030 _set(self.index_col) 

2031 

2032 def set_error_bad_lines(self, status): 

2033 self._reader.set_error_bad_lines(int(status)) 

2034 

2035 def read(self, nrows=None): 

2036 try: 

2037 data = self._reader.read(nrows) 

2038 except StopIteration: 

2039 if self._first_chunk: 

2040 self._first_chunk = False 

2041 names = self._maybe_dedup_names(self.orig_names) 

2042 index, columns, col_dict = _get_empty_meta( 

2043 names, 

2044 self.index_col, 

2045 self.index_names, 

2046 dtype=self.kwds.get("dtype"), 

2047 ) 

2048 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 

2049 

2050 if self.usecols is not None: 

2051 columns = self._filter_usecols(columns) 

2052 

2053 col_dict = dict( 

2054 filter(lambda item: item[0] in columns, col_dict.items()) 

2055 ) 

2056 

2057 return index, columns, col_dict 

2058 

2059 else: 

2060 raise 

2061 

2062 # Done with first read, next time raise StopIteration 

2063 self._first_chunk = False 

2064 

2065 names = self.names 

2066 

2067 if self._reader.leading_cols: 

2068 if self._has_complex_date_col: 

2069 raise NotImplementedError("file structure not yet supported") 

2070 

2071 # implicit index, no index names 

2072 arrays = [] 

2073 

2074 for i in range(self._reader.leading_cols): 

2075 if self.index_col is None: 

2076 values = data.pop(i) 

2077 else: 

2078 values = data.pop(self.index_col[i]) 

2079 

2080 values = self._maybe_parse_dates(values, i, try_parse_dates=True) 

2081 arrays.append(values) 

2082 

2083 index = ensure_index_from_sequences(arrays) 

2084 

2085 if self.usecols is not None: 

2086 names = self._filter_usecols(names) 

2087 

2088 names = self._maybe_dedup_names(names) 

2089 

2090 # rename dict keys 

2091 data = sorted(data.items()) 

2092 data = {k: v for k, (i, v) in zip(names, data)} 

2093 

2094 names, data = self._do_date_conversions(names, data) 

2095 

2096 else: 

2097 # rename dict keys 

2098 data = sorted(data.items()) 

2099 

2100 # ugh, mutation 

2101 names = list(self.orig_names) 

2102 names = self._maybe_dedup_names(names) 

2103 

2104 if self.usecols is not None: 

2105 names = self._filter_usecols(names) 

2106 

2107 # columns as list 

2108 alldata = [x[1] for x in data] 

2109 

2110 data = {k: v for k, (i, v) in zip(names, data)} 

2111 

2112 names, data = self._do_date_conversions(names, data) 

2113 index, names = self._make_index(data, alldata, names) 

2114 

2115 # maybe create a mi on the columns 

2116 names = self._maybe_make_multi_index_columns(names, self.col_names) 

2117 

2118 return index, names, data 

2119 

2120 def _filter_usecols(self, names): 

2121 # hackish 

2122 usecols = _evaluate_usecols(self.usecols, names) 

2123 if usecols is not None and len(names) != len(usecols): 

2124 names = [ 

2125 name for i, name in enumerate(names) if i in usecols or name in usecols 

2126 ] 

2127 return names 

2128 

2129 def _get_index_names(self): 

2130 names = list(self._reader.header[0]) 

2131 idx_names = None 

2132 

2133 if self._reader.leading_cols == 0 and self.index_col is not None: 

2134 (idx_names, names, self.index_col) = _clean_index_names( 

2135 names, self.index_col, self.unnamed_cols 

2136 ) 

2137 

2138 return names, idx_names 

2139 

2140 def _maybe_parse_dates(self, values, index, try_parse_dates=True): 

2141 if try_parse_dates and self._should_parse_dates(index): 

2142 values = self._date_conv(values) 

2143 return values 

2144 

2145 

2146def TextParser(*args, **kwds): 

2147 """ 

2148 Converts lists of lists/tuples into DataFrames with proper type inference 

2149 and optional (e.g. string to datetime) conversion. Also enables iterating 

2150 lazily over chunks of large files 

2151 

2152 Parameters 

2153 ---------- 

2154 data : file-like object or list 

2155 delimiter : separator character to use 

2156 dialect : str or csv.Dialect instance, optional 

2157 Ignored if delimiter is longer than 1 character 

2158 names : sequence, default 

2159 header : int, default 0 

2160 Row to use to parse column labels. Defaults to the first row. Prior 

2161 rows will be discarded 

2162 index_col : int or list, optional 

2163 Column or columns to use as the (possibly hierarchical) index 

2164 has_index_names: bool, default False 

2165 True if the cols defined in index_col have an index name and are 

2166 not in the header. 

2167 na_values : scalar, str, list-like, or dict, optional 

2168 Additional strings to recognize as NA/NaN. 

2169 keep_default_na : bool, default True 

2170 thousands : str, optional 

2171 Thousands separator 

2172 comment : str, optional 

2173 Comment out remainder of line 

2174 parse_dates : bool, default False 

2175 keep_date_col : bool, default False 

2176 date_parser : function, optional 

2177 skiprows : list of integers 

2178 Row numbers to skip 

2179 skipfooter : int 

2180 Number of line at bottom of file to skip 

2181 converters : dict, optional 

2182 Dict of functions for converting values in certain columns. Keys can 

2183 either be integers or column labels, values are functions that take one 

2184 input argument, the cell (not column) content, and return the 

2185 transformed content. 

2186 encoding : str, optional 

2187 Encoding to use for UTF when reading/writing (ex. 'utf-8') 

2188 squeeze : bool, default False 

2189 returns Series if only one column. 

2190 infer_datetime_format: bool, default False 

2191 If True and `parse_dates` is True for a column, try to infer the 

2192 datetime format based on the first datetime string. If the format 

2193 can be inferred, there often will be a large parsing speed-up. 

2194 float_precision : str, optional 

2195 Specifies which converter the C engine should use for floating-point 

2196 values. The options are None for the ordinary converter, 

2197 'high' for the high-precision converter, and 'round_trip' for the 

2198 round-trip converter. 

2199 """ 

2200 kwds["engine"] = "python" 

2201 return TextFileReader(*args, **kwds) 

2202 

2203 

2204def count_empty_vals(vals): 

2205 return sum(1 for v in vals if v == "" or v is None) 

2206 

2207 

2208class PythonParser(ParserBase): 

2209 def __init__(self, f, **kwds): 

2210 """ 

2211 Workhorse function for processing nested list into DataFrame 

2212 """ 

2213 ParserBase.__init__(self, kwds) 

2214 

2215 self.data = None 

2216 self.buf = [] 

2217 self.pos = 0 

2218 self.line_pos = 0 

2219 

2220 self.encoding = kwds["encoding"] 

2221 self.compression = kwds["compression"] 

2222 self.memory_map = kwds["memory_map"] 

2223 self.skiprows = kwds["skiprows"] 

2224 

2225 if callable(self.skiprows): 

2226 self.skipfunc = self.skiprows 

2227 else: 

2228 self.skipfunc = lambda x: x in self.skiprows 

2229 

2230 self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) 

2231 self.delimiter = kwds["delimiter"] 

2232 

2233 self.quotechar = kwds["quotechar"] 

2234 if isinstance(self.quotechar, str): 

2235 self.quotechar = str(self.quotechar) 

2236 

2237 self.escapechar = kwds["escapechar"] 

2238 self.doublequote = kwds["doublequote"] 

2239 self.skipinitialspace = kwds["skipinitialspace"] 

2240 self.lineterminator = kwds["lineterminator"] 

2241 self.quoting = kwds["quoting"] 

2242 self.usecols, _ = _validate_usecols_arg(kwds["usecols"]) 

2243 self.skip_blank_lines = kwds["skip_blank_lines"] 

2244 

2245 self.warn_bad_lines = kwds["warn_bad_lines"] 

2246 self.error_bad_lines = kwds["error_bad_lines"] 

2247 

2248 self.names_passed = kwds["names"] or None 

2249 

2250 self.has_index_names = False 

2251 if "has_index_names" in kwds: 

2252 self.has_index_names = kwds["has_index_names"] 

2253 

2254 self.verbose = kwds["verbose"] 

2255 self.converters = kwds["converters"] 

2256 

2257 self.dtype = kwds["dtype"] 

2258 self.thousands = kwds["thousands"] 

2259 self.decimal = kwds["decimal"] 

2260 

2261 self.comment = kwds["comment"] 

2262 self._comment_lines = [] 

2263 

2264 f, handles = get_handle( 

2265 f, 

2266 "r", 

2267 encoding=self.encoding, 

2268 compression=self.compression, 

2269 memory_map=self.memory_map, 

2270 ) 

2271 self.handles.extend(handles) 

2272 

2273 # Set self.data to something that can read lines. 

2274 if hasattr(f, "readline"): 

2275 self._make_reader(f) 

2276 else: 

2277 self.data = f 

2278 

2279 # Get columns in two steps: infer from data, then 

2280 # infer column indices from self.usecols if it is specified. 

2281 self._col_indices = None 

2282 ( 

2283 self.columns, 

2284 self.num_original_columns, 

2285 self.unnamed_cols, 

2286 ) = self._infer_columns() 

2287 

2288 # Now self.columns has the set of columns that we will process. 

2289 # The original set is stored in self.original_columns. 

2290 if len(self.columns) > 1: 

2291 # we are processing a multi index column 

2292 ( 

2293 self.columns, 

2294 self.index_names, 

2295 self.col_names, 

2296 _, 

2297 ) = self._extract_multi_indexer_columns( 

2298 self.columns, self.index_names, self.col_names 

2299 ) 

2300 # Update list of original names to include all indices. 

2301 self.num_original_columns = len(self.columns) 

2302 else: 

2303 self.columns = self.columns[0] 

2304 

2305 # get popped off for index 

2306 self.orig_names = list(self.columns) 

2307 

2308 # needs to be cleaned/refactored 

2309 # multiple date column thing turning into a real spaghetti factory 

2310 

2311 if not self._has_complex_date_col: 

2312 (index_names, self.orig_names, self.columns) = self._get_index_name( 

2313 self.columns 

2314 ) 

2315 self._name_processed = True 

2316 if self.index_names is None: 

2317 self.index_names = index_names 

2318 

2319 if self.parse_dates: 

2320 self._no_thousands_columns = self._set_no_thousands_columns() 

2321 else: 

2322 self._no_thousands_columns = None 

2323 

2324 if len(self.decimal) != 1: 

2325 raise ValueError("Only length-1 decimal markers supported") 

2326 

2327 if self.thousands is None: 

2328 self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+") 

2329 else: 

2330 self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+") 

2331 

2332 def _set_no_thousands_columns(self): 

2333 # Create a set of column ids that are not to be stripped of thousands 

2334 # operators. 

2335 noconvert_columns = set() 

2336 

2337 def _set(x): 

2338 if is_integer(x): 

2339 noconvert_columns.add(x) 

2340 else: 

2341 noconvert_columns.add(self.columns.index(x)) 

2342 

2343 if isinstance(self.parse_dates, list): 

2344 for val in self.parse_dates: 

2345 if isinstance(val, list): 

2346 for k in val: 

2347 _set(k) 

2348 else: 

2349 _set(val) 

2350 

2351 elif isinstance(self.parse_dates, dict): 

2352 for val in self.parse_dates.values(): 

2353 if isinstance(val, list): 

2354 for k in val: 

2355 _set(k) 

2356 else: 

2357 _set(val) 

2358 

2359 elif self.parse_dates: 

2360 if isinstance(self.index_col, list): 

2361 for k in self.index_col: 

2362 _set(k) 

2363 elif self.index_col is not None: 

2364 _set(self.index_col) 

2365 

2366 return noconvert_columns 

2367 

2368 def _make_reader(self, f): 

2369 sep = self.delimiter 

2370 

2371 if sep is None or len(sep) == 1: 

2372 if self.lineterminator: 

2373 raise ValueError( 

2374 "Custom line terminators not supported in python parser (yet)" 

2375 ) 

2376 

2377 class MyDialect(csv.Dialect): 

2378 delimiter = self.delimiter 

2379 quotechar = self.quotechar 

2380 escapechar = self.escapechar 

2381 doublequote = self.doublequote 

2382 skipinitialspace = self.skipinitialspace 

2383 quoting = self.quoting 

2384 lineterminator = "\n" 

2385 

2386 dia = MyDialect 

2387 

2388 sniff_sep = True 

2389 

2390 if sep is not None: 

2391 sniff_sep = False 

2392 dia.delimiter = sep 

2393 # attempt to sniff the delimiter 

2394 if sniff_sep: 

2395 line = f.readline() 

2396 while self.skipfunc(self.pos): 

2397 self.pos += 1 

2398 line = f.readline() 

2399 

2400 line = self._check_comments([line])[0] 

2401 

2402 self.pos += 1 

2403 self.line_pos += 1 

2404 sniffed = csv.Sniffer().sniff(line) 

2405 dia.delimiter = sniffed.delimiter 

2406 

2407 # Note: self.encoding is irrelevant here 

2408 line_rdr = csv.reader(StringIO(line), dialect=dia) 

2409 self.buf.extend(list(line_rdr)) 

2410 

2411 # Note: self.encoding is irrelevant here 

2412 reader = csv.reader(f, dialect=dia, strict=True) 

2413 

2414 else: 

2415 

2416 def _read(): 

2417 line = f.readline() 

2418 pat = re.compile(sep) 

2419 

2420 yield pat.split(line.strip()) 

2421 

2422 for line in f: 

2423 yield pat.split(line.strip()) 

2424 

2425 reader = _read() 

2426 

2427 self.data = reader 

2428 

2429 def read(self, rows=None): 

2430 try: 

2431 content = self._get_lines(rows) 

2432 except StopIteration: 

2433 if self._first_chunk: 

2434 content = [] 

2435 else: 

2436 raise 

2437 

2438 # done with first read, next time raise StopIteration 

2439 self._first_chunk = False 

2440 

2441 columns = list(self.orig_names) 

2442 if not len(content): # pragma: no cover 

2443 # DataFrame with the right metadata, even though it's length 0 

2444 names = self._maybe_dedup_names(self.orig_names) 

2445 index, columns, col_dict = _get_empty_meta( 

2446 names, self.index_col, self.index_names, self.dtype 

2447 ) 

2448 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 

2449 return index, columns, col_dict 

2450 

2451 # handle new style for names in index 

2452 count_empty_content_vals = count_empty_vals(content[0]) 

2453 indexnamerow = None 

2454 if self.has_index_names and count_empty_content_vals == len(columns): 

2455 indexnamerow = content[0] 

2456 content = content[1:] 

2457 

2458 alldata = self._rows_to_cols(content) 

2459 data = self._exclude_implicit_index(alldata) 

2460 

2461 columns = self._maybe_dedup_names(self.columns) 

2462 columns, data = self._do_date_conversions(columns, data) 

2463 

2464 data = self._convert_data(data) 

2465 index, columns = self._make_index(data, alldata, columns, indexnamerow) 

2466 

2467 return index, columns, data 

2468 

2469 def _exclude_implicit_index(self, alldata): 

2470 names = self._maybe_dedup_names(self.orig_names) 

2471 

2472 if self._implicit_index: 

2473 excl_indices = self.index_col 

2474 

2475 data = {} 

2476 offset = 0 

2477 for i, col in enumerate(names): 

2478 while i + offset in excl_indices: 

2479 offset += 1 

2480 data[col] = alldata[i + offset] 

2481 else: 

2482 data = {k: v for k, v in zip(names, alldata)} 

2483 

2484 return data 

2485 

2486 # legacy 

2487 def get_chunk(self, size=None): 

2488 if size is None: 

2489 size = self.chunksize 

2490 return self.read(rows=size) 

2491 

2492 def _convert_data(self, data): 

2493 # apply converters 

2494 def _clean_mapping(mapping): 

2495 "converts col numbers to names" 

2496 clean = {} 

2497 for col, v in mapping.items(): 

2498 if isinstance(col, int) and col not in self.orig_names: 

2499 col = self.orig_names[col] 

2500 clean[col] = v 

2501 return clean 

2502 

2503 clean_conv = _clean_mapping(self.converters) 

2504 if not isinstance(self.dtype, dict): 

2505 # handles single dtype applied to all columns 

2506 clean_dtypes = self.dtype 

2507 else: 

2508 clean_dtypes = _clean_mapping(self.dtype) 

2509 

2510 # Apply NA values. 

2511 clean_na_values = {} 

2512 clean_na_fvalues = {} 

2513 

2514 if isinstance(self.na_values, dict): 

2515 for col in self.na_values: 

2516 na_value = self.na_values[col] 

2517 na_fvalue = self.na_fvalues[col] 

2518 

2519 if isinstance(col, int) and col not in self.orig_names: 

2520 col = self.orig_names[col] 

2521 

2522 clean_na_values[col] = na_value 

2523 clean_na_fvalues[col] = na_fvalue 

2524 else: 

2525 clean_na_values = self.na_values 

2526 clean_na_fvalues = self.na_fvalues 

2527 

2528 return self._convert_to_ndarrays( 

2529 data, 

2530 clean_na_values, 

2531 clean_na_fvalues, 

2532 self.verbose, 

2533 clean_conv, 

2534 clean_dtypes, 

2535 ) 

2536 

2537 def _infer_columns(self): 

2538 names = self.names 

2539 num_original_columns = 0 

2540 clear_buffer = True 

2541 unnamed_cols = set() 

2542 

2543 if self.header is not None: 

2544 header = self.header 

2545 

2546 if isinstance(header, (list, tuple, np.ndarray)): 

2547 have_mi_columns = len(header) > 1 

2548 # we have a mi columns, so read an extra line 

2549 if have_mi_columns: 

2550 header = list(header) + [header[-1] + 1] 

2551 else: 

2552 have_mi_columns = False 

2553 header = [header] 

2554 

2555 columns = [] 

2556 for level, hr in enumerate(header): 

2557 try: 

2558 line = self._buffered_line() 

2559 

2560 while self.line_pos <= hr: 

2561 line = self._next_line() 

2562 

2563 except StopIteration: 

2564 if self.line_pos < hr: 

2565 raise ValueError( 

2566 f"Passed header={hr} but only {self.line_pos + 1} lines in " 

2567 "file" 

2568 ) 

2569 

2570 # We have an empty file, so check 

2571 # if columns are provided. That will 

2572 # serve as the 'line' for parsing 

2573 if have_mi_columns and hr > 0: 

2574 if clear_buffer: 

2575 self._clear_buffer() 

2576 columns.append([None] * len(columns[-1])) 

2577 return columns, num_original_columns, unnamed_cols 

2578 

2579 if not self.names: 

2580 raise EmptyDataError("No columns to parse from file") 

2581 

2582 line = self.names[:] 

2583 

2584 this_columns = [] 

2585 this_unnamed_cols = [] 

2586 

2587 for i, c in enumerate(line): 

2588 if c == "": 

2589 if have_mi_columns: 

2590 col_name = f"Unnamed: {i}_level_{level}" 

2591 else: 

2592 col_name = f"Unnamed: {i}" 

2593 

2594 this_unnamed_cols.append(i) 

2595 this_columns.append(col_name) 

2596 else: 

2597 this_columns.append(c) 

2598 

2599 if not have_mi_columns and self.mangle_dupe_cols: 

2600 counts = defaultdict(int) 

2601 

2602 for i, col in enumerate(this_columns): 

2603 cur_count = counts[col] 

2604 

2605 while cur_count > 0: 

2606 counts[col] = cur_count + 1 

2607 col = f"{col}.{cur_count}" 

2608 cur_count = counts[col] 

2609 

2610 this_columns[i] = col 

2611 counts[col] = cur_count + 1 

2612 elif have_mi_columns: 

2613 

2614 # if we have grabbed an extra line, but its not in our 

2615 # format so save in the buffer, and create an blank extra 

2616 # line for the rest of the parsing code 

2617 if hr == header[-1]: 

2618 lc = len(this_columns) 

2619 ic = len(self.index_col) if self.index_col is not None else 0 

2620 unnamed_count = len(this_unnamed_cols) 

2621 

2622 if lc != unnamed_count and lc - ic > unnamed_count: 

2623 clear_buffer = False 

2624 this_columns = [None] * lc 

2625 self.buf = [self.buf[-1]] 

2626 

2627 columns.append(this_columns) 

2628 unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) 

2629 

2630 if len(columns) == 1: 

2631 num_original_columns = len(this_columns) 

2632 

2633 if clear_buffer: 

2634 self._clear_buffer() 

2635 

2636 if names is not None: 

2637 if (self.usecols is not None and len(names) != len(self.usecols)) or ( 

2638 self.usecols is None and len(names) != len(columns[0]) 

2639 ): 

2640 raise ValueError( 

2641 "Number of passed names did not match " 

2642 "number of header fields in the file" 

2643 ) 

2644 if len(columns) > 1: 

2645 raise TypeError("Cannot pass names with multi-index columns") 

2646 

2647 if self.usecols is not None: 

2648 # Set _use_cols. We don't store columns because they are 

2649 # overwritten. 

2650 self._handle_usecols(columns, names) 

2651 else: 

2652 self._col_indices = None 

2653 num_original_columns = len(names) 

2654 columns = [names] 

2655 else: 

2656 columns = self._handle_usecols(columns, columns[0]) 

2657 else: 

2658 try: 

2659 line = self._buffered_line() 

2660 

2661 except StopIteration: 

2662 if not names: 

2663 raise EmptyDataError("No columns to parse from file") 

2664 

2665 line = names[:] 

2666 

2667 ncols = len(line) 

2668 num_original_columns = ncols 

2669 

2670 if not names: 

2671 if self.prefix: 

2672 columns = [[f"{self.prefix}{i}" for i in range(ncols)]] 

2673 else: 

2674 columns = [list(range(ncols))] 

2675 columns = self._handle_usecols(columns, columns[0]) 

2676 else: 

2677 if self.usecols is None or len(names) >= num_original_columns: 

2678 columns = self._handle_usecols([names], names) 

2679 num_original_columns = len(names) 

2680 else: 

2681 if not callable(self.usecols) and len(names) != len(self.usecols): 

2682 raise ValueError( 

2683 "Number of passed names did not match number of " 

2684 "header fields in the file" 

2685 ) 

2686 # Ignore output but set used columns. 

2687 self._handle_usecols([names], names) 

2688 columns = [names] 

2689 num_original_columns = ncols 

2690 

2691 return columns, num_original_columns, unnamed_cols 

2692 

2693 def _handle_usecols(self, columns, usecols_key): 

2694 """ 

2695 Sets self._col_indices 

2696 

2697 usecols_key is used if there are string usecols. 

2698 """ 

2699 if self.usecols is not None: 

2700 if callable(self.usecols): 

2701 col_indices = _evaluate_usecols(self.usecols, usecols_key) 

2702 elif any(isinstance(u, str) for u in self.usecols): 

2703 if len(columns) > 1: 

2704 raise ValueError( 

2705 "If using multiple headers, usecols must be integers." 

2706 ) 

2707 col_indices = [] 

2708 

2709 for col in self.usecols: 

2710 if isinstance(col, str): 

2711 try: 

2712 col_indices.append(usecols_key.index(col)) 

2713 except ValueError: 

2714 _validate_usecols_names(self.usecols, usecols_key) 

2715 else: 

2716 col_indices.append(col) 

2717 else: 

2718 col_indices = self.usecols 

2719 

2720 columns = [ 

2721 [n for i, n in enumerate(column) if i in col_indices] 

2722 for column in columns 

2723 ] 

2724 self._col_indices = col_indices 

2725 return columns 

2726 

2727 def _buffered_line(self): 

2728 """ 

2729 Return a line from buffer, filling buffer if required. 

2730 """ 

2731 if len(self.buf) > 0: 

2732 return self.buf[0] 

2733 else: 

2734 return self._next_line() 

2735 

2736 def _check_for_bom(self, first_row): 

2737 """ 

2738 Checks whether the file begins with the BOM character. 

2739 If it does, remove it. In addition, if there is quoting 

2740 in the field subsequent to the BOM, remove it as well 

2741 because it technically takes place at the beginning of 

2742 the name, not the middle of it. 

2743 """ 

2744 # first_row will be a list, so we need to check 

2745 # that that list is not empty before proceeding. 

2746 if not first_row: 

2747 return first_row 

2748 

2749 # The first element of this row is the one that could have the 

2750 # BOM that we want to remove. Check that the first element is a 

2751 # string before proceeding. 

2752 if not isinstance(first_row[0], str): 

2753 return first_row 

2754 

2755 # Check that the string is not empty, as that would 

2756 # obviously not have a BOM at the start of it. 

2757 if not first_row[0]: 

2758 return first_row 

2759 

2760 # Since the string is non-empty, check that it does 

2761 # in fact begin with a BOM. 

2762 first_elt = first_row[0][0] 

2763 if first_elt != _BOM: 

2764 return first_row 

2765 

2766 first_row_bom = first_row[0] 

2767 

2768 if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: 

2769 start = 2 

2770 quote = first_row_bom[1] 

2771 end = first_row_bom[2:].index(quote) + 2 

2772 

2773 # Extract the data between the quotation marks 

2774 new_row = first_row_bom[start:end] 

2775 

2776 # Extract any remaining data after the second 

2777 # quotation mark. 

2778 if len(first_row_bom) > end + 1: 

2779 new_row += first_row_bom[end + 1 :] 

2780 return [new_row] + first_row[1:] 

2781 

2782 elif len(first_row_bom) > 1: 

2783 return [first_row_bom[1:]] 

2784 else: 

2785 # First row is just the BOM, so we 

2786 # return an empty string. 

2787 return [""] 

2788 

2789 def _is_line_empty(self, line): 

2790 """ 

2791 Check if a line is empty or not. 

2792 

2793 Parameters 

2794 ---------- 

2795 line : str, array-like 

2796 The line of data to check. 

2797 

2798 Returns 

2799 ------- 

2800 boolean : Whether or not the line is empty. 

2801 """ 

2802 return not line or all(not x for x in line) 

2803 

2804 def _next_line(self): 

2805 if isinstance(self.data, list): 

2806 while self.skipfunc(self.pos): 

2807 self.pos += 1 

2808 

2809 while True: 

2810 try: 

2811 line = self._check_comments([self.data[self.pos]])[0] 

2812 self.pos += 1 

2813 # either uncommented or blank to begin with 

2814 if not self.skip_blank_lines and ( 

2815 self._is_line_empty(self.data[self.pos - 1]) or line 

2816 ): 

2817 break 

2818 elif self.skip_blank_lines: 

2819 ret = self._remove_empty_lines([line]) 

2820 if ret: 

2821 line = ret[0] 

2822 break 

2823 except IndexError: 

2824 raise StopIteration 

2825 else: 

2826 while self.skipfunc(self.pos): 

2827 self.pos += 1 

2828 next(self.data) 

2829 

2830 while True: 

2831 orig_line = self._next_iter_line(row_num=self.pos + 1) 

2832 self.pos += 1 

2833 

2834 if orig_line is not None: 

2835 line = self._check_comments([orig_line])[0] 

2836 

2837 if self.skip_blank_lines: 

2838 ret = self._remove_empty_lines([line]) 

2839 

2840 if ret: 

2841 line = ret[0] 

2842 break 

2843 elif self._is_line_empty(orig_line) or line: 

2844 break 

2845 

2846 # This was the first line of the file, 

2847 # which could contain the BOM at the 

2848 # beginning of it. 

2849 if self.pos == 1: 

2850 line = self._check_for_bom(line) 

2851 

2852 self.line_pos += 1 

2853 self.buf.append(line) 

2854 return line 

2855 

2856 def _alert_malformed(self, msg, row_num): 

2857 """ 

2858 Alert a user about a malformed row. 

2859 

2860 If `self.error_bad_lines` is True, the alert will be `ParserError`. 

2861 If `self.warn_bad_lines` is True, the alert will be printed out. 

2862 

2863 Parameters 

2864 ---------- 

2865 msg : The error message to display. 

2866 row_num : The row number where the parsing error occurred. 

2867 Because this row number is displayed, we 1-index, 

2868 even though we 0-index internally. 

2869 """ 

2870 

2871 if self.error_bad_lines: 

2872 raise ParserError(msg) 

2873 elif self.warn_bad_lines: 

2874 base = f"Skipping line {row_num}: " 

2875 sys.stderr.write(base + msg + "\n") 

2876 

2877 def _next_iter_line(self, row_num): 

2878 """ 

2879 Wrapper around iterating through `self.data` (CSV source). 

2880 

2881 When a CSV error is raised, we check for specific 

2882 error messages that allow us to customize the 

2883 error message displayed to the user. 

2884 

2885 Parameters 

2886 ---------- 

2887 row_num : The row number of the line being parsed. 

2888 """ 

2889 

2890 try: 

2891 return next(self.data) 

2892 except csv.Error as e: 

2893 if self.warn_bad_lines or self.error_bad_lines: 

2894 msg = str(e) 

2895 

2896 if "NULL byte" in msg or "line contains NUL" in msg: 

2897 msg = ( 

2898 "NULL byte detected. This byte " 

2899 "cannot be processed in Python's " 

2900 "native csv library at the moment, " 

2901 "so please pass in engine='c' instead" 

2902 ) 

2903 

2904 if self.skipfooter > 0: 

2905 reason = ( 

2906 "Error could possibly be due to " 

2907 "parsing errors in the skipped footer rows " 

2908 "(the skipfooter keyword is only applied " 

2909 "after Python's csv library has parsed " 

2910 "all rows)." 

2911 ) 

2912 msg += ". " + reason 

2913 

2914 self._alert_malformed(msg, row_num) 

2915 return None 

2916 

2917 def _check_comments(self, lines): 

2918 if self.comment is None: 

2919 return lines 

2920 ret = [] 

2921 for l in lines: 

2922 rl = [] 

2923 for x in l: 

2924 if not isinstance(x, str) or self.comment not in x: 

2925 rl.append(x) 

2926 else: 

2927 x = x[: x.find(self.comment)] 

2928 if len(x) > 0: 

2929 rl.append(x) 

2930 break 

2931 ret.append(rl) 

2932 return ret 

2933 

2934 def _remove_empty_lines(self, lines): 

2935 """ 

2936 Iterate through the lines and remove any that are 

2937 either empty or contain only one whitespace value 

2938 

2939 Parameters 

2940 ---------- 

2941 lines : array-like 

2942 The array of lines that we are to filter. 

2943 

2944 Returns 

2945 ------- 

2946 filtered_lines : array-like 

2947 The same array of lines with the "empty" ones removed. 

2948 """ 

2949 

2950 ret = [] 

2951 for l in lines: 

2952 # Remove empty lines and lines with only one whitespace value 

2953 if ( 

2954 len(l) > 1 

2955 or len(l) == 1 

2956 and (not isinstance(l[0], str) or l[0].strip()) 

2957 ): 

2958 ret.append(l) 

2959 return ret 

2960 

2961 def _check_thousands(self, lines): 

2962 if self.thousands is None: 

2963 return lines 

2964 

2965 return self._search_replace_num_columns( 

2966 lines=lines, search=self.thousands, replace="" 

2967 ) 

2968 

2969 def _search_replace_num_columns(self, lines, search, replace): 

2970 ret = [] 

2971 for l in lines: 

2972 rl = [] 

2973 for i, x in enumerate(l): 

2974 if ( 

2975 not isinstance(x, str) 

2976 or search not in x 

2977 or (self._no_thousands_columns and i in self._no_thousands_columns) 

2978 or self.nonnum.search(x.strip()) 

2979 ): 

2980 rl.append(x) 

2981 else: 

2982 rl.append(x.replace(search, replace)) 

2983 ret.append(rl) 

2984 return ret 

2985 

2986 def _check_decimal(self, lines): 

2987 if self.decimal == _parser_defaults["decimal"]: 

2988 return lines 

2989 

2990 return self._search_replace_num_columns( 

2991 lines=lines, search=self.decimal, replace="." 

2992 ) 

2993 

2994 def _clear_buffer(self): 

2995 self.buf = [] 

2996 

2997 _implicit_index = False 

2998 

2999 def _get_index_name(self, columns): 

3000 """ 

3001 Try several cases to get lines: 

3002 

3003 0) There are headers on row 0 and row 1 and their 

3004 total summed lengths equals the length of the next line. 

3005 Treat row 0 as columns and row 1 as indices 

3006 1) Look for implicit index: there are more columns 

3007 on row 1 than row 0. If this is true, assume that row 

3008 1 lists index columns and row 0 lists normal columns. 

3009 2) Get index from the columns if it was listed. 

3010 """ 

3011 orig_names = list(columns) 

3012 columns = list(columns) 

3013 

3014 try: 

3015 line = self._next_line() 

3016 except StopIteration: 

3017 line = None 

3018 

3019 try: 

3020 next_line = self._next_line() 

3021 except StopIteration: 

3022 next_line = None 

3023 

3024 # implicitly index_col=0 b/c 1 fewer column names 

3025 implicit_first_cols = 0 

3026 if line is not None: 

3027 # leave it 0, #2442 

3028 # Case 1 

3029 if self.index_col is not False: 

3030 implicit_first_cols = len(line) - self.num_original_columns 

3031 

3032 # Case 0 

3033 if next_line is not None: 

3034 if len(next_line) == len(line) + self.num_original_columns: 

3035 # column and index names on diff rows 

3036 self.index_col = list(range(len(line))) 

3037 self.buf = self.buf[1:] 

3038 

3039 for c in reversed(line): 

3040 columns.insert(0, c) 

3041 

3042 # Update list of original names to include all indices. 

3043 orig_names = list(columns) 

3044 self.num_original_columns = len(columns) 

3045 return line, orig_names, columns 

3046 

3047 if implicit_first_cols > 0: 

3048 # Case 1 

3049 self._implicit_index = True 

3050 if self.index_col is None: 

3051 self.index_col = list(range(implicit_first_cols)) 

3052 

3053 index_name = None 

3054 

3055 else: 

3056 # Case 2 

3057 (index_name, columns_, self.index_col) = _clean_index_names( 

3058 columns, self.index_col, self.unnamed_cols 

3059 ) 

3060 

3061 return index_name, orig_names, columns 

3062 

3063 def _rows_to_cols(self, content): 

3064 col_len = self.num_original_columns 

3065 

3066 if self._implicit_index: 

3067 col_len += len(self.index_col) 

3068 

3069 max_len = max(len(row) for row in content) 

3070 

3071 # Check that there are no rows with too many 

3072 # elements in their row (rows with too few 

3073 # elements are padded with NaN). 

3074 if max_len > col_len and self.index_col is not False and self.usecols is None: 

3075 

3076 footers = self.skipfooter if self.skipfooter else 0 

3077 bad_lines = [] 

3078 

3079 iter_content = enumerate(content) 

3080 content_len = len(content) 

3081 content = [] 

3082 

3083 for (i, l) in iter_content: 

3084 actual_len = len(l) 

3085 

3086 if actual_len > col_len: 

3087 if self.error_bad_lines or self.warn_bad_lines: 

3088 row_num = self.pos - (content_len - i + footers) 

3089 bad_lines.append((row_num, actual_len)) 

3090 

3091 if self.error_bad_lines: 

3092 break 

3093 else: 

3094 content.append(l) 

3095 

3096 for row_num, actual_len in bad_lines: 

3097 msg = ( 

3098 f"Expected {col_len} fields in line {row_num + 1}, saw " 

3099 f"{actual_len}" 

3100 ) 

3101 if ( 

3102 self.delimiter 

3103 and len(self.delimiter) > 1 

3104 and self.quoting != csv.QUOTE_NONE 

3105 ): 

3106 # see gh-13374 

3107 reason = ( 

3108 "Error could possibly be due to quotes being " 

3109 "ignored when a multi-char delimiter is used." 

3110 ) 

3111 msg += ". " + reason 

3112 

3113 self._alert_malformed(msg, row_num + 1) 

3114 

3115 # see gh-13320 

3116 zipped_content = list(lib.to_object_array(content, min_width=col_len).T) 

3117 

3118 if self.usecols: 

3119 if self._implicit_index: 

3120 zipped_content = [ 

3121 a 

3122 for i, a in enumerate(zipped_content) 

3123 if ( 

3124 i < len(self.index_col) 

3125 or i - len(self.index_col) in self._col_indices 

3126 ) 

3127 ] 

3128 else: 

3129 zipped_content = [ 

3130 a for i, a in enumerate(zipped_content) if i in self._col_indices 

3131 ] 

3132 return zipped_content 

3133 

3134 def _get_lines(self, rows=None): 

3135 lines = self.buf 

3136 new_rows = None 

3137 

3138 # already fetched some number 

3139 if rows is not None: 

3140 # we already have the lines in the buffer 

3141 if len(self.buf) >= rows: 

3142 new_rows, self.buf = self.buf[:rows], self.buf[rows:] 

3143 

3144 # need some lines 

3145 else: 

3146 rows -= len(self.buf) 

3147 

3148 if new_rows is None: 

3149 if isinstance(self.data, list): 

3150 if self.pos > len(self.data): 

3151 raise StopIteration 

3152 if rows is None: 

3153 new_rows = self.data[self.pos :] 

3154 new_pos = len(self.data) 

3155 else: 

3156 new_rows = self.data[self.pos : self.pos + rows] 

3157 new_pos = self.pos + rows 

3158 

3159 # Check for stop rows. n.b.: self.skiprows is a set. 

3160 if self.skiprows: 

3161 new_rows = [ 

3162 row 

3163 for i, row in enumerate(new_rows) 

3164 if not self.skipfunc(i + self.pos) 

3165 ] 

3166 

3167 lines.extend(new_rows) 

3168 self.pos = new_pos 

3169 

3170 else: 

3171 new_rows = [] 

3172 try: 

3173 if rows is not None: 

3174 for _ in range(rows): 

3175 new_rows.append(next(self.data)) 

3176 lines.extend(new_rows) 

3177 else: 

3178 rows = 0 

3179 

3180 while True: 

3181 new_row = self._next_iter_line(row_num=self.pos + rows + 1) 

3182 rows += 1 

3183 

3184 if new_row is not None: 

3185 new_rows.append(new_row) 

3186 

3187 except StopIteration: 

3188 if self.skiprows: 

3189 new_rows = [ 

3190 row 

3191 for i, row in enumerate(new_rows) 

3192 if not self.skipfunc(i + self.pos) 

3193 ] 

3194 lines.extend(new_rows) 

3195 if len(lines) == 0: 

3196 raise 

3197 self.pos += len(new_rows) 

3198 

3199 self.buf = [] 

3200 else: 

3201 lines = new_rows 

3202 

3203 if self.skipfooter: 

3204 lines = lines[: -self.skipfooter] 

3205 

3206 lines = self._check_comments(lines) 

3207 if self.skip_blank_lines: 

3208 lines = self._remove_empty_lines(lines) 

3209 lines = self._check_thousands(lines) 

3210 return self._check_decimal(lines) 

3211 

3212 

3213def _make_date_converter( 

3214 date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True 

3215): 

3216 def converter(*date_cols): 

3217 if date_parser is None: 

3218 strs = parsing._concat_date_cols(date_cols) 

3219 

3220 try: 

3221 return tools.to_datetime( 

3222 ensure_object(strs), 

3223 utc=None, 

3224 dayfirst=dayfirst, 

3225 errors="ignore", 

3226 infer_datetime_format=infer_datetime_format, 

3227 cache=cache_dates, 

3228 ).to_numpy() 

3229 

3230 except ValueError: 

3231 return tools.to_datetime( 

3232 parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates 

3233 ) 

3234 else: 

3235 try: 

3236 result = tools.to_datetime( 

3237 date_parser(*date_cols), errors="ignore", cache=cache_dates 

3238 ) 

3239 if isinstance(result, datetime.datetime): 

3240 raise Exception("scalar parser") 

3241 return result 

3242 except Exception: 

3243 try: 

3244 return tools.to_datetime( 

3245 parsing.try_parse_dates( 

3246 parsing._concat_date_cols(date_cols), 

3247 parser=date_parser, 

3248 dayfirst=dayfirst, 

3249 ), 

3250 errors="ignore", 

3251 ) 

3252 except Exception: 

3253 return generic_parser(date_parser, *date_cols) 

3254 

3255 return converter 

3256 

3257 

3258def _process_date_conversion( 

3259 data_dict, 

3260 converter, 

3261 parse_spec, 

3262 index_col, 

3263 index_names, 

3264 columns, 

3265 keep_date_col=False, 

3266): 

3267 def _isindex(colspec): 

3268 return (isinstance(index_col, list) and colspec in index_col) or ( 

3269 isinstance(index_names, list) and colspec in index_names 

3270 ) 

3271 

3272 new_cols = [] 

3273 new_data = {} 

3274 

3275 orig_names = columns 

3276 columns = list(columns) 

3277 

3278 date_cols = set() 

3279 

3280 if parse_spec is None or isinstance(parse_spec, bool): 

3281 return data_dict, columns 

3282 

3283 if isinstance(parse_spec, list): 

3284 # list of column lists 

3285 for colspec in parse_spec: 

3286 if is_scalar(colspec): 

3287 if isinstance(colspec, int) and colspec not in data_dict: 

3288 colspec = orig_names[colspec] 

3289 if _isindex(colspec): 

3290 continue 

3291 data_dict[colspec] = converter(data_dict[colspec]) 

3292 else: 

3293 new_name, col, old_names = _try_convert_dates( 

3294 converter, colspec, data_dict, orig_names 

3295 ) 

3296 if new_name in data_dict: 

3297 raise ValueError(f"New date column already in dict {new_name}") 

3298 new_data[new_name] = col 

3299 new_cols.append(new_name) 

3300 date_cols.update(old_names) 

3301 

3302 elif isinstance(parse_spec, dict): 

3303 # dict of new name to column list 

3304 for new_name, colspec in parse_spec.items(): 

3305 if new_name in data_dict: 

3306 raise ValueError(f"Date column {new_name} already in dict") 

3307 

3308 _, col, old_names = _try_convert_dates( 

3309 converter, colspec, data_dict, orig_names 

3310 ) 

3311 

3312 new_data[new_name] = col 

3313 new_cols.append(new_name) 

3314 date_cols.update(old_names) 

3315 

3316 data_dict.update(new_data) 

3317 new_cols.extend(columns) 

3318 

3319 if not keep_date_col: 

3320 for c in list(date_cols): 

3321 data_dict.pop(c) 

3322 new_cols.remove(c) 

3323 

3324 return data_dict, new_cols 

3325 

3326 

3327def _try_convert_dates(parser, colspec, data_dict, columns): 

3328 colset = set(columns) 

3329 colnames = [] 

3330 

3331 for c in colspec: 

3332 if c in colset: 

3333 colnames.append(c) 

3334 elif isinstance(c, int) and c not in columns: 

3335 colnames.append(columns[c]) 

3336 else: 

3337 colnames.append(c) 

3338 

3339 new_name = "_".join(str(x) for x in colnames) 

3340 to_parse = [data_dict[c] for c in colnames if c in data_dict] 

3341 

3342 new_col = parser(*to_parse) 

3343 return new_name, new_col, colnames 

3344 

3345 

3346def _clean_na_values(na_values, keep_default_na=True): 

3347 

3348 if na_values is None: 

3349 if keep_default_na: 

3350 na_values = STR_NA_VALUES 

3351 else: 

3352 na_values = set() 

3353 na_fvalues = set() 

3354 elif isinstance(na_values, dict): 

3355 old_na_values = na_values.copy() 

3356 na_values = {} # Prevent aliasing. 

3357 

3358 # Convert the values in the na_values dictionary 

3359 # into array-likes for further use. This is also 

3360 # where we append the default NaN values, provided 

3361 # that `keep_default_na=True`. 

3362 for k, v in old_na_values.items(): 

3363 if not is_list_like(v): 

3364 v = [v] 

3365 

3366 if keep_default_na: 

3367 v = set(v) | STR_NA_VALUES 

3368 

3369 na_values[k] = v 

3370 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} 

3371 else: 

3372 if not is_list_like(na_values): 

3373 na_values = [na_values] 

3374 na_values = _stringify_na_values(na_values) 

3375 if keep_default_na: 

3376 na_values = na_values | STR_NA_VALUES 

3377 

3378 na_fvalues = _floatify_na_values(na_values) 

3379 

3380 return na_values, na_fvalues 

3381 

3382 

3383def _clean_index_names(columns, index_col, unnamed_cols): 

3384 if not _is_index_col(index_col): 

3385 return None, columns, index_col 

3386 

3387 columns = list(columns) 

3388 

3389 cp_cols = list(columns) 

3390 index_names = [] 

3391 

3392 # don't mutate 

3393 index_col = list(index_col) 

3394 

3395 for i, c in enumerate(index_col): 

3396 if isinstance(c, str): 

3397 index_names.append(c) 

3398 for j, name in enumerate(cp_cols): 

3399 if name == c: 

3400 index_col[i] = j 

3401 columns.remove(name) 

3402 break 

3403 else: 

3404 name = cp_cols[c] 

3405 columns.remove(name) 

3406 index_names.append(name) 

3407 

3408 # Only clean index names that were placeholders. 

3409 for i, name in enumerate(index_names): 

3410 if isinstance(name, str) and name in unnamed_cols: 

3411 index_names[i] = None 

3412 

3413 return index_names, columns, index_col 

3414 

3415 

3416def _get_empty_meta(columns, index_col, index_names, dtype=None): 

3417 columns = list(columns) 

3418 

3419 # Convert `dtype` to a defaultdict of some kind. 

3420 # This will enable us to write `dtype[col_name]` 

3421 # without worrying about KeyError issues later on. 

3422 if not isinstance(dtype, dict): 

3423 # if dtype == None, default will be np.object. 

3424 default_dtype = dtype or np.object 

3425 dtype = defaultdict(lambda: default_dtype) 

3426 else: 

3427 # Save a copy of the dictionary. 

3428 _dtype = dtype.copy() 

3429 dtype = defaultdict(lambda: np.object) 

3430 

3431 # Convert column indexes to column names. 

3432 for k, v in _dtype.items(): 

3433 col = columns[k] if is_integer(k) else k 

3434 dtype[col] = v 

3435 

3436 # Even though we have no data, the "index" of the empty DataFrame 

3437 # could for example still be an empty MultiIndex. Thus, we need to 

3438 # check whether we have any index columns specified, via either: 

3439 # 

3440 # 1) index_col (column indices) 

3441 # 2) index_names (column names) 

3442 # 

3443 # Both must be non-null to ensure a successful construction. Otherwise, 

3444 # we have to create a generic empty Index. 

3445 if (index_col is None or index_col is False) or index_names is None: 

3446 index = Index([]) 

3447 else: 

3448 data = [Series([], dtype=dtype[name]) for name in index_names] 

3449 index = ensure_index_from_sequences(data, names=index_names) 

3450 index_col.sort() 

3451 

3452 for i, n in enumerate(index_col): 

3453 columns.pop(n - i) 

3454 

3455 col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns} 

3456 

3457 return index, columns, col_dict 

3458 

3459 

3460def _floatify_na_values(na_values): 

3461 # create float versions of the na_values 

3462 result = set() 

3463 for v in na_values: 

3464 try: 

3465 v = float(v) 

3466 if not np.isnan(v): 

3467 result.add(v) 

3468 except (TypeError, ValueError, OverflowError): 

3469 pass 

3470 return result 

3471 

3472 

3473def _stringify_na_values(na_values): 

3474 """ return a stringified and numeric for these values """ 

3475 result = [] 

3476 for x in na_values: 

3477 result.append(str(x)) 

3478 result.append(x) 

3479 try: 

3480 v = float(x) 

3481 

3482 # we are like 999 here 

3483 if v == int(v): 

3484 v = int(v) 

3485 result.append(f"{v}.0") 

3486 result.append(str(v)) 

3487 

3488 result.append(v) 

3489 except (TypeError, ValueError, OverflowError): 

3490 pass 

3491 try: 

3492 result.append(int(x)) 

3493 except (TypeError, ValueError, OverflowError): 

3494 pass 

3495 return set(result) 

3496 

3497 

3498def _get_na_values(col, na_values, na_fvalues, keep_default_na): 

3499 """ 

3500 Get the NaN values for a given column. 

3501 

3502 Parameters 

3503 ---------- 

3504 col : str 

3505 The name of the column. 

3506 na_values : array-like, dict 

3507 The object listing the NaN values as strings. 

3508 na_fvalues : array-like, dict 

3509 The object listing the NaN values as floats. 

3510 keep_default_na : bool 

3511 If `na_values` is a dict, and the column is not mapped in the 

3512 dictionary, whether to return the default NaN values or the empty set. 

3513 

3514 Returns 

3515 ------- 

3516 nan_tuple : A length-two tuple composed of 

3517 

3518 1) na_values : the string NaN values for that column. 

3519 2) na_fvalues : the float NaN values for that column. 

3520 """ 

3521 

3522 if isinstance(na_values, dict): 

3523 if col in na_values: 

3524 return na_values[col], na_fvalues[col] 

3525 else: 

3526 if keep_default_na: 

3527 return STR_NA_VALUES, set() 

3528 

3529 return set(), set() 

3530 else: 

3531 return na_values, na_fvalues 

3532 

3533 

3534def _get_col_names(colspec, columns): 

3535 colset = set(columns) 

3536 colnames = [] 

3537 for c in colspec: 

3538 if c in colset: 

3539 colnames.append(c) 

3540 elif isinstance(c, int): 

3541 colnames.append(columns[c]) 

3542 return colnames 

3543 

3544 

3545class FixedWidthReader(abc.Iterator): 

3546 """ 

3547 A reader of fixed-width lines. 

3548 """ 

3549 

3550 def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100): 

3551 self.f = f 

3552 self.buffer = None 

3553 self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " 

3554 self.comment = comment 

3555 if colspecs == "infer": 

3556 self.colspecs = self.detect_colspecs( 

3557 infer_nrows=infer_nrows, skiprows=skiprows 

3558 ) 

3559 else: 

3560 self.colspecs = colspecs 

3561 

3562 if not isinstance(self.colspecs, (tuple, list)): 

3563 raise TypeError( 

3564 "column specifications must be a list or tuple, " 

3565 f"input was a {type(colspecs).__name__}" 

3566 ) 

3567 

3568 for colspec in self.colspecs: 

3569 if not ( 

3570 isinstance(colspec, (tuple, list)) 

3571 and len(colspec) == 2 

3572 and isinstance(colspec[0], (int, np.integer, type(None))) 

3573 and isinstance(colspec[1], (int, np.integer, type(None))) 

3574 ): 

3575 raise TypeError( 

3576 "Each column specification must be " 

3577 "2 element tuple or list of integers" 

3578 ) 

3579 

3580 def get_rows(self, infer_nrows, skiprows=None): 

3581 """ 

3582 Read rows from self.f, skipping as specified. 

3583 

3584 We distinguish buffer_rows (the first <= infer_nrows 

3585 lines) from the rows returned to detect_colspecs 

3586 because it's simpler to leave the other locations 

3587 with skiprows logic alone than to modify them to 

3588 deal with the fact we skipped some rows here as 

3589 well. 

3590 

3591 Parameters 

3592 ---------- 

3593 infer_nrows : int 

3594 Number of rows to read from self.f, not counting 

3595 rows that are skipped. 

3596 skiprows: set, optional 

3597 Indices of rows to skip. 

3598 

3599 Returns 

3600 ------- 

3601 detect_rows : list of str 

3602 A list containing the rows to read. 

3603 

3604 """ 

3605 if skiprows is None: 

3606 skiprows = set() 

3607 buffer_rows = [] 

3608 detect_rows = [] 

3609 for i, row in enumerate(self.f): 

3610 if i not in skiprows: 

3611 detect_rows.append(row) 

3612 buffer_rows.append(row) 

3613 if len(detect_rows) >= infer_nrows: 

3614 break 

3615 self.buffer = iter(buffer_rows) 

3616 return detect_rows 

3617 

3618 def detect_colspecs(self, infer_nrows=100, skiprows=None): 

3619 # Regex escape the delimiters 

3620 delimiters = "".join(r"\{}".format(x) for x in self.delimiter) 

3621 pattern = re.compile("([^{}]+)".format(delimiters)) 

3622 rows = self.get_rows(infer_nrows, skiprows) 

3623 if not rows: 

3624 raise EmptyDataError("No rows from which to infer column width") 

3625 max_len = max(map(len, rows)) 

3626 mask = np.zeros(max_len + 1, dtype=int) 

3627 if self.comment is not None: 

3628 rows = [row.partition(self.comment)[0] for row in rows] 

3629 for row in rows: 

3630 for m in pattern.finditer(row): 

3631 mask[m.start() : m.end()] = 1 

3632 shifted = np.roll(mask, 1) 

3633 shifted[0] = 0 

3634 edges = np.where((mask ^ shifted) == 1)[0] 

3635 edge_pairs = list(zip(edges[::2], edges[1::2])) 

3636 return edge_pairs 

3637 

3638 def __next__(self): 

3639 if self.buffer is not None: 

3640 try: 

3641 line = next(self.buffer) 

3642 except StopIteration: 

3643 self.buffer = None 

3644 line = next(self.f) 

3645 else: 

3646 line = next(self.f) 

3647 # Note: 'colspecs' is a sequence of half-open intervals. 

3648 return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] 

3649 

3650 

3651class FixedWidthFieldParser(PythonParser): 

3652 """ 

3653 Specialization that Converts fixed-width fields into DataFrames. 

3654 See PythonParser for details. 

3655 """ 

3656 

3657 def __init__(self, f, **kwds): 

3658 # Support iterators, convert to a list. 

3659 self.colspecs = kwds.pop("colspecs") 

3660 self.infer_nrows = kwds.pop("infer_nrows") 

3661 PythonParser.__init__(self, f, **kwds) 

3662 

3663 def _make_reader(self, f): 

3664 self.data = FixedWidthReader( 

3665 f, 

3666 self.colspecs, 

3667 self.delimiter, 

3668 self.comment, 

3669 self.skiprows, 

3670 self.infer_nrows, 

3671 )