Coverage for /var/devmt/py/utils4_1.7.0/utils4/reader.py: 100%
42 statements
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-21 17:18 +0000
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-21 17:18 +0000
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""
4:Purpose: This module contains purpose-built data readers for formats
5 which are no longer supported, namely:
7 - **.xls:** pre-Excel 5.0/95 Workbook
9:Platform: Linux/Windows | Python 3.7+
10:Developer: J Berendt
11:Email: development@s3dev.uk
13:Comments: n/a
15:Example:
17 Example for reading an old-style .xls (pre-Excel 5.0/95) Workbook into a
18 DataFrame::
20 >>> from utils4.reader import reader
21 >>> df = reader.read_xls('/path/to/file.xls')
23"""
24# pylint: disable=invalid-name
25# pylint: disable=wrong-import-order
27import io
28import os
29import pandas as pd
30import xlrd
31from datetime import datetime as dt
34class Reader:
35 """Class wrapper for various data reading methods.
37 For details on each reader, refer to the docstring for that reader.
39 """
41 def read_xls(self,
42 filepath: str,
43 *,
44 encoding: str=None,
45 sheet_index: int=0,
46 skiprows: int=0,
47 skipcols: int=0,
48 chopcols: int=0,
49 date_formats: dict=None,
50 errors: str='coerce',
51 fill_date_errors: bool=False) -> pd.DataFrame:
52 """Read a pre-Excel 5.0/95 .XLS file into a DataFrame.
54 This function is designed to deal with *old* XLS files which
55 the ``pandas.read_excel`` function *does not support*.
57 Args:
58 filepath (str): Full path to the file to be read.
59 encoding (str, optional): Encoding used to read the XLS file.
60 Defaults to None.
61 sheet_index (int, optional): Index of the sheet to be read,
62 zero-based. Defaults to 0.
63 skiprows (int, optional): Number of rows to skip (from the
64 beginning of the file). Defaults to 0.
65 skipcols (int, optional): Number of columns to skip (from the left).
66 Defaults to 0.
67 chopcols (int, optional): Number of columns to skip/chop (from the
68 right). Defaults to 0.
69 date_formats (dict, optional): Dictionary of
70 ``{col_name: strftime_mask}``. Defaults to None.
71 errors (str, optional): Method used by :func:`~pandas.read_csv` to
72 resolve date parsing errors. Defaults to 'coerce'.
73 fill_date_errors (bool, optional): Fill coerced NaT date errors
74 with '1900-01-01'. Defaults to False.
76 :Logic:
77 The passed XLS file is opened and parsed by the ``xlrd`` library,
78 then read into an in-memory stream buffer, which is
79 passed into ``pandas.read_csv`` function for conversion to a
80 DataFrame.
82 Raises:
83 ValueError: If the file extension is not ``.xls``.
84 IOError: If the workbook does not contain any rows of data.
86 Returns:
87 df (pd.DataFrame): A DataFrame containing the contents of
88 the XLS file.
90 """
91 if os.path.splitext(filepath)[1].lower() != '.xls':
92 raise ValueError('The file *must* be an XLS file.')
93 chopcols = -chopcols if chopcols else None
94 stream = io.StringIO(newline='\n')
95 wb = xlrd.open_workbook(filepath, encoding_override=encoding, formatting_info=True)
96 ws = wb.sheet_by_index(sheet_index)
97 if not ws.nrows:
98 raise IOError('This workbook does not contain any rows of data.')
99 rows = ws.get_rows()
100 if skiprows:
101 for _ in range(skiprows):
102 next(rows)
103 for r in rows:
104 row = r[skipcols:chopcols]
105 # Ensure xldate formats are parsed correctly.
106 data = self._extract_row(row=row)
107 stream.write(data + '\n')
108 _ = stream.seek(0)
109 df = pd.read_csv(stream)
110 if date_formats:
111 for col, fmt in date_formats.items():
112 df[col] = pd.to_datetime(df[col], format=fmt, errors=errors)
113 if fill_date_errors:
114 # Changed to remove inplace=True due to pandas v3.0 deprecation warnings.
115 df[col] = df[col].fillna(dt(1900,1,1))
116 stream.close()
117 return df
119 @staticmethod
120 def _extract_row(row: iter) -> str:
121 """Extract and parse each row.
123 Args:
124 row (iter): Iterable object which is converted into a string,
125 separated by the separator specified by the ``sep`` argument.
126 sep (str, optional): Separator character. Defaults to ``','``.
128 Returns:
129 str: A string containing all row values, separated by the ``sep``
130 character.
132 """
133 def _value_generator(row: iter) -> str:
134 """Parse each row value based on its ``xf_index`` value.
136 Args:
137 row (iter): Iterable object.
139 Yields:
140 str: Each parsed value from the iterable.
142 """
143 for i in row:
144 if i.xf_index == 62:
145 val = xlrd.xldate.xldate_as_datetime(i.value, 0)
146 else:
147 val = i.value
148 yield str(val)
149 return ','.join(_value_generator(row=row))
152reader = Reader()