Coverage for /var/devmt/py/utils4_1.7.0/utils4/reader.py: 100%

42 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2024-12-21 17:18 +0000

1#!/usr/bin/env python 

2# -*- coding: utf-8 -*- 

3""" 

4:Purpose: This module contains purpose-built data readers for formats 

5 which are no longer supported, namely: 

6 

7 - **.xls:** pre-Excel 5.0/95 Workbook 

8 

9:Platform: Linux/Windows | Python 3.7+ 

10:Developer: J Berendt 

11:Email: development@s3dev.uk 

12 

13:Comments: n/a 

14 

15:Example: 

16 

17 Example for reading an old-style .xls (pre-Excel 5.0/95) Workbook into a 

18 DataFrame:: 

19 

20 >>> from utils4.reader import reader 

21 >>> df = reader.read_xls('/path/to/file.xls') 

22 

23""" 

24# pylint: disable=invalid-name 

25# pylint: disable=wrong-import-order 

26 

27import io 

28import os 

29import pandas as pd 

30import xlrd 

31from datetime import datetime as dt 

32 

33 

34class Reader: 

35 """Class wrapper for various data reading methods. 

36 

37 For details on each reader, refer to the docstring for that reader. 

38 

39 """ 

40 

41 def read_xls(self, 

42 filepath: str, 

43 *, 

44 encoding: str=None, 

45 sheet_index: int=0, 

46 skiprows: int=0, 

47 skipcols: int=0, 

48 chopcols: int=0, 

49 date_formats: dict=None, 

50 errors: str='coerce', 

51 fill_date_errors: bool=False) -> pd.DataFrame: 

52 """Read a pre-Excel 5.0/95 .XLS file into a DataFrame. 

53 

54 This function is designed to deal with *old* XLS files which 

55 the ``pandas.read_excel`` function *does not support*. 

56 

57 Args: 

58 filepath (str): Full path to the file to be read. 

59 encoding (str, optional): Encoding used to read the XLS file. 

60 Defaults to None. 

61 sheet_index (int, optional): Index of the sheet to be read, 

62 zero-based. Defaults to 0. 

63 skiprows (int, optional): Number of rows to skip (from the 

64 beginning of the file). Defaults to 0. 

65 skipcols (int, optional): Number of columns to skip (from the left). 

66 Defaults to 0. 

67 chopcols (int, optional): Number of columns to skip/chop (from the 

68 right). Defaults to 0. 

69 date_formats (dict, optional): Dictionary of 

70 ``{col_name: strftime_mask}``. Defaults to None. 

71 errors (str, optional): Method used by :func:`~pandas.read_csv` to 

72 resolve date parsing errors. Defaults to 'coerce'. 

73 fill_date_errors (bool, optional): Fill coerced NaT date errors 

74 with '1900-01-01'. Defaults to False. 

75 

76 :Logic: 

77 The passed XLS file is opened and parsed by the ``xlrd`` library, 

78 then read into an in-memory stream buffer, which is 

79 passed into ``pandas.read_csv`` function for conversion to a 

80 DataFrame. 

81 

82 Raises: 

83 ValueError: If the file extension is not ``.xls``. 

84 IOError: If the workbook does not contain any rows of data. 

85 

86 Returns: 

87 df (pd.DataFrame): A DataFrame containing the contents of 

88 the XLS file. 

89 

90 """ 

91 if os.path.splitext(filepath)[1].lower() != '.xls': 

92 raise ValueError('The file *must* be an XLS file.') 

93 chopcols = -chopcols if chopcols else None 

94 stream = io.StringIO(newline='\n') 

95 wb = xlrd.open_workbook(filepath, encoding_override=encoding, formatting_info=True) 

96 ws = wb.sheet_by_index(sheet_index) 

97 if not ws.nrows: 

98 raise IOError('This workbook does not contain any rows of data.') 

99 rows = ws.get_rows() 

100 if skiprows: 

101 for _ in range(skiprows): 

102 next(rows) 

103 for r in rows: 

104 row = r[skipcols:chopcols] 

105 # Ensure xldate formats are parsed correctly. 

106 data = self._extract_row(row=row) 

107 stream.write(data + '\n') 

108 _ = stream.seek(0) 

109 df = pd.read_csv(stream) 

110 if date_formats: 

111 for col, fmt in date_formats.items(): 

112 df[col] = pd.to_datetime(df[col], format=fmt, errors=errors) 

113 if fill_date_errors: 

114 # Changed to remove inplace=True due to pandas v3.0 deprecation warnings. 

115 df[col] = df[col].fillna(dt(1900,1,1)) 

116 stream.close() 

117 return df 

118 

119 @staticmethod 

120 def _extract_row(row: iter) -> str: 

121 """Extract and parse each row. 

122 

123 Args: 

124 row (iter): Iterable object which is converted into a string, 

125 separated by the separator specified by the ``sep`` argument. 

126 sep (str, optional): Separator character. Defaults to ``','``. 

127 

128 Returns: 

129 str: A string containing all row values, separated by the ``sep`` 

130 character. 

131 

132 """ 

133 def _value_generator(row: iter) -> str: 

134 """Parse each row value based on its ``xf_index`` value. 

135 

136 Args: 

137 row (iter): Iterable object. 

138 

139 Yields: 

140 str: Each parsed value from the iterable. 

141 

142 """ 

143 for i in row: 

144 if i.xf_index == 62: 

145 val = xlrd.xldate.xldate_as_datetime(i.value, 0) 

146 else: 

147 val = i.value 

148 yield str(val) 

149 return ','.join(_value_generator(row=row)) 

150 

151 

152reader = Reader()