Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py : 28%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2010-2020 openpyxl
3"""Reader for a single worksheet."""
4import traceback
5from warnings import warn
7# compatibility imports
8from openpyxl.xml.functions import iterparse
10# package imports
11from openpyxl.cell import Cell, MergedCell
12from openpyxl.cell.text import Text
13from openpyxl.worksheet.dimensions import (
14 ColumnDimension,
15 RowDimension,
16 SheetFormatProperties,
17)
19from openpyxl.xml.constants import (
20 SHEET_MAIN_NS,
21 EXT_TYPES,
22)
23from openpyxl.formatting.formatting import ConditionalFormatting
24from openpyxl.formula.translate import Translator
25from openpyxl.utils import (
26 get_column_letter,
27 coordinate_to_tuple,
28 )
29from openpyxl.utils.datetime import from_excel, from_ISO8601, WINDOWS_EPOCH
30from openpyxl.descriptors.excel import ExtensionList
32from .filters import AutoFilter
33from .header_footer import HeaderFooter
34from .hyperlink import HyperlinkList
35from .merge import MergeCells
36from .page import PageMargins, PrintOptions, PrintPageSetup
37from .pagebreak import RowBreak, ColBreak
38from .protection import SheetProtection
39from .scenario import ScenarioList
40from .views import SheetViewList
41from .datavalidation import DataValidationList
42from .table import TablePartList
43from .properties import WorksheetProperties
44from .dimensions import SheetDimension
45from .related import Related
48CELL_TAG = '{%s}c' % SHEET_MAIN_NS
49VALUE_TAG = '{%s}v' % SHEET_MAIN_NS
50FORMULA_TAG = '{%s}f' % SHEET_MAIN_NS
51MERGE_TAG = '{%s}mergeCells' % SHEET_MAIN_NS
52INLINE_STRING = "{%s}is" % SHEET_MAIN_NS
53COL_TAG = '{%s}col' % SHEET_MAIN_NS
54ROW_TAG = '{%s}row' % SHEET_MAIN_NS
55CF_TAG = '{%s}conditionalFormatting' % SHEET_MAIN_NS
56LEGACY_TAG = '{%s}legacyDrawing' % SHEET_MAIN_NS
57PROT_TAG = '{%s}sheetProtection' % SHEET_MAIN_NS
58EXT_TAG = "{%s}extLst" % SHEET_MAIN_NS
59HYPERLINK_TAG = "{%s}hyperlinks" % SHEET_MAIN_NS
60TABLE_TAG = "{%s}tableParts" % SHEET_MAIN_NS
61PRINT_TAG = '{%s}printOptions' % SHEET_MAIN_NS
62MARGINS_TAG = '{%s}pageMargins' % SHEET_MAIN_NS
63PAGE_TAG = '{%s}pageSetup' % SHEET_MAIN_NS
64HEADER_TAG = '{%s}headerFooter' % SHEET_MAIN_NS
65FILTER_TAG = '{%s}autoFilter' % SHEET_MAIN_NS
66VALIDATION_TAG = '{%s}dataValidations' % SHEET_MAIN_NS
67PROPERTIES_TAG = '{%s}sheetPr' % SHEET_MAIN_NS
68VIEWS_TAG = '{%s}sheetViews' % SHEET_MAIN_NS
69FORMAT_TAG = '{%s}sheetFormatPr' % SHEET_MAIN_NS
70ROW_BREAK_TAG = '{%s}rowBreaks' % SHEET_MAIN_NS
71COL_BREAK_TAG = '{%s}colBreaks' % SHEET_MAIN_NS
72SCENARIOS_TAG = '{%s}scenarios' % SHEET_MAIN_NS
73DATA_TAG = '{%s}sheetData' % SHEET_MAIN_NS
74DIMENSION_TAG = '{%s}dimension' % SHEET_MAIN_NS
75CUSTOM_VIEWS_TAG = '{%s}customSheetViews' % SHEET_MAIN_NS
78def _cast_number(value):
79 "Convert numbers as string to an int or float"
80 if "." in value or "E" in value or "e" in value:
81 return float(value)
82 return int(value)
85class WorkSheetParser(object):
87 def __init__(self, src, shared_strings, data_only=False,
88 epoch=WINDOWS_EPOCH, date_formats=set()):
89 self.min_row = self.min_col = None
90 self.epoch = epoch
91 self.source = src
92 self.shared_strings = shared_strings
93 self.data_only = data_only
94 self.shared_formulae = {}
95 self.array_formulae = {}
96 self.row_counter = self.col_counter = 0
97 self.tables = TablePartList()
98 self.date_formats = date_formats
99 self.row_dimensions = {}
100 self.column_dimensions = {}
101 self.number_formats = []
102 self.keep_vba = False
103 self.hyperlinks = HyperlinkList()
104 self.formatting = []
105 self.legacy_drawing = None
106 self.merged_cells = None
107 self.row_breaks = RowBreak()
108 self.col_breaks = ColBreak()
111 def parse(self):
112 dispatcher = {
113 COL_TAG: self.parse_column_dimensions,
114 PROT_TAG: self.parse_sheet_protection,
115 EXT_TAG: self.parse_extensions,
116 CF_TAG: self.parse_formatting,
117 LEGACY_TAG: self.parse_legacy,
118 ROW_BREAK_TAG: self.parse_row_breaks,
119 COL_BREAK_TAG: self.parse_col_breaks,
120 CUSTOM_VIEWS_TAG: self.parse_custom_views,
121 }
123 properties = {
124 PRINT_TAG: ('print_options', PrintOptions),
125 MARGINS_TAG: ('page_margins', PageMargins),
126 PAGE_TAG: ('page_setup', PrintPageSetup),
127 HEADER_TAG: ('HeaderFooter', HeaderFooter),
128 FILTER_TAG: ('auto_filter', AutoFilter),
129 VALIDATION_TAG: ('data_validations', DataValidationList),
130 PROPERTIES_TAG: ('sheet_properties', WorksheetProperties),
131 VIEWS_TAG: ('views', SheetViewList),
132 FORMAT_TAG: ('sheet_format', SheetFormatProperties),
133 SCENARIOS_TAG: ('scenarios', ScenarioList),
134 TABLE_TAG: ('tables', TablePartList),
135 HYPERLINK_TAG: ('hyperlinks', HyperlinkList),
136 MERGE_TAG: ('merged_cells', MergeCells),
138 }
140 it = iterparse(self.source)
142 for _, element in it:
143 tag_name = element.tag
144 if tag_name in dispatcher:
145 dispatcher[tag_name](element)
146 element.clear()
147 elif tag_name in properties:
148 prop = properties[tag_name]
149 obj = prop[1].from_tree(element)
150 setattr(self, prop[0], obj)
151 element.clear()
152 elif tag_name == ROW_TAG:
153 row = self.parse_row(element)
154 element.clear()
155 yield row
158 def parse_dimensions(self):
159 """
160 Get worksheet dimensions if they are provided.
161 """
162 it = iterparse(self.source)
164 for _event, element in it:
165 if element.tag == DIMENSION_TAG:
166 dim = SheetDimension.from_tree(element)
167 return dim.boundaries
169 elif element.tag == DATA_TAG:
170 # Dimensions missing
171 break
172 element.clear()
175 def parse_cell(self, element):
176 data_type = element.get('t', 'n')
177 coordinate = element.get('r')
178 self.col_counter += 1
179 style_id = element.get('s', 0)
180 if style_id:
181 style_id = int(style_id)
183 if data_type == "inlineStr":
184 value = None
185 else:
186 value = element.findtext(VALUE_TAG, None) or None
188 if coordinate:
189 row, column = coordinate_to_tuple(coordinate)
190 else:
191 row, column = self.row_counter, self.col_counter
193 if not self.data_only and element.find(FORMULA_TAG) is not None:
194 data_type = 'f'
195 value = self.parse_formula(element)
197 elif value is not None:
198 if data_type == 'n':
199 value = _cast_number(value)
200 if style_id in self.date_formats:
201 data_type = 'd'
202 try:
203 value = from_excel(value, self.epoch)
204 except ValueError:
205 msg = """Cell {0} is marked as a date but the serial value {1} is outside the limits for dates. The cell will be treated as an error.""".format(coordinate, value)
206 warn(msg)
207 data_type = "e"
208 value = "#VALUE!"
209 elif data_type == 's':
210 value = self.shared_strings[int(value)]
211 elif data_type == 'b':
212 value = bool(int(value))
213 elif data_type == "str":
214 data_type = "s"
215 elif data_type == 'd':
216 value = from_ISO8601(value)
218 elif data_type == 'inlineStr':
219 child = element.find(INLINE_STRING)
220 if child is not None:
221 data_type = 's'
222 richtext = Text.from_tree(child)
223 value = richtext.content
225 return {'row':row, 'column':column, 'value':value, 'data_type':data_type, 'style_id':style_id}
228 def parse_formula(self, element):
229 """
230 possible formulae types: shared, array, datatable
231 """
232 formula = element.find(FORMULA_TAG)
233 formula_type = formula.get('t')
234 coordinate = element.get('r')
235 value = "="
236 if formula.text is not None:
237 value += formula.text
239 if formula_type == "array":
240 self.array_formulae[coordinate] = dict(formula.attrib)
242 elif formula_type == "shared":
243 idx = formula.get('si')
244 if idx in self.shared_formulae:
245 trans = self.shared_formulae[idx]
246 value = trans.translate_formula(coordinate)
247 elif value != "=":
248 self.shared_formulae[idx] = Translator(value, coordinate)
250 return value
253 def parse_column_dimensions(self, col):
254 attrs = dict(col.attrib)
255 column = get_column_letter(int(attrs['min']))
256 attrs['index'] = column
257 self.column_dimensions[column] = attrs
260 def parse_row(self, row):
261 attrs = dict(row.attrib)
263 if "r" in attrs:
264 self.row_counter = int(attrs['r'])
265 else:
266 self.row_counter += 1
267 self.col_counter = 0
269 keys = {k for k in attrs if not k.startswith('{')}
270 if keys - {'r', 'spans'}:
271 # don't create dimension objects unless they have relevant information
272 self.row_dimensions[str(self.row_counter)] = attrs
274 cells = [self.parse_cell(el) for el in row]
275 return self.row_counter, cells
278 def parse_formatting(self, element):
279 try:
280 cf = ConditionalFormatting.from_tree(element)
281 self.formatting.append(cf)
282 except TypeError:
283 msg = f"Failed to load a conditional formatting rule. It will be discarded. Cause: {traceback.format_exc()}"
284 warn(msg)
287 def parse_sheet_protection(self, element):
288 protection = SheetProtection.from_tree(element)
289 password = element.get("password")
290 if password is not None:
291 protection.set_password(password, True)
292 self.protection = protection
295 def parse_extensions(self, element):
296 extLst = ExtensionList.from_tree(element)
297 for e in extLst.ext:
298 ext_type = EXT_TYPES.get(e.uri.upper(), "Unknown")
299 msg = "{0} extension is not supported and will be removed".format(ext_type)
300 warn(msg)
303 def parse_legacy(self, element):
304 obj = Related.from_tree(element)
305 self.legacy_drawing = obj.id
308 def parse_row_breaks(self, element):
309 brk = RowBreak.from_tree(element)
310 self.row_breaks = brk
313 def parse_col_breaks(self, element):
314 brk = ColBreak.from_tree(element)
315 self.col_breaks = brk
318 def parse_custom_views(self, element):
319 # clear page_breaks to avoid duplication which Excel doesn't like
320 # basically they're ignored in custom views
321 self.row_breaks = RowBreak()
322 self.col_breaks = ColBreak()
325class WorksheetReader(object):
326 """
327 Create a parser and apply it to a workbook
328 """
330 def __init__(self, ws, xml_source, shared_strings, data_only):
331 self.ws = ws
332 self.parser = WorkSheetParser(xml_source, shared_strings, data_only, ws.parent.epoch, ws.parent._date_formats)
333 self.tables = []
336 def bind_cells(self):
337 for idx, row in self.parser.parse():
338 for cell in row:
339 style = self.ws.parent._cell_styles[cell['style_id']]
340 c = Cell(self.ws, row=cell['row'], column=cell['column'], style_array=style)
341 c._value = cell['value']
342 c.data_type = cell['data_type']
343 self.ws._cells[(cell['row'], cell['column'])] = c
344 self.ws.formula_attributes = self.parser.array_formulae
345 if self.ws._cells:
346 self.ws._current_row = self.ws.max_row # use cells not row dimensions
349 def bind_formatting(self):
350 for cf in self.parser.formatting:
351 for rule in cf.rules:
352 if rule.dxfId is not None:
353 rule.dxf = self.ws.parent._differential_styles[rule.dxfId]
354 self.ws.conditional_formatting[cf] = rule
357 def bind_tables(self):
358 for t in self.parser.tables.tablePart:
359 rel = self.ws._rels[t.id]
360 self.tables.append(rel.Target)
363 def bind_merged_cells(self):
364 from openpyxl.worksheet.cell_range import MultiCellRange
365 from openpyxl.worksheet.merge import MergedCellRange
366 if not self.parser.merged_cells:
367 return
369 ranges = []
370 for cr in self.parser.merged_cells.mergeCell:
371 mcr = MergedCellRange(self.ws, cr.ref)
372 self.ws._clean_merge_range(mcr)
373 ranges.append(mcr)
374 self.ws.merged_cells = MultiCellRange(ranges)
377 def bind_hyperlinks(self):
378 for link in self.parser.hyperlinks.hyperlink:
379 if link.id:
380 rel = self.ws._rels[link.id]
381 link.target = rel.Target
382 if ":" in link.ref:
383 # range of cells
384 for row in self.ws[link.ref]:
385 for cell in row:
386 try:
387 cell.hyperlink = link
388 except AttributeError:
389 pass
390 else:
391 cell = self.ws[link.ref]
392 if isinstance(cell, MergedCell):
393 cell = self.normalize_merged_cell_link(cell.coordinate)
394 cell.hyperlink = link
396 def normalize_merged_cell_link(self, coord):
397 """
398 Returns the appropriate cell to which a hyperlink, which references a merged cell at the specified coordinates,
399 should be bound.
400 """
401 for rng in self.ws.merged_cells:
402 if coord in rng:
403 return self.ws.cell(*rng.top[0])
405 def bind_col_dimensions(self):
406 for col, cd in self.parser.column_dimensions.items():
407 if 'style' in cd:
408 key = int(cd['style'])
409 cd['style'] = self.ws.parent._cell_styles[key]
410 self.ws.column_dimensions[col] = ColumnDimension(self.ws, **cd)
413 def bind_row_dimensions(self):
414 for row, rd in self.parser.row_dimensions.items():
415 if 's' in rd:
416 key = int(rd['s'])
417 rd['s'] = self.ws.parent._cell_styles[key]
418 self.ws.row_dimensions[int(row)] = RowDimension(self.ws, **rd)
421 def bind_properties(self):
422 for k in ('print_options', 'page_margins', 'page_setup',
423 'HeaderFooter', 'auto_filter', 'data_validations',
424 'sheet_properties', 'views', 'sheet_format',
425 'row_breaks', 'col_breaks', 'scenarios', 'legacy_drawing',
426 'protection',
427 ):
428 v = getattr(self.parser, k, None)
429 if v is not None:
430 setattr(self.ws, k, v)
433 def bind_all(self):
434 self.bind_cells()
435 self.bind_merged_cells()
436 self.bind_hyperlinks()
437 self.bind_formatting()
438 self.bind_col_dimensions()
439 self.bind_row_dimensions()
440 self.bind_tables()
441 self.bind_properties()