Coverage for cc_modules/cc_spreadsheet.py: 29%
267 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-08 23:14 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-08 23:14 +0000
1#!/usr/bin/env python
3"""
4camcops_server/cc_modules/cc_spreadsheet.py
6===============================================================================
8 Copyright (C) 2012, University of Cambridge, Department of Psychiatry.
9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
11 This file is part of CamCOPS.
13 CamCOPS is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 CamCOPS is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.
26===============================================================================
28**Helper functions/classes for spreadsheet-style tab-separated value (TSV)
29(and related) exports.**
31"""
33from collections import OrderedDict
34import csv
35import io
36import logging
37import os
38import random
39import re
40from typing import (
41 Any,
42 BinaryIO,
43 Callable,
44 Container,
45 Dict,
46 Iterable,
47 List,
48 Optional,
49 Sequence,
50 Union,
51)
52import zipfile
54from cardinal_pythonlib.datetimefunc import (
55 format_datetime,
56 get_now_localtz_pendulum,
57)
58from cardinal_pythonlib.excel import (
59 convert_for_openpyxl,
60 convert_for_pyexcel_ods3,
61)
62from cardinal_pythonlib.logs import BraceStyleAdapter
63from sqlalchemy.engine.result import ResultProxy
65from camcops_server.cc_modules.cc_constants import DateFormat
67ODS_VIA_PYEXCEL = True # significantly faster
68XLSX_VIA_PYEXCEL = True
70if ODS_VIA_PYEXCEL:
71 import pyexcel_ods3 # e.g. pip install pyexcel-ods3==0.5.3
73 ODSWriter = ODSSheet = None
74else:
75 from odswriter import ODSWriter, Sheet as ODSSheet # noqa
77 pyexcel_ods3 = None
79if XLSX_VIA_PYEXCEL:
80 import pyexcel_xlsx # e.g. pip install pyexcel-xlsx==0.5.7
82 openpyxl = XLWorkbook = XLWorksheet = None
83else:
84 from openpyxl.workbook.workbook import Workbook as XLWorkbook
85 from openpyxl.worksheet.worksheet import Worksheet as XLWorksheet
87 pyexcel_xlsx = None
89log = BraceStyleAdapter(logging.getLogger(__name__))
92# =============================================================================
93# Spreadsheet output holding structures
94# =============================================================================
97class SpreadsheetPage(object):
98 """
99 Represents a single "spreadsheet" page, e.g. for TSV/Excel/ODS output.
100 """
102 def __init__(
103 self, name: str, rows: List[Union[Dict[str, Any], OrderedDict]]
104 ) -> None:
105 """
106 Args:
107 name: name for the whole sheet
108 rows: list of rows, where each row is a dictionary mapping
109 column name to value
110 """
111 assert name, "Missing name"
112 self.name = name
113 self.rows = rows
114 self.headings = [] # type: List[str]
115 for row in rows:
116 self._add_headings_if_absent(row.keys())
118 def __str__(self) -> str:
119 return f"SpreadsheetPage: name={self.name}\n{self.get_tsv()}"
121 @classmethod
122 def from_headings_rows(
123 cls, name: str, headings: List[str], rows: List[Sequence[Any]]
124 ) -> "SpreadsheetPage":
125 """
126 Creates a SpreadsheetPage object using a list of headings and the row
127 data as a list of lists.
128 """
129 page = cls(name=name, rows=[])
130 n_cols = len(headings)
131 page.headings = headings
132 for row in rows:
133 assert len(row) == n_cols
134 page.rows.append(dict(zip(headings, row)))
135 return page
137 @classmethod
138 def from_resultproxy(cls, name: str, rp: ResultProxy) -> "SpreadsheetPage":
139 """
140 Creates a SpreadsheetPage object from an SQLAlchemy ResultProxy.
142 Args:
143 rp:
144 A :class:` sqlalchemy.engine.result.ResultProxy`.
145 name:
146 Name for this sheet.
147 """
148 column_names = rp.keys()
149 rows = rp.fetchall()
150 return cls.from_headings_rows(
151 name=name, headings=column_names, rows=rows
152 )
154 @property
155 def empty(self) -> bool:
156 """
157 Do we have zero rows?
158 """
159 return len(self.rows) == 0
161 def _add_headings_if_absent(self, headings: Iterable[str]) -> None:
162 """
163 Add any headings we've not yet seen to our list of headings.
164 """
165 for h in headings:
166 if h not in self.headings:
167 self.headings.append(h)
169 def add_or_set_value(self, heading: str, value: Any) -> None:
170 """
171 If we contain only a single row, this function will set the value
172 for a given column (``heading``) to ``value``.
174 Raises:
175 :exc:`AssertionError` if we don't have exactly 1 row
176 """
177 assert len(self.rows) == 1, "add_value can only be used if #rows == 1"
178 self._add_headings_if_absent([heading])
179 self.rows[0][heading] = value
181 def add_or_set_column(self, heading: str, values: List[Any]) -> None:
182 """
183 Set the column labelled ``heading`` so it contains the values specified
184 in ``values``. The length of ``values`` must equal the number of rows
185 that we already contain.
187 Raises:
188 :exc:`AssertionError` if the number of values doesn't match
189 the number of existing rows
190 """
191 assert len(values) == len(self.rows), "#values != #existing rows"
192 self._add_headings_if_absent([heading])
193 for i, row in enumerate(self.rows):
194 row[heading] = values[i]
196 def add_or_set_columns_from_page(self, other: "SpreadsheetPage") -> None:
197 """
198 This function presupposes that ``self`` and ``other`` are two pages
199 ("spreadsheets") with *matching* rows.
201 It updates values or creates columns in ``self`` such that the values
202 from all columns in ``other`` are written to the corresponding rows of
203 ``self``.
205 Raises:
206 :exc:`AssertionError` if the two pages (sheets) don't have
207 the same number of rows.
208 """
209 assert len(self.rows) == len(other.rows), "Mismatched #rows"
210 self._add_headings_if_absent(other.headings)
211 for i, row in enumerate(self.rows):
212 for k, v in other.rows[i].items():
213 row[k] = v
215 def add_rows_from_page(self, other: "SpreadsheetPage") -> None:
216 """
217 Add all rows from ``other`` to ``self``.
218 """
219 self._add_headings_if_absent(other.headings)
220 self.rows.extend(other.rows)
222 def sort_headings(self) -> None:
223 """
224 Sort our headings internally.
225 """
226 self.headings.sort()
228 def delete_columns(self, headings: Container[str]) -> None:
229 """
230 Removes columns with the specified heading names.
231 Used to simplify spreadsheets.
233 Since our rows are a dictionary, and our export functions are based on
234 the headings, all we have to do is to delete the unwanted headings.
235 """
236 self.headings = [h for h in self.headings if h not in headings]
238 @property
239 def plainrows(self) -> List[List[Any]]:
240 """
241 Returns a list of rows, where each row is a list of values.
242 Does not include a "header" row.
244 Compare :attr:`rows`, which is a list of dictionaries.
245 """
246 rows = []
247 for row in self.rows:
248 rows.append([row.get(h) for h in self.headings])
249 return rows
251 def spreadsheetrows(
252 self, converter: Callable[[Any], Any]
253 ) -> List[List[Any]]:
254 """
255 Like :meth:`plainrows`, but (a) ensures every cell is converted to a
256 value that can be sent to a spreadsheet converted (e.g. ODS, XLSX), and
257 (b) includes a header row.
258 """
259 rows = [self.headings.copy()]
260 for row in self.rows:
261 rows.append([converter(row.get(h)) for h in self.headings])
262 return rows
264 def get_tsv(self, dialect: str = "excel-tab") -> str:
265 r"""
266 Returns the entire page (sheet) as TSV: one header row and then
267 lots of data rows.
269 For the dialect, see
270 https://docs.python.org/3/library/csv.html#csv.excel_tab.
272 For CSV files, see RGC 4180: https://tools.ietf.org/html/rfc4180.
274 For TSV files, see
275 https://www.iana.org/assignments/media-types/text/tab-separated-values.
277 Test code:
279 .. code-block:: python
281 import io
282 import csv
283 from typing import List
285 def test(row: List[str], dialect: str = "excel-tab") -> str:
286 f = io.StringIO()
287 writer = csv.writer(f, dialect=dialect)
288 writer.writerow(row)
289 return f.getvalue()
291 test(["hello", "world"])
292 test(["hello\ttab", "world"]) # actual tab within double quotes
293 test(["hello\nnewline", "world"]) # actual newline within double quotes
294 test(['hello"doublequote', "world"]) # doubled double quote within double quotes
296 """ # noqa
297 f = io.StringIO()
298 writer = csv.writer(f, dialect=dialect)
299 writer.writerow(self.headings)
300 for row in self.rows:
301 writer.writerow([row.get(h) for h in self.headings])
302 return f.getvalue()
304 def write_to_openpyxl_xlsx_worksheet(self, ws: "XLWorksheet") -> None:
305 """
306 Writes data from this page to an existing ``openpyxl`` XLSX worksheet.
307 """
308 ws.append(self.headings)
309 for row in self.rows:
310 ws.append(
311 [convert_for_openpyxl(row.get(h)) for h in self.headings]
312 )
314 def write_to_odswriter_ods_worksheet(self, ws: "ODSSheet") -> None:
315 """
316 Writes data from this page to an existing ``odswriter`` ODS sheet.
317 """
318 # noinspection PyUnresolvedReferences
319 ws.writerow(self.headings)
320 for row in self.rows:
321 # noinspection PyUnresolvedReferences
322 ws.writerow([row.get(h) for h in self.headings])
324 def r_object_name(self) -> str:
325 """
326 Name of the object when imported into R.
327 The main thing: no leading underscores.
328 """
329 n = self.name
330 n = n[1:] if n.startswith("_") else n
331 return f"camcops_{n}" # less chance of conflict within R
333 def r_data_table_definition(self) -> str:
334 """
335 Returns a string to define this object as a ``data.table`` in R.
337 See also:
339 - https://stackoverflow.com/questions/32103639/read-csv-file-in-r-with-double-quotes
340 """ # noqa
341 object_name = self.r_object_name()
342 csv_text = self.get_tsv(dialect="excel")
343 csv_text = csv_text.replace('"', r"\"")
344 definition = (
345 f'data.table::fread(sep=",", header=TRUE, text="{csv_text}"\n)'
346 )
347 return f"{object_name} <- {definition}"
350class SpreadsheetCollection(object):
351 """
352 A collection of
353 :class:`camcops_server.cc_modules.cc_spreadsheet.SpreadsheetPage` pages
354 (spreadsheets), like an Excel workbook.
355 """
357 def __init__(self) -> None:
358 self.pages = [] # type: List[SpreadsheetPage]
360 def __str__(self) -> str:
361 return "SpreadsheetCollection:\n" + "\n\n".join(
362 page.get_tsv() for page in self.pages
363 )
365 # -------------------------------------------------------------------------
366 # Pages
367 # -------------------------------------------------------------------------
369 def page_with_name(self, page_name: str) -> Optional[SpreadsheetPage]:
370 """
371 Returns the page with the specific name, or ``None`` if no such
372 page exists.
373 """
374 return next(
375 (page for page in self.pages if page.name == page_name), None
376 )
378 def add_page(self, page: SpreadsheetPage) -> None:
379 """
380 Adds a new page to our collection. If the new page has the same name
381 as an existing page, rows from the new page are added to the existing
382 page. Does nothing if the new page is empty.
383 """
384 if page.empty:
385 return
386 existing_page = self.page_with_name(page.name)
387 if existing_page:
388 # Blend with existing page
389 existing_page.add_rows_from_page(page)
390 else:
391 # New page
392 self.pages.append(page)
394 def add_pages(self, pages: List[SpreadsheetPage]) -> None:
395 """
396 Adds all ``pages`` to our collection, via :func:`add_page`.
397 """
398 for page in pages:
399 self.add_page(page)
401 def sort_headings_within_all_pages(self) -> None:
402 """
403 Sort headings within each of our pages.
404 """
405 for page in self.pages:
406 page.sort_headings()
408 def sort_pages(self) -> None:
409 """
410 Sort our pages by their page name.
411 """
412 self.pages.sort(key=lambda p: p.name)
414 def get_page_names(self) -> List[str]:
415 """
416 Return a list of the names of all our pages.
417 """
418 return [p.name for p in self.pages]
420 def delete_page(self, page_name: str) -> None:
421 """
422 Delete any page with the name specified.
423 """
424 self.pages = [p for p in self.pages if p.name != page_name]
426 def delete_pages(self, page_names: Container[str]) -> None:
427 """
428 Delete pages with the names specified.
429 """
430 self.pages = [p for p in self.pages if p.name not in page_names]
432 def delete_columns(self, headings: Container[str]) -> None:
433 """
434 Across all pages, removes columns with the specified heading names.
435 Used to simplify spreadsheets.
436 """
437 for p in self.pages:
438 p.delete_columns(headings)
440 # -------------------------------------------------------------------------
441 # TSV
442 # -------------------------------------------------------------------------
444 def get_tsv_file(self, page_name: str) -> str:
445 """
446 Returns a TSV file for a named page.
448 Raises:
449 :exc:`AssertionError` if the named page does not exist
451 """
452 page = self.page_with_name(page_name)
453 assert page is not None, f"No such page with name {page_name}"
454 return page.get_tsv()
456 # -------------------------------------------------------------------------
457 # ZIP of TSVs
458 # -------------------------------------------------------------------------
460 def write_zip(
461 self,
462 file: Union[str, BinaryIO],
463 encoding: str = "utf-8",
464 compression: int = zipfile.ZIP_DEFLATED,
465 ) -> None:
466 """
467 Writes data to a file, as a ZIP file of TSV files.
469 Args:
470 file: filename or file-like object
471 encoding: encoding to use when writing the TSV files
472 compression: compression method to use
474 Choice of compression method: see
476 - https://docs.python.org/3/library/zipfile.html
477 - https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
478 - https://en.wikipedia.org/wiki/Zip_(file_format)#Compression_methods
480 Note also that ``openpyxl`` uses ``ZIP_DEFLATED``, which seems to be
481 the most portable if not the best compression.
482 """
483 if isinstance(file, str): # it's a filename
484 with open(file, "wb") as binaryfile:
485 return self.write_zip(binaryfile, encoding) # recurse once
486 with zipfile.ZipFile(file, mode="w", compression=compression) as z:
487 # Write to ZIP.
488 # If there are no valid task instances, there'll be no TSV;
489 # that's OK.
490 for filename_stem in self.get_page_names():
491 tsv_filename = filename_stem + ".tsv"
492 tsv_contents = self.get_tsv_file(page_name=filename_stem)
493 z.writestr(tsv_filename, tsv_contents.encode(encoding))
495 def as_zip(self, encoding: str = "utf-8") -> bytes:
496 """
497 Returns the TSV collection as a ZIP file containing TSV files.
499 Args:
500 encoding: encoding to use when writing the TSV files
501 """
502 with io.BytesIO() as memfile:
503 self.write_zip(memfile, encoding)
504 zip_contents = memfile.getvalue()
505 return zip_contents
507 # -------------------------------------------------------------------------
508 # XLSX, ODS
509 # -------------------------------------------------------------------------
511 def write_xlsx(self, file: Union[str, BinaryIO]) -> None:
512 """
513 Write the contents in XLSX (Excel) format to a file.
515 Args:
516 file: filename or file-like object
517 """
518 if XLSX_VIA_PYEXCEL: # use pyexcel_xlsx
519 data = self._get_pyexcel_data(convert_for_openpyxl)
520 pyexcel_xlsx.save_data(file, data)
521 else: # use openpyxl
522 # Marginal performance gain with write_only. Does not automatically
523 # add a blank sheet
524 wb = XLWorkbook(write_only=True)
525 valid_name_dict = self.get_pages_with_valid_sheet_names()
526 for page, title in valid_name_dict.items():
527 ws = wb.create_sheet(title=title)
528 page.write_to_openpyxl_xlsx_worksheet(ws)
529 wb.save(file)
531 def as_xlsx(self) -> bytes:
532 """
533 Returns the TSV collection as an XLSX (Excel) file.
534 """
535 with io.BytesIO() as memfile:
536 self.write_xlsx(memfile)
537 contents = memfile.getvalue()
538 return contents
540 @staticmethod
541 def get_sheet_title(page: SpreadsheetPage) -> str:
542 r"""
543 Returns a worksheet name for a :class:`SpreadsheetPage`.
545 See ``openpyxl/workbook/child.py``.
547 - Excel prohibits ``\``, ``*``, ``?``, ``:``, ``/``, ``[``, ``]``
548 - LibreOffice also prohibits ``'`` as first or last character but let's
549 just replace that globally.
550 """
551 title = re.sub(r"[\\*?:/\[\]']", "_", page.name)
553 if len(title) > 31:
554 title = f"{title[:28]}..."
556 return title
558 def _get_pyexcel_data(
559 self, converter: Callable[[Any], Any]
560 ) -> Dict[str, List[List[Any]]]:
561 """
562 Returns data in the format expected by ``pyexcel``, which is an ordered
563 dictionary mapping sheet names to a list of rows, where each row is a
564 list of cell values.
565 """
566 data = OrderedDict()
567 for page in self.pages:
568 data[self.get_sheet_title(page)] = page.spreadsheetrows(converter)
569 return data
571 def write_ods(self, file: Union[str, BinaryIO]) -> None:
572 """
573 Writes an ODS (OpenOffice spreadsheet document) to a file.
575 Args:
576 file: filename or file-like object
577 """
578 if ODS_VIA_PYEXCEL: # use pyexcel_ods3
579 data = self._get_pyexcel_data(convert_for_pyexcel_ods3)
580 pyexcel_ods3.save_data(file, data)
581 else: # use odswriter
582 if isinstance(file, str): # it's a filename
583 with open(file, "wb") as binaryfile:
584 return self.write_ods(binaryfile) # recurse once
585 # noinspection PyCallingNonCallable
586 with ODSWriter(file) as odsfile:
587 valid_name_dict = self.get_pages_with_valid_sheet_names()
588 for page, title in valid_name_dict.items():
589 sheet = odsfile.new_sheet(name=title)
590 page.write_to_odswriter_ods_worksheet(sheet)
592 def as_ods(self) -> bytes:
593 """
594 Returns the TSV collection as an ODS (OpenOffice spreadsheet document)
595 file.
596 """
597 with io.BytesIO() as memfile:
598 self.write_ods(memfile)
599 contents = memfile.getvalue()
600 return contents
602 def get_pages_with_valid_sheet_names(self) -> Dict[SpreadsheetPage, str]:
603 """
604 Returns an ordered mapping from :class:`SpreadsheetPage` objects to
605 their sheet names.
606 """
607 name_dict = OrderedDict()
609 for page in self.pages:
610 name_dict[page] = self.get_sheet_title(page)
612 self.make_sheet_names_unique(name_dict)
614 return name_dict
616 @staticmethod
617 def make_sheet_names_unique(name_dict: Dict[SpreadsheetPage, str]) -> None:
618 """
619 Modifies (in place) a mapping from :class:`SpreadsheetPage` to
620 worksheet names, such that all page names are unique.
622 - See also :func:`avoid_duplicate_name` in
623 ``openpxl/workbook/child.py``
624 - We keep the 31 character restriction
625 """
626 unique_names = [] # type: List[str]
628 for page, name in name_dict.items():
629 attempt = 0
631 while name.lower() in unique_names:
632 attempt += 1
634 if attempt > 1000:
635 # algorithm failure, better to let Excel deal with the
636 # consequences than get stuck in a loop
637 log.debug(
638 f"Failed to generate a unique sheet name from {name}"
639 )
640 break
642 match = re.search(r"\d+$", name)
643 count = 0
644 if match is not None:
645 count = int(match.group())
647 new_suffix = str(count + 1)
648 name = name[: -len(new_suffix)] + new_suffix
649 name_dict[page] = name
650 unique_names.append(name.lower())
652 # -------------------------------------------------------------------------
653 # R
654 # -------------------------------------------------------------------------
656 def as_r(self) -> str:
657 """
658 Returns data as an R script.
660 This could be more sophisticated, e.g. creating factors with
661 appropriate levels (etc.).
662 """
663 now = format_datetime(
664 get_now_localtz_pendulum(),
665 DateFormat.ISO8601_HUMANIZED_TO_SECONDS_TZ,
666 )
667 table_definition_str = "\n\n".join(
668 page.r_data_table_definition() for page in self.pages
669 )
670 script = f"""#!/usr/bin/env Rscript
672# R script generated by CamCOPS at {now}
674# =============================================================================
675# Libraries
676# =============================================================================
678library(data.table)
680# =============================================================================
681# Data
682# =============================================================================
684{table_definition_str}
686"""
687 return script
689 def write_r(self, filename: str, encoding: str = "utf-8") -> None:
690 """
691 Write the contents in R format to a file.
693 Args:
694 filename: filename or file-like object
695 encoding: encoding to use
696 """
697 with open(filename, "wt", encoding=encoding) as f:
698 f.write(self.as_r())
701def _make_benchmarking_collection(
702 nsheets: int = 100,
703 nrows: int = 200,
704 ncols: int = 30,
705 mindata: int = 0,
706 maxdata: int = 1000000,
707) -> SpreadsheetCollection:
708 log.info(
709 f"Creating SpreadsheetCollection with nsheets={nsheets}, "
710 f"nrows={nrows}, ncols={ncols}..."
711 )
712 coll = SpreadsheetCollection()
713 for sheetnum in range(1, nsheets + 1):
714 rows = [
715 {
716 f"c{colnum}": str(random.randint(mindata, maxdata))
717 for colnum in range(1, ncols + 1)
718 }
719 for _ in range(1, nrows + 1)
720 ]
721 page = SpreadsheetPage(name=f"sheet{sheetnum}", rows=rows)
722 coll.add_page(page)
723 log.info("... done.")
724 return coll
727def file_size(filename: str) -> int:
728 """
729 Returns a file's size in bytes.
730 """
731 return os.stat(filename).st_size
734def benchmark_save(
735 xlsx_filename: str = "test.xlsx",
736 ods_filename: str = "test.ods",
737 tsv_zip_filename: str = "test.zip",
738 r_filename: str = "test.R",
739) -> None:
740 """
741 Use with:
743 .. code-block:: python
745 from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
746 from camcops_server.cc_modules.cc_spreadsheet import benchmark_save
747 main_only_quicksetup_rootlogger()
748 benchmark_save()
750 Args:
751 xlsx_filename: XLSX file to create
752 ods_filename: ODS file to create
753 tsv_zip_filename: TSV ZIP file to create
754 r_filename: R script to create
756 Problem in Nov 2019 is that ODS is extremely slow. Rough timings:
758 - TSV ZIP: about 4.1 Mb, about 0.2 s. Good.
759 - XLSX (via openpyxl): about 4.6 Mb, 16 seconds.
760 - XLSX (via pyexcel_xlsx): about 4.6 Mb, 16 seconds.
761 - ODS (via odswriter): about 53 Mb, 56 seconds.
762 - ODS (via pyexcel_ods3): about 2.8 Mb, 29 seconds.
763 """
764 coll = _make_benchmarking_collection()
766 log.info("Writing TSV ZIP...")
767 coll.write_zip(tsv_zip_filename)
768 log.info(f"... done. File size {file_size(tsv_zip_filename)}")
770 log.info("Writing XLSX...")
771 coll.write_xlsx(xlsx_filename)
772 log.info(f"... done. File size {file_size(xlsx_filename)}")
774 log.info("Writing ODS...")
775 coll.write_ods(ods_filename)
776 log.info(f"... done. File size {file_size(ods_filename)}")
778 log.info("Writing R...")
779 coll.write_r(r_filename)
780 log.info(f"... done. File size {file_size(r_filename)}")