Coverage for cc_modules/cc_tsv.py : 29%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
3"""
4camcops_server/cc_modules/cc_tsv.py
6===============================================================================
8 Copyright (C) 2012-2020 Rudolf Cardinal (rudolf@pobox.com).
10 This file is part of CamCOPS.
12 CamCOPS is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 CamCOPS is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.
25===============================================================================
27**Helper functions/classes for spreadsheet-style tab-separated value (TSV)
28exports.**
30"""
32from collections import OrderedDict
33import csv
34import io
35import logging
36import os
37import random
38import re
39from typing import (Any, BinaryIO, Callable, Dict, Iterable, List, Optional,
40 Sequence, Union)
41import zipfile
43from cardinal_pythonlib.datetimefunc import (
44 format_datetime,
45 get_now_localtz_pendulum,
46)
47from cardinal_pythonlib.excel import (
48 convert_for_openpyxl,
49 convert_for_pyexcel_ods3,
50)
51from cardinal_pythonlib.logs import BraceStyleAdapter
52from sqlalchemy.engine.result import ResultProxy
54from camcops_server.cc_modules.cc_constants import DateFormat
56ODS_VIA_PYEXCEL = True # significantly faster
57XLSX_VIA_PYEXCEL = True
59if ODS_VIA_PYEXCEL:
60 import pyexcel_ods3 # e.g. pip install pyexcel-ods3==0.5.3
61 ODSWriter = ODSSheet = None
62else:
63 from odswriter import ODSWriter, Sheet as ODSSheet # noqa
64 pyexcel_ods3 = None
66if XLSX_VIA_PYEXCEL:
67 import pyexcel_xlsx # e.g. pip install pyexcel-xlsx==0.5.7
68 openpyxl = XLWorkbook = XLWorksheet = None
69else:
70 from openpyxl.workbook.workbook import Workbook as XLWorkbook
71 from openpyxl.worksheet.worksheet import Worksheet as XLWorksheet
72 pyexcel_xlsx = None
74log = BraceStyleAdapter(logging.getLogger(__name__))
77# =============================================================================
78# TSV output holding structures
79# =============================================================================
81class TsvPage(object):
82 """
83 Represents a single TSV "spreadsheet".
84 """
85 def __init__(self, name: str,
86 rows: List[Union[Dict[str, Any], OrderedDict]]) -> None:
87 """
88 Args:
89 name: name for the whole sheet
90 rows: list of rows, where each row is a dictionary mapping
91 column name to value
92 """
93 assert name, "Missing name"
94 self.name = name
95 self.rows = rows
96 self.headings = [] # type: List[str]
97 for row in rows:
98 self._add_headings_if_absent(row.keys())
100 def __str__(self) -> str:
101 return f"TsvPage: name={self.name}\n{self.get_tsv()}"
103 @classmethod
104 def from_headings_rows(cls, name: str, headings: List[str],
105 rows: List[Sequence[Any]]) -> "TsvPage":
106 """
107 Creates a TsvPage object using a list of headings and the row data
108 as a list of lists.
109 """
110 page = cls(name=name, rows=[])
111 n_cols = len(headings)
112 page.headings = headings
113 for row in rows:
114 assert len(row) == n_cols
115 page.rows.append(dict(zip(headings, row)))
116 return page
118 @classmethod
119 def from_resultproxy(cls, name: str, rp: ResultProxy) -> "TsvPage":
120 """
121 Creates a TsvPage object from an SQLAlchemy ResultProxy.
123 Args:
124 rp:
125 A :class:` sqlalchemy.engine.result.ResultProxy`.
126 name:
127 Name for this sheet.
128 """
129 column_names = rp.keys()
130 rows = rp.fetchall()
131 return cls.from_headings_rows(
132 name=name, headings=column_names, rows=rows)
134 @property
135 def empty(self) -> bool:
136 """
137 Do we have zero rows?
138 """
139 return len(self.rows) == 0
141 def _add_headings_if_absent(self, headings: Iterable[str]) -> None:
142 """
143 Add any headings we've not yet seen to our list of headings.
144 """
145 for h in headings:
146 if h not in self.headings:
147 self.headings.append(h)
149 def add_or_set_value(self, heading: str, value: Any) -> None:
150 """
151 If we contain only a single row, this function will set the value
152 for a given column (``heading``) to ``value``.
154 Raises:
155 :exc:`AssertionError` if we don't have exactly 1 row
156 """
157 assert len(self.rows) == 1, "add_value can only be used if #rows == 1"
158 self._add_headings_if_absent([heading])
159 self.rows[0][heading] = value
161 def add_or_set_column(self, heading: str, values: List[Any]) -> None:
162 """
163 Set the column labelled ``heading`` so it contains the values specified
164 in ``values``. The length of ``values`` must equal the number of rows
165 that we already contain.
167 Raises:
168 :exc:`AssertionError` if the number of values doesn't match
169 the number of existing rows
170 """
171 assert len(values) == len(self.rows), "#values != #existing rows"
172 self._add_headings_if_absent([heading])
173 for i, row in enumerate(self.rows):
174 row[heading] = values[i]
176 def add_or_set_columns_from_page(self, other: "TsvPage") -> None:
177 """
178 This function presupposes that ``self`` and ``other`` are two pages
179 ("spreadsheets") with *matching* rows.
181 It updates values or creates columns in ``self`` such that the values
182 from all columns in ``other`` are written to the corresponding rows of
183 ``self``.
185 Raises:
186 :exc:`AssertionError` if the two pages (sheets) don't have
187 the same number of rows.
188 """
189 assert len(self.rows) == len(other.rows), "Mismatched #rows"
190 self._add_headings_if_absent(other.headings)
191 for i, row in enumerate(self.rows):
192 for k, v in other.rows[i].items():
193 row[k] = v
195 def add_rows_from_page(self, other: "TsvPage") -> None:
196 """
197 Add all rows from ``other`` to ``self``.
198 """
199 self._add_headings_if_absent(other.headings)
200 self.rows.extend(other.rows)
202 def sort_headings(self) -> None:
203 """
204 Sort our headings internally.
205 """
206 self.headings.sort()
208 @property
209 def plainrows(self) -> List[List[Any]]:
210 """
211 Returns a list of rows, where each row is a list of values.
212 Does not include a "header" row.
214 Compare :attr:`rows`, which is a list of dictionaries.
215 """
216 rows = []
217 for row in self.rows:
218 rows.append([row.get(h) for h in self.headings])
219 return rows
221 def spreadsheetrows(self, converter: Callable[[Any], Any]) \
222 -> List[List[Any]]:
223 """
224 Like :meth:`plainrows`, but (a) ensures every cell is converted to a
225 value that can be sent to a spreadsheet converted (e.g. ODS, XLSX), and
226 (b) includes a header row.
227 """
228 rows = [self.headings.copy()]
229 for row in self.rows:
230 rows.append([converter(row.get(h))
231 for h in self.headings])
232 return rows
234 def get_tsv(self, dialect: str = "excel-tab") -> str:
235 r"""
236 Returns the entire page (sheet) as TSV: one header row and then
237 lots of data rows.
239 For the dialect, see
240 https://docs.python.org/3/library/csv.html#csv.excel_tab.
242 For CSV files, see RGC 4180: https://tools.ietf.org/html/rfc4180.
244 For TSV files, see
245 https://www.iana.org/assignments/media-types/text/tab-separated-values.
247 Test code:
249 .. code-block:: python
251 import io
252 import csv
253 from typing import List
255 def test(row: List[str], dialect: str = "excel-tab") -> str:
256 f = io.StringIO()
257 writer = csv.writer(f, dialect=dialect)
258 writer.writerow(row)
259 return f.getvalue()
261 test(["hello", "world"])
262 test(["hello\ttab", "world"]) # actual tab within double quotes
263 test(["hello\nnewline", "world"]) # actual newline within double quotes
264 test(['hello"doublequote', "world"]) # doubled double quote within double quotes
266 """ # noqa
267 f = io.StringIO()
268 writer = csv.writer(f, dialect=dialect)
269 writer.writerow(self.headings)
270 for row in self.rows:
271 writer.writerow([row.get(h) for h in self.headings])
272 return f.getvalue()
274 def write_to_openpyxl_xlsx_worksheet(self, ws: "XLWorksheet") -> None:
275 """
276 Writes data from this page to an existing ``openpyxl`` XLSX worksheet.
277 """
278 ws.append(self.headings)
279 for row in self.rows:
280 ws.append([convert_for_openpyxl(row.get(h))
281 for h in self.headings])
283 def write_to_odswriter_ods_worksheet(self, ws: "ODSSheet") -> None:
284 """
285 Writes data from this page to an existing ``odswriter`` ODS sheet.
286 """
287 # noinspection PyUnresolvedReferences
288 ws.writerow(self.headings)
289 for row in self.rows:
290 # noinspection PyUnresolvedReferences
291 ws.writerow([row.get(h) for h in self.headings])
293 def r_object_name(self) -> str:
294 """
295 Name of the object when imported into R.
296 The main thing: no leading underscores.
297 """
298 n = self.name
299 n = n[1:] if n.startswith("_") else n
300 return f"camcops_{n}" # less chance of conflict within R
302 def r_data_table_definition(self) -> str:
303 """
304 Returns a string to define this object as a ``data.table`` in R.
306 See also:
308 - https://stackoverflow.com/questions/32103639/read-csv-file-in-r-with-double-quotes
309 """ # noqa
310 object_name = self.r_object_name()
311 csv_text = self.get_tsv(dialect="excel")
312 csv_text = csv_text.replace('"', r'\"')
313 definition = (
314 f'data.table::fread(sep=",", header=TRUE, text="{csv_text}"\n)'
315 )
316 return f"{object_name} <- {definition}"
319class TsvCollection(object):
320 """
321 A collection of :class:`camcops_server.cc_modules.cc_tsv.TsvPage` pages
322 (spreadsheets), like an Excel workbook.
323 """
324 def __init__(self) -> None:
325 self.pages = [] # type: List[TsvPage]
327 def __str__(self) -> str:
328 return (
329 "TsvCollection:\n" +
330 "\n\n".join(page.get_tsv() for page in self.pages)
331 )
333 # -------------------------------------------------------------------------
334 # Pages
335 # -------------------------------------------------------------------------
337 def page_with_name(self, page_name: str) -> Optional[TsvPage]:
338 """
339 Returns the page with the specific name, or ``None`` if no such
340 page exists.
341 """
342 return next((page for page in self.pages if page.name == page_name),
343 None)
345 def add_page(self, page: TsvPage) -> None:
346 """
347 Adds a new page to our collection. If the new page has the same name
348 as an existing page, rows from the new page are added to the existing
349 page. Does nothing if the new page is empty.
350 """
351 if page.empty:
352 return
353 existing_page = self.page_with_name(page.name)
354 if existing_page:
355 # Blend with existing page
356 existing_page.add_rows_from_page(page)
357 else:
358 # New page
359 self.pages.append(page)
361 def add_pages(self, pages: List[TsvPage]) -> None:
362 """
363 Adds all ``pages`` to our collection, via :func:`add_page`.
364 """
365 for page in pages:
366 self.add_page(page)
368 def sort_headings_within_all_pages(self) -> None:
369 """
370 Sort headings within each of our pages.
371 """
372 for page in self.pages:
373 page.sort_headings()
375 def sort_pages(self) -> None:
376 """
377 Sort our pages by their page name.
378 """
379 self.pages.sort(key=lambda p: p.name)
381 def get_page_names(self) -> List[str]:
382 """
383 Return a list of the names of all our pages.
384 """
385 return [p.name for p in self.pages]
387 # -------------------------------------------------------------------------
388 # TSV
389 # -------------------------------------------------------------------------
391 def get_tsv_file(self, page_name: str) -> str:
392 """
393 Returns a TSV file for a named page.
395 Raises:
396 :exc:`AssertionError` if the named page does not exist
398 """
399 page = self.page_with_name(page_name)
400 assert page is not None, f"No such page with name {page_name}"
401 return page.get_tsv()
403 # -------------------------------------------------------------------------
404 # ZIP of TSVs
405 # -------------------------------------------------------------------------
407 def write_zip(self,
408 file: Union[str, BinaryIO],
409 encoding: str = "utf-8",
410 compression: int = zipfile.ZIP_DEFLATED) -> None:
411 """
412 Writes data to a file, as a ZIP file of TSV files.
414 Args:
415 file: filename or file-like object
416 encoding: encoding to use when writing the TSV files
417 compression: compression method to use
419 Choice of compression method: see
421 - https://docs.python.org/3/library/zipfile.html
422 - https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
423 - https://en.wikipedia.org/wiki/Zip_(file_format)#Compression_methods
425 Note also that ``openpyxl`` uses ``ZIP_DEFLATED``, which seems to be
426 the most portable if not the best compression.
427 """
428 if isinstance(file, str): # it's a filename
429 with open(file, "wb") as binaryfile:
430 return self.write_zip(binaryfile, encoding) # recurse once
431 with zipfile.ZipFile(file, mode="w", compression=compression) as z:
432 # Write to ZIP.
433 # If there are no valid task instances, there'll be no TSV;
434 # that's OK.
435 for filename_stem in self.get_page_names():
436 tsv_filename = filename_stem + ".tsv"
437 tsv_contents = self.get_tsv_file(page_name=filename_stem)
438 z.writestr(tsv_filename, tsv_contents.encode(encoding))
440 def as_zip(self, encoding: str = "utf-8") -> bytes:
441 """
442 Returns the TSV collection as a ZIP file containing TSV files.
444 Args:
445 encoding: encoding to use when writing the TSV files
446 """
447 with io.BytesIO() as memfile:
448 self.write_zip(memfile, encoding)
449 zip_contents = memfile.getvalue()
450 return zip_contents
452 # -------------------------------------------------------------------------
453 # XLSX, ODS
454 # -------------------------------------------------------------------------
456 def write_xlsx(self, file: Union[str, BinaryIO]) -> None:
457 """
458 Write the contents in XLSX (Excel) format to a file.
460 Args:
461 file: filename or file-like object
462 """
463 if XLSX_VIA_PYEXCEL: # use pyexcel_xlsx
464 data = self._get_pyexcel_data(convert_for_openpyxl)
465 pyexcel_xlsx.save_data(file, data)
466 else: # use openpyxl
467 # Marginal performance gain with write_only. Does not automatically
468 # add a blank sheet
469 wb = XLWorkbook(write_only=True)
470 valid_name_dict = self.get_pages_with_valid_sheet_names()
471 for page, title in valid_name_dict.items():
472 ws = wb.create_sheet(title=title)
473 page.write_to_openpyxl_xlsx_worksheet(ws)
474 wb.save(file)
476 def as_xlsx(self) -> bytes:
477 """
478 Returns the TSV collection as an XLSX (Excel) file.
479 """
480 with io.BytesIO() as memfile:
481 self.write_xlsx(memfile)
482 contents = memfile.getvalue()
483 return contents
485 @staticmethod
486 def get_sheet_title(page: TsvPage) -> str:
487 r"""
488 Returns a worksheet name for a :class:`TsvPage`.
490 See ``openpyxl/workbook/child.py``.
492 - Excel prohibits ``\``, ``*``, ``?``, ``:``, ``/``, ``[``, ``]``
493 - LibreOffice also prohibits ``'`` as first or last character but let's
494 just replace that globally.
495 """
496 title = re.sub(r"[\\*?:/\[\]']", "_", page.name)
498 if len(title) > 31:
499 title = f"{title[:28]}..."
501 return title
503 def _get_pyexcel_data(self, converter: Callable[[Any], Any]) \
504 -> Dict[str, List[List[Any]]]:
505 """
506 Returns data in the format expected by ``pyexcel``, which is an ordered
507 dictionary mapping sheet names to a list of rows, where each row is a
508 list of cell values.
509 """
510 data = OrderedDict()
511 for page in self.pages:
512 data[self.get_sheet_title(page)] = page.spreadsheetrows(converter)
513 return data
515 def write_ods(self, file: Union[str, BinaryIO]) -> None:
516 """
517 Writes an ODS (OpenOffice spreadsheet document) to a file.
519 Args:
520 file: filename or file-like object
521 """
522 if ODS_VIA_PYEXCEL: # use pyexcel_ods3
523 data = self._get_pyexcel_data(convert_for_pyexcel_ods3)
524 pyexcel_ods3.save_data(file, data)
525 else: # use odswriter
526 if isinstance(file, str): # it's a filename
527 with open(file, "wb") as binaryfile:
528 return self.write_ods(binaryfile) # recurse once
529 # noinspection PyCallingNonCallable
530 with ODSWriter(file) as odsfile:
531 valid_name_dict = self.get_pages_with_valid_sheet_names()
532 for page, title in valid_name_dict.items():
533 sheet = odsfile.new_sheet(name=title)
534 page.write_to_odswriter_ods_worksheet(sheet)
536 def as_ods(self) -> bytes:
537 """
538 Returns the TSV collection as an ODS (OpenOffice spreadsheet document)
539 file.
540 """
541 with io.BytesIO() as memfile:
542 self.write_ods(memfile)
543 contents = memfile.getvalue()
544 return contents
546 def get_pages_with_valid_sheet_names(self) -> Dict[TsvPage, str]:
547 """
548 Returns an ordered mapping from :class:`TsvPage` objects to their
549 sheet names.
550 """
551 name_dict = OrderedDict()
553 for page in self.pages:
554 name_dict[page] = self.get_sheet_title(page)
556 self.make_sheet_names_unique(name_dict)
558 return name_dict
560 @staticmethod
561 def make_sheet_names_unique(name_dict: Dict[TsvPage, str]) -> None:
562 """
563 Modifies (in place) a mapping from :class:`TsvPage` to worksheet names,
564 such that all page names are unique.
566 - See also :func:`avoid_duplicate_name` in
567 ``openpxl/workbook/child.py``
568 - We keep the 31 character restriction
569 """
570 unique_names = [] # type: List[str]
572 for page, name in name_dict.items():
573 attempt = 0
575 while name.lower() in unique_names:
576 attempt += 1
578 if attempt > 1000:
579 # algorithm failure, better to let Excel deal with the
580 # consequences than get stuck in a loop
581 log.debug(
582 f"Failed to generate a unique sheet name from {name}"
583 )
584 break
586 match = re.search(r'\d+$', name)
587 count = 0
588 if match is not None:
589 count = int(match.group())
591 new_suffix = str(count + 1)
592 name = name[:-len(new_suffix)] + new_suffix
593 name_dict[page] = name
594 unique_names.append(name.lower())
596 # -------------------------------------------------------------------------
597 # R
598 # -------------------------------------------------------------------------
600 def as_r(self) -> str:
601 """
602 Returns data as an R script.
604 This could be more sophisticated, e.g. creating factors with
605 appropriate levels (etc.).
606 """
607 now = format_datetime(get_now_localtz_pendulum(),
608 DateFormat.ISO8601_HUMANIZED_TO_SECONDS_TZ)
609 table_definition_str = "\n\n".join(
610 page.r_data_table_definition()
611 for page in self.pages
612 )
613 script = f"""#!/usr/bin/env Rscript
615# R script generated by CamCOPS at {now}
617# =============================================================================
618# Libraries
619# =============================================================================
621library(data.table)
623# =============================================================================
624# Data
625# =============================================================================
627{table_definition_str}
629"""
630 return script
632 def write_r(self, filename: str, encoding: str = "utf-8") -> None:
633 """
634 Write the contents in R format to a file.
636 Args:
637 filename: filename or file-like object
638 encoding: encoding to use
639 """
640 with open(filename, "wt", encoding=encoding) as f:
641 f.write(self.as_r())
644def _make_benchmarking_collection(nsheets: int = 100,
645 nrows: int = 200,
646 ncols: int = 30,
647 mindata: int = 0,
648 maxdata: int = 1000000) -> TsvCollection:
649 log.info(f"Creating TsvCollection with nsheets={nsheets}, nrows={nrows}, "
650 f"ncols={ncols}...")
651 coll = TsvCollection()
652 for sheetnum in range(1, nsheets + 1):
653 rows = [
654 {
655 f"c{colnum}": str(random.randint(mindata, maxdata))
656 for colnum in range(1, ncols + 1)
657 } for _ in range(1, nrows + 1)
658 ]
659 page = TsvPage(name=f"sheet{sheetnum}", rows=rows)
660 coll.add_page(page)
661 log.info("... done.")
662 return coll
665def file_size(filename: str) -> int:
666 """
667 Returns a file's size in bytes.
668 """
669 return os.stat(filename).st_size
672def benchmark_save(xlsx_filename: str = "test.xlsx",
673 ods_filename: str = "test.ods",
674 tsv_zip_filename: str = "test.zip",
675 r_filename: str = "test.R") -> None:
676 """
677 Use with:
679 .. code-block:: python
681 from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
682 from camcops_server.cc_modules.cc_tsv import benchmark_save
683 main_only_quicksetup_rootlogger()
684 benchmark_save()
686 Args:
687 xlsx_filename: XLSX file to create
688 ods_filename: ODS file to create
689 tsv_zip_filename: TSV ZIP file to create
690 r_filename: R script to create
692 Problem in Nov 2019 is that ODS is extremely slow. Rough timings:
694 - TSV ZIP: about 4.1 Mb, about 0.2 s. Good.
695 - XLSX (via openpyxl): about 4.6 Mb, 16 seconds.
696 - XLSX (via pyexcel_xlsx): about 4.6 Mb, 16 seconds.
697 - ODS (via odswriter): about 53 Mb, 56 seconds.
698 - ODS (via pyexcel_ods3): about 2.8 Mb, 29 seconds.
699 """
700 coll = _make_benchmarking_collection()
702 log.info("Writing TSV ZIP...")
703 coll.write_zip(tsv_zip_filename)
704 log.info(f"... done. File size {file_size(tsv_zip_filename)}")
706 log.info("Writing XLSX...")
707 coll.write_xlsx(xlsx_filename)
708 log.info(f"... done. File size {file_size(xlsx_filename)}")
710 log.info("Writing ODS...")
711 coll.write_ods(ods_filename)
712 log.info(f"... done. File size {file_size(ods_filename)}")
714 log.info("Writing R...")
715 coll.write_r(r_filename)
716 log.info(f"... done. File size {file_size(r_filename)}")