Coverage for cc_modules/cc_spreadsheet.py: 29%

267 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-08 23:14 +0000

1#!/usr/bin/env python 

2 

3""" 

4camcops_server/cc_modules/cc_spreadsheet.py 

5 

6=============================================================================== 

7 

8 Copyright (C) 2012, University of Cambridge, Department of Psychiatry. 

9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

10 

11 This file is part of CamCOPS. 

12 

13 CamCOPS is free software: you can redistribute it and/or modify 

14 it under the terms of the GNU General Public License as published by 

15 the Free Software Foundation, either version 3 of the License, or 

16 (at your option) any later version. 

17 

18 CamCOPS is distributed in the hope that it will be useful, 

19 but WITHOUT ANY WARRANTY; without even the implied warranty of 

20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

21 GNU General Public License for more details. 

22 

23 You should have received a copy of the GNU General Public License 

24 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>. 

25 

26=============================================================================== 

27 

28**Helper functions/classes for spreadsheet-style tab-separated value (TSV) 

29(and related) exports.** 

30 

31""" 

32 

33from collections import OrderedDict 

34import csv 

35import io 

36import logging 

37import os 

38import random 

39import re 

40from typing import ( 

41 Any, 

42 BinaryIO, 

43 Callable, 

44 Container, 

45 Dict, 

46 Iterable, 

47 List, 

48 Optional, 

49 Sequence, 

50 Union, 

51) 

52import zipfile 

53 

54from cardinal_pythonlib.datetimefunc import ( 

55 format_datetime, 

56 get_now_localtz_pendulum, 

57) 

58from cardinal_pythonlib.excel import ( 

59 convert_for_openpyxl, 

60 convert_for_pyexcel_ods3, 

61) 

62from cardinal_pythonlib.logs import BraceStyleAdapter 

63from sqlalchemy.engine.result import ResultProxy 

64 

65from camcops_server.cc_modules.cc_constants import DateFormat 

66 

67ODS_VIA_PYEXCEL = True # significantly faster 

68XLSX_VIA_PYEXCEL = True 

69 

70if ODS_VIA_PYEXCEL: 

71 import pyexcel_ods3 # e.g. pip install pyexcel-ods3==0.5.3 

72 

73 ODSWriter = ODSSheet = None 

74else: 

75 from odswriter import ODSWriter, Sheet as ODSSheet # noqa 

76 

77 pyexcel_ods3 = None 

78 

79if XLSX_VIA_PYEXCEL: 

80 import pyexcel_xlsx # e.g. pip install pyexcel-xlsx==0.5.7 

81 

82 openpyxl = XLWorkbook = XLWorksheet = None 

83else: 

84 from openpyxl.workbook.workbook import Workbook as XLWorkbook 

85 from openpyxl.worksheet.worksheet import Worksheet as XLWorksheet 

86 

87 pyexcel_xlsx = None 

88 

89log = BraceStyleAdapter(logging.getLogger(__name__)) 

90 

91 

92# ============================================================================= 

93# Spreadsheet output holding structures 

94# ============================================================================= 

95 

96 

97class SpreadsheetPage(object): 

98 """ 

99 Represents a single "spreadsheet" page, e.g. for TSV/Excel/ODS output. 

100 """ 

101 

102 def __init__( 

103 self, name: str, rows: List[Union[Dict[str, Any], OrderedDict]] 

104 ) -> None: 

105 """ 

106 Args: 

107 name: name for the whole sheet 

108 rows: list of rows, where each row is a dictionary mapping 

109 column name to value 

110 """ 

111 assert name, "Missing name" 

112 self.name = name 

113 self.rows = rows 

114 self.headings = [] # type: List[str] 

115 for row in rows: 

116 self._add_headings_if_absent(row.keys()) 

117 

118 def __str__(self) -> str: 

119 return f"SpreadsheetPage: name={self.name}\n{self.get_tsv()}" 

120 

121 @classmethod 

122 def from_headings_rows( 

123 cls, name: str, headings: List[str], rows: List[Sequence[Any]] 

124 ) -> "SpreadsheetPage": 

125 """ 

126 Creates a SpreadsheetPage object using a list of headings and the row 

127 data as a list of lists. 

128 """ 

129 page = cls(name=name, rows=[]) 

130 n_cols = len(headings) 

131 page.headings = headings 

132 for row in rows: 

133 assert len(row) == n_cols 

134 page.rows.append(dict(zip(headings, row))) 

135 return page 

136 

137 @classmethod 

138 def from_resultproxy(cls, name: str, rp: ResultProxy) -> "SpreadsheetPage": 

139 """ 

140 Creates a SpreadsheetPage object from an SQLAlchemy ResultProxy. 

141 

142 Args: 

143 rp: 

144 A :class:` sqlalchemy.engine.result.ResultProxy`. 

145 name: 

146 Name for this sheet. 

147 """ 

148 column_names = rp.keys() 

149 rows = rp.fetchall() 

150 return cls.from_headings_rows( 

151 name=name, headings=column_names, rows=rows 

152 ) 

153 

154 @property 

155 def empty(self) -> bool: 

156 """ 

157 Do we have zero rows? 

158 """ 

159 return len(self.rows) == 0 

160 

161 def _add_headings_if_absent(self, headings: Iterable[str]) -> None: 

162 """ 

163 Add any headings we've not yet seen to our list of headings. 

164 """ 

165 for h in headings: 

166 if h not in self.headings: 

167 self.headings.append(h) 

168 

169 def add_or_set_value(self, heading: str, value: Any) -> None: 

170 """ 

171 If we contain only a single row, this function will set the value 

172 for a given column (``heading``) to ``value``. 

173 

174 Raises: 

175 :exc:`AssertionError` if we don't have exactly 1 row 

176 """ 

177 assert len(self.rows) == 1, "add_value can only be used if #rows == 1" 

178 self._add_headings_if_absent([heading]) 

179 self.rows[0][heading] = value 

180 

181 def add_or_set_column(self, heading: str, values: List[Any]) -> None: 

182 """ 

183 Set the column labelled ``heading`` so it contains the values specified 

184 in ``values``. The length of ``values`` must equal the number of rows 

185 that we already contain. 

186 

187 Raises: 

188 :exc:`AssertionError` if the number of values doesn't match 

189 the number of existing rows 

190 """ 

191 assert len(values) == len(self.rows), "#values != #existing rows" 

192 self._add_headings_if_absent([heading]) 

193 for i, row in enumerate(self.rows): 

194 row[heading] = values[i] 

195 

196 def add_or_set_columns_from_page(self, other: "SpreadsheetPage") -> None: 

197 """ 

198 This function presupposes that ``self`` and ``other`` are two pages 

199 ("spreadsheets") with *matching* rows. 

200 

201 It updates values or creates columns in ``self`` such that the values 

202 from all columns in ``other`` are written to the corresponding rows of 

203 ``self``. 

204 

205 Raises: 

206 :exc:`AssertionError` if the two pages (sheets) don't have 

207 the same number of rows. 

208 """ 

209 assert len(self.rows) == len(other.rows), "Mismatched #rows" 

210 self._add_headings_if_absent(other.headings) 

211 for i, row in enumerate(self.rows): 

212 for k, v in other.rows[i].items(): 

213 row[k] = v 

214 

215 def add_rows_from_page(self, other: "SpreadsheetPage") -> None: 

216 """ 

217 Add all rows from ``other`` to ``self``. 

218 """ 

219 self._add_headings_if_absent(other.headings) 

220 self.rows.extend(other.rows) 

221 

222 def sort_headings(self) -> None: 

223 """ 

224 Sort our headings internally. 

225 """ 

226 self.headings.sort() 

227 

228 def delete_columns(self, headings: Container[str]) -> None: 

229 """ 

230 Removes columns with the specified heading names. 

231 Used to simplify spreadsheets. 

232 

233 Since our rows are a dictionary, and our export functions are based on 

234 the headings, all we have to do is to delete the unwanted headings. 

235 """ 

236 self.headings = [h for h in self.headings if h not in headings] 

237 

238 @property 

239 def plainrows(self) -> List[List[Any]]: 

240 """ 

241 Returns a list of rows, where each row is a list of values. 

242 Does not include a "header" row. 

243 

244 Compare :attr:`rows`, which is a list of dictionaries. 

245 """ 

246 rows = [] 

247 for row in self.rows: 

248 rows.append([row.get(h) for h in self.headings]) 

249 return rows 

250 

251 def spreadsheetrows( 

252 self, converter: Callable[[Any], Any] 

253 ) -> List[List[Any]]: 

254 """ 

255 Like :meth:`plainrows`, but (a) ensures every cell is converted to a 

256 value that can be sent to a spreadsheet converted (e.g. ODS, XLSX), and 

257 (b) includes a header row. 

258 """ 

259 rows = [self.headings.copy()] 

260 for row in self.rows: 

261 rows.append([converter(row.get(h)) for h in self.headings]) 

262 return rows 

263 

264 def get_tsv(self, dialect: str = "excel-tab") -> str: 

265 r""" 

266 Returns the entire page (sheet) as TSV: one header row and then 

267 lots of data rows. 

268 

269 For the dialect, see 

270 https://docs.python.org/3/library/csv.html#csv.excel_tab. 

271 

272 For CSV files, see RGC 4180: https://tools.ietf.org/html/rfc4180. 

273 

274 For TSV files, see 

275 https://www.iana.org/assignments/media-types/text/tab-separated-values. 

276 

277 Test code: 

278 

279 .. code-block:: python 

280 

281 import io 

282 import csv 

283 from typing import List 

284 

285 def test(row: List[str], dialect: str = "excel-tab") -> str: 

286 f = io.StringIO() 

287 writer = csv.writer(f, dialect=dialect) 

288 writer.writerow(row) 

289 return f.getvalue() 

290 

291 test(["hello", "world"]) 

292 test(["hello\ttab", "world"]) # actual tab within double quotes 

293 test(["hello\nnewline", "world"]) # actual newline within double quotes 

294 test(['hello"doublequote', "world"]) # doubled double quote within double quotes 

295 

296 """ # noqa 

297 f = io.StringIO() 

298 writer = csv.writer(f, dialect=dialect) 

299 writer.writerow(self.headings) 

300 for row in self.rows: 

301 writer.writerow([row.get(h) for h in self.headings]) 

302 return f.getvalue() 

303 

304 def write_to_openpyxl_xlsx_worksheet(self, ws: "XLWorksheet") -> None: 

305 """ 

306 Writes data from this page to an existing ``openpyxl`` XLSX worksheet. 

307 """ 

308 ws.append(self.headings) 

309 for row in self.rows: 

310 ws.append( 

311 [convert_for_openpyxl(row.get(h)) for h in self.headings] 

312 ) 

313 

314 def write_to_odswriter_ods_worksheet(self, ws: "ODSSheet") -> None: 

315 """ 

316 Writes data from this page to an existing ``odswriter`` ODS sheet. 

317 """ 

318 # noinspection PyUnresolvedReferences 

319 ws.writerow(self.headings) 

320 for row in self.rows: 

321 # noinspection PyUnresolvedReferences 

322 ws.writerow([row.get(h) for h in self.headings]) 

323 

324 def r_object_name(self) -> str: 

325 """ 

326 Name of the object when imported into R. 

327 The main thing: no leading underscores. 

328 """ 

329 n = self.name 

330 n = n[1:] if n.startswith("_") else n 

331 return f"camcops_{n}" # less chance of conflict within R 

332 

333 def r_data_table_definition(self) -> str: 

334 """ 

335 Returns a string to define this object as a ``data.table`` in R. 

336 

337 See also: 

338 

339 - https://stackoverflow.com/questions/32103639/read-csv-file-in-r-with-double-quotes 

340 """ # noqa 

341 object_name = self.r_object_name() 

342 csv_text = self.get_tsv(dialect="excel") 

343 csv_text = csv_text.replace('"', r"\"") 

344 definition = ( 

345 f'data.table::fread(sep=",", header=TRUE, text="{csv_text}"\n)' 

346 ) 

347 return f"{object_name} <- {definition}" 

348 

349 

350class SpreadsheetCollection(object): 

351 """ 

352 A collection of 

353 :class:`camcops_server.cc_modules.cc_spreadsheet.SpreadsheetPage` pages 

354 (spreadsheets), like an Excel workbook. 

355 """ 

356 

357 def __init__(self) -> None: 

358 self.pages = [] # type: List[SpreadsheetPage] 

359 

360 def __str__(self) -> str: 

361 return "SpreadsheetCollection:\n" + "\n\n".join( 

362 page.get_tsv() for page in self.pages 

363 ) 

364 

365 # ------------------------------------------------------------------------- 

366 # Pages 

367 # ------------------------------------------------------------------------- 

368 

369 def page_with_name(self, page_name: str) -> Optional[SpreadsheetPage]: 

370 """ 

371 Returns the page with the specific name, or ``None`` if no such 

372 page exists. 

373 """ 

374 return next( 

375 (page for page in self.pages if page.name == page_name), None 

376 ) 

377 

378 def add_page(self, page: SpreadsheetPage) -> None: 

379 """ 

380 Adds a new page to our collection. If the new page has the same name 

381 as an existing page, rows from the new page are added to the existing 

382 page. Does nothing if the new page is empty. 

383 """ 

384 if page.empty: 

385 return 

386 existing_page = self.page_with_name(page.name) 

387 if existing_page: 

388 # Blend with existing page 

389 existing_page.add_rows_from_page(page) 

390 else: 

391 # New page 

392 self.pages.append(page) 

393 

394 def add_pages(self, pages: List[SpreadsheetPage]) -> None: 

395 """ 

396 Adds all ``pages`` to our collection, via :func:`add_page`. 

397 """ 

398 for page in pages: 

399 self.add_page(page) 

400 

401 def sort_headings_within_all_pages(self) -> None: 

402 """ 

403 Sort headings within each of our pages. 

404 """ 

405 for page in self.pages: 

406 page.sort_headings() 

407 

408 def sort_pages(self) -> None: 

409 """ 

410 Sort our pages by their page name. 

411 """ 

412 self.pages.sort(key=lambda p: p.name) 

413 

414 def get_page_names(self) -> List[str]: 

415 """ 

416 Return a list of the names of all our pages. 

417 """ 

418 return [p.name for p in self.pages] 

419 

420 def delete_page(self, page_name: str) -> None: 

421 """ 

422 Delete any page with the name specified. 

423 """ 

424 self.pages = [p for p in self.pages if p.name != page_name] 

425 

426 def delete_pages(self, page_names: Container[str]) -> None: 

427 """ 

428 Delete pages with the names specified. 

429 """ 

430 self.pages = [p for p in self.pages if p.name not in page_names] 

431 

432 def delete_columns(self, headings: Container[str]) -> None: 

433 """ 

434 Across all pages, removes columns with the specified heading names. 

435 Used to simplify spreadsheets. 

436 """ 

437 for p in self.pages: 

438 p.delete_columns(headings) 

439 

440 # ------------------------------------------------------------------------- 

441 # TSV 

442 # ------------------------------------------------------------------------- 

443 

444 def get_tsv_file(self, page_name: str) -> str: 

445 """ 

446 Returns a TSV file for a named page. 

447 

448 Raises: 

449 :exc:`AssertionError` if the named page does not exist 

450 

451 """ 

452 page = self.page_with_name(page_name) 

453 assert page is not None, f"No such page with name {page_name}" 

454 return page.get_tsv() 

455 

456 # ------------------------------------------------------------------------- 

457 # ZIP of TSVs 

458 # ------------------------------------------------------------------------- 

459 

460 def write_zip( 

461 self, 

462 file: Union[str, BinaryIO], 

463 encoding: str = "utf-8", 

464 compression: int = zipfile.ZIP_DEFLATED, 

465 ) -> None: 

466 """ 

467 Writes data to a file, as a ZIP file of TSV files. 

468 

469 Args: 

470 file: filename or file-like object 

471 encoding: encoding to use when writing the TSV files 

472 compression: compression method to use 

473 

474 Choice of compression method: see 

475 

476 - https://docs.python.org/3/library/zipfile.html 

477 - https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT 

478 - https://en.wikipedia.org/wiki/Zip_(file_format)#Compression_methods 

479 

480 Note also that ``openpyxl`` uses ``ZIP_DEFLATED``, which seems to be 

481 the most portable if not the best compression. 

482 """ 

483 if isinstance(file, str): # it's a filename 

484 with open(file, "wb") as binaryfile: 

485 return self.write_zip(binaryfile, encoding) # recurse once 

486 with zipfile.ZipFile(file, mode="w", compression=compression) as z: 

487 # Write to ZIP. 

488 # If there are no valid task instances, there'll be no TSV; 

489 # that's OK. 

490 for filename_stem in self.get_page_names(): 

491 tsv_filename = filename_stem + ".tsv" 

492 tsv_contents = self.get_tsv_file(page_name=filename_stem) 

493 z.writestr(tsv_filename, tsv_contents.encode(encoding)) 

494 

495 def as_zip(self, encoding: str = "utf-8") -> bytes: 

496 """ 

497 Returns the TSV collection as a ZIP file containing TSV files. 

498 

499 Args: 

500 encoding: encoding to use when writing the TSV files 

501 """ 

502 with io.BytesIO() as memfile: 

503 self.write_zip(memfile, encoding) 

504 zip_contents = memfile.getvalue() 

505 return zip_contents 

506 

507 # ------------------------------------------------------------------------- 

508 # XLSX, ODS 

509 # ------------------------------------------------------------------------- 

510 

511 def write_xlsx(self, file: Union[str, BinaryIO]) -> None: 

512 """ 

513 Write the contents in XLSX (Excel) format to a file. 

514 

515 Args: 

516 file: filename or file-like object 

517 """ 

518 if XLSX_VIA_PYEXCEL: # use pyexcel_xlsx 

519 data = self._get_pyexcel_data(convert_for_openpyxl) 

520 pyexcel_xlsx.save_data(file, data) 

521 else: # use openpyxl 

522 # Marginal performance gain with write_only. Does not automatically 

523 # add a blank sheet 

524 wb = XLWorkbook(write_only=True) 

525 valid_name_dict = self.get_pages_with_valid_sheet_names() 

526 for page, title in valid_name_dict.items(): 

527 ws = wb.create_sheet(title=title) 

528 page.write_to_openpyxl_xlsx_worksheet(ws) 

529 wb.save(file) 

530 

531 def as_xlsx(self) -> bytes: 

532 """ 

533 Returns the TSV collection as an XLSX (Excel) file. 

534 """ 

535 with io.BytesIO() as memfile: 

536 self.write_xlsx(memfile) 

537 contents = memfile.getvalue() 

538 return contents 

539 

540 @staticmethod 

541 def get_sheet_title(page: SpreadsheetPage) -> str: 

542 r""" 

543 Returns a worksheet name for a :class:`SpreadsheetPage`. 

544 

545 See ``openpyxl/workbook/child.py``. 

546 

547 - Excel prohibits ``\``, ``*``, ``?``, ``:``, ``/``, ``[``, ``]`` 

548 - LibreOffice also prohibits ``'`` as first or last character but let's 

549 just replace that globally. 

550 """ 

551 title = re.sub(r"[\\*?:/\[\]']", "_", page.name) 

552 

553 if len(title) > 31: 

554 title = f"{title[:28]}..." 

555 

556 return title 

557 

558 def _get_pyexcel_data( 

559 self, converter: Callable[[Any], Any] 

560 ) -> Dict[str, List[List[Any]]]: 

561 """ 

562 Returns data in the format expected by ``pyexcel``, which is an ordered 

563 dictionary mapping sheet names to a list of rows, where each row is a 

564 list of cell values. 

565 """ 

566 data = OrderedDict() 

567 for page in self.pages: 

568 data[self.get_sheet_title(page)] = page.spreadsheetrows(converter) 

569 return data 

570 

571 def write_ods(self, file: Union[str, BinaryIO]) -> None: 

572 """ 

573 Writes an ODS (OpenOffice spreadsheet document) to a file. 

574 

575 Args: 

576 file: filename or file-like object 

577 """ 

578 if ODS_VIA_PYEXCEL: # use pyexcel_ods3 

579 data = self._get_pyexcel_data(convert_for_pyexcel_ods3) 

580 pyexcel_ods3.save_data(file, data) 

581 else: # use odswriter 

582 if isinstance(file, str): # it's a filename 

583 with open(file, "wb") as binaryfile: 

584 return self.write_ods(binaryfile) # recurse once 

585 # noinspection PyCallingNonCallable 

586 with ODSWriter(file) as odsfile: 

587 valid_name_dict = self.get_pages_with_valid_sheet_names() 

588 for page, title in valid_name_dict.items(): 

589 sheet = odsfile.new_sheet(name=title) 

590 page.write_to_odswriter_ods_worksheet(sheet) 

591 

592 def as_ods(self) -> bytes: 

593 """ 

594 Returns the TSV collection as an ODS (OpenOffice spreadsheet document) 

595 file. 

596 """ 

597 with io.BytesIO() as memfile: 

598 self.write_ods(memfile) 

599 contents = memfile.getvalue() 

600 return contents 

601 

602 def get_pages_with_valid_sheet_names(self) -> Dict[SpreadsheetPage, str]: 

603 """ 

604 Returns an ordered mapping from :class:`SpreadsheetPage` objects to 

605 their sheet names. 

606 """ 

607 name_dict = OrderedDict() 

608 

609 for page in self.pages: 

610 name_dict[page] = self.get_sheet_title(page) 

611 

612 self.make_sheet_names_unique(name_dict) 

613 

614 return name_dict 

615 

616 @staticmethod 

617 def make_sheet_names_unique(name_dict: Dict[SpreadsheetPage, str]) -> None: 

618 """ 

619 Modifies (in place) a mapping from :class:`SpreadsheetPage` to 

620 worksheet names, such that all page names are unique. 

621 

622 - See also :func:`avoid_duplicate_name` in 

623 ``openpxl/workbook/child.py`` 

624 - We keep the 31 character restriction 

625 """ 

626 unique_names = [] # type: List[str] 

627 

628 for page, name in name_dict.items(): 

629 attempt = 0 

630 

631 while name.lower() in unique_names: 

632 attempt += 1 

633 

634 if attempt > 1000: 

635 # algorithm failure, better to let Excel deal with the 

636 # consequences than get stuck in a loop 

637 log.debug( 

638 f"Failed to generate a unique sheet name from {name}" 

639 ) 

640 break 

641 

642 match = re.search(r"\d+$", name) 

643 count = 0 

644 if match is not None: 

645 count = int(match.group()) 

646 

647 new_suffix = str(count + 1) 

648 name = name[: -len(new_suffix)] + new_suffix 

649 name_dict[page] = name 

650 unique_names.append(name.lower()) 

651 

652 # ------------------------------------------------------------------------- 

653 # R 

654 # ------------------------------------------------------------------------- 

655 

656 def as_r(self) -> str: 

657 """ 

658 Returns data as an R script. 

659 

660 This could be more sophisticated, e.g. creating factors with 

661 appropriate levels (etc.). 

662 """ 

663 now = format_datetime( 

664 get_now_localtz_pendulum(), 

665 DateFormat.ISO8601_HUMANIZED_TO_SECONDS_TZ, 

666 ) 

667 table_definition_str = "\n\n".join( 

668 page.r_data_table_definition() for page in self.pages 

669 ) 

670 script = f"""#!/usr/bin/env Rscript 

671 

672# R script generated by CamCOPS at {now} 

673 

674# ============================================================================= 

675# Libraries 

676# ============================================================================= 

677 

678library(data.table) 

679 

680# ============================================================================= 

681# Data 

682# ============================================================================= 

683 

684{table_definition_str} 

685 

686""" 

687 return script 

688 

689 def write_r(self, filename: str, encoding: str = "utf-8") -> None: 

690 """ 

691 Write the contents in R format to a file. 

692 

693 Args: 

694 filename: filename or file-like object 

695 encoding: encoding to use 

696 """ 

697 with open(filename, "wt", encoding=encoding) as f: 

698 f.write(self.as_r()) 

699 

700 

701def _make_benchmarking_collection( 

702 nsheets: int = 100, 

703 nrows: int = 200, 

704 ncols: int = 30, 

705 mindata: int = 0, 

706 maxdata: int = 1000000, 

707) -> SpreadsheetCollection: 

708 log.info( 

709 f"Creating SpreadsheetCollection with nsheets={nsheets}, " 

710 f"nrows={nrows}, ncols={ncols}..." 

711 ) 

712 coll = SpreadsheetCollection() 

713 for sheetnum in range(1, nsheets + 1): 

714 rows = [ 

715 { 

716 f"c{colnum}": str(random.randint(mindata, maxdata)) 

717 for colnum in range(1, ncols + 1) 

718 } 

719 for _ in range(1, nrows + 1) 

720 ] 

721 page = SpreadsheetPage(name=f"sheet{sheetnum}", rows=rows) 

722 coll.add_page(page) 

723 log.info("... done.") 

724 return coll 

725 

726 

727def file_size(filename: str) -> int: 

728 """ 

729 Returns a file's size in bytes. 

730 """ 

731 return os.stat(filename).st_size 

732 

733 

734def benchmark_save( 

735 xlsx_filename: str = "test.xlsx", 

736 ods_filename: str = "test.ods", 

737 tsv_zip_filename: str = "test.zip", 

738 r_filename: str = "test.R", 

739) -> None: 

740 """ 

741 Use with: 

742 

743 .. code-block:: python 

744 

745 from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger 

746 from camcops_server.cc_modules.cc_spreadsheet import benchmark_save 

747 main_only_quicksetup_rootlogger() 

748 benchmark_save() 

749 

750 Args: 

751 xlsx_filename: XLSX file to create 

752 ods_filename: ODS file to create 

753 tsv_zip_filename: TSV ZIP file to create 

754 r_filename: R script to create 

755 

756 Problem in Nov 2019 is that ODS is extremely slow. Rough timings: 

757 

758 - TSV ZIP: about 4.1 Mb, about 0.2 s. Good. 

759 - XLSX (via openpyxl): about 4.6 Mb, 16 seconds. 

760 - XLSX (via pyexcel_xlsx): about 4.6 Mb, 16 seconds. 

761 - ODS (via odswriter): about 53 Mb, 56 seconds. 

762 - ODS (via pyexcel_ods3): about 2.8 Mb, 29 seconds. 

763 """ 

764 coll = _make_benchmarking_collection() 

765 

766 log.info("Writing TSV ZIP...") 

767 coll.write_zip(tsv_zip_filename) 

768 log.info(f"... done. File size {file_size(tsv_zip_filename)}") 

769 

770 log.info("Writing XLSX...") 

771 coll.write_xlsx(xlsx_filename) 

772 log.info(f"... done. File size {file_size(xlsx_filename)}") 

773 

774 log.info("Writing ODS...") 

775 coll.write_ods(ods_filename) 

776 log.info(f"... done. File size {file_size(ods_filename)}") 

777 

778 log.info("Writing R...") 

779 coll.write_r(r_filename) 

780 log.info(f"... done. File size {file_size(r_filename)}")