Coverage for cc_modules/cc_xml.py: 39%

156 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-08 23:14 +0000

1#!/usr/bin/env python 

2 

3""" 

4camcops_server/cc_modules/cc_xml.py 

5 

6=============================================================================== 

7 

8 Copyright (C) 2012, University of Cambridge, Department of Psychiatry. 

9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

10 

11 This file is part of CamCOPS. 

12 

13 CamCOPS is free software: you can redistribute it and/or modify 

14 it under the terms of the GNU General Public License as published by 

15 the Free Software Foundation, either version 3 of the License, or 

16 (at your option) any later version. 

17 

18 CamCOPS is distributed in the hope that it will be useful, 

19 but WITHOUT ANY WARRANTY; without even the implied warranty of 

20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

21 GNU General Public License for more details. 

22 

23 You should have received a copy of the GNU General Public License 

24 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>. 

25 

26=============================================================================== 

27 

28**XML helper functions/classes.** 

29 

30""" 

31 

32import base64 

33import datetime 

34import logging 

35from typing import Any, List, Optional, TYPE_CHECKING, Union 

36import xml.sax.saxutils 

37 

38from cardinal_pythonlib.logs import BraceStyleAdapter 

39from cardinal_pythonlib.reprfunc import auto_repr 

40from cardinal_pythonlib.sqlalchemy.orm_inspect import gen_columns 

41import pendulum # avoid name confusion with Date 

42from pendulum import DateTime as Pendulum 

43from semantic_version.base import Version 

44from sqlalchemy.sql.schema import Column 

45from sqlalchemy.sql.type_api import TypeEngine 

46 

47from camcops_server.cc_modules.cc_simpleobjects import XmlSimpleValue 

48from camcops_server.cc_modules.cc_sqla_coltypes import gen_camcops_blob_columns 

49 

50if TYPE_CHECKING: 

51 from camcops_server.cc_modules.cc_request import ( # noqa: F401 

52 CamcopsRequest, 

53 ) 

54 from camcops_server.cc_modules.cc_summaryelement import ( # noqa: F401 

55 SummaryElement, 

56 ) 

57 

58log = BraceStyleAdapter(logging.getLogger(__name__)) 

59 

60 

61# ============================================================================= 

62# Constants 

63# ============================================================================= 

64 

65XML_NAME_SNOMED_CODES = "snomed_ct_codes" 

66 

67XML_NAMESPACES = [ 

68 ' xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance"' 

69 # ' xmlns:dt="https://www.w3.org/2001/XMLSchema-datatypes"' 

70] 

71XML_IGNORE_NAMESPACES = [ 

72 'xmlns:mc="https://schemas.openxmlformats.org/markup-compatibility/2006"', 

73 'xmlns:ignore="https://camcops.readthedocs.org/ignore"', 

74 # ... actual URL unimportant 

75 'mc:Ignorable="ignore"', 

76] 

77# http://www.w3.org/TR/xmlschema-1/ 

78# http://www.w3.org/TR/2004/REC-xmlschema-2-20041028/datatypes.html 

79 

80 

81class XmlDataTypes(object): 

82 """ 

83 Constants representing standard XML data types. 

84 """ 

85 

86 BASE64BINARY = "base64Binary" 

87 BOOLEAN = "boolean" 

88 DATE = "date" 

89 DATETIME = "dateTime" 

90 DOUBLE = "double" 

91 INTEGER = "integer" 

92 STRING = "string" 

93 TIME = "time" 

94 

95 

96# ============================================================================= 

97# XML element 

98# ============================================================================= 

99 

100 

101class XmlElement(object): 

102 """ 

103 Represents XML data in a tree. 

104 """ 

105 

106 def __init__( 

107 self, 

108 name: str, 

109 value: Any = None, 

110 datatype: str = None, 

111 comment: str = None, 

112 literal: str = None, 

113 ) -> None: 

114 """ 

115 Args: 

116 name: name of this XML element 

117 value: value of this element: may be a raw value or a list of 

118 :class:`camcops_server.cc_modules.cc_xml.XmlElement` objects 

119 (default: ``None``) 

120 datatype: data type of this element (default: ``None``) 

121 comment: description of this element (default: ``None``) 

122 literal: literal XML; overrides all other options 

123 """ 

124 # Special: boolean requires lower case "true"/"false" (or 0/1) 

125 if datatype == XmlDataTypes.BOOLEAN and value is not None: 

126 value = str(value).lower() 

127 self.name = name 

128 self.value = value 

129 self.datatype = datatype 

130 self.comment = comment 

131 self.literal = literal 

132 

133 def __repr__(self) -> str: 

134 """ 

135 Shows just this element. 

136 """ 

137 return auto_repr(self, with_addr=True) 

138 

139 

140class XmlLiteral(XmlElement): 

141 """ 

142 Represents literal XML. 

143 """ 

144 

145 def __init__(self, literal: str) -> None: 

146 super().__init__(name="", literal=literal) 

147 

148 

149# ============================================================================= 

150# Some literals 

151# ============================================================================= 

152 

153XML_COMMENT_ANCILLARY = XmlLiteral("<!-- Ancillary records -->") 

154XML_COMMENT_ANONYMOUS = XmlLiteral("<!-- Anonymous task; no patient info -->") 

155XML_COMMENT_BLOBS = XmlLiteral("<!-- Associated BLOBs -->") 

156XML_COMMENT_CALCULATED = XmlLiteral("<!-- Calculated fields -->") 

157XML_COMMENT_PATIENT = XmlLiteral("<!-- Associated patient details -->") 

158XML_COMMENT_SNOMED_CT = XmlLiteral("<!-- SNOMED-CT codes -->") 

159XML_COMMENT_SPECIAL_NOTES = XmlLiteral("<!-- Any special notes added -->") 

160XML_COMMENT_STORED = XmlLiteral("<!-- Stored fields -->") 

161 

162 

163# ============================================================================= 

164# XML processing 

165# ============================================================================= 

166# The xml.etree.ElementTree and lxml libraries can both do this sort of thing. 

167# However, they do look quite fiddly and we only want to create something 

168# simple. Therefore, let's roll our own: 

169 

170 

171def make_xml_branches_from_columns( 

172 obj, skip_fields: List[str] = None 

173) -> List[XmlElement]: 

174 """ 

175 Returns a list of XML branches, each an 

176 :class:`camcops_server.cc_modules.cc_xml.XmlElement`, from an SQLAlchemy 

177 ORM object, using the list of SQLAlchemy Column objects that 

178 define/describe its fields. 

179 

180 Args: 

181 obj: the SQLAlchemy ORM object 

182 skip_fields: database column names to skip 

183 """ 

184 skip_fields = skip_fields or [] # type: List[str] 

185 branches = [] # type: List[XmlElement] 

186 for attrname, column in gen_columns(obj): 

187 # log.debug("make_xml_branches_from_columns: {!r}", attrname) 

188 colname = column.name 

189 if colname in skip_fields: 

190 continue 

191 branches.append( 

192 XmlElement( 

193 name=colname, 

194 value=getattr(obj, attrname), 

195 datatype=get_xml_datatype_from_sqla_column(column), 

196 comment=column.comment, 

197 ) 

198 ) 

199 return branches 

200 

201 

202def make_xml_branches_from_summaries( 

203 summaries: List["SummaryElement"], 

204 skip_fields: List[str] = None, 

205 sort_by_name: bool = True, 

206) -> List[XmlElement]: 

207 """ 

208 Returns a list of XML branches, each an 

209 :class:`camcops_server.cc_modules.cc_xml.XmlElement`, from a list of 

210 summary data provided by a task. 

211 

212 Args: 

213 summaries: list of :class:`SummaryElement` objects 

214 skip_fields: summary element names to skip 

215 sort_by_name: sort branches by element name? 

216 """ 

217 skip_fields = skip_fields or [] 

218 branches = [] 

219 for s in summaries: 

220 name = s.name 

221 if name in skip_fields: 

222 continue 

223 branches.append( 

224 XmlElement( 

225 name=name, 

226 value=s.value, 

227 datatype=get_xml_datatype_from_sqla_column_type(s.coltype), 

228 comment=s.comment, 

229 ) 

230 ) 

231 if sort_by_name: 

232 branches.sort(key=lambda el: el.name) 

233 return branches 

234 

235 

236def make_xml_branches_from_blobs( 

237 req: "CamcopsRequest", obj, skip_fields: List[str] = None 

238) -> List[XmlElement]: 

239 """ 

240 Return XML branches from those attributes of an SQLAlchemy ORM object 

241 (e.g. task) that represent BLOBs. 

242 

243 Args: 

244 req: the :class:`camcops_server.cc_modules.cc_request.CamcopsRequest` 

245 obj: the SQLAlchemy ORM object 

246 skip_fields: database column names to skip 

247 

248 Returns: 

249 a list of :class:`camcops_server.cc_modules.cc_xml.XmlElement` objects 

250 

251 """ 

252 skip_fields = skip_fields or [] # type: List[str] 

253 branches = [] # type: List[XmlElement] 

254 for id_attrname, column in gen_camcops_blob_columns(obj): 

255 colname = column.name 

256 if colname in skip_fields: 

257 continue 

258 relationship_attr = column.blob_relationship_attr_name 

259 blob = getattr(obj, relationship_attr) 

260 branches.append( 

261 XmlElement( 

262 name=relationship_attr, 

263 value=None if blob is None else blob.get_xml_element(req), 

264 comment=column.comment, 

265 ) 

266 ) 

267 return branches 

268 

269 

270def xml_header(eol: str = "\n") -> str: 

271 """ 

272 XML declaration header. 

273 """ 

274 return f'<?xml version="1.0" encoding="UTF-8"?>{eol}' 

275 

276 

277def get_xml_datatype_from_sqla_column_type(coltype: TypeEngine) -> str: 

278 """ 

279 Returns the XML schema datatype from an SQLAlchemy column type, 

280 such as ``Integer``. Compare :func:`get_xml_datatype_from_sqla_column`. 

281 """ 

282 # http://www.xml.dvint.com/docs/SchemaDataTypesQR-2.pdf 

283 # http://www.w3.org/TR/2004/REC-xmlschema-2-20041028/datatypes.html 

284 pt = coltype.python_type 

285 # pt is a *type*, not an *instance* of that type, so we use issubclass: 

286 # Watch the order. Move from more specific to less specific. 

287 # For example, issubclass(bool, int) == True, so do bool first. 

288 if issubclass(pt, datetime.datetime) or issubclass(pt, Pendulum): 

289 return XmlDataTypes.DATETIME 

290 if issubclass(pt, datetime.date) or issubclass(pt, pendulum.Date): 

291 return XmlDataTypes.DATE 

292 if issubclass(pt, datetime.time) or issubclass(pt, pendulum.Time): 

293 return XmlDataTypes.TIME 

294 if issubclass(pt, bool): 

295 return XmlDataTypes.BOOLEAN 

296 if issubclass(pt, int): 

297 return XmlDataTypes.INTEGER 

298 if issubclass(pt, float): 

299 return XmlDataTypes.DOUBLE 

300 if issubclass(pt, str) or issubclass(pt, Version): 

301 return XmlDataTypes.STRING 

302 # BLOBs are handled separately. 

303 raise NotImplementedError( 

304 f"Don't know XML type for SQLAlchemy type {coltype!r} with Python " 

305 f"type {pt!r}" 

306 ) 

307 

308 

309def get_xml_datatype_from_sqla_column(column: Column) -> Optional[str]: 

310 """ 

311 Returns the XML schema datatype from an SQLAlchemy Column, such as 

312 ``Integer()``. Compare :func:`get_xml_datatype_from_sqla_column_type`. 

313 """ 

314 coltype = column.type # type: TypeEngine 

315 return get_xml_datatype_from_sqla_column_type(coltype) 

316 

317 

318def get_xml_blob_element( 

319 name: str, blobdata: Optional[bytes], comment: str = None 

320) -> XmlElement: 

321 """ 

322 Returns an XmlElement representing a base-64-encoded BLOB. 

323 

324 Args: 

325 name: XML element name 

326 blobdata: the raw binary, or ``None`` 

327 comment: XML comment 

328 """ 

329 if blobdata: 

330 # blobdata is raw binary 

331 b64bytes = base64.b64encode(blobdata) 

332 b64str = b64bytes.decode("ascii") 

333 value = b64str 

334 else: 

335 value = None 

336 return XmlElement( 

337 name=name, 

338 value=value, 

339 datatype=XmlDataTypes.BASE64BINARY, 

340 comment=comment, 

341 ) 

342 # http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#base64Binary 

343 

344 

345def xml_escape_value(value: str) -> str: 

346 """ 

347 Escape a value for XML. 

348 """ 

349 # http://stackoverflow.com/questions/1091945/ 

350 # https://wiki.python.org/moin/EscapingXml 

351 return xml.sax.saxutils.escape(value) 

352 

353 

354def xml_quote_attribute(attr: str) -> str: 

355 """ 

356 Escapes and quotes an attribute for XML. 

357 

358 More stringent than value escaping. 

359 """ 

360 return xml.sax.saxutils.quoteattr(attr) 

361 

362 

363def get_xml_tree( 

364 element: Union[ 

365 XmlElement, XmlSimpleValue, List[Union[XmlElement, XmlSimpleValue]] 

366 ], 

367 level: int = 0, 

368 indent_spaces: int = 4, 

369 eol: str = "\n", 

370 include_comments: bool = False, 

371) -> str: 

372 # noinspection HttpUrlsUsage 

373 """ 

374 Returns an :class:`camcops_server.cc_modules.cc_xml.XmlElement` as text. 

375 

376 Args: 

377 element: root :class:`camcops_server.cc_modules.cc_xml.XmlElement` 

378 level: starting level/depth (used for recursion) 

379 indent_spaces: number of spaces to indent formatted XML 

380 eol: end-of-line string 

381 include_comments: include comments describing each field? 

382 

383 We will represent NULL values with ``xsi:nil``, but this requires a 

384 namespace: 

385 

386 - https://stackoverflow.com/questions/774192 

387 - http://books.xmlschemata.org/relaxng/relax-CHP-11-SECT-1.html 

388 

389 Comments: 

390 

391 - http://blog.galasoft.ch/posts/2010/02/quick-tip-commenting-out-properties-in-xaml/ 

392 - https://stackoverflow.com/questions/2073140/ 

393 

394 Regarding newlines: 

395 

396 - We do nothing special, i.e. newlines are provided in raw format. 

397 - However, some browsers may fail to display them correctly (i.e. they look 

398 like they're missing) -- e.g. Firefox, Chrome -- see 

399 https://stackoverflow.com/questions/2004386. Just try saving and 

400 inspecting the results with a text editor, or use the browser's "View 

401 Source" function (which, for Chrome, shows both newlines and line numbers 

402 too). 

403 

404 """ # noqa 

405 xmltext = "" 

406 prefix = " " * level * indent_spaces 

407 

408 if isinstance(element, XmlElement): 

409 

410 if element.literal: 

411 # A user-inserted piece of XML. Insert, but indent. 

412 xmltext += prefix + element.literal + eol 

413 

414 else: 

415 

416 # Attributes 

417 namespaces = [] 

418 if level == 0: # root 

419 # Apply namespace to root element (will inherit): 

420 namespaces.extend(XML_NAMESPACES) 

421 if include_comments: 

422 namespaces.extend(XML_IGNORE_NAMESPACES) 

423 namespace = " ".join(namespaces) 

424 if element.datatype: 

425 dt = f' xsi:type="{element.datatype}"' 

426 else: 

427 # log.warning("XmlElement has no datatype: {!r}", element) 

428 dt = "" 

429 cmt = "" 

430 if include_comments and element.comment: 

431 cmt = f" ignore:comment={xml_quote_attribute(element.comment)}" 

432 attributes = f"{namespace}{dt}{cmt}" 

433 

434 # Assemble 

435 if element.value is None: 

436 # NULL handling 

437 xmltext += ( 

438 f"{prefix}<{element.name}{attributes} " 

439 f'xsi:nil="true"/>{eol}' 

440 ) 

441 else: 

442 complex_value = isinstance( 

443 element.value, XmlElement 

444 ) or isinstance(element.value, list) 

445 value_to_recurse = ( 

446 element.value 

447 if complex_value 

448 else XmlSimpleValue(element.value) 

449 ) 

450 # ... XmlSimpleValue is a marker that subsequently 

451 # distinguishes things that were part of an XmlElement from 

452 # user-inserted raw XML. 

453 nl = eol if complex_value else "" 

454 pr2 = prefix if complex_value else "" 

455 v = get_xml_tree( 

456 value_to_recurse, 

457 level=level + 1, 

458 indent_spaces=indent_spaces, 

459 eol=eol, 

460 include_comments=include_comments, 

461 ) 

462 xmltext += ( 

463 f"{prefix}<{element.name}{attributes}>{nl}" 

464 f"{v}{pr2}</{element.name}>{eol}" 

465 ) 

466 

467 elif isinstance(element, list): 

468 for subelement in element: 

469 xmltext += get_xml_tree( 

470 subelement, 

471 level, 

472 indent_spaces=indent_spaces, 

473 eol=eol, 

474 include_comments=include_comments, 

475 ) 

476 # recursive 

477 

478 elif isinstance(element, XmlSimpleValue): 

479 # The lowest-level thing a value. No extra indent. 

480 xmltext += xml_escape_value(str(element.value)) 

481 

482 else: 

483 raise ValueError(f"Bad value to get_xml_tree: {element!r}") 

484 

485 return xmltext 

486 

487 

488def get_xml_document( 

489 root: XmlElement, 

490 indent_spaces: int = 4, 

491 eol: str = "\n", 

492 include_comments: bool = False, 

493) -> str: 

494 """ 

495 Returns an entire XML document as text, given the root 

496 :class:`camcops_server.cc_modules.cc_xml.XmlElement`. 

497 

498 Args: 

499 root: root :class:`camcops_server.cc_modules.cc_xml.XmlElement` 

500 indent_spaces: number of spaces to indent formatted XML 

501 eol: end-of-line string 

502 include_comments: include comments describing each field? 

503 """ 

504 if not isinstance(root, XmlElement): 

505 raise AssertionError( 

506 "get_xml_document: root not an XmlElement; " 

507 "XML requires a single root" 

508 ) 

509 return xml_header(eol) + get_xml_tree( 

510 root, 

511 indent_spaces=indent_spaces, 

512 eol=eol, 

513 include_comments=include_comments, 

514 )