Coverage for cc_modules/cc_xml.py: 39%
156 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-08 23:14 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-08 23:14 +0000
1#!/usr/bin/env python
3"""
4camcops_server/cc_modules/cc_xml.py
6===============================================================================
8 Copyright (C) 2012, University of Cambridge, Department of Psychiatry.
9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
11 This file is part of CamCOPS.
13 CamCOPS is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 CamCOPS is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.
26===============================================================================
28**XML helper functions/classes.**
30"""
32import base64
33import datetime
34import logging
35from typing import Any, List, Optional, TYPE_CHECKING, Union
36import xml.sax.saxutils
38from cardinal_pythonlib.logs import BraceStyleAdapter
39from cardinal_pythonlib.reprfunc import auto_repr
40from cardinal_pythonlib.sqlalchemy.orm_inspect import gen_columns
41import pendulum # avoid name confusion with Date
42from pendulum import DateTime as Pendulum
43from semantic_version.base import Version
44from sqlalchemy.sql.schema import Column
45from sqlalchemy.sql.type_api import TypeEngine
47from camcops_server.cc_modules.cc_simpleobjects import XmlSimpleValue
48from camcops_server.cc_modules.cc_sqla_coltypes import gen_camcops_blob_columns
50if TYPE_CHECKING:
51 from camcops_server.cc_modules.cc_request import ( # noqa: F401
52 CamcopsRequest,
53 )
54 from camcops_server.cc_modules.cc_summaryelement import ( # noqa: F401
55 SummaryElement,
56 )
58log = BraceStyleAdapter(logging.getLogger(__name__))
61# =============================================================================
62# Constants
63# =============================================================================
65XML_NAME_SNOMED_CODES = "snomed_ct_codes"
67XML_NAMESPACES = [
68 ' xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance"'
69 # ' xmlns:dt="https://www.w3.org/2001/XMLSchema-datatypes"'
70]
71XML_IGNORE_NAMESPACES = [
72 'xmlns:mc="https://schemas.openxmlformats.org/markup-compatibility/2006"',
73 'xmlns:ignore="https://camcops.readthedocs.org/ignore"',
74 # ... actual URL unimportant
75 'mc:Ignorable="ignore"',
76]
77# http://www.w3.org/TR/xmlschema-1/
78# http://www.w3.org/TR/2004/REC-xmlschema-2-20041028/datatypes.html
81class XmlDataTypes(object):
82 """
83 Constants representing standard XML data types.
84 """
86 BASE64BINARY = "base64Binary"
87 BOOLEAN = "boolean"
88 DATE = "date"
89 DATETIME = "dateTime"
90 DOUBLE = "double"
91 INTEGER = "integer"
92 STRING = "string"
93 TIME = "time"
96# =============================================================================
97# XML element
98# =============================================================================
101class XmlElement(object):
102 """
103 Represents XML data in a tree.
104 """
106 def __init__(
107 self,
108 name: str,
109 value: Any = None,
110 datatype: str = None,
111 comment: str = None,
112 literal: str = None,
113 ) -> None:
114 """
115 Args:
116 name: name of this XML element
117 value: value of this element: may be a raw value or a list of
118 :class:`camcops_server.cc_modules.cc_xml.XmlElement` objects
119 (default: ``None``)
120 datatype: data type of this element (default: ``None``)
121 comment: description of this element (default: ``None``)
122 literal: literal XML; overrides all other options
123 """
124 # Special: boolean requires lower case "true"/"false" (or 0/1)
125 if datatype == XmlDataTypes.BOOLEAN and value is not None:
126 value = str(value).lower()
127 self.name = name
128 self.value = value
129 self.datatype = datatype
130 self.comment = comment
131 self.literal = literal
133 def __repr__(self) -> str:
134 """
135 Shows just this element.
136 """
137 return auto_repr(self, with_addr=True)
140class XmlLiteral(XmlElement):
141 """
142 Represents literal XML.
143 """
145 def __init__(self, literal: str) -> None:
146 super().__init__(name="", literal=literal)
149# =============================================================================
150# Some literals
151# =============================================================================
153XML_COMMENT_ANCILLARY = XmlLiteral("<!-- Ancillary records -->")
154XML_COMMENT_ANONYMOUS = XmlLiteral("<!-- Anonymous task; no patient info -->")
155XML_COMMENT_BLOBS = XmlLiteral("<!-- Associated BLOBs -->")
156XML_COMMENT_CALCULATED = XmlLiteral("<!-- Calculated fields -->")
157XML_COMMENT_PATIENT = XmlLiteral("<!-- Associated patient details -->")
158XML_COMMENT_SNOMED_CT = XmlLiteral("<!-- SNOMED-CT codes -->")
159XML_COMMENT_SPECIAL_NOTES = XmlLiteral("<!-- Any special notes added -->")
160XML_COMMENT_STORED = XmlLiteral("<!-- Stored fields -->")
163# =============================================================================
164# XML processing
165# =============================================================================
166# The xml.etree.ElementTree and lxml libraries can both do this sort of thing.
167# However, they do look quite fiddly and we only want to create something
168# simple. Therefore, let's roll our own:
171def make_xml_branches_from_columns(
172 obj, skip_fields: List[str] = None
173) -> List[XmlElement]:
174 """
175 Returns a list of XML branches, each an
176 :class:`camcops_server.cc_modules.cc_xml.XmlElement`, from an SQLAlchemy
177 ORM object, using the list of SQLAlchemy Column objects that
178 define/describe its fields.
180 Args:
181 obj: the SQLAlchemy ORM object
182 skip_fields: database column names to skip
183 """
184 skip_fields = skip_fields or [] # type: List[str]
185 branches = [] # type: List[XmlElement]
186 for attrname, column in gen_columns(obj):
187 # log.debug("make_xml_branches_from_columns: {!r}", attrname)
188 colname = column.name
189 if colname in skip_fields:
190 continue
191 branches.append(
192 XmlElement(
193 name=colname,
194 value=getattr(obj, attrname),
195 datatype=get_xml_datatype_from_sqla_column(column),
196 comment=column.comment,
197 )
198 )
199 return branches
202def make_xml_branches_from_summaries(
203 summaries: List["SummaryElement"],
204 skip_fields: List[str] = None,
205 sort_by_name: bool = True,
206) -> List[XmlElement]:
207 """
208 Returns a list of XML branches, each an
209 :class:`camcops_server.cc_modules.cc_xml.XmlElement`, from a list of
210 summary data provided by a task.
212 Args:
213 summaries: list of :class:`SummaryElement` objects
214 skip_fields: summary element names to skip
215 sort_by_name: sort branches by element name?
216 """
217 skip_fields = skip_fields or []
218 branches = []
219 for s in summaries:
220 name = s.name
221 if name in skip_fields:
222 continue
223 branches.append(
224 XmlElement(
225 name=name,
226 value=s.value,
227 datatype=get_xml_datatype_from_sqla_column_type(s.coltype),
228 comment=s.comment,
229 )
230 )
231 if sort_by_name:
232 branches.sort(key=lambda el: el.name)
233 return branches
236def make_xml_branches_from_blobs(
237 req: "CamcopsRequest", obj, skip_fields: List[str] = None
238) -> List[XmlElement]:
239 """
240 Return XML branches from those attributes of an SQLAlchemy ORM object
241 (e.g. task) that represent BLOBs.
243 Args:
244 req: the :class:`camcops_server.cc_modules.cc_request.CamcopsRequest`
245 obj: the SQLAlchemy ORM object
246 skip_fields: database column names to skip
248 Returns:
249 a list of :class:`camcops_server.cc_modules.cc_xml.XmlElement` objects
251 """
252 skip_fields = skip_fields or [] # type: List[str]
253 branches = [] # type: List[XmlElement]
254 for id_attrname, column in gen_camcops_blob_columns(obj):
255 colname = column.name
256 if colname in skip_fields:
257 continue
258 relationship_attr = column.blob_relationship_attr_name
259 blob = getattr(obj, relationship_attr)
260 branches.append(
261 XmlElement(
262 name=relationship_attr,
263 value=None if blob is None else blob.get_xml_element(req),
264 comment=column.comment,
265 )
266 )
267 return branches
270def xml_header(eol: str = "\n") -> str:
271 """
272 XML declaration header.
273 """
274 return f'<?xml version="1.0" encoding="UTF-8"?>{eol}'
277def get_xml_datatype_from_sqla_column_type(coltype: TypeEngine) -> str:
278 """
279 Returns the XML schema datatype from an SQLAlchemy column type,
280 such as ``Integer``. Compare :func:`get_xml_datatype_from_sqla_column`.
281 """
282 # http://www.xml.dvint.com/docs/SchemaDataTypesQR-2.pdf
283 # http://www.w3.org/TR/2004/REC-xmlschema-2-20041028/datatypes.html
284 pt = coltype.python_type
285 # pt is a *type*, not an *instance* of that type, so we use issubclass:
286 # Watch the order. Move from more specific to less specific.
287 # For example, issubclass(bool, int) == True, so do bool first.
288 if issubclass(pt, datetime.datetime) or issubclass(pt, Pendulum):
289 return XmlDataTypes.DATETIME
290 if issubclass(pt, datetime.date) or issubclass(pt, pendulum.Date):
291 return XmlDataTypes.DATE
292 if issubclass(pt, datetime.time) or issubclass(pt, pendulum.Time):
293 return XmlDataTypes.TIME
294 if issubclass(pt, bool):
295 return XmlDataTypes.BOOLEAN
296 if issubclass(pt, int):
297 return XmlDataTypes.INTEGER
298 if issubclass(pt, float):
299 return XmlDataTypes.DOUBLE
300 if issubclass(pt, str) or issubclass(pt, Version):
301 return XmlDataTypes.STRING
302 # BLOBs are handled separately.
303 raise NotImplementedError(
304 f"Don't know XML type for SQLAlchemy type {coltype!r} with Python "
305 f"type {pt!r}"
306 )
309def get_xml_datatype_from_sqla_column(column: Column) -> Optional[str]:
310 """
311 Returns the XML schema datatype from an SQLAlchemy Column, such as
312 ``Integer()``. Compare :func:`get_xml_datatype_from_sqla_column_type`.
313 """
314 coltype = column.type # type: TypeEngine
315 return get_xml_datatype_from_sqla_column_type(coltype)
318def get_xml_blob_element(
319 name: str, blobdata: Optional[bytes], comment: str = None
320) -> XmlElement:
321 """
322 Returns an XmlElement representing a base-64-encoded BLOB.
324 Args:
325 name: XML element name
326 blobdata: the raw binary, or ``None``
327 comment: XML comment
328 """
329 if blobdata:
330 # blobdata is raw binary
331 b64bytes = base64.b64encode(blobdata)
332 b64str = b64bytes.decode("ascii")
333 value = b64str
334 else:
335 value = None
336 return XmlElement(
337 name=name,
338 value=value,
339 datatype=XmlDataTypes.BASE64BINARY,
340 comment=comment,
341 )
342 # http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#base64Binary
345def xml_escape_value(value: str) -> str:
346 """
347 Escape a value for XML.
348 """
349 # http://stackoverflow.com/questions/1091945/
350 # https://wiki.python.org/moin/EscapingXml
351 return xml.sax.saxutils.escape(value)
354def xml_quote_attribute(attr: str) -> str:
355 """
356 Escapes and quotes an attribute for XML.
358 More stringent than value escaping.
359 """
360 return xml.sax.saxutils.quoteattr(attr)
363def get_xml_tree(
364 element: Union[
365 XmlElement, XmlSimpleValue, List[Union[XmlElement, XmlSimpleValue]]
366 ],
367 level: int = 0,
368 indent_spaces: int = 4,
369 eol: str = "\n",
370 include_comments: bool = False,
371) -> str:
372 # noinspection HttpUrlsUsage
373 """
374 Returns an :class:`camcops_server.cc_modules.cc_xml.XmlElement` as text.
376 Args:
377 element: root :class:`camcops_server.cc_modules.cc_xml.XmlElement`
378 level: starting level/depth (used for recursion)
379 indent_spaces: number of spaces to indent formatted XML
380 eol: end-of-line string
381 include_comments: include comments describing each field?
383 We will represent NULL values with ``xsi:nil``, but this requires a
384 namespace:
386 - https://stackoverflow.com/questions/774192
387 - http://books.xmlschemata.org/relaxng/relax-CHP-11-SECT-1.html
389 Comments:
391 - http://blog.galasoft.ch/posts/2010/02/quick-tip-commenting-out-properties-in-xaml/
392 - https://stackoverflow.com/questions/2073140/
394 Regarding newlines:
396 - We do nothing special, i.e. newlines are provided in raw format.
397 - However, some browsers may fail to display them correctly (i.e. they look
398 like they're missing) -- e.g. Firefox, Chrome -- see
399 https://stackoverflow.com/questions/2004386. Just try saving and
400 inspecting the results with a text editor, or use the browser's "View
401 Source" function (which, for Chrome, shows both newlines and line numbers
402 too).
404 """ # noqa
405 xmltext = ""
406 prefix = " " * level * indent_spaces
408 if isinstance(element, XmlElement):
410 if element.literal:
411 # A user-inserted piece of XML. Insert, but indent.
412 xmltext += prefix + element.literal + eol
414 else:
416 # Attributes
417 namespaces = []
418 if level == 0: # root
419 # Apply namespace to root element (will inherit):
420 namespaces.extend(XML_NAMESPACES)
421 if include_comments:
422 namespaces.extend(XML_IGNORE_NAMESPACES)
423 namespace = " ".join(namespaces)
424 if element.datatype:
425 dt = f' xsi:type="{element.datatype}"'
426 else:
427 # log.warning("XmlElement has no datatype: {!r}", element)
428 dt = ""
429 cmt = ""
430 if include_comments and element.comment:
431 cmt = f" ignore:comment={xml_quote_attribute(element.comment)}"
432 attributes = f"{namespace}{dt}{cmt}"
434 # Assemble
435 if element.value is None:
436 # NULL handling
437 xmltext += (
438 f"{prefix}<{element.name}{attributes} "
439 f'xsi:nil="true"/>{eol}'
440 )
441 else:
442 complex_value = isinstance(
443 element.value, XmlElement
444 ) or isinstance(element.value, list)
445 value_to_recurse = (
446 element.value
447 if complex_value
448 else XmlSimpleValue(element.value)
449 )
450 # ... XmlSimpleValue is a marker that subsequently
451 # distinguishes things that were part of an XmlElement from
452 # user-inserted raw XML.
453 nl = eol if complex_value else ""
454 pr2 = prefix if complex_value else ""
455 v = get_xml_tree(
456 value_to_recurse,
457 level=level + 1,
458 indent_spaces=indent_spaces,
459 eol=eol,
460 include_comments=include_comments,
461 )
462 xmltext += (
463 f"{prefix}<{element.name}{attributes}>{nl}"
464 f"{v}{pr2}</{element.name}>{eol}"
465 )
467 elif isinstance(element, list):
468 for subelement in element:
469 xmltext += get_xml_tree(
470 subelement,
471 level,
472 indent_spaces=indent_spaces,
473 eol=eol,
474 include_comments=include_comments,
475 )
476 # recursive
478 elif isinstance(element, XmlSimpleValue):
479 # The lowest-level thing a value. No extra indent.
480 xmltext += xml_escape_value(str(element.value))
482 else:
483 raise ValueError(f"Bad value to get_xml_tree: {element!r}")
485 return xmltext
488def get_xml_document(
489 root: XmlElement,
490 indent_spaces: int = 4,
491 eol: str = "\n",
492 include_comments: bool = False,
493) -> str:
494 """
495 Returns an entire XML document as text, given the root
496 :class:`camcops_server.cc_modules.cc_xml.XmlElement`.
498 Args:
499 root: root :class:`camcops_server.cc_modules.cc_xml.XmlElement`
500 indent_spaces: number of spaces to indent formatted XML
501 eol: end-of-line string
502 include_comments: include comments describing each field?
503 """
504 if not isinstance(root, XmlElement):
505 raise AssertionError(
506 "get_xml_document: root not an XmlElement; "
507 "XML requires a single root"
508 )
509 return xml_header(eol) + get_xml_tree(
510 root,
511 indent_spaces=indent_spaces,
512 eol=eol,
513 include_comments=include_comments,
514 )