Coverage for cc_modules/cc_anon.py: 17%
163 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-08 23:14 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-08 23:14 +0000
1#!/usr/bin/env python
3"""
4camcops_server/cc_modules/cc_anon.py
6===============================================================================
8 Copyright (C) 2012, University of Cambridge, Department of Psychiatry.
9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
11 This file is part of CamCOPS.
13 CamCOPS is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 CamCOPS is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.
26===============================================================================
28**Anonymisation functions.**
30Largely superseded by CRATE (https://doi.org/10.1186%2Fs12911-017-0437-1).
32"""
34from collections import OrderedDict
35import csv
36import sys
37from typing import Dict, List, Generator, TextIO, Tuple, TYPE_CHECKING, Union
39from cardinal_pythonlib.sqlalchemy.orm_inspect import coltype_as_typeengine
40from cardinal_pythonlib.sqlalchemy.schema import (
41 convert_sqla_type_for_dialect,
42 does_sqlatype_require_index_len,
43 is_sqlatype_date,
44 is_sqlatype_text_of_length_at_least,
45 RE_COLTYPE_WITH_ONE_PARAM,
46)
47from cardinal_pythonlib.sqlalchemy.session import SQLITE_MEMORY_URL
49# from sqlalchemy.dialects.mssql.base import MSDialect
50from sqlalchemy.dialects.mysql.base import MySQLDialect
51from sqlalchemy.engine import create_engine
52from sqlalchemy.engine.interfaces import Dialect
53from sqlalchemy.orm import Session as SqlASession, sessionmaker
54from sqlalchemy.sql.schema import Column
56from camcops_server.cc_modules.cc_constants import TABLET_ID_FIELD
57from camcops_server.cc_modules.cc_db import FN_PK
58from camcops_server.cc_modules.cc_dump import DumpController
59from camcops_server.cc_modules.cc_patient import Patient
60from camcops_server.cc_modules.cc_patientidnum import (
61 extra_id_colname,
62 EXTRA_IDNUM_FIELD_PREFIX,
63)
64from camcops_server.cc_modules.cc_simpleobjects import TaskExportOptions
65from camcops_server.cc_modules.cc_sqla_coltypes import CamcopsColumn
67if TYPE_CHECKING:
68 from camcops_server.cc_modules.cc_exportrecipientinfo import (
69 ExportRecipientInfo,
70 )
71 from camcops_server.cc_modules.cc_request import CamcopsRequest
74# =============================================================================
75# Constants
76# =============================================================================
78MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING = 256
81# =============================================================================
82# Write data dictionaries for anonymisation tools
83# =============================================================================
86def _gen_columns_for_anon_staging_db(
87 req: "CamcopsRequest", recipient: "ExportRecipientInfo"
88) -> Generator[Union[Column, CamcopsColumn], None, None]:
89 """
90 Generates all columns for an anonymisation staging database.
91 """
92 url = SQLITE_MEMORY_URL
93 engine = create_engine(url, echo=False)
94 session = sessionmaker(bind=engine)() # type: SqlASession
95 export_options = TaskExportOptions(
96 include_blobs=recipient.db_include_blobs,
97 db_patient_id_per_row=recipient.db_patient_id_per_row,
98 db_make_all_tables_even_empty=True,
99 db_include_summaries=recipient.db_add_summaries,
100 )
102 dc = DumpController(
103 dst_engine=engine,
104 dst_session=session,
105 export_options=export_options,
106 req=req,
107 )
108 for col in dc.gen_all_dest_columns():
109 yield col
112# -----------------------------------------------------------------------------
113# CRIS
114# -----------------------------------------------------------------------------
117def _get_type_size_as_text_from_sqltype(sqltype: str) -> Tuple[str, str]:
118 """
119 Splits SQL size definitions like ``VARCHAR(10)`` into tuples like
120 ``('VARCHAR', '10')`` If it doesn't fit that format, return
121 ``(sqltype, '')``.
122 """
123 m = RE_COLTYPE_WITH_ONE_PARAM.match(sqltype)
124 if m is not None:
125 finaltype = m.group("type").upper()
126 size = m.group("size").strip().upper()
127 else:
128 size = ""
129 finaltype = sqltype
130 return finaltype, size
133# noinspection PyUnusedLocal
134def _get_cris_dd_row(
135 column: Union[Column, CamcopsColumn, None],
136 recipient: "ExportRecipientInfo",
137 dest_dialect: Dialect = None,
138) -> Dict:
139 """
140 Args:
141 column:
142 A column specification (or ``None`` to create a dummy dictionary).
143 dest_dialect:
144 The SQL dialect of the destination database. If ``None``, then
145 MySQL is used as the default.
147 Returns:
148 An :class:`OrderedDict` with information for a CRIS data dictionary
149 row.
150 """
151 dest_dialect = dest_dialect or MySQLDialect() # MSDialect() for SQL Server
152 valid_values = None
153 if column is None:
154 # Dummy row
155 colname = None
156 tablename = None
157 taskname = None
158 comment = None
159 feft = None
160 security_status = None
161 finaltype = None
162 tlfa = None
163 size = None
164 else:
165 colname = column.name
166 tablename = column.table.name
167 taskname = tablename
168 comment = column.comment
169 coltype = coltype_as_typeengine(column.type)
170 is_free_text = is_sqlatype_text_of_length_at_least(
171 coltype, min_length=MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING
172 )
173 exempt_from_anonymisation = False
174 identifies_patient = False
176 if isinstance(column, CamcopsColumn):
177 exempt_from_anonymisation = column.exempt_from_anonymisation
178 identifies_patient = column.identifies_patient
179 if column.permitted_value_checker:
180 valid_values = (
181 column.permitted_value_checker.permitted_values_csv()
182 )
184 needs_scrubbing = is_free_text and not exempt_from_anonymisation
186 # Tag list - fields anon
187 tlfa = "Y" if needs_scrubbing else ""
189 # Destination SQL type
190 desttype = convert_sqla_type_for_dialect(
191 coltype=coltype,
192 dialect=dest_dialect,
193 strip_collation=True,
194 expand_for_scrubbing=needs_scrubbing,
195 )
196 destsqltype = desttype.compile(dialect=dest_dialect)
197 finaltype, size = _get_type_size_as_text_from_sqltype(destsqltype)
199 # Security status
200 system_id = colname == TABLET_ID_FIELD or colname.endswith("_id")
201 patient_idnum_field = colname.startswith(EXTRA_IDNUM_FIELD_PREFIX)
202 internal_field = colname.startswith("_")
203 if identifies_patient and (
204 tablename == Patient.__tablename__ and colname == Patient.dob.name
205 ):
206 security_status = 3 # truncate (e.g. DOB, postcode)
207 elif identifies_patient and tablename == Patient.__tablename__:
208 security_status = 2 # use to scrub
209 elif system_id or internal_field or identifies_patient:
210 security_status = 1 # drop (e.g. for pointless internal keys)
211 else:
212 security_status = 4 # bring through
214 # Front end field type
215 if system_id or patient_idnum_field:
216 feft = 34 # patient ID; other internal keys
217 elif is_sqlatype_date(coltype):
218 feft = 4 # dates
219 elif is_free_text:
220 feft = 3 # giant free text, I think
221 elif valid_values is not None:
222 feft = 2 # picklist
223 else:
224 feft = 1 # text, numbers
226 return OrderedDict(
227 [
228 ("Tab", "CamCOPS"),
229 ("Form name", taskname),
230 ("CRIS tree label", colname),
231 ("Source system table name", tablename),
232 ("SQL column name", colname),
233 ("Front end field type", feft),
234 ("Valid values", valid_values),
235 ("Result column name", colname),
236 ("Family doc tab name", ""),
237 ("Family doc form name", ""),
238 ("Security status", security_status),
239 ("Exclude", ""),
240 ("End SQL Type", finaltype),
241 ("Header field (Y/N)", ""),
242 ("Header field name", ""),
243 ("Header field active (Y/N)", ""),
244 ("View name", ""),
245 ("Exclude from family doc", ""),
246 ("Tag list - fields anon", tlfa),
247 ("Anon type", ""), # formerly "Additional info"
248 ("Form start date", ""),
249 ("Form end date", ""),
250 ("Source", ""),
251 ("Size", size),
252 ("Header logic", ""),
253 ("Patient/contact", ""),
254 ("Comments", comment),
255 ]
256 )
259def write_cris_data_dictionary(
260 req: "CamcopsRequest",
261 recipient: "ExportRecipientInfo",
262 file: TextIO = sys.stdout,
263) -> None:
264 """
265 Generates a draft CRIS data dictionary.
267 CRIS is an anonymisation tool. See
269 - Stewart R, Soremekun M, Perera G, Broadbent M, Callard F, Denis M, Hotopf
270 M, Thornicroft G, Lovestone S (2009).
271 The South London and Maudsley NHS Foundation Trust Biomedical Research
272 Centre (SLAM BRC) case register: development and descriptive data.
273 *BMC Psychiatry* 9: 51.
274 https://www.ncbi.nlm.nih.gov/pubmed/19674459
276 - Fernandes AC, Cloete D, Broadbent MT, Hayes RD, Chang CK, Jackson RG,
277 Roberts A, Tsang J, Soncul M, Liebscher J, Stewart R, Callard F (2013).
278 Development and evaluation of a de-identification procedure for a case
279 register sourced from mental health electronic records.
280 *BMC Med Inform Decis Mak.* 13: 71.
281 https://www.ncbi.nlm.nih.gov/pubmed/23842533
283 Args:
284 req: a :class:`camcops_server.cc_modules.cc_request.CamcopsRequest`
285 recipient: a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo`
286 file: output file
287 """ # noqa
288 dummy = _get_cris_dd_row(column=None, recipient=recipient)
289 wr = csv.DictWriter(file, fieldnames=list(dummy.keys()))
290 wr.writeheader()
291 for col in _gen_columns_for_anon_staging_db(req, recipient):
292 d = _get_cris_dd_row(column=col, recipient=recipient)
293 wr.writerow(d)
296# -----------------------------------------------------------------------------
297# CRATE
298# -----------------------------------------------------------------------------
301def _get_crate_dd_row(
302 column: Union[Column, CamcopsColumn, None],
303 recipient: "ExportRecipientInfo",
304 dest_dialect: Dialect = None,
305 src_db: str = "camcops",
306 default_indexlen: int = 100,
307) -> Dict:
308 """
309 Args:
310 column:
311 A column specification (or ``None`` to create a dummy dictionary).
312 recipient:
313 a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo`
314 dest_dialect:
315 The SQL dialect of the destination database. If ``None``, then
316 MySQL is used as the default.
317 src_db:
318 Value to be used for the "src_db" field.
319 default_indexlen:
320 Default index length for fields that require one.
322 Returns:
323 An :class:`OrderedDict` with information for a CRATE data dictionary
324 row.
325 """ # noqa
326 dest_dialect = dest_dialect or MySQLDialect()
327 exempt_from_anonymisation = False
328 identifies_patient = False
329 identifies_respondent = False
330 force_include = False
331 if column is None:
332 # Dummy row
333 colname = None
334 tablename = None
335 comment = None
336 coltype = None
337 needs_scrubbing = False
338 desttype = None
339 destsqltype = None
340 else:
341 colname = column.name
342 tablename = column.table.name
343 comment = column.comment
344 coltype = coltype_as_typeengine(column.type)
345 is_free_text = is_sqlatype_text_of_length_at_least(
346 coltype, min_length=MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING
347 )
349 if isinstance(column, CamcopsColumn):
350 exempt_from_anonymisation = column.exempt_from_anonymisation
351 identifies_patient = column.identifies_patient
352 force_include = column.include_in_anon_staging_db
354 needs_scrubbing = is_free_text and not exempt_from_anonymisation
355 desttype = convert_sqla_type_for_dialect(
356 coltype=coltype,
357 dialect=dest_dialect,
358 strip_collation=True,
359 expand_for_scrubbing=needs_scrubbing,
360 )
361 destsqltype = desttype.compile(dialect=dest_dialect)
363 # src_flags
364 src_flags = [] # type: List[str]
365 primary_key = colname == FN_PK
366 if primary_key:
367 src_flags.extend(["K", "C"])
368 primary_pid = (
369 recipient.db_patient_id_per_row
370 and recipient.primary_idnum # otherwise just in PatientIdNum
371 and colname == extra_id_colname(recipient.primary_idnum)
372 )
373 if primary_pid:
374 src_flags.append("P")
375 defines_primary_pids = False # no single unique table for this...
376 if defines_primary_pids:
377 src_flags.append("*")
378 master_pid = False # not supported for now
379 if master_pid:
380 src_flags.append("M")
382 # scrub_src
383 if identifies_patient and tablename == Patient.__tablename__:
384 scrub_src = "patient"
385 elif identifies_respondent:
386 scrub_src = "thirdparty"
387 else:
388 scrub_src = None
390 # scrub_method
391 scrub_method = None # default is fine
393 # Include in output?
394 include = (
395 force_include
396 or primary_key
397 or primary_pid
398 or master_pid
399 or not (identifies_patient or identifies_respondent)
400 )
402 # alter_method
403 if needs_scrubbing:
404 alter_method = "scrub"
405 elif tablename == Patient.__tablename__ and colname == Patient.dob.name:
406 alter_method = "truncate_date"
407 else:
408 alter_method = None
410 # Indexing
411 crate_index = None
412 crate_indexlen = None
413 if column is not None and column.index:
414 crate_index = "U" if column.unique else "I"
415 if does_sqlatype_require_index_len(desttype):
416 crate_indexlen = default_indexlen
418 return OrderedDict(
419 [
420 ("src_db", src_db),
421 ("src_table", tablename),
422 ("src_field", colname),
423 ("src_datatype", str(coltype)),
424 ("src_flags", "".join(src_flags) if src_flags else None),
425 ("scrub_src", scrub_src),
426 ("scrub_method", scrub_method),
427 ("decision", "include" if include else "OMIT"),
428 ("inclusion_values", None),
429 ("exclusion_values", None),
430 ("alter_method", alter_method),
431 ("dest_table", tablename),
432 ("dest_field", colname),
433 ("dest_datatype", destsqltype),
434 ("index", crate_index),
435 ("indexlen", crate_indexlen),
436 ("comment", comment),
437 ]
438 )
441def write_crate_data_dictionary(
442 req: "CamcopsRequest",
443 recipient: "ExportRecipientInfo",
444 file: TextIO = sys.stdout,
445) -> None:
446 """
447 Generates a draft CRATE data dictionary.
449 CRATE is an anonymisation tool. See:
451 - Cardinal RN (2017).
452 Clinical records anonymisation and text extraction (CRATE): an
453 open-source software system.
454 *BMC Medical Informatics and Decision Making* 17: 50.
455 https://www.pubmed.gov/28441940;
456 https://doi.org/10.1186/s12911-017-0437-1.
458 - https://crateanon.readthedocs.io/
460 Args:
461 req: a :class:`camcops_server.cc_modules.cc_request.CamcopsRequest`
462 recipient: a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo`
463 file: output file
464 """ # noqa
465 dummy = _get_crate_dd_row(column=None, recipient=recipient)
466 wr = csv.DictWriter(file, fieldnames=list(dummy.keys()))
467 wr.writeheader()
468 for col in _gen_columns_for_anon_staging_db(req, recipient):
469 d = _get_crate_dd_row(column=col, recipient=recipient)
470 wr.writerow(d)