Coverage for cc_modules/cc_anon.py: 17%

163 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-08 23:14 +0000

1#!/usr/bin/env python 

2 

3""" 

4camcops_server/cc_modules/cc_anon.py 

5 

6=============================================================================== 

7 

8 Copyright (C) 2012, University of Cambridge, Department of Psychiatry. 

9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

10 

11 This file is part of CamCOPS. 

12 

13 CamCOPS is free software: you can redistribute it and/or modify 

14 it under the terms of the GNU General Public License as published by 

15 the Free Software Foundation, either version 3 of the License, or 

16 (at your option) any later version. 

17 

18 CamCOPS is distributed in the hope that it will be useful, 

19 but WITHOUT ANY WARRANTY; without even the implied warranty of 

20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

21 GNU General Public License for more details. 

22 

23 You should have received a copy of the GNU General Public License 

24 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>. 

25 

26=============================================================================== 

27 

28**Anonymisation functions.** 

29 

30Largely superseded by CRATE (https://doi.org/10.1186%2Fs12911-017-0437-1). 

31 

32""" 

33 

34from collections import OrderedDict 

35import csv 

36import sys 

37from typing import Dict, List, Generator, TextIO, Tuple, TYPE_CHECKING, Union 

38 

39from cardinal_pythonlib.sqlalchemy.orm_inspect import coltype_as_typeengine 

40from cardinal_pythonlib.sqlalchemy.schema import ( 

41 convert_sqla_type_for_dialect, 

42 does_sqlatype_require_index_len, 

43 is_sqlatype_date, 

44 is_sqlatype_text_of_length_at_least, 

45 RE_COLTYPE_WITH_ONE_PARAM, 

46) 

47from cardinal_pythonlib.sqlalchemy.session import SQLITE_MEMORY_URL 

48 

49# from sqlalchemy.dialects.mssql.base import MSDialect 

50from sqlalchemy.dialects.mysql.base import MySQLDialect 

51from sqlalchemy.engine import create_engine 

52from sqlalchemy.engine.interfaces import Dialect 

53from sqlalchemy.orm import Session as SqlASession, sessionmaker 

54from sqlalchemy.sql.schema import Column 

55 

56from camcops_server.cc_modules.cc_constants import TABLET_ID_FIELD 

57from camcops_server.cc_modules.cc_db import FN_PK 

58from camcops_server.cc_modules.cc_dump import DumpController 

59from camcops_server.cc_modules.cc_patient import Patient 

60from camcops_server.cc_modules.cc_patientidnum import ( 

61 extra_id_colname, 

62 EXTRA_IDNUM_FIELD_PREFIX, 

63) 

64from camcops_server.cc_modules.cc_simpleobjects import TaskExportOptions 

65from camcops_server.cc_modules.cc_sqla_coltypes import CamcopsColumn 

66 

67if TYPE_CHECKING: 

68 from camcops_server.cc_modules.cc_exportrecipientinfo import ( 

69 ExportRecipientInfo, 

70 ) 

71 from camcops_server.cc_modules.cc_request import CamcopsRequest 

72 

73 

74# ============================================================================= 

75# Constants 

76# ============================================================================= 

77 

78MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING = 256 

79 

80 

81# ============================================================================= 

82# Write data dictionaries for anonymisation tools 

83# ============================================================================= 

84 

85 

86def _gen_columns_for_anon_staging_db( 

87 req: "CamcopsRequest", recipient: "ExportRecipientInfo" 

88) -> Generator[Union[Column, CamcopsColumn], None, None]: 

89 """ 

90 Generates all columns for an anonymisation staging database. 

91 """ 

92 url = SQLITE_MEMORY_URL 

93 engine = create_engine(url, echo=False) 

94 session = sessionmaker(bind=engine)() # type: SqlASession 

95 export_options = TaskExportOptions( 

96 include_blobs=recipient.db_include_blobs, 

97 db_patient_id_per_row=recipient.db_patient_id_per_row, 

98 db_make_all_tables_even_empty=True, 

99 db_include_summaries=recipient.db_add_summaries, 

100 ) 

101 

102 dc = DumpController( 

103 dst_engine=engine, 

104 dst_session=session, 

105 export_options=export_options, 

106 req=req, 

107 ) 

108 for col in dc.gen_all_dest_columns(): 

109 yield col 

110 

111 

112# ----------------------------------------------------------------------------- 

113# CRIS 

114# ----------------------------------------------------------------------------- 

115 

116 

117def _get_type_size_as_text_from_sqltype(sqltype: str) -> Tuple[str, str]: 

118 """ 

119 Splits SQL size definitions like ``VARCHAR(10)`` into tuples like 

120 ``('VARCHAR', '10')`` If it doesn't fit that format, return 

121 ``(sqltype, '')``. 

122 """ 

123 m = RE_COLTYPE_WITH_ONE_PARAM.match(sqltype) 

124 if m is not None: 

125 finaltype = m.group("type").upper() 

126 size = m.group("size").strip().upper() 

127 else: 

128 size = "" 

129 finaltype = sqltype 

130 return finaltype, size 

131 

132 

133# noinspection PyUnusedLocal 

134def _get_cris_dd_row( 

135 column: Union[Column, CamcopsColumn, None], 

136 recipient: "ExportRecipientInfo", 

137 dest_dialect: Dialect = None, 

138) -> Dict: 

139 """ 

140 Args: 

141 column: 

142 A column specification (or ``None`` to create a dummy dictionary). 

143 dest_dialect: 

144 The SQL dialect of the destination database. If ``None``, then 

145 MySQL is used as the default. 

146 

147 Returns: 

148 An :class:`OrderedDict` with information for a CRIS data dictionary 

149 row. 

150 """ 

151 dest_dialect = dest_dialect or MySQLDialect() # MSDialect() for SQL Server 

152 valid_values = None 

153 if column is None: 

154 # Dummy row 

155 colname = None 

156 tablename = None 

157 taskname = None 

158 comment = None 

159 feft = None 

160 security_status = None 

161 finaltype = None 

162 tlfa = None 

163 size = None 

164 else: 

165 colname = column.name 

166 tablename = column.table.name 

167 taskname = tablename 

168 comment = column.comment 

169 coltype = coltype_as_typeengine(column.type) 

170 is_free_text = is_sqlatype_text_of_length_at_least( 

171 coltype, min_length=MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING 

172 ) 

173 exempt_from_anonymisation = False 

174 identifies_patient = False 

175 

176 if isinstance(column, CamcopsColumn): 

177 exempt_from_anonymisation = column.exempt_from_anonymisation 

178 identifies_patient = column.identifies_patient 

179 if column.permitted_value_checker: 

180 valid_values = ( 

181 column.permitted_value_checker.permitted_values_csv() 

182 ) 

183 

184 needs_scrubbing = is_free_text and not exempt_from_anonymisation 

185 

186 # Tag list - fields anon 

187 tlfa = "Y" if needs_scrubbing else "" 

188 

189 # Destination SQL type 

190 desttype = convert_sqla_type_for_dialect( 

191 coltype=coltype, 

192 dialect=dest_dialect, 

193 strip_collation=True, 

194 expand_for_scrubbing=needs_scrubbing, 

195 ) 

196 destsqltype = desttype.compile(dialect=dest_dialect) 

197 finaltype, size = _get_type_size_as_text_from_sqltype(destsqltype) 

198 

199 # Security status 

200 system_id = colname == TABLET_ID_FIELD or colname.endswith("_id") 

201 patient_idnum_field = colname.startswith(EXTRA_IDNUM_FIELD_PREFIX) 

202 internal_field = colname.startswith("_") 

203 if identifies_patient and ( 

204 tablename == Patient.__tablename__ and colname == Patient.dob.name 

205 ): 

206 security_status = 3 # truncate (e.g. DOB, postcode) 

207 elif identifies_patient and tablename == Patient.__tablename__: 

208 security_status = 2 # use to scrub 

209 elif system_id or internal_field or identifies_patient: 

210 security_status = 1 # drop (e.g. for pointless internal keys) 

211 else: 

212 security_status = 4 # bring through 

213 

214 # Front end field type 

215 if system_id or patient_idnum_field: 

216 feft = 34 # patient ID; other internal keys 

217 elif is_sqlatype_date(coltype): 

218 feft = 4 # dates 

219 elif is_free_text: 

220 feft = 3 # giant free text, I think 

221 elif valid_values is not None: 

222 feft = 2 # picklist 

223 else: 

224 feft = 1 # text, numbers 

225 

226 return OrderedDict( 

227 [ 

228 ("Tab", "CamCOPS"), 

229 ("Form name", taskname), 

230 ("CRIS tree label", colname), 

231 ("Source system table name", tablename), 

232 ("SQL column name", colname), 

233 ("Front end field type", feft), 

234 ("Valid values", valid_values), 

235 ("Result column name", colname), 

236 ("Family doc tab name", ""), 

237 ("Family doc form name", ""), 

238 ("Security status", security_status), 

239 ("Exclude", ""), 

240 ("End SQL Type", finaltype), 

241 ("Header field (Y/N)", ""), 

242 ("Header field name", ""), 

243 ("Header field active (Y/N)", ""), 

244 ("View name", ""), 

245 ("Exclude from family doc", ""), 

246 ("Tag list - fields anon", tlfa), 

247 ("Anon type", ""), # formerly "Additional info" 

248 ("Form start date", ""), 

249 ("Form end date", ""), 

250 ("Source", ""), 

251 ("Size", size), 

252 ("Header logic", ""), 

253 ("Patient/contact", ""), 

254 ("Comments", comment), 

255 ] 

256 ) 

257 

258 

259def write_cris_data_dictionary( 

260 req: "CamcopsRequest", 

261 recipient: "ExportRecipientInfo", 

262 file: TextIO = sys.stdout, 

263) -> None: 

264 """ 

265 Generates a draft CRIS data dictionary. 

266 

267 CRIS is an anonymisation tool. See 

268 

269 - Stewart R, Soremekun M, Perera G, Broadbent M, Callard F, Denis M, Hotopf 

270 M, Thornicroft G, Lovestone S (2009). 

271 The South London and Maudsley NHS Foundation Trust Biomedical Research 

272 Centre (SLAM BRC) case register: development and descriptive data. 

273 *BMC Psychiatry* 9: 51. 

274 https://www.ncbi.nlm.nih.gov/pubmed/19674459 

275 

276 - Fernandes AC, Cloete D, Broadbent MT, Hayes RD, Chang CK, Jackson RG, 

277 Roberts A, Tsang J, Soncul M, Liebscher J, Stewart R, Callard F (2013). 

278 Development and evaluation of a de-identification procedure for a case 

279 register sourced from mental health electronic records. 

280 *BMC Med Inform Decis Mak.* 13: 71. 

281 https://www.ncbi.nlm.nih.gov/pubmed/23842533 

282 

283 Args: 

284 req: a :class:`camcops_server.cc_modules.cc_request.CamcopsRequest` 

285 recipient: a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo` 

286 file: output file 

287 """ # noqa 

288 dummy = _get_cris_dd_row(column=None, recipient=recipient) 

289 wr = csv.DictWriter(file, fieldnames=list(dummy.keys())) 

290 wr.writeheader() 

291 for col in _gen_columns_for_anon_staging_db(req, recipient): 

292 d = _get_cris_dd_row(column=col, recipient=recipient) 

293 wr.writerow(d) 

294 

295 

296# ----------------------------------------------------------------------------- 

297# CRATE 

298# ----------------------------------------------------------------------------- 

299 

300 

301def _get_crate_dd_row( 

302 column: Union[Column, CamcopsColumn, None], 

303 recipient: "ExportRecipientInfo", 

304 dest_dialect: Dialect = None, 

305 src_db: str = "camcops", 

306 default_indexlen: int = 100, 

307) -> Dict: 

308 """ 

309 Args: 

310 column: 

311 A column specification (or ``None`` to create a dummy dictionary). 

312 recipient: 

313 a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo` 

314 dest_dialect: 

315 The SQL dialect of the destination database. If ``None``, then 

316 MySQL is used as the default. 

317 src_db: 

318 Value to be used for the "src_db" field. 

319 default_indexlen: 

320 Default index length for fields that require one. 

321 

322 Returns: 

323 An :class:`OrderedDict` with information for a CRATE data dictionary 

324 row. 

325 """ # noqa 

326 dest_dialect = dest_dialect or MySQLDialect() 

327 exempt_from_anonymisation = False 

328 identifies_patient = False 

329 identifies_respondent = False 

330 force_include = False 

331 if column is None: 

332 # Dummy row 

333 colname = None 

334 tablename = None 

335 comment = None 

336 coltype = None 

337 needs_scrubbing = False 

338 desttype = None 

339 destsqltype = None 

340 else: 

341 colname = column.name 

342 tablename = column.table.name 

343 comment = column.comment 

344 coltype = coltype_as_typeengine(column.type) 

345 is_free_text = is_sqlatype_text_of_length_at_least( 

346 coltype, min_length=MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING 

347 ) 

348 

349 if isinstance(column, CamcopsColumn): 

350 exempt_from_anonymisation = column.exempt_from_anonymisation 

351 identifies_patient = column.identifies_patient 

352 force_include = column.include_in_anon_staging_db 

353 

354 needs_scrubbing = is_free_text and not exempt_from_anonymisation 

355 desttype = convert_sqla_type_for_dialect( 

356 coltype=coltype, 

357 dialect=dest_dialect, 

358 strip_collation=True, 

359 expand_for_scrubbing=needs_scrubbing, 

360 ) 

361 destsqltype = desttype.compile(dialect=dest_dialect) 

362 

363 # src_flags 

364 src_flags = [] # type: List[str] 

365 primary_key = colname == FN_PK 

366 if primary_key: 

367 src_flags.extend(["K", "C"]) 

368 primary_pid = ( 

369 recipient.db_patient_id_per_row 

370 and recipient.primary_idnum # otherwise just in PatientIdNum 

371 and colname == extra_id_colname(recipient.primary_idnum) 

372 ) 

373 if primary_pid: 

374 src_flags.append("P") 

375 defines_primary_pids = False # no single unique table for this... 

376 if defines_primary_pids: 

377 src_flags.append("*") 

378 master_pid = False # not supported for now 

379 if master_pid: 

380 src_flags.append("M") 

381 

382 # scrub_src 

383 if identifies_patient and tablename == Patient.__tablename__: 

384 scrub_src = "patient" 

385 elif identifies_respondent: 

386 scrub_src = "thirdparty" 

387 else: 

388 scrub_src = None 

389 

390 # scrub_method 

391 scrub_method = None # default is fine 

392 

393 # Include in output? 

394 include = ( 

395 force_include 

396 or primary_key 

397 or primary_pid 

398 or master_pid 

399 or not (identifies_patient or identifies_respondent) 

400 ) 

401 

402 # alter_method 

403 if needs_scrubbing: 

404 alter_method = "scrub" 

405 elif tablename == Patient.__tablename__ and colname == Patient.dob.name: 

406 alter_method = "truncate_date" 

407 else: 

408 alter_method = None 

409 

410 # Indexing 

411 crate_index = None 

412 crate_indexlen = None 

413 if column is not None and column.index: 

414 crate_index = "U" if column.unique else "I" 

415 if does_sqlatype_require_index_len(desttype): 

416 crate_indexlen = default_indexlen 

417 

418 return OrderedDict( 

419 [ 

420 ("src_db", src_db), 

421 ("src_table", tablename), 

422 ("src_field", colname), 

423 ("src_datatype", str(coltype)), 

424 ("src_flags", "".join(src_flags) if src_flags else None), 

425 ("scrub_src", scrub_src), 

426 ("scrub_method", scrub_method), 

427 ("decision", "include" if include else "OMIT"), 

428 ("inclusion_values", None), 

429 ("exclusion_values", None), 

430 ("alter_method", alter_method), 

431 ("dest_table", tablename), 

432 ("dest_field", colname), 

433 ("dest_datatype", destsqltype), 

434 ("index", crate_index), 

435 ("indexlen", crate_indexlen), 

436 ("comment", comment), 

437 ] 

438 ) 

439 

440 

441def write_crate_data_dictionary( 

442 req: "CamcopsRequest", 

443 recipient: "ExportRecipientInfo", 

444 file: TextIO = sys.stdout, 

445) -> None: 

446 """ 

447 Generates a draft CRATE data dictionary. 

448 

449 CRATE is an anonymisation tool. See: 

450 

451 - Cardinal RN (2017). 

452 Clinical records anonymisation and text extraction (CRATE): an 

453 open-source software system. 

454 *BMC Medical Informatics and Decision Making* 17: 50. 

455 https://www.pubmed.gov/28441940; 

456 https://doi.org/10.1186/s12911-017-0437-1. 

457 

458 - https://crateanon.readthedocs.io/ 

459 

460 Args: 

461 req: a :class:`camcops_server.cc_modules.cc_request.CamcopsRequest` 

462 recipient: a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo` 

463 file: output file 

464 """ # noqa 

465 dummy = _get_crate_dd_row(column=None, recipient=recipient) 

466 wr = csv.DictWriter(file, fieldnames=list(dummy.keys())) 

467 wr.writeheader() 

468 for col in _gen_columns_for_anon_staging_db(req, recipient): 

469 d = _get_crate_dd_row(column=col, recipient=recipient) 

470 wr.writerow(d)