#!/usr/bin/env python
# crate_anon/crateweb/research/models.py
"""
===============================================================================
Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <http://www.gnu.org/licenses/>.
===============================================================================
"""
from collections import OrderedDict
import contextlib
import datetime
import io
import logging
from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Union
import zipfile
from cardinal_pythonlib.dbfunc import dictfetchall, get_fieldnames_from_cursor
from cardinal_pythonlib.django.fields.jsonclassfield import JsonClassField
from cardinal_pythonlib.excel import excel_to_bytes
from cardinal_pythonlib.exceptions import add_info_to_exception
from cardinal_pythonlib.hash import (
get_longest_supported_hasher_output_length,
hash64,
)
from cardinal_pythonlib.json.serialize import (
json_encode,
METHOD_STRIP_UNDERSCORE,
register_for_json,
)
from cardinal_pythonlib.reprfunc import simple_repr
from cardinal_pythonlib.sql.sql_grammar import format_sql, SqlGrammar
from cardinal_pythonlib.tsv import make_tsv_row
from django.db import connections, DatabaseError, models
from django.db.models import QuerySet
from django.conf import settings
from django.http.request import HttpRequest
from django.db.backends.utils import CursorWrapper
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from openpyxl.workbook.workbook import Workbook
from openpyxl.worksheet.worksheet import Worksheet
from crate_anon.anonymise.models import PatientInfoConstants
from crate_anon.common.sql import (
ColumnId,
columns_to_table_column_hierarchy,
escape_percent_for_python_dbapi,
make_grammar,
sql_string_literal,
SqlArgsTupleType,
TableId,
translate_sql_qmark_to_percent,
WhereCondition,
)
from crate_anon.crateweb.research.html_functions import (
HtmlElementCounter,
N_CSS_HIGHLIGHT_CLASSES,
prettify_sql_html,
)
from crate_anon.crateweb.research.research_db_info import (
RESEARCH_DB_CONNECTION_NAME,
research_database_info,
SingleResearchDatabase,
)
from crate_anon.crateweb.research.sql_writer import (
add_to_select,
SelectElement,
)
log = logging.getLogger(__name__)
# =============================================================================
# Hacking django-pyodbc-azure, to stop it calling cursor.nextset() every time
# you ask it to do cursor.fetchone()
# =============================================================================
DJANGO_PYODBC_AZURE_ENGINE = 'sql_server.pyodbc'
[docs]def replacement_sqlserver_pyodbc_cursorwrapper_fetchone(self) -> List[Any]:
"""
A function to replace ``CursorWrapper.fetchone()`` in
``sql_server/pyodbc/base.py`` from ``django-pyodbc-azure``.
This replacement function does not call ``cursor.nextset()``.
"""
# log.critical("Using monkeypatched fetchone(); self: {}; self.cursor: "
# "{}".format(repr(self), repr(self.cursor)))
row = self.cursor.fetchone()
if row is not None:
row = self.format_row(row)
# BUT DO NOT CALL self.cursor.nextset()
return row
[docs]def hack_django_pyodbc_azure_cursorwrapper() -> None:
"""
Monkey-patch part of the ``sql_server.pyodbc`` library from
``django-pyodbc-azure``. It replaces the ``fetchone()`` method with a
version that doesn't call ``cursor.nextset()`` automatically.
**It looks like this becomes unnecessary in django-pyodbc-azure==2.0.6.1
or similar, because the call to ``cursor.nextset()`` is now only performed
``if not self.connection.supports_mars``.**
*Notes*
- I thought I wanted to modify an *instance*, not a *class*
(https://tryolabs.com/blog/2013/07/05/run-time-method-patching-python/).
- To modify a class, we do ``SomeClass.method = newmethod``.
- But to modify an instance, we use ``instance.method =
types.MethodType(newmethod, instance)``.
- However, it turned out the instance was actually part of a long chain
of cursor wrappers, including the Django debug toolbar. Classes included
``debug_toolbar.panels.sql.tracking.NormalCursorWrapper``;
``django.db.backends.utils.CursorDebugWrapper``.
And in any case, modifying the class is a sensible thing.
"""
try:
# noinspection PyUnresolvedReferences
from sql_server.pyodbc.base import CursorWrapper
log.info("Monkey-patching sql_server.pyodbc.base.CursorWrapper."
"fetchone to disable automatic call to cursor.nextset()")
CursorWrapper.fetchone = replacement_sqlserver_pyodbc_cursorwrapper_fetchone # noqa
except ImportError:
return
if getattr(settings, 'DISABLE_DJANGO_PYODBC_AZURE_CURSOR_FETCHONE_NEXTSET',
True):
# http://stackoverflow.com/questions/5601590/how-to-define-a-default-value-for-a-custom-django-setting # noqa
hack_django_pyodbc_azure_cursorwrapper()
# =============================================================================
# Cursors
# =============================================================================
def debug_query() -> None:
cursor = connections[RESEARCH_DB_CONNECTION_NAME].cursor()
cursor.execute("SELECT 'debug'")
def get_executed_researchdb_cursor(sql: str,
args: List[Any] = None) -> CursorWrapper:
args = args or []
cursor = connections[RESEARCH_DB_CONNECTION_NAME].cursor()
try:
cursor.execute(sql, args or None)
except DatabaseError as exception:
add_info_to_exception(exception, {'sql': sql, 'args': args})
raise
# noinspection PyTypeChecker
return contextlib.closing(cursor)
# =============================================================================
# Data going to Excel files
# =============================================================================
ILLEGAL_CHARACTERS_REPLACED_WITH = ""
[docs]def gen_excel_row_elements(worksheet: Worksheet,
row: Iterable) -> Generator[Any, None, None]:
r"""
Given an Excel worksheet row, generate individual cell contents, cell by
cell.
Reasons for this function:
1. Need a tuple/list/generator, as openpyxl checks its types manually.
- We want to have a Worksheet object from openpyxl, and say something
like
.. code-block:: python
ws.append(row)
where "row" has come from a database query.
- However, openpyxl doesn't believe in duck-typing; see
``Worksheet.append()`` in ``openpyxl/worksheet/worksheet.py``. So
sometimes the plain append works (e.g. from MySQL results), but
sometimes it fails, e.g. when the row is of type ``pyodbc.Row``.
- So we must coerce it to a tuple, list, or generator.
- A generator will be the most efficient.
2. If a string fails certain checks, openpyxl will raise an
IllegalCharacterError exception. We need to work around that. We'll use
the "forgiveness, not permission" maxim.
Specifically, it dislikes strings matching its ILLEGAL_CHARACTERS_RE,
which contains unprintable low characters matching this:
.. code-block:: python
r'[\000-\010]|[\013-\014]|[\016-\037]'
Note the use of octal; ``\037`` is decimal 31.
openpyxl gets to its Cell.check_string() function for these types:
.. code-block:: python
STRING_TYPES = (basestring, unicode, bytes)
In Python 3, this means (str, str, bytes).
So we should check str and bytes. (For bytes, we'll follow its method
of converting to str in the encoding of the worksheet's choice.)
"""
# Docstring must be a raw string for Sphinx! See
# http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html#text-syntax-bold-italic-verbatim-and-special-characters # noqa
for element in row:
if isinstance(element, bytes):
# Convert to str using the worksheet's encoding.
element = element.decode(worksheet.encoding)
# ... or: str(element, encoding)
if isinstance(element, str):
yield ILLEGAL_CHARACTERS_RE.sub(ILLEGAL_CHARACTERS_REPLACED_WITH,
element)
else:
yield element
# =============================================================================
# Query highlighting class
# =============================================================================
HIGHLIGHT_FWD_REF = "Highlight"
[docs]class Highlight(models.Model):
"""
Represents the highlighting of a query.
"""
id = models.AutoField(primary_key=True) # automatic
user = models.ForeignKey(settings.AUTH_USER_MODEL,
on_delete=models.CASCADE)
colour = models.PositiveSmallIntegerField(verbose_name="Colour number")
text = models.CharField(max_length=255, verbose_name="Text to highlight")
active = models.BooleanField(default=True)
def __str__(self) -> str:
return "colour={}, text={}".format(self.colour, self.text)
def get_safe_colour(self) -> int:
if self.colour is None:
return 0
return min(self.colour, N_CSS_HIGHLIGHT_CLASSES - 1)
@staticmethod
def as_ordered_dict(highlight_list) -> Dict[int, List[HIGHLIGHT_FWD_REF]]:
d = dict()
for highlight in highlight_list:
n = highlight.get_safe_colour()
if n not in d:
d[n] = [] # type: List[HIGHLIGHT_FWD_REF]
d[n].append(highlight)
return OrderedDict(sorted(d.items()))
@staticmethod
def get_active_highlights(request: HttpRequest) -> QuerySet:
return Highlight.objects.filter(user=request.user, active=True)
def activate(self) -> None:
self.active = True
self.save()
def deactivate(self) -> None:
self.active = False
self.save()
# =============================================================================
# Query class
# =============================================================================
QUERY_FWD_REF = "Query"
[docs]class Query(models.Model):
"""
Class to query the research database.
"""
class Meta:
app_label = "research"
id = models.AutoField(primary_key=True) # automatic
user = models.ForeignKey(settings.AUTH_USER_MODEL,
on_delete=models.CASCADE)
sql = models.TextField(verbose_name='SQL query')
sql_hash = models.BigIntegerField(
verbose_name='64-bit non-cryptographic hash of SQL query')
args = JsonClassField(verbose_name='SQL arguments (as JSON)', null=True)
# ... https://github.com/shrubberysoft/django-picklefield
raw = models.BooleanField(
default=False, verbose_name='SQL is raw, not parameter-substituted')
qmark = models.BooleanField(
default=True,
verbose_name='Parameter-substituted SQL uses ?, not %s, '
'as placeholders')
active = models.BooleanField(default=True) # see save() below
created = models.DateTimeField(auto_now_add=True)
deleted = models.BooleanField(
default=False,
verbose_name="Deleted from the user's perspective. "
"Audited queries are never properly deleted.")
audited = models.BooleanField(default=False)
def __repr__(self) -> str:
return simple_repr(self, ['id', 'user', 'sql', 'args', 'raw', 'qmark',
'active', 'created', 'deleted', 'audited'])
[docs] def save(self, *args, **kwargs) -> None:
"""
Custom save method.
Ensures that only one Query has active == True for a given user.
Also sets the hash.
"""
# http://stackoverflow.com/questions/1455126/unique-booleanfield-value-in-django # noqa
if self.active:
Query.objects.filter(user=self.user, active=True)\
.update(active=False)
self.sql_hash = hash64(self.sql)
super().save(*args, **kwargs)
# -------------------------------------------------------------------------
# Fetching
# -------------------------------------------------------------------------
@staticmethod
def get_active_query_or_none(request: HttpRequest) \
-> Optional[QUERY_FWD_REF]:
if not request.user.is_authenticated:
return None
try:
return Query.objects.get(user=request.user, active=True)
except Query.DoesNotExist:
return None
@staticmethod
def get_active_query_id_or_none(request: HttpRequest) -> Optional[int]:
if not request.user.is_authenticated:
return None
try:
query = Query.objects.get(user=request.user, active=True)
return query.id
except Query.DoesNotExist:
return None
# -------------------------------------------------------------------------
# Activating, deleting, auditing
# -------------------------------------------------------------------------
def activate(self) -> None:
self.active = True
self.save()
def mark_audited(self) -> None:
if self.audited:
return
self.audited = True
self.save()
def mark_deleted(self) -> None:
if self.deleted:
# log.debug("pointless")
return
self.deleted = True
self.active = False
# log.debug("about to save")
self.save()
# log.debug("saved")
[docs] def delete_if_permitted(self) -> None:
"""If a query has been audited, it isn't properly deleted."""
if self.deleted:
log.debug("already flagged as deleted")
return
if self.audited:
log.debug("marking as deleted")
self.mark_deleted()
else:
# actually delete
log.debug("actually deleting")
self.delete()
def audit(self, count_only: bool = False, n_records: int = 0,
failed: bool = False, fail_msg: str = "") -> None:
a = QueryAudit(query=self,
count_only=count_only,
n_records=n_records,
failed=failed,
fail_msg=fail_msg)
a.save()
self.mark_audited()
# -------------------------------------------------------------------------
# SQL queries
# -------------------------------------------------------------------------
def get_original_sql(self) -> str:
# noinspection PyTypeChecker
return self.sql
[docs] def get_sql_args_for_django(self) -> Tuple[str, Optional[List[Any]]]:
"""
Get sql/args in a format suitable for Django, with %s placeholders,
or as escaped raw SQL.
"""
if self.raw:
# noinspection PyTypeChecker
sql = escape_percent_for_python_dbapi(self.sql)
args = None
else:
if self.qmark:
# noinspection PyTypeChecker
sql = translate_sql_qmark_to_percent(self.sql)
else:
sql = self.sql
args = self.args
return sql, args
[docs] def get_executed_cursor(self, sql_append_raw: str = None) -> CursorWrapper:
"""
Get cursor with a query executed
"""
(sql, args) = self.get_sql_args_for_django()
if sql_append_raw:
sql += sql_append_raw
return get_executed_researchdb_cursor(sql, args)
# def gen_rows(self,
# firstrow: int = 0,
# lastrow: int = None) -> Generator[List[Any], None, None]:
# """
# Generate rows from the query.
# """
# if firstrow > 0 or lastrow is not None:
# sql_append_raw = " LIMIT {f},{n}".format(
# f=firstrow,
# n=(lastrow - firstrow + 1),
# )
# # zero-indexed;
# # http://dev.mysql.com/doc/refman/5.0/en/select.html
# else:
# sql_append_raw = None
# with self.get_executed_cursor(sql_append_raw) as cursor:
# row = cursor.fetchone()
# while row is not None:
# yield row
# row = cursor.fetchone()
def make_tsv(self) -> str:
with self.get_executed_cursor() as cursor:
fieldnames = get_fieldnames_from_cursor(cursor)
tsv = make_tsv_row(fieldnames)
row = cursor.fetchone()
while row is not None:
tsv += make_tsv_row(row)
row = cursor.fetchone()
return tsv
def make_excel(self) -> bytes:
wb = Workbook()
wb.remove_sheet(wb.active) # remove the autocreated blank sheet
sheetname = "query_{}".format(self.id)
ws = wb.create_sheet(sheetname)
now = datetime.datetime.now()
with self.get_executed_cursor() as cursor:
fieldnames = get_fieldnames_from_cursor(cursor)
ws.append(fieldnames)
row = cursor.fetchone()
while row is not None:
ws.append(gen_excel_row_elements(ws, row))
row = cursor.fetchone()
# BUG in django-pyodbc-azure==1.10.4.0 (providing
# sql_server/*), 2017-02-17: this causes
# ProgrammingError "No results. Previous SQL was not a query."
# The problem relates to sql_server/pyodbc/base.py
# CursorWrapper.fetchone() calling self.cursor.nextset(); if
# you comment this out, it works fine.
# Related:
# - https://github.com/pymssql/pymssql/issues/98
sql_ws = wb.create_sheet(title="SQL")
sql_ws.append(["SQL", "Executed_at"])
sql_ws.append([self.get_original_sql(), now])
return excel_to_bytes(wb)
[docs] def dictfetchall(self) -> List[Dict[str, Any]]:
"""Generates all results as a list of OrderedDicts."""
with self.get_executed_cursor() as cursor:
return dictfetchall(cursor)
# =============================================================================
# Query auditing class
# =============================================================================
[docs]class QueryAudit(models.Model):
"""
Audit log for a query.
"""
id = models.AutoField(primary_key=True) # automatic
query = models.ForeignKey('Query', on_delete=models.PROTECT)
when = models.DateTimeField(auto_now_add=True)
count_only = models.BooleanField(default=False)
n_records = models.IntegerField(default=0)
# ... not PositiveIntegerField; SQL Server gives -1, for example
failed = models.BooleanField(default=False)
fail_msg = models.TextField()
def __str__(self):
return "<QueryAudit id={}>".format(self.id)
# =============================================================================
# Lookup class for secret RID-to-PID conversion
# =============================================================================
# class PidLookupRouter(object):
# # https://docs.djangoproject.com/en/1.8/topics/db/multi-db/
# # https://newcircle.com/s/post/1242/django_multiple_database_support
# # noinspection PyMethodMayBeStatic,PyUnusedLocal
# def db_for_read(self, model: Type[models.Model], **hints) -> Optional[str]: # noqa
# """
# read model PidLookup -> look at database secret
# """
# # log.debug("PidLookupRouter: {}".format(model._meta.model_name))
# # if model._meta.model_name == PidLookup._meta.model_name:
# if model == PidLookup:
# return 'secret'
# return None
#
# # noinspection PyUnusedLocal
# @staticmethod
# def allow_migrate(db: str, app_label: str, model_name: str = None,
# **hints) -> bool:
# # 2017-02-12, to address bug:
# # - https://code.djangoproject.com/ticket/27054
# # See also:
# # - https://docs.djangoproject.com/en/1.10/topics/db/multi-db/#using-other-management-commands # noqa
# return db == 'default'
[docs]class PidLookup(models.Model):
"""
Lookup class for secret RID-to-PID conversion.
Used via one or other of the 'secret' database connections.
Intended for READ-ONLY access to that table.
Since we have fixed the tablenames for the anonymiser, we remove the
settings.SECRET_MAP option. See PatientInfo in
crate_anon/anonymise/models.py. Moreover, we fix the maximum length,
regardless of the specifics of the config used.
Use as e.g. Lookup(pid=XXX)
"""
pid = models.PositiveIntegerField(
primary_key=True,
db_column=PatientInfoConstants.PID_FIELDNAME)
mpid = models.PositiveIntegerField(
db_column=PatientInfoConstants.MPID_FIELDNAME)
rid = models.CharField(
db_column=PatientInfoConstants.RID_FIELDNAME,
max_length=get_longest_supported_hasher_output_length())
mrid = models.CharField(
db_column=PatientInfoConstants.MRID_FIELDNAME,
max_length=get_longest_supported_hasher_output_length())
trid = models.PositiveIntegerField(
db_column=PatientInfoConstants.TRID_FIELDNAME)
class Meta:
managed = False
db_table = PatientInfoConstants.SECRET_MAP_TABLENAME
# https://stackoverflow.com/questions/12158463/how-can-i-make-a-model-read-only # noqa
[docs] def save(self, *args, **kwargs) -> None:
return
def delete(self, *args, **kwargs) -> None:
return
def get_pid_lookup(dbinfo: SingleResearchDatabase,
pid: Union[int, str] = None,
mpid: Union[int, str] = None,
trid: int = None,
rid: str = None,
mrid: str = None) -> Optional[PidLookup]:
dbalias = dbinfo.secret_lookup_db
assert dbalias
q = PidLookup.objects.using(dbalias)
if trid is not None:
lookup = q.get(trid=trid)
elif rid is not None:
lookup = q.get(rid=rid)
elif mrid is not None:
lookup = q.get(mrid=mrid)
elif pid is not None:
lookup = q.get(pid=pid)
elif mpid is not None:
lookup = q.get(mpid=mpid)
else:
raise ValueError("no input")
return lookup
def get_mpid(dbinfo: SingleResearchDatabase,
trid: int = None,
rid: str = None,
mrid: str = None) -> int:
lookup = get_pid_lookup(dbinfo=dbinfo, trid=trid, rid=rid, mrid=mrid)
# noinspection PyTypeChecker
return lookup.mpid
def get_pid(dbinfo: SingleResearchDatabase,
trid: int = None,
rid: str = None,
mrid: str = None) -> int:
lookup = get_pid_lookup(dbinfo=dbinfo, trid=trid, rid=rid, mrid=mrid)
# noinspection PyTypeChecker
return lookup.pid
# =============================================================================
# Patient Explorer multi-query class
# =============================================================================
"""
1. Patient ID query
- Single database is easy; we can use RID or TRID, and therefore TRID for
performance.
Note that UNION gives only DISTINCT results by default ("UNION ALL" gives
everything).
... http://stackoverflow.com/questions/49925/what-is-the-difference-between-union-and-union-all
-- Clear, but extensibility of boolean logic less clear:
SELECT trid
FROM diagnosis_table
WHERE diagnosis LIKE 'F20%'
INTERSECT
SELECT trid
FROM progress_note_table
WHERE note LIKE '%schizophreni%' OR note LIKE '%depression%'
ORDER BY trid
... logic across tables requires careful arrangement of UNION vs. INTERSECT
... logic for multiple fields within one table can be done with AND/OR
-- Slower (?), but simpler to manipulate logic?
SELECT DISTINCT something.trid
FROM diagnosis_table INNER JOIN progress_note_table
ON diagnosis_table.trid = progress_note_table.trid
WHERE
diagnosis_table.diagnosis LIKE 'F20%'
AND (progress_note_table.note LIKE '%schizophreni%'
OR progress_note_table.notenote LIKE '%depression%')
ORDER BY something.trid
-- ... boolean logic can all be encapsulated in a single WHERE clause
-- ... can also share existing join code
-- ... ?reasonable speed since the TRID fields will be indexed
-- ... preferable.
1b. Which ID for the patient ID query
... the TRID (for speed, inc. sorting) of the first database
... can use the TRID from the first "where clause" table
(don't have to join to a master patient table)
... join everything across databases as before
2. Results queries
-- Something like:
SELECT rid, date_of_note, note
FROM progress_note_table
WHERE trid IN ( ... patient_id_query ... )
ORDER BY trid
SELECT rid, date_of_diagnosis, diagnosis, diagnosis_description
FROM diagnosis_table
WHERE trid IN ( ... patient_id_query ... )
ORDER BY trid
This means we will repeat the patient_id_query, which may be inefficient.
Options:
- store the TRIDs in Python, then pass them as arguments
... at which point the SQL string/packet length becomes relevant;
... http://stackoverflow.com/questions/1869753/maximum-size-for-a-sql-server-query-in-clause-is-there-a-better-approach
... http://stackoverflow.com/questions/16335011/what-is-maximum-query-size-for-mysql
... http://stackoverflow.com/questions/96553/practical-limit-to-length-of-sql-query-specifically-mysql
- let the database worry about it
... probably best for now!
3. Display
One patient per page, with multiple results tables.
===========
- Boolean logic on patient selection
... within
""" # noqa
# =============================================================================
# PatientMultiQuery
# =============================================================================
@register_for_json(method=METHOD_STRIP_UNDERSCORE)
class PatientMultiQuery(object):
def __init__(self,
output_columns: List[ColumnId] = None,
patient_conditions: List[WhereCondition] = None,
manual_patient_id_query: str = ''):
self._output_columns = output_columns or [] # type: List[ColumnId]
self._patient_conditions = patient_conditions or [] # type: List[WhereCondition] # noqa
self._manual_patient_id_query = manual_patient_id_query or ''
def __repr__(self) -> str:
return (
"<{qualname}("
"output_columns={output_columns}, "
"patient_conditions={patient_conditions}, "
"manual_patient_id_query={manual_patient_id_query}"
") at {addr}>".format(
qualname=self.__class__.__qualname__,
output_columns=repr(self._output_columns),
patient_conditions=repr(self._patient_conditions),
manual_patient_id_query=repr(self._manual_patient_id_query),
addr=hex(id(self)),
)
)
def __eq__(self, other: 'PatientMultiQuery') -> bool:
return (
self._output_columns == other._output_columns and
self._patient_conditions == other._patient_conditions and
self._manual_patient_id_query == other._manual_patient_id_query
)
def __hash__(self) -> int:
"""
WARNING: Python's hash() function converts the result of __hash__()
to the integer width of the host machine, so 64-bit results can get
down-converted to 32 bits. Use hash64() directly if you want a 64-bit
result.
"""
return self.hash64
@property
def hash64(self) -> int:
return hash64(json_encode(self))
@property
def output_columns(self) -> List[ColumnId]:
return self._output_columns
@property
def has_output_columns(self) -> bool:
return bool(self._output_columns)
@property
def ok_to_run(self) -> bool:
return self.has_output_columns and self.has_patient_id_query
@property
def patient_conditions(self) -> List[WhereCondition]:
return self._patient_conditions
@property
def manual_patient_id_query(self) -> str:
return self._manual_patient_id_query
def add_output_column(self, column_id: ColumnId) -> None:
if column_id not in self._output_columns:
self._output_columns.append(column_id)
self._output_columns.sort()
def clear_output_columns(self) -> None:
self._output_columns = []
def add_patient_condition(self, where: WhereCondition) -> None:
if where not in self._patient_conditions:
self._patient_conditions.append(where)
self._patient_conditions.sort()
def clear_patient_conditions(self) -> None:
self._patient_conditions = []
def set_override_query(self, query: str) -> None:
self._manual_patient_id_query = query
def _get_select_mrid_column(self) -> Optional[ColumnId]:
if not self._patient_conditions:
return None
return research_database_info.get_linked_mrid_column(
self._patient_conditions[0].table_id)
@property
def has_patient_id_query(self) -> bool:
if self._manual_patient_id_query:
return True
if self._patient_conditions:
mrid_col = self._get_select_mrid_column()
if mrid_col and mrid_col.is_valid:
return True
return False
def patient_id_query(self, with_order_by: bool = True) -> str:
# Returns an SQL SELECT statement based on the list of WHERE conditions
# already stored, joined with AND by default.
if self._manual_patient_id_query:
# User has specified one manually.
return self._manual_patient_id_query
if not self._patient_conditions:
return ''
grammar = research_database_info.grammar
select_mrid_column = self._get_select_mrid_column()
if not select_mrid_column.is_valid:
log.warning(
"PatientMultiQuery.patient_id_query(): invalid "
"select_mrid_column: {}".format(repr(select_mrid_column)))
# One way this can happen: (1) a user saves a PMQ; (2) the
# administrator removes one of the databases!
return ''
mrid_alias = "_mrid"
sql = add_to_select(
'',
grammar=grammar,
select_elements=[SelectElement(column_id=select_mrid_column,
alias=mrid_alias)],
distinct=True,
where_conditions=(self._patient_conditions + [WhereCondition(
column_id=select_mrid_column, op="IS NOT NULL")]),
where_type="AND",
magic_join=True,
formatted=False
)
if with_order_by:
sql += " ORDER BY " + mrid_alias
# ... ORDER BY is important for consistency across runs
sql = format_sql(sql)
# log.critical(sql)
return sql
@property
def all_full_queries(self) -> List[Tuple[TableId, str, List[Any]]]:
return self.all_queries(mrids=None)
def all_queries_specific_patients(
self,
mrids: List[int]) -> List[Tuple[TableId, str, List[Any]]]:
return self.all_queries(mrids=mrids)
def all_queries(self,
mrids: List[Any] = None) -> List[Tuple[TableId, str,
List[Any]]]:
queries = []
table_columns_map = columns_to_table_column_hierarchy(
self._output_columns, sort=True)
for table, columns in table_columns_map:
table_sql_args = self.make_query(table_id=table,
columns=columns,
mrids=mrids)
queries.append(table_sql_args)
return queries
def where_patient_clause(self, table_id: TableId,
grammar: SqlGrammar,
mrids: List[Any] = None) -> SqlArgsTupleType:
"""Returns (sql, args)."""
mrid_column = research_database_info.get_mrid_column_from_table(
table_id)
if mrids:
in_clause = ",".join(["?"] * len(mrids))
# ... see notes for translate_sql_qmark_to_percent()
args = mrids
else:
# If we haven't specified specific patients, use our patient-
# finding query.
in_clause = self.patient_id_query(with_order_by=False)
# ... SQL Server moans if you use use ORDER BY in a subquery:
# "The ORDER BY clause is invalid in views, inline functions,
# derived tables, subqueries, ... unless TOP, OFFSET or FOR XML
# is specified."
args = []
sql = "{mrid} IN ({in_clause})".format(
mrid=mrid_column.identifier(grammar),
in_clause=in_clause)
return sql, args
def make_query(self,
table_id: TableId,
columns: List[ColumnId],
mrids: List[Any] = None) -> Tuple[TableId, str, List[Any]]:
if not columns:
raise ValueError("No columns specified")
grammar = research_database_info.grammar
mrid_column = research_database_info.get_mrid_column_from_table(
table_id)
all_columns = [mrid_column]
for c in columns:
if c not in all_columns:
all_columns.append(c)
where_clause, args = self.where_patient_clause(table_id, grammar,
mrids)
select_elements = [SelectElement(column_id=col)
for col in all_columns]
where_conditions = [WhereCondition(raw_sql=where_clause)]
sql = add_to_select('',
grammar=grammar,
select_elements=select_elements,
where_conditions=where_conditions,
magic_join=True,
formatted=True)
return table_id, sql, args
# -------------------------------------------------------------------------
# Display
# -------------------------------------------------------------------------
@property
def output_cols_html(self) -> str:
grammar = research_database_info.grammar
return prettify_sql_html("\n".join(
[column_id.identifier(grammar)
for column_id in self.output_columns]))
@property
def pt_conditions_html(self) -> str:
grammar = research_database_info.grammar
return prettify_sql_html("\nAND ".join([
wc.sql(grammar) for wc in self.patient_conditions]))
def summary_html(self, element_counter: HtmlElementCounter) -> str:
def collapser(x: str) -> str:
return element_counter.overflow_div(contents=x)
outcols = self.output_cols_html
manual_query = self.manual_patient_id_query
if manual_query:
manual_or_auto = " (MANUAL)"
ptselect = prettify_sql_html(manual_query)
else:
manual_or_auto = ""
ptselect = self.pt_conditions_html
return """
Output columns:<br>
{outcols}
Patient selection:<br>
{ptselect}
""".format(
outcols=collapser(outcols),
manual_or_auto=manual_or_auto,
ptselect=collapser(ptselect),
)
# -------------------------------------------------------------------------
# Data finder: COUNT(*) for all patient tables
# -------------------------------------------------------------------------
def gen_data_finder_queries(self, mrids: List[Any] = None) \
-> Generator[Tuple[str, str, List[Any]], None, None]:
"""
Generates (table_identifier, sql, args).
When executed, query gives:
research_id, table_name, n_records, min_date, max_date
"""
grammar = research_database_info.grammar
mrid_alias = 'master_research_id'
table_name_alias = 'table_name'
n_records_alias = 'n_records'
min_date_alias = 'min_date'
max_date_alias = 'max_date'
for table_id in research_database_info.get_mrid_linkable_patient_tables(): # noqa
mrid_col = research_database_info.get_mrid_column_from_table(
table=table_id)
date_col = research_database_info.get_default_date_column(
table=table_id)
if date_col:
min_date = "MIN({})".format(date_col.identifier(grammar))
max_date = "MAX({})".format(date_col.identifier(grammar))
else:
min_date = "NULL"
max_date = "NULL"
# ... OK (at least in MySQL) to do:
# SELECT col1, COUNT(*), NULL FROM table GROUP BY col1;
where_clause, args = self.where_patient_clause(
table_id, grammar, mrids)
table_identifier = table_id.identifier(grammar)
select_elements = [
SelectElement(column_id=mrid_col, alias=mrid_alias),
SelectElement(raw_select=sql_string_literal(table_identifier),
alias=table_name_alias),
SelectElement(raw_select='COUNT(*)',
from_table_for_raw_select=table_id,
alias=n_records_alias),
SelectElement(raw_select=min_date,
from_table_for_raw_select=table_id,
alias=min_date_alias),
SelectElement(raw_select=max_date,
from_table_for_raw_select=table_id,
alias=max_date_alias),
]
where_conditions = [WhereCondition(raw_sql=where_clause)]
sql = add_to_select('',
grammar=grammar,
select_elements=select_elements,
where_conditions=where_conditions,
magic_join=True,
formatted=False)
sql += "\nGROUP BY " + mrid_col.identifier(grammar)
sql += "\nORDER BY " + mrid_alias
sql = format_sql(sql)
yield table_identifier, sql, args
# -------------------------------------------------------------------------
# Monster data: SELECT * for all patient tables
# -------------------------------------------------------------------------
def gen_monster_queries(self, mrids: List[int] = None) \
-> Generator[List[Tuple[TableId, str, List[Any]]], None, None]:
grammar = research_database_info.grammar
for table_id in research_database_info.get_mrid_linkable_patient_tables(): # noqa
mrid_col = research_database_info.get_mrid_column_from_table(
table=table_id)
where_clause, args = self.where_patient_clause(
table_id, grammar, mrids)
# We add the WHERE using our magic query machine, to get the joins
# right:
select_elements = [
SelectElement(raw_select='*',
from_table_for_raw_select=table_id),
]
where_conditions = [
WhereCondition(raw_sql=where_clause,
from_table_for_raw_sql=mrid_col.table_id),
]
sql = add_to_select(
'',
grammar=grammar,
select_elements=select_elements,
where_conditions=where_conditions,
magic_join=True,
formatted=False)
sql += " ORDER BY " + mrid_col.identifier(grammar)
sql = format_sql(sql)
yield table_id, sql, args
# =============================================================================
# PatientExplorer
# =============================================================================
PATIENT_EXPLORER_FWD_REF = "PatientExplorer"
[docs]class PatientExplorer(models.Model):
"""
Class to explore the research database on a per-patient basis.
"""
class Meta:
app_label = "research"
id = models.AutoField(primary_key=True) # automatic
user = models.ForeignKey(settings.AUTH_USER_MODEL,
on_delete=models.CASCADE)
patient_multiquery = JsonClassField(
verbose_name='PatientMultiQuery as JSON',
null=True) # type: PatientMultiQuery
pmq_hash = models.BigIntegerField(
verbose_name='64-bit non-cryptographic hash of JSON of '
'patient_multiquery')
active = models.BooleanField(default=True) # see save() below
created = models.DateTimeField(auto_now_add=True)
deleted = models.BooleanField(
default=False,
verbose_name="Deleted from the user's perspective. "
"Audited queries are never properly deleted.")
audited = models.BooleanField(default=False)
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
if not self.patient_multiquery:
self.patient_multiquery = PatientMultiQuery()
def __str__(self) -> str:
return "<PatientExplorer id={}>".format(self.id)
[docs] def save(self, *args, **kwargs) -> None:
"""
Custom save method. Ensures that only one PatientExplorer has
active == True for a given user.
Also sets the hash.
"""
if self.active:
PatientExplorer.objects\
.filter(user=self.user, active=True)\
.update(active=False)
self.pmq_hash = self.patient_multiquery.hash64
# Beware: Python's hash() function will downconvert to 32 bits on 32-bit
# machines; use pmq.hash64() directly, not hash(pmq).
super().save(*args, **kwargs)
# -------------------------------------------------------------------------
# Fetching
# -------------------------------------------------------------------------
@staticmethod
def get_active_pe_or_none(request: HttpRequest) \
-> Optional[PATIENT_EXPLORER_FWD_REF]:
if not request.user.is_authenticated:
return None
try:
return PatientExplorer.objects.get(user=request.user, active=True)
except PatientExplorer.DoesNotExist:
return None
@staticmethod
def get_active_pe_id_or_none(request: HttpRequest) -> Optional[int]:
if not request.user.is_authenticated:
return None
try:
pe = PatientExplorer.objects.get(user=request.user, active=True)
return pe.id
except PatientExplorer.DoesNotExist:
return None
# -------------------------------------------------------------------------
# Activating, deleting, auditing
# -------------------------------------------------------------------------
def activate(self) -> None:
self.active = True
self.save()
def mark_audited(self) -> None:
if self.audited:
return
self.audited = True
self.save()
def mark_deleted(self) -> None:
if self.deleted:
# log.debug("pointless")
return
self.deleted = True
self.active = False
# log.debug("about to save")
self.save()
# log.debug("saved")
[docs] def delete_if_permitted(self) -> None:
"""If a PE has been audited, it isn't properly deleted."""
if self.deleted:
log.debug("already flagged as deleted")
return
if self.audited:
log.debug("marking as deleted")
self.mark_deleted()
else:
# actually delete
log.debug("actually deleting")
self.delete()
def audit(self, count_only: bool = False, n_records: int = 0,
failed: bool = False, fail_msg: str = "") -> None:
a = PatientExplorerAudit(patient_explorer=self,
count_only=count_only,
n_records=n_records,
failed=failed,
fail_msg=fail_msg)
a.save()
self.mark_audited()
# -------------------------------------------------------------------------
# Using the internal PatientMultiQuery
# -------------------------------------------------------------------------
def all_queries(self,
mrids: List[Any] = None) -> List[Tuple[TableId, str,
List[Any]]]:
return self.patient_multiquery.all_queries(mrids=mrids)
[docs] @staticmethod
def get_executed_cursor(sql: str, args: List[Any] = None) -> CursorWrapper:
"""
Get cursor with a query executed
"""
sql = translate_sql_qmark_to_percent(sql)
return get_executed_researchdb_cursor(sql, args)
def get_patient_mrids(self) -> List[int]:
sql = self.patient_multiquery.patient_id_query(with_order_by=True)
# log.critical(sql)
with self.get_executed_cursor(sql) as cursor:
return [row[0] for row in cursor.fetchall()]
def get_zipped_tsv_binary(self) -> bytes:
# Don't pass giant result sets around beyond what's necessary.
# Use cursor.fetchone()
grammar = make_grammar(settings.RESEARCH_DB_DIALECT)
memfile = io.BytesIO()
z = zipfile.ZipFile(memfile, "w")
for table_id, sql, args in self.patient_multiquery.all_queries():
with self.get_executed_cursor(sql, args) as cursor:
fieldnames = get_fieldnames_from_cursor(cursor)
tsv = make_tsv_row(fieldnames)
row = cursor.fetchone()
while row is not None:
tsv += make_tsv_row(row)
row = cursor.fetchone()
filename = table_id.identifier(grammar) + ".tsv"
z.writestr(filename, tsv.encode("utf-8"))
z.close()
return memfile.getvalue()
[docs] def get_xlsx_binary(self) -> bytes:
"""
Other notes:
- cell size:
http://stackoverflow.com/questions/13197574/python-openpyxl-column-width-size-adjust
... and the "auto_size" / "bestFit" options don't really do the job,
according to the interweb
""" # noqa
wb = Workbook()
wb.remove_sheet(wb.active) # remove the autocreated blank sheet
sqlsheet_rows = [["Table", "SQL", "Args", "Executed_at"]]
for table_id, sql, args in self.patient_multiquery.all_queries():
sqlsheet_rows.append([str(table_id), sql, repr(args),
datetime.datetime.now()])
ws = wb.create_sheet(title=str(table_id))
with self.get_executed_cursor(sql, args) as cursor:
fieldnames = get_fieldnames_from_cursor(cursor)
ws.append(fieldnames)
row = cursor.fetchone()
while row is not None:
ws.append(gen_excel_row_elements(ws, row))
row = cursor.fetchone()
sql_ws = wb.create_sheet(title="SQL")
for r in sqlsheet_rows:
sql_ws.append(r)
return excel_to_bytes(wb)
# -------------------------------------------------------------------------
# Using the internal PatientMultiQuery
# -------------------------------------------------------------------------
def get_patient_id_query(self, with_order_by: bool = True) -> str:
return self.patient_multiquery.patient_id_query(
with_order_by=with_order_by)
# -------------------------------------------------------------------------
# Display
# -------------------------------------------------------------------------
@property
def summary_html(self) -> str:
# Nasty hack. We want collapsing things, so we want HTML element IDs.
# We could build the HTML table in code for the Patient Explorer
# chooser, but I was trying to do it in Django templates.
# However, it's not easy to pass parameters (such as an
# HtmlElementCounter) back to Python from Django templates.
# So we can hack it a bit:
element_counter = HtmlElementCounter(prefix="pe_{}_".format(self.id))
return self.patient_multiquery.summary_html(
element_counter=element_counter)
@property
def has_patient_id_query(self) -> bool:
return self.patient_multiquery.has_patient_id_query
@property
def has_output_columns(self) -> bool:
return self.patient_multiquery.has_output_columns
# -------------------------------------------------------------------------
# Data finder
# -------------------------------------------------------------------------
@property
def data_finder_excel(self) -> bytes:
"""
Performs a SELECT COUNT(*)
Returns (fieldnames, rows).
"""
fieldnames = []
wb = Workbook()
wb.remove_sheet(wb.active) # remove the autocreated blank sheet
all_ws = wb.create_sheet("All_patients")
sql_ws = wb.create_sheet("SQL")
sql_ws.append(["Table", "SQL", "Args", "Executed_at"])
for table_identifier, sql, args in \
self.patient_multiquery.gen_data_finder_queries():
sql_ws.append([table_identifier,
format_sql(sql),
repr(args),
datetime.datetime.now()])
with self.get_executed_cursor(sql, args) as cursor:
if not fieldnames:
fieldnames = get_fieldnames_from_cursor(cursor)
all_ws.append(fieldnames)
row = cursor.fetchone()
while row is not None:
mrid = str(row[0])
if mrid in wb:
ws = wb[mrid]
else:
ws = wb.create_sheet(mrid)
ws.append(fieldnames)
rowtuple = tuple(row)
ws.append(rowtuple)
all_ws.append(rowtuple)
row = cursor.fetchone()
return excel_to_bytes(wb)
# =============================================================================
# PatientExplorer auditing class
# =============================================================================
[docs]class PatientExplorerAudit(models.Model):
"""
Audit log for a PatientExplorer.
"""
id = models.AutoField(primary_key=True) # automatic
patient_explorer = models.ForeignKey('PatientExplorer',
on_delete=models.PROTECT)
when = models.DateTimeField(auto_now_add=True)
count_only = models.BooleanField(default=False)
n_records = models.IntegerField(default=0)
# ... not PositiveIntegerField; SQL Server gives -1, for example
failed = models.BooleanField(default=False)
fail_msg = models.TextField()
def __str__(self):
return "<PatientExplorerAudit id={}>".format(self.id)