#!/usr/bin/env python
# crate_anon/anonymise/scrub.py
"""
===============================================================================
Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <http://www.gnu.org/licenses/>.
===============================================================================
Scrubber classes for CRATE anonymiser.
"""
from collections import OrderedDict
import datetime
import logging
import string
from typing import (Any, Dict, Iterable, Generator, List, Optional, Pattern,
Union)
from cardinal_pythonlib.datetimefunc import coerce_to_datetime
from cardinal_pythonlib.hash import GenericHasher
from cardinal_pythonlib.rnc_db import (
is_sqltype_date,
is_sqltype_text_over_one_char,
)
from cardinal_pythonlib.text import UNICODE_CATEGORY_STRINGS
from flashtext import KeywordProcessor
from crate_anon.anonymise.constants import SCRUBMETHOD
from crate_anon.anonymise.anonregex import (
get_anon_fragments_from_string,
get_code_regex_elements,
get_date_regex_elements,
get_number_of_length_n_regex_elements,
get_phrase_regex_elements,
get_regex_from_elements,
get_regex_string_from_elements,
get_string_regex_elements,
get_uk_postcode_regex_elements,
)
from crate_anon.common.stringfunc import (
get_digit_string_from_vaguely_numeric_string,
reduce_to_alphanumeric,
)
log = logging.getLogger(__name__)
# =============================================================================
# Generic scrubber base class
# =============================================================================
[docs]class ScrubberBase(object):
"""Scrubber base class."""
def __init__(self, hasher: GenericHasher) -> None:
"""
:param hasher: object implementing GenericHasher interface
"""
self.hasher = hasher
[docs] def scrub(self, text: str) -> str:
"""Returns a scrubbed version of the text."""
raise NotImplementedError()
[docs] def get_hash(self) -> str:
"""Returns a hash of the scrubber itself."""
raise NotImplementedError()
# =============================================================================
# WordList -- this serves a dual function as a whitelist (is a word in the
# list?) and a blacklist (scrub text using the wordlist).
# =============================================================================
def lower_case_words_from_file(fileobj: Iterable[str]) -> Generator[str, None,
None]:
for line in fileobj:
for word in line.split():
yield word.lower()
FLASHTEXT_WORD_CHARACTERS = set(
string.digits + # part of flashtext default
string.ascii_letters + # part of flashtext default
'_' + # part of flashtext default
UNICODE_CATEGORY_STRINGS['Latin_Alphabetic']
)
# Why do we do this? So e.g. "naïve" isn't truncated to "naï[~~~]".
# Check: FLASHTEXT_WORDCHAR_STR = "".join(sorted(FLASHTEXT_WORD_CHARACTERS))
[docs]class WordList(ScrubberBase):
def __init__(self,
filenames: Iterable[str] = None,
words: Iterable[str] = None,
replacement_text: str = '[---]',
hasher: GenericHasher = None,
suffixes: List[str] = None,
at_word_boundaries_only: bool = True,
max_errors: int = 0,
regex_method: bool = False) -> None:
filenames = filenames or []
words = words or []
super().__init__(hasher)
self.replacement_text = replacement_text
self.suffixes = suffixes
self.at_word_boundaries_only = at_word_boundaries_only
self.max_errors = max_errors
self.regex_method = regex_method
self._regex = None # type: Pattern
self._processor = None # type: KeywordProcessor
self._cached_hash = None # type: str
self._built = False
self.words = set()
# Sets are faster than lists for "is x in s" operations:
# http://stackoverflow.com/questions/2831212/python-sets-vs-lists
# noinspection PyTypeChecker
for f in filenames:
self.add_file(f, clear_cache=False)
# noinspection PyTypeChecker
for w in words:
self.add_word(w, clear_cache=False)
# log.debug("Created wordlist with {} words".format(len(self.words)))
[docs] def clear_cache(self) -> None:
"""Clear cached information."""
self._built = False
self._regex = None
self._processor = None
self._cached_hash = None
def add_word(self, word: str, clear_cache: bool = True) -> None:
if not word:
return
self.words.add(word.lower())
if clear_cache:
self.clear_cache()
def add_file(self, filename: str, clear_cache: bool = True) -> None:
with open(filename) as f:
wordgen = lower_case_words_from_file(f)
for w in wordgen:
self.words.add(w)
if clear_cache:
self.clear_cache()
def contains(self, word: str) -> bool:
return word.lower() in self.words
[docs] def get_hash(self) -> str:
# A set is unordered.
# We want the hash to be the same if we have the same words, even if
# they were entered in a different order, so we need to sort:
if not self._cached_hash:
self._cached_hash = self.hasher.hash(sorted(self.words))
return self._cached_hash
[docs] def scrub(self, text: str) -> str:
if not self._built:
self.build()
if self.regex_method:
if not self._regex:
return text
return self._regex.sub(self.replacement_text, text)
else:
if not self._processor:
return text
return self._processor.replace_keywords(text)
def build(self) -> None:
if self.regex_method:
elements = []
for w in self.words:
elements.extend(get_string_regex_elements(
w,
suffixes=self.suffixes,
at_word_boundaries_only=self.at_word_boundaries_only,
max_errors=self.max_errors
))
log.debug("Building regex with {} elements".format(len(elements)))
self._regex = get_regex_from_elements(elements)
else:
if self.words:
self._processor = KeywordProcessor(case_sensitive=False)
self._processor.set_non_word_boundaries(
FLASHTEXT_WORD_CHARACTERS)
replacement = self.replacement_text
log.debug("Building FlashText processor with {} "
"keywords".format(len(self.words)))
for w in self.words:
self._processor.add_keyword(w, replacement)
else:
self._processor = None
self._built = True
# =============================================================================
# NonspecificScrubber
# Scrubs a bunch of things that are independent of any patient-specific data,
# such as removing all UK postcodes, or numbers of a certain length.
# =============================================================================
[docs]class NonspecificScrubber(ScrubberBase):
def __init__(self,
replacement_text: str,
hasher: GenericHasher,
anonymise_codes_at_word_boundaries_only: bool = True,
anonymise_numbers_at_word_boundaries_only: bool = True,
blacklist: WordList = None,
scrub_all_numbers_of_n_digits: List[int] = None,
scrub_all_uk_postcodes: bool = False) -> None:
"""
scrub_all_numbers_of_n_digits: list of values of n
"""
scrub_all_numbers_of_n_digits = scrub_all_numbers_of_n_digits or []
super().__init__(hasher)
self.replacement_text = replacement_text
self.anonymise_codes_at_word_boundaries_only = (
anonymise_codes_at_word_boundaries_only)
self.anonymise_numbers_at_word_boundaries_only = (
anonymise_numbers_at_word_boundaries_only)
self.blacklist = blacklist
self.scrub_all_numbers_of_n_digits = scrub_all_numbers_of_n_digits
self.scrub_all_uk_postcodes = scrub_all_uk_postcodes
self._cached_hash = None
self._regex = None
self._regex_built = False
self.build_regex()
[docs] def get_hash(self) -> str:
if not self._cached_hash:
self._cached_hash = self.hasher.hash([
# signature, used for hashing:
self.anonymise_codes_at_word_boundaries_only,
self.anonymise_numbers_at_word_boundaries_only,
self.blacklist.get_hash() if self.blacklist else None,
self.scrub_all_numbers_of_n_digits,
self.scrub_all_uk_postcodes,
])
return self._cached_hash
[docs] def scrub(self, text: str) -> str:
if not self._regex_built:
self.build_regex()
if self.blacklist:
text = self.blacklist.scrub(text)
if not self._regex: # possible; may be blank
return text
return self._regex.sub(self.replacement_text, text)
def build_regex(self) -> None:
elements = []
if self.scrub_all_uk_postcodes:
elements.extend(
get_uk_postcode_regex_elements(
at_word_boundaries_only=
self.anonymise_codes_at_word_boundaries_only))
# noinspection PyTypeChecker
for n in self.scrub_all_numbers_of_n_digits:
elements.extend(get_number_of_length_n_regex_elements(
n,
at_word_boundaries_only=(
self.anonymise_numbers_at_word_boundaries_only)
))
self._regex = get_regex_from_elements(elements)
self._regex_built = True
# =============================================================================
# PersonalizedScrubber
# =============================================================================
[docs]class PersonalizedScrubber(ScrubberBase):
"""Accepts patient-specific (patient and third-party) information, and
uses that to scrub text."""
def __init__(self,
replacement_text_patient: str,
replacement_text_third_party: str,
hasher: GenericHasher,
anonymise_codes_at_word_boundaries_only: bool = True,
anonymise_dates_at_word_boundaries_only: bool = True,
anonymise_numbers_at_word_boundaries_only: bool = True,
anonymise_numbers_at_numeric_boundaries_only: bool = True,
anonymise_strings_at_word_boundaries_only: bool = True,
min_string_length_for_errors: int = 4,
min_string_length_to_scrub_with: int = 3,
scrub_string_suffixes: List[str] = None,
string_max_regex_errors: int = 0,
whitelist: WordList = None,
nonspecific_scrubber: NonspecificScrubber = None,
debug: bool = False) -> None:
scrub_string_suffixes = scrub_string_suffixes or []
super().__init__(hasher)
self.replacement_text_patient = replacement_text_patient
self.replacement_text_third_party = replacement_text_third_party
self.anonymise_codes_at_word_boundaries_only = (
anonymise_codes_at_word_boundaries_only)
self.anonymise_dates_at_word_boundaries_only = (
anonymise_dates_at_word_boundaries_only)
self.anonymise_numbers_at_word_boundaries_only = (
anonymise_numbers_at_word_boundaries_only)
self.anonymise_numbers_at_numeric_boundaries_only = (
anonymise_numbers_at_numeric_boundaries_only)
self.anonymise_strings_at_word_boundaries_only = (
anonymise_strings_at_word_boundaries_only)
self.min_string_length_for_errors = min_string_length_for_errors
self.min_string_length_to_scrub_with = min_string_length_to_scrub_with
self.scrub_string_suffixes = scrub_string_suffixes
self.string_max_regex_errors = string_max_regex_errors
self.whitelist = whitelist
self.nonspecific_scrubber = nonspecific_scrubber
self.debug = debug
# Regex information
self.re_patient = None # re: regular expression
self.re_tp = None
self.regexes_built = False
self.re_patient_elements = []
self.re_tp_elements = []
# ... both changed from set to list to reflect referee's point re
# potential importance of scrubber order
self.elements_tuplelist = [] # of tuples: (patient?, type, value)
# ... used for get_raw_info(); since we've made the order important,
# we should detect changes in order here as well
self.clear_cache()
def clear_cache(self) -> None:
self.regexes_built = False
[docs] @staticmethod
def get_scrub_method(datatype_long: str,
scrub_method: Optional[SCRUBMETHOD]) -> SCRUBMETHOD:
"""
Return the default scrub method for a given SQL datatype,
unless overridden.
"""
if scrub_method is not None:
return scrub_method
elif is_sqltype_date(datatype_long):
return SCRUBMETHOD.DATE
elif is_sqltype_text_over_one_char(datatype_long):
return SCRUBMETHOD.WORDS
else:
return SCRUBMETHOD.NUMERIC
[docs] def add_value(self,
value: Any,
scrub_method: SCRUBMETHOD,
patient: bool = True,
clear_cache: bool = True) -> None:
"""
Add a specific value via a specific scrub_method.
The patient flag controls whether it's treated as a patient value or
a third-party value.
"""
if value is None:
return
new_tuple = (patient, scrub_method, repr(value))
if new_tuple not in self.elements_tuplelist:
self.elements_tuplelist.append(new_tuple)
# Note: object reference
r = self.re_patient_elements if patient else self.re_tp_elements
if scrub_method is SCRUBMETHOD.DATE:
elements = self.get_elements_date(value)
elif scrub_method is SCRUBMETHOD.WORDS:
elements = self.get_elements_words(value)
elif scrub_method is SCRUBMETHOD.PHRASE:
elements = self.get_elements_phrase(value)
elif scrub_method is SCRUBMETHOD.NUMERIC:
elements = self.get_elements_numeric(value)
elif scrub_method is SCRUBMETHOD.CODE:
elements = self.get_elements_code(value)
else:
raise ValueError("Bug: unknown scrub_method to add_value: "
"{}".format(scrub_method))
r.extend(elements)
if clear_cache:
self.clear_cache()
def get_elements_date(self,
value: Union[datetime.datetime,
datetime.date]) -> Optional[List[str]]:
# Source is a date.
try:
value = coerce_to_datetime(value)
except Exception as e:
log.warning(
"Invalid date received to PersonalizedScrubber."
"get_elements_date(): value={}, exception={}".format(
value, e))
return
return get_date_regex_elements(
value,
at_word_boundaries_only=(
self.anonymise_dates_at_word_boundaries_only)
)
def get_elements_words(self, value: str) -> List[str]:
# Source is a string containing textual words.
elements = []
for s in get_anon_fragments_from_string(str(value)):
length = len(s)
if length < self.min_string_length_to_scrub_with:
# With numbers: if you use the length limit, you may see
# numeric parts of addresses, e.g. 4 Drury Lane as
# 4 [___] [___]. However, if you exempt numbers then you
# mess up a whole bunch of quantitative information, such
# as "the last 4-5 years" getting wiped to "the last
# [___]-5 years". So let's apply the length limit
# consistently.
continue
if self.whitelist and self.whitelist.contains(s):
continue
if length >= self.min_string_length_for_errors:
max_errors = self.string_max_regex_errors
else:
max_errors = 0
elements.extend(get_string_regex_elements(
s,
self.scrub_string_suffixes,
max_errors=max_errors,
at_word_boundaries_only=(
self.anonymise_strings_at_word_boundaries_only)
))
return elements
def get_elements_phrase(self, value: Any) -> List[str]:
# value = unicode(value) # Python 2
value = str(value)
if not value:
return []
length = len(value)
if length < self.min_string_length_to_scrub_with:
return []
if self.whitelist and self.whitelist.contains(value):
return []
if length >= self.min_string_length_for_errors:
max_errors = self.string_max_regex_errors
else:
max_errors = 0
return get_phrase_regex_elements(
value,
max_errors=max_errors,
at_word_boundaries_only=(
self.anonymise_strings_at_word_boundaries_only)
)
def get_elements_numeric(self, value: Any) -> List[str]:
# Source is a text field containing a number, or an actual number.
# Remove everything but the digits
# Particular examples: phone numbers, e.g. "(01223) 123456".
return get_code_regex_elements(
get_digit_string_from_vaguely_numeric_string(str(value)),
at_word_boundaries_only=(
self.anonymise_numbers_at_word_boundaries_only),
at_numeric_boundaries_only=(
self.anonymise_numbers_at_numeric_boundaries_only)
)
def get_elements_code(self, value: Any) -> List[str]:
# Source is a text field containing an alphanumeric code.
# Remove whitespace.
# Particular examples: postcodes, e.g. "PE12 3AB".
return get_code_regex_elements(
reduce_to_alphanumeric(str(value)),
at_word_boundaries_only=(
self.anonymise_codes_at_word_boundaries_only)
)
[docs] def get_patient_regex_string(self) -> str:
"""Return the string version of the patient regex, sorted."""
return get_regex_string_from_elements(self.re_patient_elements)
[docs] def get_tp_regex_string(self) -> str:
"""Return the string version of the third-party regex, sorted."""
return get_regex_string_from_elements(self.re_tp_elements)
def build_regexes(self) -> None:
self.re_patient = get_regex_from_elements(self.re_patient_elements)
self.re_tp = get_regex_from_elements(self.re_tp_elements)
self.regexes_built = True
# Note that the regexes themselves may be None even if they have
# been built.
if self.debug:
log.debug("Patient scrubber: {}".format(
self.get_patient_regex_string()))
log.debug("Third party scrubber: {}".format(
self.get_tp_regex_string()))
[docs] def scrub(self, text: str) -> Optional[str]:
"""Scrub some text and return the scrubbed result."""
if text is None:
return None
if not self.regexes_built:
self.build_regexes()
if self.nonspecific_scrubber:
text = self.nonspecific_scrubber.scrub(text)
if self.re_patient:
text = self.re_patient.sub(self.replacement_text_patient, text)
if self.re_tp:
text = self.re_tp.sub(self.replacement_text_third_party, text)
return text
[docs] def get_hash(self) -> str:
return self.hasher.hash(self.get_raw_info())
def get_raw_info(self) -> Dict[str, Any]:
# This is both a summary for debugging and the basis for our
# change-detection hash (and for the latter reason we need order etc.
# to be consistent). For anything we put in here, changes will cause
# data to be re-scrubbed.
# We use a list of tuples to make the OrderedDict:
d = (
('anonymise_codes_at_word_boundaries_only',
self.anonymise_codes_at_word_boundaries_only),
('anonymise_dates_at_word_boundaries_only',
self.anonymise_dates_at_word_boundaries_only),
('anonymise_numbers_at_word_boundaries_only',
self.anonymise_numbers_at_word_boundaries_only),
('anonymise_strings_at_word_boundaries_only',
self.anonymise_strings_at_word_boundaries_only),
('min_string_length_for_errors',
self.min_string_length_for_errors),
('min_string_length_to_scrub_with',
self.min_string_length_to_scrub_with),
('scrub_string_suffixes', sorted(self.scrub_string_suffixes)),
('string_max_regex_errors', self.string_max_regex_errors),
('whitelist_hash',
self.whitelist.get_hash() if self.whitelist else None),
('nonspecific_scrubber_hash',
self.nonspecific_scrubber.get_hash() if self.nonspecific_scrubber
else None),
('elements', self.elements_tuplelist),
)
return OrderedDict(d)
_TEST_FLASHTEXT = r"""
import flashtext
replacement = "[~~~]"
keywords = [
"""