Source code for crate_anon.anonymise.anonregex

#!/usr/bin/env python
# crate_anon/anonymise/anonregex.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================
"""

# =============================================================================
# Imports
# =============================================================================

import calendar
import datetime
import dateutil.parser  # for unit tests
import logging
from typing import List, Optional, Pattern, Union
import unittest

from cardinal_pythonlib.lists import unique_list
from cardinal_pythonlib.logs import configure_logger_for_colour

# https://pypi.python.org/pypi/regex/
# https://bitbucket.org/mrabarnett/mrab-regex
import regex  # sudo apt-get install python-regex
# noinspection PyProtectedMember
from regex import _regex_core

from crate_anon.common.stringfunc import (
    get_digit_string_from_vaguely_numeric_string,  # for unit testing
    reduce_to_alphanumeric,  # for unit testing
)

log = logging.getLogger(__name__)

# =============================================================================
# Constants
# =============================================================================

REGEX_METACHARS = ["\\", "^", "$", ".",
                   "|", "?", "*", "+",
                   "(", ")", "[", "{"]
# http://www.regular-expressions.info/characters.html
# Start with \, for replacement.

WB = r"\b"  # word boundary; escape the slash if not using a raw string

# http://www.regular-expressions.info/lookaround.html
# Not all engines support lookbehind; e.g. regexr.com doesn't; but Python does
NOT_DIGIT_LOOKBEHIND = r"(?<!\d)"
NOT_DIGIT_LOOKAHEAD = r"(?!\d)"

# The Kleene star has highest precedence.
# So, for example, ab*c matches abbbc, but not (all of) ababc. See regexr.com
OPTIONAL_NONWORD = r"\W*"  # zero or more non-alphanumeric characters...
# ... doesn't need to be [\W]*, for precedence reasons as above.
AT_LEAST_ONE_NONWORD = r"\W+"  # 1 or more non-alphanumeric character

NON_ALPHANUMERIC_SPLITTERS = regex.compile(AT_LEAST_ONE_NONWORD, regex.UNICODE)

OPTIONAL_WHITESPACE = r"\s*"  # zero or more whitespace chars
OPTIONAL_NON_NEWLINE_WHITESPACE = r"[ \t]*"  # zero or more spaces/tabs


# =============================================================================
# String manipulation
# =============================================================================

[docs]def get_anon_fragments_from_string(s: str) -> List[str]: """ Takes a complex string, such as a name or address with its components separated by spaces, commas, etc., and returns a list of substrings to be used for anonymisation. - For example, from "John Smith", return ["John", "Smith"]; from "John D'Souza", return ["John", "D", "Souza"]; from "42 West Street", return ["42", "West", "Street"]. - Try the examples listed below the function. - Note that this is a LIBERAL algorithm, i.e. one prone to anonymise too much (e.g. all instances of "Street" if someone has that as part of their address). - NOTE THAT WE USE THE "WORD BOUNDARY" FACILITY WHEN REPLACING, AND THAT TREATS APOSTROPHES AND HYPHENS AS WORD BOUNDARIES. Therefore, we don't need the largest-level chunks, like D'Souza. """ return NON_ALPHANUMERIC_SPLITTERS.split(s)
# smallfragments = [] # combinedsmallfragments = [] # for chunk in s.split(): # split on whitespace # for smallchunk in NON_WHITESPACE_SPLITTERS.split(chunk): # if smallchunk.lower() in config.words_not_to_scrub: # continue # smallfragments.append(smallchunk) # # OVERLAP here, but we need it for the combination bit, and # # we remove the overlap at the end. # # Now we have chunks with e.g. apostrophes in, and all chunks split by # # everything. Finally, we want all of these lumped together. # for L in xrange(len(smallfragments) + 1): # for subset in itertools.combinations(smallfragments, L): # if subset: # combinedsmallfragments.append("".join(subset)) # return list(set(smallfragments + combinedsmallfragments)) # EXAMPLES: # get_anon_fragments_from_string("Bob D'Souza") # get_anon_fragments_from_string("Jemima Al-Khalaim") # get_anon_fragments_from_string("47 Russell Square") # ============================================================================= # Regexes # =============================================================================
[docs]def escape_literal_string_for_regex(s: str) -> str: r""" Escape any regex characters. Start with \ -> \\ ... this should be the first replacement in REGEX_METACHARS. """ for c in REGEX_METACHARS: s.replace(c, "\\" + c) return s
# ============================================================================= # Anonymisation regexes # ============================================================================= # Note, for strings, several typo-detecting methods: # http://en.wikipedia.org/wiki/Levenshtein_distance # http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/ # http://en.wikipedia.org/wiki/TRE_(computing) # https://pypi.python.org/pypi/regex # ... let's go with the fuzzy regex method (Python regex module).
[docs]def get_date_regex_elements( dt: Union[datetime.datetime, datetime.date], at_word_boundaries_only: bool = False) -> List[str]: """ Takes a datetime object and returns a list of regex strings with which to scrub. """ # Reminders: ? zero or one, + one or more, * zero or more # Non-capturing groups: (?:...) # ... https://docs.python.org/2/howto/regex.html # ... http://stackoverflow.com/questions/3512471/non-capturing-group # Day, allowing leading zeroes and e.g. "1st, 2nd" day = "0*" + str(dt.day) + "(?:st|nd|rd|th)?" # Month, allowing leading zeroes for numeric and e.g. Feb/February month_numeric = "0*" + str(dt.month) # month_word = dt.strftime("%B") # can't cope with years < 1900 month_word = calendar.month_name[dt.month] month_word = month_word[0:3] + "(?:" + month_word[3:] + ")?" month = "(?:" + month_numeric + "|" + month_word + ")" # Year year = str(dt.year) if len(year) == 4: year = "(?:" + year[0:2] + ")?" + year[2:4] # ... converts e.g. 1986 to (19)?86, to match 1986 or 86 sep = OPTIONAL_NONWORD # Regexes basic_regexes = [ day + sep + month + sep + year, # e.g. 13 Sep 2014 month + sep + day + sep + year, # e.g. Sep 13, 2014 year + sep + month + sep + day, # e.g. 2014/09/13 ] if at_word_boundaries_only: return [WB + x + WB for x in basic_regexes] else: return basic_regexes
[docs]def get_code_regex_elements( s: str, liberal: bool = True, very_liberal: bool = True, at_word_boundaries_only: bool = True, at_numeric_boundaries_only: bool = False) -> List[str]: """ Takes a STRING representation of a number or an alphanumeric code, which may include leading zeros (as for phone numbers), and produces a list of regex strings for scrubbing. We allow all sorts of separators. For example, 0123456789 might appear as (01234) 56789 0123 456 789 01234-56789 0123.456.789 This can also be used for postcodes, which should have whitespace prestripped, so e.g. PE123AB might appear as PE123AB PE12 3AB PE 12 3 AB """ if not s: return [] s = escape_literal_string_for_regex(s) # escape any decimal points, etc. if very_liberal: separators = OPTIONAL_NONWORD elif liberal: separators = OPTIONAL_NON_NEWLINE_WHITESPACE else: separators = "" s = separators.join([c for c in s]) # ... can appear anywhere if at_word_boundaries_only: return [WB + s + WB] else: if at_numeric_boundaries_only: # Even though we're not restricting to word boundaries, because # (for example) we want 123456 to match "M123456", it can be # undesirable match numbers that are bordered only by numbers; that # is, with this setting, "23" should never match "234" or "1234" # or "123". # - http://www.regular-expressions.info/lookaround.html # - http://stackoverflow.com/questions/15099150/regex-find-one-digit-number # noqa return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD] else: # But if you want to anonymise "123456" out of a phone number # written like "01223123456", you might have to turn that off... return [s]
[docs]def get_number_of_length_n_regex_elements( n: int, liberal: bool = True, very_liberal: bool = False, at_word_boundaries_only: bool = True) -> List[str]: """ Get a list of regex strings for scrubbing n-digit numbers -- for example, to remove all 10-digit numbers as putative NHS numbers, or all 11-digit numbers as putative UK phone numbers. """ s = ["[0-9]"] * n if very_liberal: separators = OPTIONAL_NONWORD elif liberal: separators = OPTIONAL_NON_NEWLINE_WHITESPACE else: separators = "" s = separators.join([c for c in s]) if at_word_boundaries_only: return [WB + s + WB] else: return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD]
# ... if there was a digit before/after, it's not an n-digit number
[docs]def get_uk_postcode_regex_elements( at_word_boundaries_only: bool = True) -> List[str]: """ Get a list of regex strings for scrubbing UK postcodes. """ e = [ "AN NAA", "ANN NAA", "AAN NAA", "AANN NAA", "ANA NAA", "AANA NAA", ] for i in range(len(e)): e[i] = e[i].replace("A", "[A-Z]") # letter e[i] = e[i].replace("N", "[0-9]") # number e[i] = e[i].replace(" ", OPTIONAL_WHITESPACE) if at_word_boundaries_only: e[i] = WB + e[i] + WB return e
[docs]def get_string_regex_elements( s: str, suffixes: List[str] = None, at_word_boundaries_only: bool = True, max_errors: int = 0) -> List[str]: """ Takes a string and returns a list of regex strings with which to scrub. Options: - list of suffixes to permit, typically ["s"] - typographical errors - whether to constrain to word boundaries or not ... if false: will scrub ANN from bANNed """ if not s: return [] s = escape_literal_string_for_regex(s) if max_errors > 0: s = "(" + s + "){e<" + str(max_errors + 1) + "}" # - a leading (?e) forces a search for a better match than the first; # the other way is to specify the regex.ENHANCEMATCH flag... # however, when doing this in get_regex_from_elements(), we got a # segmentation fault... and, less consistently, when we put it here. # So skip that! # - (...) is the pattern # - suffix up to n insertion/deletion/substitution errors # ... https://pypi.python.org/pypi/regex # ... http://www.gossamer-threads.com/lists/python/python/1002881 if suffixes: suffixstr = ( "(?:" + "|".join([escape_literal_string_for_regex(x) for x in suffixes]) + "|)" # allows for no suffix at all ) else: suffixstr = "" if at_word_boundaries_only: return [WB + s + suffixstr + WB] else: return [s + suffixstr]
[docs]def get_phrase_regex_elements( phrase: str, at_word_boundaries_only: bool = True, max_errors: int = 0) -> List[str]: """ phrase: e.g. '4 Privet Drive' """ strings = get_anon_fragments_from_string(phrase) if not strings: return [] strings = [escape_literal_string_for_regex(x) for x in strings] s = AT_LEAST_ONE_NONWORD.join(strings) if max_errors > 0: s = "(" + s + "){e<" + str(max_errors + 1) + "}" if at_word_boundaries_only: return [WB + s + WB] else: return [s]
# ============================================================================= # Combining regex elements into a giant regex # =============================================================================
[docs]def get_regex_string_from_elements(elementlist: List[str]) -> str: """ Convert a list of regex elements into a single regex string. """ if not elementlist: return "" return u"|".join(unique_list(elementlist))
# The or operator | has the lowest precedence. # ... http://www.regular-expressions.info/alternation.html # We also want to minimize the number of brackets. # THEREFORE, ANYTHING CONTRIBUTING FRAGMENTS HERE SHOULD NOT HAVE | # OPERATORS AT ITS TOP LEVEL. If it does, it should encapsulate them in a # non-capturing group, (?:...)
[docs]def get_regex_from_elements(elementlist: List[str]) -> Optional[Pattern]: """ Convert a list of regex elements into a compiled regex, which will operate in case-insensitive fashion on Unicode strings. """ if not elementlist: return None try: s = get_regex_string_from_elements(elementlist) return regex.compile(s, regex.IGNORECASE | regex.UNICODE) except _regex_core.error: log.exception("Failed regex: elementlist={}".format(elementlist)) raise
# ============================================================================= # Unit tests # =============================================================================
[docs]class TestAnonRegexes(unittest.TestCase): STRING_1 = r""" I was born on 07 Jan 2013, m'lud. It was 7 January 13, or 7/1/13, or 1/7/13, or Jan 7 2013, or 2013/01/07, or 2013-01-07, or 7th January 13 (split over a line) or Jan 7th 13 or 07.01.13 or 7.1.2013 or a host of other variations. And ISO-8601 formats like 20130107T0123, or just 20130107. BUT NOT 8 Jan 2013, or 2013/02/07, or 2013 Jan 17, or just a number like 7, or a month like January, or a nonspecific date like Jan 2013 or 7 January. And not ISO-8601-formatted other dates like 20130108T0123, or just 20130108. I am 34 years old. My mother was 348, or 834, or perhaps 8348. Was she 34.6? Don't think so. Her IDs include NHS#123456, or 123 456, or (123) 456, or 123456. I am 34 years old. My mother was 348, or 834, or perhaps 8348. She wasn't my step-mother, or my grandmother, or my mother-in-law. She was my MOTHER! A typo is mther. Unicode apostrophe: the thread’s possession E-mail: bob@pobox.com, mr.jones@somewhere.nhs.uk, blah@place.com Mr.Jones@somewhere.nhs.uk Some numbers by size: 1 12 123 1234 12345 123456 1234567 12345678 123456789 1234567890 12345678901 123456789012 1234567890123 12345678901234 123456789012345 Some postcodes (from https://www.mrs.org.uk/pdf/postcodeformat.pdf) M1 1AA M60 1NW CR2 6XH DN55 1PT W1A 1HQ EC1A 1BB """ @staticmethod def report(title: str, string: str) -> None: print("=" * 79) print(title) print("=" * 79) print(string) def test_most(self) -> None: s = self.STRING_1 testnumber = 34 testnumber_as_text = "123456" testdate_str = "7 Jan 2013" testdate = dateutil.parser.parse(testdate_str) teststring = "mother" testphrase = "348 or 834" date_19th_c = "3 Sep 1847" old_testdate = dateutil.parser.parse(date_19th_c) testemail = "mr.jones@somewhere.nhs.uk" regex_date = get_regex_from_elements(get_date_regex_elements(testdate)) regex_number = get_regex_from_elements( get_code_regex_elements(str(testnumber))) regex_number_as_text = get_regex_from_elements( get_code_regex_elements( get_digit_string_from_vaguely_numeric_string( testnumber_as_text))) regex_string = get_regex_from_elements( get_string_regex_elements(teststring)) regex_email = get_regex_from_elements( get_string_regex_elements(testemail)) regex_phrase = get_regex_from_elements( get_phrase_regex_elements(testphrase)) regex_10digit = get_regex_from_elements( get_number_of_length_n_regex_elements(10)) regex_postcode = get_regex_from_elements( get_uk_postcode_regex_elements()) all_elements = ( get_date_regex_elements(testdate) + get_code_regex_elements(str(testnumber)) + get_code_regex_elements( get_digit_string_from_vaguely_numeric_string( testnumber_as_text)) + get_string_regex_elements(teststring) + get_string_regex_elements(testemail) + get_phrase_regex_elements(testphrase) + get_number_of_length_n_regex_elements(10) + get_uk_postcode_regex_elements() ) regex_all = get_regex_from_elements(all_elements) self.report("Removing date: " + testdate_str, regex_date.sub("DATE_GONE", s)) self.report("Removing number: {}".format(testnumber), regex_number.sub("NUMBER_GONE", s)) self.report("Removing numbers as text: " + testnumber_as_text, regex_number_as_text.sub("NUMBER_AS_TEXT_GONE", s)) self.report("Removing string: " + teststring, regex_string.sub("STRING_GONE", s)) self.report("Removing email: " + testemail, regex_email.sub("EMAIL_GONE", s)) self.report("Removing phrase: " + testphrase, regex_phrase.sub("PHRASE_GONE", s)) self.report("Removing 10-digit numbers", regex_10digit.sub("TEN_DIGIT_NUMBERS_GONE", s)) self.report("Removing postcodes", regex_postcode.sub("POSTCODES_GONE", s)) self.report("Removing everything", regex_all.sub("EVERYTHING_GONE", s)) self.report("All-elements regex", get_regex_string_from_elements(all_elements)) self.report("Date regex", get_regex_string_from_elements( get_date_regex_elements(testdate))) self.report("Date regex for 19th century", get_regex_string_from_elements( get_date_regex_elements(old_testdate))) self.report("Phrase regex", get_regex_string_from_elements( get_phrase_regex_elements(testphrase))) self.report("10-digit-number regex", get_regex_string_from_elements( get_number_of_length_n_regex_elements(10)))
def examples_for_paper(): testwords = "John Al'Rahem" min_string_length_to_scrub_with = 4 scrub_string_suffixes = [] # type: List[str] max_errors = 0 at_word_boundaries_only = True words_regexes = [] # type: List[str] for s in get_anon_fragments_from_string(testwords): length = len(s) if length < min_string_length_to_scrub_with: continue words_regexes.extend(get_string_regex_elements( s, suffixes=scrub_string_suffixes, at_word_boundaries_only=at_word_boundaries_only, max_errors=max_errors )) print("--- For words {}:".format(testwords)) for r in words_regexes: print(r) testphrase = "4 Privet Drive" phrase_regexes = get_phrase_regex_elements( testphrase, max_errors=max_errors, at_word_boundaries_only=at_word_boundaries_only ) print("--- For phrase {}:".format(testphrase)) for r in phrase_regexes: print(r) testnumber = "(01223) 123456" anonymise_numbers_at_word_boundaries_only = False anonymise_numbers_at_numeric_boundaries_only = True number_regexes = get_code_regex_elements( get_digit_string_from_vaguely_numeric_string(str(testnumber)), at_word_boundaries_only=anonymise_numbers_at_word_boundaries_only, at_numeric_boundaries_only=anonymise_numbers_at_numeric_boundaries_only ) print("--- For number {}:".format(testnumber)) for r in number_regexes: print(r) testcode = "CB12 3DE" anonymise_codes_at_word_boundaries_only = True code_regexes = get_code_regex_elements( reduce_to_alphanumeric(str(testcode)), at_word_boundaries_only=anonymise_codes_at_word_boundaries_only ) print("--- For code {}:".format(testcode)) for r in code_regexes: print(r) n_digits = 10 nonspec_10_digit_number_regexes = get_number_of_length_n_regex_elements( n_digits, at_word_boundaries_only=anonymise_numbers_at_word_boundaries_only ) print("--- NONSPECIFIC: numbers of length {}:".format(n_digits)) for r in nonspec_10_digit_number_regexes: print(r) uk_postcode_regexes = get_uk_postcode_regex_elements( at_word_boundaries_only=anonymise_codes_at_word_boundaries_only ) print("--- NONSPECIFIC: UK postcodes:") for r in uk_postcode_regexes: print(r) testdate = datetime.date(year=2016, month=12, day=31) date_regexes = get_date_regex_elements(testdate) print("--- For date {}:".format(testdate)) for r in date_regexes: print(r) if __name__ == '__main__': rootlogger = logging.getLogger() configure_logger_for_colour(rootlogger, level=logging.DEBUG) # unittest.main() examples_for_paper()