Source code for crate_anon.nlp_manager.regex_func

#!/usr/bin/env python
# crate_anon/nlp_manager/regex_func.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================
"""

from typing import Any, Dict, Optional, Pattern, Tuple

import regex
# noinspection PyProtectedMember
from regex import _regex_core

# =============================================================================
#  Core regex functions
# =============================================================================
# - All will use VERBOSE mode for legibility. (No impact on speed: compiled.)
# - Don't forget to use raw strings for all regex definitions!
# - Beware comments inside regexes. The comment parser isn't quite as benign
#   as you might think. Use very plain text only.
# - (?: XXX ) makes XXX into an unnamed group.


REGEX_COMPILE_FLAGS = (regex.IGNORECASE | regex.MULTILINE | regex.VERBOSE |
                       regex.UNICODE)


[docs]def at_wb_start_end(regex_str: str) -> str: """ Caution using this. Digits do not end a word, so "mm3" will not match if your "mm" group ends in a word boundary. """ return "\b(?: {} )\b".format(regex_str)
[docs]def at_start_wb(regex_str: str) -> str: """ With word boundary at start. Beware, though; e.g. "3kg" is reasonable, and this does NOT have a word boundary in. """ return "(?: \b (?: {} ) )".format(regex_str)
def compile_regex(regex_str: str) -> Pattern: try: return regex.compile(regex_str, REGEX_COMPILE_FLAGS) except _regex_core.error: print("FAILING REGEX:\n{}".format(regex_str)) raise def compile_regex_dict(regexstr_to_value_dict: Dict[str, Any]) \ -> Dict[Pattern, Any]: return { compile_regex(k): v for k, v in regexstr_to_value_dict.items() }
[docs]def get_regex_dict_match(text: Optional[str], regex_to_value_dict: Dict[Pattern, Any], default: Any = None) \ -> Tuple[bool, Any]: """Returns (matched, result).""" if text: for r, value in regex_to_value_dict.items(): if r.match(text): return True, value return False, default