#!/usr/bin/env python
# crate_anon/nlp_manager/nlp_definition.py
"""
===============================================================================
Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <http://www.gnu.org/licenses/>.
===============================================================================
"""
# =============================================================================
# Imports
# =============================================================================
import codecs
import datetime
import logging
import os
import sys
from typing import Dict, Iterable, List, Optional
from cardinal_pythonlib.datetimefunc import get_now_utc_notz_datetime
from cardinal_pythonlib.lists import chunks
from sqlalchemy.engine.base import Engine
from sqlalchemy.orm.session import Session
from sqlalchemy.schema import MetaData
from crate_anon.anonymise.dbholder import DatabaseHolder
from crate_anon.common.extendedconfigparser import ExtendedConfigParser
from crate_anon.common.sql import TransactionSizeLimiter
from crate_anon.nlp_manager.constants import (
DEFAULT_MAX_BYTES_BEFORE_COMMIT,
DEFAULT_MAX_ROWS_BEFORE_COMMIT,
DEFAULT_TEMPORARY_TABLENAME,
HashClass,
MAX_SQL_FIELD_LEN,
NLP_CONFIG_ENV_VAR,
)
# if sys.version_info.major >= 3 and sys.version_info.minor >= 5:
# from crate_anon.nlp_manager import input_field_config
# from crate_anon.nlp_manager import base_nlp_parser # SEE NEXT LINES
# - see PEP0484 / forward references
# - some circular imports work under Python 3.5 but not 3.4:
# https://docs.python.org/3/whatsnew/3.5.html#other-language-changes
# https://bugs.python.org/issue17636
# - see also:
# http://stackoverflow.com/questions/6351805/cyclic-module-dependencies-and-relative-imports-in-python # noqa
# http://stackoverflow.com/questions/35776791/type-hinting-union-with-forward-references # noqa
# - OK, still problems.
# Let's strip this back to something sensible.
# Does BaseNlpParser really need to know about NlpDefinition?
# - Not directly.
# - For typing, if it stores a reference (optional).
# - It could also be given subcomponents instead.
# Does NlpDefinition really need to know about BaseNlpParser?
# - Yes, but only for delayed imports.
# - For now, solved by weakening type hints for NlpDefinition.
# - # noinspection PyUnresolvedReferences
# ... see http://codeoptimism.com/blog/pycharm-suppress-inspections-list/
# for a full list.
log = logging.getLogger(__name__)
# =============================================================================
# Config class
# =============================================================================
[docs]class NlpDefinition(object):
"""
Class representing NLP master configuration as read from config file.
"""
# noinspection PyUnresolvedReferences
def __init__(self, nlpname: str, logtag: str = "") -> None:
"""
Read config from file.
"""
# DELAYED IMPORTS (to make life simpler for classes deriving from
# NlpParser and using NlpDefinition -- they can now do it directly,
# not just via forward reference).
from crate_anon.nlp_manager.all_processors import make_processor
from crate_anon.nlp_manager.input_field_config import InputFieldConfig
self._nlpname = nlpname
self._logtag = logtag
log.info("Loading config for section: {}".format(nlpname))
# Get filename
try:
self._config_filename = os.environ[NLP_CONFIG_ENV_VAR]
assert self._config_filename
except (KeyError, AssertionError):
print(
"You must set the {} environment variable to point to a CRATE "
"anonymisation config file. Run crate_print_demo_anon_config "
"to see a specimen config.".format(NLP_CONFIG_ENV_VAR))
sys.exit(1)
# Read config from file.
self._parser = ExtendedConfigParser()
self._parser.optionxform = str # make it case-sensitive
log.info("Reading config file: {}".format(self._config_filename))
self._parser.read_file(codecs.open(self._config_filename, "r", "utf8"))
if not self._parser.has_section(nlpname):
raise ValueError("No section named {} present".format(nlpname))
# ---------------------------------------------------------------------
# Our own stuff
# ---------------------------------------------------------------------
self._databases = {} # type: Dict[str, DatabaseHolder]
self._progressdb_name = self.opt_str(nlpname, 'progressdb',
required=True)
self._progdb = self.get_database(self._progressdb_name)
self._temporary_tablename = self.opt_str(
nlpname, 'temporary_tablename',
default=DEFAULT_TEMPORARY_TABLENAME)
self._hashphrase = self.opt_str(nlpname, 'hashphrase', required=True)
self._hasher = HashClass(self._hashphrase)
self._max_rows_before_commit = self.opt_int(
nlpname, 'max_rows_before_commit', DEFAULT_MAX_ROWS_BEFORE_COMMIT)
self._max_bytes_before_commit = self.opt_int(
nlpname, 'max_bytes_before_commit',
DEFAULT_MAX_BYTES_BEFORE_COMMIT)
self._now = get_now_utc_notz_datetime()
# ---------------------------------------------------------------------
# Input field definitions
# ---------------------------------------------------------------------
self._inputfielddefs = self.opt_strlist(nlpname, 'inputfielddefs',
required=True, lower=False)
self._inputfieldmap = {} # type: Dict[str, InputFieldConfig]
for x in self._inputfielddefs:
if x in self._inputfieldmap:
continue
self._inputfieldmap[x] = InputFieldConfig(self, x)
# ---------------------------------------------------------------------
# NLP processors
# ---------------------------------------------------------------------
self._processors = [] # type: List[BaseNlpParser]
processorpairs = self.opt_strlist(nlpname, 'processors', required=True,
lower=False)
try:
for proctype, procname in chunks(processorpairs, 2):
self.require_section(procname)
processor = make_processor(proctype, self, procname)
self._processors.append(processor)
except ValueError:
log.critical("Bad 'processors' specification")
raise
# ---------------------------------------------------------------------
# Transaction sizes, for early commit
# ---------------------------------------------------------------------
self._transaction_limiters = {} # type: Dict[Session, TransactionSizeLimiter] # noqa
# dictionary of session -> TransactionSizeLimiter
def get_name(self) -> str:
return self._nlpname
def get_logtag(self) -> str:
return self._logtag
def get_parser(self) -> ExtendedConfigParser:
return self._parser
def hash(self, text: str) -> str:
return self._hasher.hash(text)
def get_temporary_tablename(self) -> str:
return self._temporary_tablename
def set_echo(self, echo: bool) -> None:
self._progdb.engine.echo = echo
for db in self._databases.values():
db.engine.echo = echo
# Now, SQLAlchemy will mess things up by adding an additional handler.
# So, bye-bye:
for logname in ('sqlalchemy.engine.base.Engine',
'sqlalchemy.engine.base.OptionEngine'):
logger = logging.getLogger(logname)
logger.handlers = [] # type: List[logging.Handler]
def require_section(self, section: str) -> None:
if not self._parser.has_section(section):
msg = "Missing config section: {}".format(section)
log.critical(msg)
raise ValueError(msg)
def opt_str(self, section: str, option: str, required: bool = False,
default: str = None) -> str:
return self._parser.get_str(section, option, default=default,
required=required)
def opt_strlist(self, section: str, option: str, required: bool = False,
lower: bool = True, as_words: bool = True) -> List[str]:
return self._parser.get_str_list(section, option, as_words=as_words,
lower=lower, required=required)
def opt_int(self, section: str, option: str,
default: Optional[int]) -> Optional[int]:
return self._parser.getint(section, option, fallback=default)
def opt_bool(self, section: str, option: str, default: bool) -> bool:
return self._parser.getboolean(section, option, fallback=default)
def get_database(self, name_and_cfg_section: str,
with_session: bool = True,
with_conn: bool = False,
reflect: bool = False) -> DatabaseHolder:
if name_and_cfg_section in self._databases:
return self._databases[name_and_cfg_section]
assert len(name_and_cfg_section) <= MAX_SQL_FIELD_LEN
db = self._parser.get_database(name_and_cfg_section,
with_session=with_session,
with_conn=with_conn,
reflect=reflect)
self._databases[name_and_cfg_section] = db
return db
def get_env_dict(self, section: str,
parent_env: Optional[Dict]=None) -> Dict:
return self._parser.get_env_dict(section, parent_env=parent_env)
def get_progdb_session(self) -> Session:
return self._progdb.session
def get_progdb_engine(self) -> Engine:
return self._progdb.engine
def get_progdb_metadata(self) -> MetaData:
return self._progdb.metadata
[docs] def commit_all(self) -> None:
"""
Execute a COMMIT on all databases (destination + progress).
"""
self.commit(self.get_progdb_session())
for db in self._databases.values():
self.commit(db.session)
def get_transation_limiter(self,
session: Session) -> TransactionSizeLimiter:
if session not in self._transaction_limiters:
self._transaction_limiters[session] = TransactionSizeLimiter(
session,
max_rows_before_commit=self._max_rows_before_commit,
max_bytes_before_commit=self._max_bytes_before_commit)
return self._transaction_limiters[session]
def notify_transaction(self, session: Session,
n_rows: int, n_bytes: int,
force_commit: bool=False) -> None:
tl = self.get_transation_limiter(session)
tl.notify(n_rows=n_rows, n_bytes=n_bytes, force_commit=force_commit)
def commit(self, session: Session) -> None:
tl = self.get_transation_limiter(session)
tl.commit()
# noinspection PyUnresolvedReferences
def get_processors(self) -> List['base_nlp_parser.BaseNlpParser']: # typing / circular reference problem # noqa
return self._processors
# noinspection PyUnresolvedReferences
def get_ifconfigs(self) -> Iterable['input_field_config.InputFieldConfig']: # typing / circular reference problem # noqa
return self._inputfieldmap.values()
def get_now(self) -> datetime.datetime:
return self._now