Source code for crate_anon.nlp_manager.nlp_manager

#!/usr/bin/env python
# crate_anon/nlp_manager/nlp_manager.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================

Manage natural-language processing (NLP) via external tools.

Speed testing:

    - 8 processes, extracting person, location from a mostly text database
    - commit off during full (non-incremental) processing (much faster)
    - needs lots of RAM; e.g. Java subprocess uses 1.4 Gb per process as an
      average (rises from ~250Mb to ~1.4Gb and falls; steady rise means memory
      leak!); tested on a 16 Gb machine. See also the max_external_prog_uses
      parameter.

from __future__ import division
test_size_mb = 1887
n_person_tags_found =
n_locations_tags_found =
time_s = 10333  # 10333 s for main bit; 10465 including indexing; is 2.9 hours
speed_mb_per_s = test_size_mb / time_s

    ... 0.18 Mb/s
    ... and note that's 1.9 Gb of *text*, not of attachments

    - With incremental option, and nothing to do:
        same run took 18 s
    - During the main run, snapshot CPU usage:
        java about 81% across all processes, everything else close to 0
            (using about 12 Gb RAM total)
        ... or 75-85% * 8 [from top]
        mysqld about 18% [from top]
        nlp_manager.py about 4-5% * 8 [from top]

TO DO:
    - comments for NLP output fields (in table definition, destfields)

"""


# =============================================================================
# Imports
# =============================================================================

import argparse
import logging
import os
import sys
from typing import Any, Dict, List, Tuple

from cardinal_pythonlib.datetimefunc import get_now_utc_pendulum
from cardinal_pythonlib.exceptions import die
from cardinal_pythonlib.logs import configure_logger_for_colour
from cardinal_pythonlib.sqlalchemy.core_query import count_star
from cardinal_pythonlib.timing import MultiTimerContext, timer
from sqlalchemy.schema import Column, Index, Table
from sqlalchemy.types import BigInteger, String

from crate_anon.anonymise.constants import (
    DEFAULT_CHUNKSIZE,
    DEFAULT_REPORT_EVERY,
    TABLE_KWARGS,
    SEP,
)
from crate_anon.common.formatting import print_record_counts
from crate_anon.nlp_manager.all_processors import (
    get_nlp_parser_debug_instance,
    possible_processor_names,
    possible_processor_table,
)
from crate_anon.nlp_manager.constants import (
    DEFAULT_REPORT_EVERY_NLP,
    DEMO_CONFIG,
    MAX_STRING_PK_LENGTH,
    NLP_CONFIG_ENV_VAR,
)
from crate_anon.nlp_manager.input_field_config import (
    InputFieldConfig,
    FN_SRCDB,
    FN_SRCTABLE,
    FN_SRCPKFIELD,
    FN_SRCPKVAL,
    FN_SRCPKSTR,
    FN_SRCFIELD,
)
from crate_anon.nlp_manager.models import NlpRecord
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.version import VERSION, VERSION_DATE

log = logging.getLogger(__name__)

TIMING_DROP_REMAKE = "drop_remake"
TIMING_DELETE_WHERE_NO_SOURCE = "delete_where_no_source"
TIMING_PROGRESS_DB_ADD = "progress_db_add"


# =============================================================================
# Database operations
# =============================================================================

[docs]def delete_where_no_source(nlpdef: NlpDefinition, ifconfig: InputFieldConfig, report_every: int = DEFAULT_REPORT_EVERY, chunksize: int = DEFAULT_CHUNKSIZE) -> None: """ Delete destination records where source records no longer exist. - Can't do this in a single SQL command, since the engine can't necessarily see both databases. - Can't use a single temporary table, since the progress database isn't necessarily the same as any of the destination database(s). - Can't do this in a multiprocess way, because we're trying to do a DELETE WHERE NOT IN. - So we fetch all source PKs (which, by definition, do exist), stash them keep them in memory, and do a DELETE WHERE NOT IN based on those specified values (or, if there are no PKs in the source, delete everything from the destination). Problems: - This is IMPERFECT if we have string source PKs and there are hash collisions (e.g. PKs for records X and Y both hash to the same thing; record X is deleted; then its processed version might not be). - With massive tables, we might run out of memory or (much more likely) SQL parameter slots. -- This is now happening; error looks like: pyodbc.ProgrammingError: ('The SQL contains 30807 parameter parkers, but 2717783 parameters were supplied', 'HY000') A better way might be: - for each table, make a temporary table in the same database - populate that table with (source PK integer/hash, source PK string) pairs - delete where pairs don't match -- is that portable SQL? http://stackoverflow.com/questions/7356108/sql-query-for-deleting-rows-with-not-in-using-2-columns # noqa - More efficient would be to make one table per destination database. On the "delete where multiple fields don't match": - Single field syntax is DELETE FROM a WHERE a1 NOT IN (SELECT b1 FROM b) - Multiple field syntax is DELETE FROM a WHERE NOT EXISTS ( SELECT 1 FROM b WHERE a.a1 = b.b1 AND a.a2 = b.b2 ) - In SQLAlchemy, exists(): http://stackoverflow.com/questions/14600619 http://docs.sqlalchemy.org/en/latest/core/selectable.html - Furthermore, in SQL NULL = NULL is false, and NULL <> NULL is also false, so we have to do an explicit null check. You do that with "field == None" (disable See http://stackoverflow.com/questions/21668606 We're aiming, therefore, for: DELETE FROM a WHERE NOT EXISTS ( SELECT 1 FROM b WHERE a.a1 = b.b1 AND ( a.a2 = b.b2 OR (a.a2 IS NULL AND b.b2 IS NULL) ) ) """ # ------------------------------------------------------------------------- # Sub-functions # ------------------------------------------------------------------------- def insert(records_): n_rows = len(records_) log.debug("... inserting {} records".format(n_rows)) for db in databases: session_ = db['session'] temptable_ = db['temptable'] # type: Table session_.execute(temptable_.insert(), records_) nlpdef.notify_transaction(session_, n_rows=n_rows, n_bytes=sys.getsizeof(records_)) def commit(): for db in databases: nlpdef.commit(db['session']) # ------------------------------------------------------------------------- # Main code # ------------------------------------------------------------------------- # Use info log level, otherwise it looks like our code hangs with very # large databases. log.info("delete_where_no_source: examining source table {}.{}; " "MAY BE SLOW".format(ifconfig.get_srcdb(), ifconfig.get_srctable())) # Start our list with the progress database databases = [{ 'session': nlpdef.get_progdb_session(), 'engine': nlpdef.get_progdb_engine(), 'metadata': nlpdef.get_progdb_metadata(), 'temptable': None, # type: Table }] # Add the processors' destination databases for processor in nlpdef.get_processors(): # of type BaseNlpParser session = processor.get_session() if any(x['session'] == session for x in databases): continue # already exists databases.append({ 'session': session, 'engine': processor.get_engine(), 'metadata': processor.get_metadata(), }) # Make a temporary table in each database (note: the Table objects become # affiliated to their engine, I think, so make separate ones for each). log.info("... using {n} destination database(s)".format(n=len(databases))) log.info("... dropping (if exists) and creating temporary table(s)") for database in databases: engine = database['engine'] temptable = Table( nlpdef.get_temporary_tablename(), database['metadata'], Column(FN_SRCPKVAL, BigInteger), # not PK, as may be a hash Column(FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH)), **TABLE_KWARGS ) temptable.drop(engine, checkfirst=True) temptable.create(engine, checkfirst=True) database['temptable'] = temptable # Insert PKs into temporary tables n = count_star(ifconfig.get_source_session(), ifconfig.get_srctable()) log.info("... populating temporary table(s): {} records to go; working in " "chunks of {}".format(n, chunksize)) i = 0 records = [] # type: List[Dict[str, Any]] for pkval, pkstr in ifconfig.gen_src_pks(): i += 1 if report_every and i % report_every == 0: log.info("... src row# {} / {}".format(i, n)) records.append({FN_SRCPKVAL: pkval, FN_SRCPKSTR: pkstr}) if i % chunksize == 0: insert(records) records = [] # type: List[Dict[str, Any]] if records: # remainder insert(records) # Commit commit() # Index, for speed log.info("... creating index(es) on temporary table(s)") for database in databases: temptable = database['temptable'] # type: Table index = Index('_temptable_idx', temptable.columns[FN_SRCPKVAL]) index.create(database['engine']) # DELETE FROM desttable WHERE destpk NOT IN (SELECT srcpk FROM temptable) log.info("... deleting from progress/destination DBs where appropriate") # Delete from progress database prog_db = databases[0] prog_temptable = prog_db['temptable'] ifconfig.delete_progress_records_where_srcpk_not(prog_temptable) # Delete from others for processor in nlpdef.get_processors(): database = [x for x in databases if x['session'] == processor.get_session()][0] temptable = database['temptable'] processor.delete_where_srcpk_not(ifconfig, temptable) # Drop temporary tables log.info("... dropping temporary table(s)") for database in databases: database['temptable'].drop(database['engine'], checkfirst=True) # Commit commit()
# ============================================================================= # Core functions # =============================================================================
[docs]def process_nlp(nlpdef: NlpDefinition, incremental: bool = False, report_every: int = DEFAULT_REPORT_EVERY_NLP, tasknum: int = 0, ntasks: int = 1) -> None: """ Main NLP processing function. Fetch text, send it to the GATE app (storing the results), and make a note in the progress database. """ log.info(SEP + "NLP") session = nlpdef.get_progdb_session() for ifconfig in nlpdef.get_ifconfigs(): i = 0 # record count within this process recnum = tasknum # record count overall totalcount = ifconfig.get_count() # total number of records in table for text, other_values in ifconfig.gen_text(tasknum=tasknum, ntasks=ntasks): i += 1 pkval = other_values[FN_SRCPKVAL] pkstr = other_values[FN_SRCPKSTR] if report_every and i % report_every == 0: log.info( "Processing {db}.{t}.{c}, PK: {pkf}={pkv} " "({overall}record {approx}{recnum}/{totalcount})" "{thisproc}".format( db=other_values[FN_SRCDB], t=other_values[FN_SRCTABLE], c=other_values[FN_SRCFIELD], pkf=other_values[FN_SRCPKFIELD], pkv=pkstr if pkstr else pkval, overall="overall " if ntasks > 1 else "", approx="~" if pkstr and ntasks > 1 else "", # ... string hashing means approx. distribution recnum=recnum + 1, i=i, totalcount=totalcount, thisproc=( " ({i}/~{proccount} this process)".format( i=i, proccount=totalcount // ntasks) if ntasks > 1 else "" ) ) ) recnum += ntasks # log.critical("other_values={}".format(repr(other_values))) srchash = nlpdef.hash(text) progrec = None if incremental: progrec = ifconfig.get_progress_record(pkval, pkstr) if progrec is not None: if progrec.srchash == srchash: log.debug("Record previously processed; skipping") continue else: log.debug("Record has changed") else: log.debug("Record is new") for processor in nlpdef.get_processors(): if incremental: processor.delete_dest_record(ifconfig, pkval, pkstr, commit=incremental) processor.process(text, other_values) # Make a note in the progress database that we've processed a # source record. if progrec: # modifying an existing record progrec.whenprocessedutc = nlpdef.get_now() progrec.srchash = srchash else: # creating a new record progrec = NlpRecord( # Quasi-key fields: srcdb=ifconfig.get_srcdb(), srctable=ifconfig.get_srctable(), srcpkval=pkval, srcpkstr=pkstr, srcfield=ifconfig.get_srcfield(), nlpdef=nlpdef.get_name(), # Other fields: srcpkfield=ifconfig.get_srcpkfield(), whenprocessedutc=nlpdef.get_now(), srchash=srchash, ) with MultiTimerContext(timer, TIMING_PROGRESS_DB_ADD): session.add(progrec) # In incremental mode, do we commit immediately, because other # processes may need this table promptly... ? # force_commit = False # definitely wrong; crashes as below # force_commit = incremental force_commit = ntasks > 1 # - A single source record should not be processed by >1 CRATE # process. So in theory there should be no conflicts. # - However, databases can lock in various ways. Can we guarantee # it'll do something sensible? # - See also # https://en.wikipedia.org/wiki/Isolation_(database_systems) # http://skien.cc/blog/2014/02/06/sqlalchemy-and-race-conditions-follow-up/ # noqa # http://docs.sqlalchemy.org/en/latest/core/connections.html?highlight=execution_options#sqlalchemy.engine.Connection.execution_options # noqa # - However, empirically, setting this to False gives # "Transaction (Process ID xx) was deadlocked on lock resources # with another process and has been chosen as the deadlock # victim. Rerun the transaction." -- with a SELECT query. # - SQL Server uses READ COMMITTED as the default isolation level. # - https://technet.microsoft.com/en-us/library/jj856598(v=sql.110).aspx # noqa nlpdef.notify_transaction(session=session, n_rows=1, n_bytes=sys.getsizeof(progrec), # approx force_commit=force_commit) nlpdef.commit_all()
[docs]def drop_remake(progargs, nlpdef: NlpDefinition, incremental: bool = False, skipdelete: bool = False) -> None: """ Drop output tables and recreate them. """ # Not parallel. # ------------------------------------------------------------------------- # 1. Progress database # ------------------------------------------------------------------------- progengine = nlpdef.get_progdb_engine() if not incremental: log.debug("Dropping progress tables") NlpRecord.__table__.drop(progengine, checkfirst=True) log.info("Creating progress table (with index)") NlpRecord.__table__.create(progengine, checkfirst=True) # ------------------------------------------------------------------------- # 2. Output database(s) # ------------------------------------------------------------------------- pretty_names = [] # type: List[str] for processor in nlpdef.get_processors(): new_pretty_names = processor.make_tables(drop_first=not incremental) for npn in new_pretty_names: if npn in pretty_names: log.warning("An NLP processor has tried to re-make a table " "made by one of its colleagues: {}".format(npn)) pretty_names.extend(new_pretty_names) # ------------------------------------------------------------------------- # 3. Delete WHERE NOT IN for incremental # ------------------------------------------------------------------------- for ifconfig in nlpdef.get_ifconfigs(): with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE): if incremental: if not skipdelete: delete_where_no_source( nlpdef, ifconfig, report_every=progargs.report_every_fast, chunksize=progargs.chunksize) else: # full ifconfig.delete_all_progress_records() # ------------------------------------------------------------------------- # 4. Overall commit (superfluous) # ------------------------------------------------------------------------- nlpdef.commit_all()
[docs]def show_source_counts(nlpdef: NlpDefinition) -> None: """ Show the number of records in all source tables. """ print("SOURCE TABLE RECORD COUNTS:") counts = [] # type: List[Tuple[str, int]] for ifconfig in nlpdef.get_ifconfigs(): session = ifconfig.get_source_session() dbname = ifconfig.get_srcdb() tablename = ifconfig.get_srctable() n = count_star(session, tablename) counts.append(("{}.{}".format(dbname, tablename), n)) print_record_counts(counts)
[docs]def show_dest_counts(nlpdef: NlpDefinition) -> None: """ Show the number of records in all destination tables. """ print("DESTINATION TABLE RECORD COUNTS:") counts = [] # type: List[Tuple[str, int]] for processor in nlpdef.get_processors(): session = processor.get_session() dbname = processor.get_dbname() for tablename in processor.get_tablenames(): n = count_star(session, tablename) counts.append(("DESTINATION: {}.{}".format(dbname, tablename), n)) print_record_counts(counts)
# ============================================================================= # Main # =============================================================================
[docs]def main() -> None: """ Command-line entry point. """ version = "Version {} ({})".format(VERSION, VERSION_DATE) description = "NLP manager. {version}. By Rudolf Cardinal.".format( version=version) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--version", action="version", version=version) parser.add_argument("--config", help="Config file (overriding environment " "variable {})".format(NLP_CONFIG_ENV_VAR)) parser.add_argument('--verbose', '-v', action='store_true', help="Be verbose (use twice for extra verbosity)") parser.add_argument("--nlpdef", nargs="?", default=None, help="NLP definition name (from config file)") parser.add_argument('--report_every_fast', nargs="?", type=int, default=DEFAULT_REPORT_EVERY, help="Report insert progress (for fast operations) " "every n rows in verbose " "mode (default {})".format(DEFAULT_REPORT_EVERY)) parser.add_argument('--report_every_nlp', nargs="?", type=int, default=DEFAULT_REPORT_EVERY_NLP, help="Report progress for NLP every n rows in verbose " "mode (default " "{})".format(DEFAULT_REPORT_EVERY_NLP)) parser.add_argument('--chunksize', nargs="?", type=int, default=DEFAULT_CHUNKSIZE, help="Number of records copied in a chunk when copying" " PKs from one database to another" " (default {})".format(DEFAULT_CHUNKSIZE)) parser.add_argument("--process", nargs="?", type=int, default=0, help="For multiprocess mode: specify process number") parser.add_argument("--nprocesses", nargs="?", type=int, default=1, help="For multiprocess mode: specify " "total number of processes (launched somehow, of " "which this is to be one)") parser.add_argument("--processcluster", default="", help="Process cluster name") parser.add_argument("--democonfig", action="store_true", help="Print a demo config file") parser.add_argument("--listprocessors", action="store_true", help="Show possible built-in NLP processor names") parser.add_argument("--describeprocessors", action="store_true", help="Show details of built-in NLP processors") parser.add_argument("--showinfo", required=False, nargs='?', metavar="NLP_CLASS_NAME", help="Show detailed information for a parser") parser.add_argument("--count", action="store_true", help="Count records in source/destination databases, " "then stop") mode_group = parser.add_mutually_exclusive_group() mode_group.add_argument("-i", "--incremental", dest="incremental", action="store_true", help="Process only new/changed information, where " "possible (* default)") mode_group.add_argument("-f", "--full", dest="incremental", action="store_false", help="Drop and remake everything") parser.set_defaults(incremental=True) parser.add_argument("--dropremake", action="store_true", help="Drop/remake destination tables only") parser.add_argument("--skipdelete", dest="skipdelete", action="store_true", help="For incremental updates, skip deletion of rows " "present in the destination but not the source") parser.add_argument("--nlp", action="store_true", help="Perform NLP processing only") parser.add_argument("--echo", action="store_true", help="Echo SQL") parser.add_argument("--timing", action="store_true", help="Show detailed timing breakdown") args = parser.parse_args() # Validate args if args.nprocesses < 1: raise ValueError("--nprocesses must be >=1") if args.process < 0 or args.process >= args.nprocesses: raise ValueError( "--process argument must be from 0 to (nprocesses - 1) inclusive") if args.config: os.environ[NLP_CONFIG_ENV_VAR] = args.config # Verbosity and logging mynames = [] # type: List[str] if args.processcluster: mynames.append(args.processcluster) if args.nprocesses > 1: mynames.append("proc{}".format(args.process)) loglevel = logging.DEBUG if args.verbose else logging.INFO rootlogger = logging.getLogger() configure_logger_for_colour(rootlogger, level=loglevel, extranames=mynames) # ------------------------------------------------------------------------- # Demo config? if args.democonfig: print(DEMO_CONFIG) return # List or describe processors? if args.listprocessors: print("\n".join(possible_processor_names())) return if args.describeprocessors: print(possible_processor_table()) return if args.showinfo: parser = get_nlp_parser_debug_instance(args.showinfo) if parser: print("Info for class {}:\n".format(args.showinfo)) parser.print_info() else: print("No such processor class: {}".format(args.showinfo)) return # Otherwise, we need a valid NLP definition. if args.nlpdef is None: raise ValueError( "Must specify nlpdef parameter (unless --democonfig, " "--listprocessors, or --describeprocessors used)") everything = not any([args.dropremake, args.nlp]) # Report args log.debug("arguments: {}".format(args)) # Load/validate config config = NlpDefinition(args.nlpdef, logtag="_".join(mynames).replace(" ", "_")) config.set_echo(args.echo) # Count only? if args.count: show_source_counts(config) show_dest_counts(config) return # ------------------------------------------------------------------------- log.info("Starting: incremental={}".format(args.incremental)) start = get_now_utc_pendulum() timer.set_timing(args.timing, reset=True) # 1. Drop/remake tables. Single-tasking only. with MultiTimerContext(timer, TIMING_DROP_REMAKE): if args.dropremake or everything: drop_remake(args, config, incremental=args.incremental, skipdelete=args.skipdelete) # From here, in a multiprocessing environment, trap any errors simply so # we can report the process number clearly. # 2. NLP if args.nlp or everything: try: process_nlp(config, incremental=args.incremental, report_every=args.report_every_nlp, tasknum=args.process, ntasks=args.nprocesses) except Exception as exc: log.critical("TERMINAL ERROR FROM THIS PROCESS") # so we see proc# die(exc) log.info("Finished") end = get_now_utc_pendulum() time_taken = end - start log.info("Time taken: {:.3f} seconds".format(time_taken.total_seconds())) if args.timing: timer.report()
# ============================================================================= # Command-line entry point # ============================================================================= if __name__ == '__main__': main()