#!/usr/bin/env python
# crate_anon/anonymise/anonymise_cli.py
"""
===============================================================================
Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <http://www.gnu.org/licenses/>.
===============================================================================
"""
# Uses a delayed import (see below), so we can set up logging before
# using the config object.
import argparse
import logging
import os
from typing import List
from cardinal_pythonlib.exceptions import die
from cardinal_pythonlib.extract_text import is_text_extractor_available
from cardinal_pythonlib.logs import configure_logger_for_colour
from crate_anon.anonymise.constants import (
CONFIG_ENV_VAR,
DEFAULT_CHUNKSIZE,
DEFAULT_REPORT_EVERY,
DEMO_CONFIG,
)
from crate_anon.version import VERSION, VERSION_DATE
log = logging.getLogger(__name__)
DEBUG_RUN_WITH_PDB = False
if DEBUG_RUN_WITH_PDB:
from cardinal_pythonlib.debugging import pdb_run
else:
pdb_run = None
# =============================================================================
# Main
# =============================================================================
[docs]def main() -> None:
"""
Command-line entry point.
"""
version = "Version {} ({})".format(VERSION, VERSION_DATE)
description = "Database anonymiser. {version}. By Rudolf Cardinal.".format(
version=version,
)
parser = argparse.ArgumentParser(
description=description,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--version", action="version", version=version)
parser.add_argument("--democonfig", action="store_true",
help="Print a demo config file")
parser.add_argument("--config",
help="Config file (overriding environment "
"variable {})".format(CONFIG_ENV_VAR))
parser.add_argument('--verbose', '-v', action="store_true",
help="Be verbose")
parser.add_argument('--reportevery', nargs="?", type=int,
default=DEFAULT_REPORT_EVERY,
help="Report insert progress every n rows in verbose "
"mode (default {})".format(DEFAULT_REPORT_EVERY))
parser.add_argument('--chunksize', nargs="?", type=int,
default=DEFAULT_CHUNKSIZE,
help="Number of records copied in a chunk when copying"
" PKs from one database to another"
" (default {})".format(DEFAULT_CHUNKSIZE))
parser.add_argument("--process", nargs="?", type=int, default=0,
help="For multiprocess mode: specify process number")
parser.add_argument("--nprocesses", nargs="?", type=int, default=1,
help="For multiprocess mode: specify "
"total number of processes (launched somehow, of "
"which this is to be one)")
parser.add_argument("--processcluster", default="",
help="Process cluster name")
parser.add_argument("--draftdd", action="store_true",
help="Print a draft data dictionary")
parser.add_argument("--incrementaldd", action="store_true",
help="Print an INCREMENTAL draft data dictionary")
parser.add_argument("--debugscrubbers", action="store_true",
help="Report sensitive scrubbing information, for "
"debugging")
parser.add_argument("--savescrubbers", action="store_true",
help="Saves sensitive scrubbing information in admin "
"database, for debugging")
parser.add_argument("--count", action="store_true",
help="Count records in source/destination databases, "
"then stop")
parser.add_argument("--dropremake", action="store_true",
help="Drop/remake destination tables, then stop")
parser.add_argument("--optout", action="store_true",
help="Build opt-out list, then stop")
parser.add_argument("--nonpatienttables", action="store_true",
help="Process non-patient tables only")
parser.add_argument("--patienttables", action="store_true",
help="Process patient tables only")
parser.add_argument("--index", action="store_true",
help="Create indexes only")
parser.add_argument("--skip_dd_check", action="store_true",
help="Skip data dictionary validity check")
mode_group = parser.add_mutually_exclusive_group()
mode_group.add_argument(
"-i", "--incremental", dest="incremental", action="store_true",
help="Process only new/changed information, where possible "
"(* default)")
mode_group.add_argument(
"-f", "--full", dest="incremental", action="store_false",
help="Drop and remake everything")
parser.set_defaults(incremental=True)
parser.add_argument(
"--skipdelete", dest="skipdelete", action="store_true",
help="For incremental updates, skip deletion of rows present in the "
"destination but not the source")
parser.add_argument(
"--seed",
help="String to use as the basis of the seed for the random number "
"generator used for the transient integer RID (TRID). Leave "
"blank to use the default seed (system time).")
parser.add_argument("--echo", action="store_true", help="Echo SQL")
parser.add_argument(
"--checkextractor", nargs='*',
help="File extensions to check for availability of a text extractor "
"(use a '.' prefix, and use the special extension 'None' to "
"check the fallback processor")
args = parser.parse_args()
# -------------------------------------------------------------------------
# Verbosity
mynames = [] # type: List[str]
if args.processcluster:
mynames.append(args.processcluster)
if args.nprocesses > 1:
mynames.append("proc{}".format(args.process))
loglevel = logging.DEBUG if args.verbose else logging.INFO
rootlogger = logging.getLogger()
configure_logger_for_colour(rootlogger, loglevel, extranames=mynames)
# Check text converters
if args.checkextractor:
for ext in args.checkextractor:
if ext.lower() == 'none':
ext = None
available = is_text_extractor_available(ext)
print("Text extractor for extension {} present: {}".format(
ext, available))
return
if args.config:
os.environ[CONFIG_ENV_VAR] = args.config
# Demo config?
if args.democonfig:
print(DEMO_CONFIG)
return
# Delayed import; pass everything else on
from crate_anon.anonymise.anonymise import anonymise # delayed import
try:
anonymise(args)
except Exception as exc:
log.critical("TERMINAL ERROR FROM THIS PROCESS") # so we see proc#
die(exc)
# =============================================================================
# Command-line entry point
# =============================================================================
if __name__ == '__main__':
if DEBUG_RUN_WITH_PDB:
pdb_run(main)
else:
main()