Source code for crate_anon.anonymise.ddr

#!/usr/bin/env python
# crate_anon/anonymise/ddr.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================
"""

# =============================================================================
# Imports
# =============================================================================

import ast
import logging
from typing import Any, List, Dict, Iterable, TYPE_CHECKING, Union

from cardinal_pythonlib.convert import convert_to_int
from cardinal_pythonlib.lists import count_bool
from cardinal_pythonlib.rnc_db import (
    ensure_valid_field_name,
    ensure_valid_table_name,
    is_sqltype_valid,
)
from cardinal_pythonlib.sqlalchemy.schema import (
    convert_sqla_type_for_dialect,
    does_sqlatype_merit_fulltext_index,
    does_sqlatype_require_index_len,
    giant_text_sqltype,
    get_sqla_coltype_from_dialect_str,
    is_sqlatype_binary,
    is_sqlatype_date,
    is_sqlatype_numeric,
    is_sqlatype_text_of_length_at_least,
    is_sqlatype_text_over_one_char,
)
from sqlalchemy import Column
from sqlalchemy.sql.sqltypes import TypeEngine

from crate_anon.anonymise.altermethod import AlterMethod
from crate_anon.anonymise.constants import (
    ALTERMETHOD,
    DECISION,
    DEFAULT_INDEX_LEN,
    INDEX,
    MAX_IDENTIFIER_LENGTH,
    ODD_CHARS_TRANSLATE,
    SCRUBMETHOD,
    SCRUBSRC,
    SRCFLAG,
)
import crate_anon.common.sql

if TYPE_CHECKING:
    from crate_anon.anonymise.config import Config

log = logging.getLogger(__name__)


# =============================================================================
# DataDictionaryRow
# =============================================================================

DDR_FWD_REF = "DataDictionaryRow"
DATABASE_SAFE_CONFIG_FWD_REF = "DatabaseSafeConfig"


[docs]class DataDictionaryRow(object): """ Class representing a single row of a data dictionary. """ ROWNAMES = [ "src_db", "src_table", "src_field", "src_datatype", "src_flags", "scrub_src", "scrub_method", "decision", "inclusion_values", "exclusion_values", "alter_method", "dest_table", "dest_field", "dest_datatype", "index", "indexlen", "comment", ] def __init__(self, config: "Config") -> None: """ Set up basic defaults. """ self.config = config self.src_db = None self.src_table = None self.src_field = None self.src_datatype = None # in SQL string format self._src_sqla_coltype = None # src_flags: a property; see below self.scrub_src = None self.scrub_method = None self.omit = False # in the DD file, this is 'decision' # alter_method: a property; see below self.dest_table = None self.dest_field = None self.dest_datatype = None self.index = None self.indexlen = None self.comment = '' self._from_file = False # For src_flags: self._pk = False self._add_src_hash = False self._primary_pid = False self._defines_primary_pids = False self._master_pid = False self._constant = False self._addition_only = False self._opt_out_info = False self._required_scrubber = False self._inclusion_values = [] # type: List[Any] self._exclusion_values = [] # type: List[Any] self._alter_methods = [] # type: List[AlterMethod] # ------------------------------------------------------------------------- # Properties # ------------------------------------------------------------------------- @property def src_db_lowercase(self) -> str: return self.src_db.lower() @property def src_table_lowercase(self) -> str: return self.src_table.lower() @property def src_field_lowercase(self) -> str: return self.src_field.lower() @property def pk(self) -> bool: return self._pk @property def add_src_hash(self) -> bool: return self._add_src_hash @property def primary_pid(self) -> bool: return self._primary_pid @property def defines_primary_pids(self) -> bool: return self._defines_primary_pids @property def master_pid(self) -> bool: return self._master_pid @property def constant(self) -> bool: return self._constant @property def addition_only(self) -> bool: return self._addition_only @property def opt_out_info(self) -> bool: return self._opt_out_info @property def src_flags(self) -> str: return ''.join(str(x) for x in ( SRCFLAG.PK if self._pk else '', SRCFLAG.ADD_SRC_HASH if self._add_src_hash else '', SRCFLAG.PRIMARY_PID if self._primary_pid else '', SRCFLAG.DEFINES_PRIMARY_PIDS if self._defines_primary_pids else '', SRCFLAG.MASTER_PID if self._master_pid else '', SRCFLAG.CONSTANT if self._constant else '', SRCFLAG.ADDITION_ONLY if self._addition_only else '', SRCFLAG.OPT_OUT if self._opt_out_info else '', SRCFLAG.REQUIRED_SCRUBBER if self._required_scrubber else '', )) @src_flags.setter def src_flags(self, value: str) -> None: self._pk = SRCFLAG.PK.value in value self._add_src_hash = SRCFLAG.ADD_SRC_HASH.value in value self._primary_pid = SRCFLAG.PRIMARY_PID.value in value self._defines_primary_pids = SRCFLAG.DEFINES_PRIMARY_PIDS.value in value self._master_pid = SRCFLAG.MASTER_PID.value in value self._constant = SRCFLAG.CONSTANT.value in value self._addition_only = SRCFLAG.ADDITION_ONLY.value in value self._opt_out_info = SRCFLAG.OPT_OUT.value in value self._required_scrubber = SRCFLAG.REQUIRED_SCRUBBER.value in value @property def inclusion_values(self) -> List[Any]: return self._inclusion_values or '' # for TSV output @inclusion_values.setter def inclusion_values(self, value: str) -> None: if value: self._inclusion_values = ast.literal_eval(value) or [] else: self._inclusion_values = [] @property def exclusion_values(self) -> List[Any]: return self._exclusion_values or '' # for TSV output @exclusion_values.setter def exclusion_values(self, value: str) -> None: if value: self._exclusion_values = ast.literal_eval(value) or [] else: self._exclusion_values = [] @property def alter_method(self) -> str: """ Return the alter_method field from the working fields. """ return ",".join(filter( None, (x.get_text() for x in self._alter_methods))) @property def required_scrubber(self) -> bool: return self._required_scrubber @alter_method.setter def alter_method(self, value: str) -> None: """ Convert the alter_method field (from the data dictionary) to a bunch of boolean/simple fields. """ # Get the list of elements in the user's order. self._alter_methods = [] elements = [x.strip() for x in value.split(",") if x] methods = [] for e in elements: methods.append(AlterMethod(config=self.config, text_value=e)) # Now establish order. Text extraction first; everything else in order. text_extraction_indices = [] for i, am in enumerate(methods): if am.extract_text: text_extraction_indices.append(i) for index in sorted(text_extraction_indices, reverse=True): # Go in reverse order of index. self._alter_methods.append(methods[index]) del methods[index] self._alter_methods.extend(methods) # Now, checks: have_text_extraction = False have_truncate_date = False for am in self._alter_methods: if not am.truncate_date and have_truncate_date: raise ValueError("Date truncation must stand alone in " "alter_method: {}".format(value)) if am.extract_text and have_text_extraction: raise ValueError("Can only have one text extraction method in " "{}".format(value)) if am.truncate_date: have_truncate_date = True if am.extract_text: have_text_extraction = True @property def from_file(self) -> bool: return self._from_file @property def decision(self) -> str: return DECISION.OMIT.value if self.omit else DECISION.INCLUDE.value @decision.setter def decision(self, value: str) -> None: try: e = DECISION.lookup(value) self.omit = e is DECISION.OMIT except ValueError: raise ValueError("decision was {}; must be one of {}".format( value, [DECISION.OMIT.value, DECISION.INCLUDE.value])) # ------------------------------------------------------------------------- # Comparisons # ------------------------------------------------------------------------- def __lt__(self, other: DDR_FWD_REF) -> bool: return self.get_signature() < other.get_signature() def matches_tabledef(self, tabledef: Union[str, List[str]]) -> bool: return crate_anon.common.sql.matches_tabledef(self.src_table, tabledef) def matches_fielddef(self, fielddef: Union[str, List[str]]) -> bool: return crate_anon.common.sql.matches_fielddef( self.src_table, self.src_field, fielddef) # ------------------------------------------------------------------------- # Representations # ------------------------------------------------------------------------- def __str__(self) -> str: """ Return a string representation. """ return ", ".join(["{}: {}".format(a, getattr(self, a)) for a in DataDictionaryRow.ROWNAMES])
[docs] def get_signature(self) -> str: """ Return a signature based on the source database/table/field. """ return "{}.{}.{}".format(self.src_db, self.src_table, self.src_field)
def get_dest_signature(self) -> str: return "{}.{}".format(self.dest_table, self.dest_field) def get_offender_description(self) -> str: offenderdest = "" if not self.omit else " -> {}".format( self.get_dest_signature()) return "{}{}".format(self.get_signature(), offenderdest)
[docs] def get_tsv(self) -> str: """ Return a TSV row for writing. """ values = [] for x in DataDictionaryRow.ROWNAMES: v = getattr(self, x) if v is None: v = "" v = str(v) values.append(v) return "\t".join(values)
# ------------------------------------------------------------------------- # Setting # -------------------------------------------------------------------------
[docs] def set_from_dict(self, valuedict: Dict[str, Any]) -> None: """ Set internal fields from a dict of elements representing a row from the TSV data dictionary file. """ self.src_db = valuedict['src_db'] self.src_table = valuedict['src_table'] self.src_field = valuedict['src_field'] self.src_datatype = valuedict['src_datatype'].upper() self.src_flags = valuedict['src_flags'] # a property self.scrub_src = SCRUBSRC.lookup(valuedict['scrub_src'], allow_none=True) self.scrub_method = SCRUBMETHOD.lookup(valuedict['scrub_method'], allow_none=True) self.decision = valuedict['decision'] # a property; sets self.omit self.inclusion_values = valuedict['inclusion_values'] # a property self.exclusion_values = valuedict['exclusion_values'] # a property self.alter_method = valuedict['alter_method'] # a property self.dest_table = valuedict['dest_table'] self.dest_field = valuedict['dest_field'] self.dest_datatype = valuedict['dest_datatype'].upper() self.index = INDEX.lookup(valuedict['index'], allow_none=True) self.indexlen = convert_to_int(valuedict['indexlen']) self.comment = valuedict['comment'] self._from_file = True
# ------------------------------------------------------------------------- # Anonymisation decisions # ------------------------------------------------------------------------- def being_scrubbed(self) -> bool: return any(am.scrub for am in self._alter_methods) def contains_patient_info(self) -> bool: return self._primary_pid or self._master_pid or bool(self.scrub_src) def contains_vital_patient_info(self) -> bool: return bool(self.scrub_src) def required(self) -> bool: # return not self.omit or self.contains_patient_info() return not self.omit or self.contains_vital_patient_info() def skip_row_by_value(self, value: Any) -> bool: if self._inclusion_values and value not in self._inclusion_values: # log.debug("skipping row based on inclusion_values") return True if value in self._exclusion_values: # log.debug("skipping row based on exclusion_values") return True return False def get_alter_methods(self) -> List[AlterMethod]: return self._alter_methods def skip_row_if_extract_text_fails(self) -> bool: return any(x.skip_if_text_extract_fails for x in self._alter_methods) def get_extracting_text_altermethods(self) -> List[AlterMethod]: return [am for am in self._alter_methods if am.extract_text] def remove_scrub_from_alter_methods(self) -> None: log.debug( "remove_scrub_from_alter_methods " "[used for non-patient tables]: {}".format( self.get_signature())) for sm in self._alter_methods: sm.scrub = False # ------------------------------------------------------------------------- # Other decisions # ------------------------------------------------------------------------- def using_fulltext_index(self) -> bool: return self.index is INDEX.FULLTEXT # ------------------------------------------------------------------------- # SQLAlchemy types # ------------------------------------------------------------------------- def get_src_sqla_coltype(self) -> TypeEngine: return self._src_sqla_coltype or get_sqla_coltype_from_dialect_str( self.src_datatype, self.config.get_src_dialect(self.src_db)) def set_src_sqla_coltype(self, sqla_coltype: TypeEngine) -> None: self._src_sqla_coltype = sqla_coltype def get_dest_sqla_coltype(self) -> TypeEngine: dialect = self.config.get_dest_dialect() if self.dest_datatype: # User (or our autogeneration process) wants to override # the type. return get_sqla_coltype_from_dialect_str(self.dest_datatype, dialect) else: # Return the SQLAlchemy column type class determined from the # source database by reflection. # Will be autoconverted to the destination dialect. # With some exceptions, addressed as below: return convert_sqla_type_for_dialect( coltype=self.get_src_sqla_coltype(), dialect=dialect, expand_for_scrubbing=self.being_scrubbed()) def get_dest_sqla_column(self) -> Column: name = self.dest_field coltype = self.get_dest_sqla_coltype() comment = self.comment or '' kwargs = { 'doc': comment, # When SQLAlchemy 1.2 released, add this: # 'comment': comment, # https://bitbucket.org/zzzeek/sqlalchemy/issues/1546/feature-request-commenting-db-objects # noqa } if self._pk: kwargs['primary_key'] = True kwargs['autoincrement'] = False if self.primary_pid: kwargs['nullable'] = False return Column(name, coltype, **kwargs) # ------------------------------------------------------------------------- # Validation # -------------------------------------------------------------------------
[docs] def check_valid(self) -> None: """ Check internal validity and complain if invalid, showing the source of the problem. """ try: self._check_valid() except (AssertionError, ValueError): log.exception( "Offending DD row [{}]: {}".format( self.get_offender_description(), str(self))) raise
def check_prohibited_fieldnames(self, fieldnames: Iterable[str]) -> None: if self.dest_field in fieldnames: log.exception( "Offending DD row [{}]: {}".format( self.get_offender_description(), str(self))) raise ValueError("Prohibited dest_field name") def _check_valid(self) -> None: """ Check internal validity and complain if invalid. """ assert self.src_db, "Need src_db" assert self.src_table, "Need src_table" assert self.src_field, "Need src_field" assert self.src_datatype, "Need src_datatype" if not self.omit: assert self.dest_table, "Need dest_table" assert self.dest_field, "Need dest_field" src_sqla_coltype = self.get_src_sqla_coltype() dest_sqla_coltype = self.get_dest_sqla_coltype() if self.src_db not in self.config.get_source_db_names(): raise ValueError( "Data dictionary row references non-existent source " "database") srccfg = self.config.sources[self.src_db].srccfg ensure_valid_table_name(self.src_table) ensure_valid_field_name(self.src_field) if len(self.src_table) > MAX_IDENTIFIER_LENGTH: log.warning( "Table name in {}.{} is too long for MySQL ({} characters > " "{} maximum".format( self.src_table, self.src_field, len(self.src_table), MAX_IDENTIFIER_LENGTH)) if len(self.src_field) > MAX_IDENTIFIER_LENGTH: log.warning( "Field name in {}.{} is too long for MySQL ({} characters > " "{} maximum".format( self.src_table, self.src_field, len(self.src_field), MAX_IDENTIFIER_LENGTH)) # REMOVED 2016-06-04; fails with complex SQL Server types, which can # look like 'NVARCHAR(10) COLLATE "Latin1_General_CI_AS"'. # # if not is_sqltype_valid(self.src_datatype): # raise ValueError( # "Field has invalid source data type: {}".format( # self.src_datatype)) # 2016-11-11: error message clarified # 2017-05-06: check removed; we can now handle non-integer PIDs # # if ((self._primary_pid or self._master_pid) and # not is_sqltype_integer(self.src_datatype)): # raise ValueError( # "For {}: All fields with src_flags={} or src_flags={} set " # "should be integer, (a) for work distribution purposes, and " # "(b) so we know the structure of our secret mapping table " # "in advance.".format(self.src_field, # SRCFLAG.PRIMARY_PID, # SRCFLAG.MASTER_PID)) if self._defines_primary_pids and not self._primary_pid: raise ValueError( "All fields with src_flags={} set must have src_flags={} " "set".format(SRCFLAG.DEFINES_PRIMARY_PIDS, SRCFLAG.PRIMARY_PID)) if self._opt_out_info and not self.config.optout_col_values: raise ValueError( "Fields with src_flags={} exist, but config's " "optout_col_values setting is empty".format(SRCFLAG.OPT_OUT)) if count_bool([self._primary_pid, self._master_pid, bool(self.alter_method)]) > 1: raise ValueError( "Field can be any ONE of: src_flags={}, src_flags={}, " "alter_method".format(SRCFLAG.PRIMARY_PID, SRCFLAG.MASTER_PID)) if self._required_scrubber and not self.scrub_src: raise ValueError("If you specify src_flags={}, you must specify " "scrub_src".format(SRCFLAG.REQUIRED_SCRUBBER)) if self._add_src_hash: if not self._pk: raise ValueError( "src_flags={} can only be set on " "src_flags={} fields".format( SRCFLAG.ADD_SRC_HASH, SRCFLAG.PK)) if self.index is not INDEX.UNIQUE: raise ValueError( "src_flags={} fields require index=={}".format( SRCFLAG.ADD_SRC_HASH, INDEX.UNIQUE)) if self._constant: raise ValueError( "cannot mix {} flag with {} flag".format( SRCFLAG.ADD_SRC_HASH, SRCFLAG.CONSTANT)) if self._constant: if not self._pk: raise ValueError( "src_flags={} can only be set on " "src_flags={} fields".format( SRCFLAG.CONSTANT, SRCFLAG.PK)) if self.index is not INDEX.UNIQUE: raise ValueError( "src_flags={} fields require index=={}".format( SRCFLAG.CONSTANT, INDEX.UNIQUE)) if self.omit: return # --------------------------------------------------------------------- # Below here: checks only applying to non-omitted columns # --------------------------------------------------------------------- ensure_valid_table_name(self.dest_table) if self.dest_table == self.config.temporary_tablename: raise ValueError( "Destination tables can't be named {}, as that's the " "name set in the config's temporary_tablename " "variable".format(self.config.temporary_tablename)) ensure_valid_field_name(self.dest_field) if self.dest_field == self.config.source_hash_fieldname: raise ValueError( "Destination fields can't be named {}, as that's the " "name set in the config's source_hash_fieldname " "variable".format(self.config.source_hash_fieldname)) if self.dest_datatype and not is_sqltype_valid(self.dest_datatype): raise ValueError( "Field has invalid destination data type: " "{}".format(self.dest_datatype)) if self.matches_fielddef(srccfg.ddgen_per_table_pid_field): if not self._primary_pid: raise ValueError( "All fields with src_field={} used in output should " "have src_flag={} set".format(self.src_field, SRCFLAG.PRIMARY_PID)) if self.dest_field != self.config.research_id_fieldname: raise ValueError( "Primary PID field should have " "dest_field = {}".format( self.config.research_id_fieldname)) if (self.matches_fielddef(srccfg.ddgen_master_pid_fieldname) and not self._master_pid): raise ValueError( "All fields with src_field = {} used in output should have" " src_flags={} set".format( srccfg.ddgen_master_pid_fieldname, SRCFLAG.MASTER_PID)) for am in self._alter_methods: if am.truncate_date: if not (is_sqlatype_date(src_sqla_coltype) or is_sqlatype_text_over_one_char(src_sqla_coltype)): raise ValueError("Can't set truncate_date for " "non-date/non-text field") if am.extract_from_filename: if not is_sqlatype_text_over_one_char(src_sqla_coltype): raise ValueError( "For alter_method = " "{ALTERMETHOD.FILENAME_TO_TEXT}, source field " "must contain a filename and therefore " "must be text type of >1 character".format( ALTERMETHOD=ALTERMETHOD)) if am.extract_from_blob: if not is_sqlatype_binary(src_sqla_coltype): raise ValueError( "For alter_method = {ALTERMETHOD.BINARY_TO_TEXT}, " "source field must be of binary type".format( ALTERMETHOD=ALTERMETHOD)) # This error/warning too hard to be sure of with SQL Server odd # string types: # if self._scrub and not self._extract_text: # if not is_sqltype_text_over_one_char(self.src_datatype): # raise ValueError("Can't scrub in non-text field or " # "single-character text field") if ((self._primary_pid or self._master_pid) and self.dest_datatype != self.config.sqltype_encrypted_pid_as_sql): raise ValueError( "All src_flags={}/src_flags={} fields used in output must " "have destination_datatype = {}".format( SRCFLAG.PRIMARY_PID, SRCFLAG.MASTER_PID, self.config.sqltype_encrypted_pid_as_sql)) if (self.index in (INDEX.NORMAL, INDEX.UNIQUE) and self.indexlen is None and does_sqlatype_require_index_len(dest_sqla_coltype)): raise ValueError( "Must specify indexlen to index a TEXT or BLOB field") # ------------------------------------------------------------------------- # Other stuff requiring config or database info # -------------------------------------------------------------------------
[docs] def set_from_src_db_info(self, db: str, table: str, field: str, datatype_sqltext: str, sqla_coltype: TypeEngine, dbconf: DATABASE_SAFE_CONFIG_FWD_REF, comment=None) -> None: """ Create a draft data dictionary row from a field in the source database. """ self.src_db = db self.src_table = table self.src_field = field self.src_datatype = datatype_sqltext self._src_sqla_coltype = sqla_coltype self._pk = False self._add_src_hash = False self._primary_pid = False self._defines_primary_pids = False self._master_pid = False self._constant = False self._addition_only = False self.comment = comment self._from_file = False # --------------------------------------------------------------------- # Is the field special, such as a PK? # --------------------------------------------------------------------- if self.matches_fielddef(dbconf.ddgen_pk_fields): self._pk = True self._constant = ( (dbconf.ddgen_constant_content or self.matches_tabledef( dbconf.ddgen_constant_content_tables)) and not self.matches_tabledef( dbconf.ddgen_nonconstant_content_tables) ) self._add_src_hash = not self._constant self._addition_only = ( (dbconf.ddgen_addition_only or self.matches_tabledef(dbconf.ddgen_addition_only_tables)) and not self.matches_tabledef(dbconf.ddgen_deletion_possible_tables) ) if self.matches_fielddef(dbconf.ddgen_per_table_pid_field): self._primary_pid = True if self.matches_fielddef(dbconf.ddgen_master_pid_fieldname): self._master_pid = True if self.matches_fielddef(dbconf.ddgen_pid_defining_fieldnames): self._defines_primary_pids = True # --------------------------------------------------------------------- # Does it indicate the patient wishes to opt out entirely? # --------------------------------------------------------------------- if self.matches_fielddef(dbconf.ddgen_patient_opt_out_fields): self._opt_out_info = True # --------------------------------------------------------------------- # Does the field contain sensitive data? # --------------------------------------------------------------------- if (self._master_pid or self._defines_primary_pids or (self._primary_pid and dbconf.ddgen_add_per_table_pids_to_scrubber) or self.matches_fielddef(dbconf.ddgen_scrubsrc_patient_fields)): self.scrub_src = SCRUBSRC.PATIENT elif self.matches_fielddef(dbconf.ddgen_scrubsrc_thirdparty_fields): self.scrub_src = SCRUBSRC.THIRDPARTY elif self.matches_fielddef( dbconf.ddgen_scrubsrc_thirdparty_xref_pid_fields): self.scrub_src = SCRUBSRC.THIRDPARTY_XREF_PID else: self.scrub_src = None # --------------------------------------------------------------------- # Is it a mandatory scrubbing field? # --------------------------------------------------------------------- if self.matches_fielddef(dbconf.ddgen_required_scrubsrc_fields): self._required_scrubber = True # --------------------------------------------------------------------- # What kind of sensitive data? Date, text, number, code? # --------------------------------------------------------------------- if not self.scrub_src: self.scrub_method = "" elif (self.scrub_src is SCRUBSRC.THIRDPARTY_XREF_PID or is_sqlatype_numeric(sqla_coltype) or self.matches_fielddef(dbconf.ddgen_per_table_pid_field) or self.matches_fielddef(dbconf.ddgen_master_pid_fieldname) or self.matches_fielddef(dbconf.ddgen_scrubmethod_number_fields)): self.scrub_method = SCRUBMETHOD.NUMERIC elif (is_sqlatype_date(sqla_coltype) or self.matches_fielddef(dbconf.ddgen_scrubmethod_date_fields)): self.scrub_method = SCRUBMETHOD.DATE elif self.matches_fielddef(dbconf.ddgen_scrubmethod_code_fields): self.scrub_method = SCRUBMETHOD.CODE elif self.matches_fielddef(dbconf.ddgen_scrubmethod_phrase_fields): self.scrub_method = SCRUBMETHOD.PHRASE else: self.scrub_method = SCRUBMETHOD.WORDS # --------------------------------------------------------------------- # Do we want to change the destination fieldname? # --------------------------------------------------------------------- if self._primary_pid: self.dest_field = self.config.research_id_fieldname elif self._master_pid: self.dest_field = self.config.master_research_id_fieldname else: self.dest_field = field if dbconf.ddgen_force_lower_case: self.dest_field = self.dest_field.lower() if dbconf.ddgen_convert_odd_chars_to_underscore: self.dest_field = str(self.dest_field) # if this fails, # there's a Unicode problem self.dest_field = self.dest_field.translate(ODD_CHARS_TRANSLATE) # ... this will choke on a Unicode string # --------------------------------------------------------------------- # Do we want to change the destination field SQL type? # --------------------------------------------------------------------- if self._primary_pid or self._master_pid: self.dest_datatype = self.config.sqltype_encrypted_pid_as_sql else: self.dest_datatype = '' # ... and see also potential changes made below # --------------------------------------------------------------------- # How should we manipulate the destination? # --------------------------------------------------------------------- extracting_text = False if self.matches_fielddef(dbconf.ddgen_truncate_date_fields): self._alter_methods.append(AlterMethod(config=self.config, truncate_date=True)) elif self.matches_fielddef(dbconf.ddgen_filename_to_text_fields): self._alter_methods.append(AlterMethod(config=self.config, extract_from_filename=True)) self.dest_datatype = giant_text_sqltype( self.config.get_dest_dialect()) extracting_text = True elif self.matches_fielddef(dbconf.bin2text_dict.keys()): for binfielddef, extfield in dbconf.bin2text_dict.items(): if self.matches_fielddef(binfielddef): self._alter_methods.append(AlterMethod( config=self.config, extract_from_blob=True, extract_ext_field=extfield)) self.dest_datatype = giant_text_sqltype( self.config.get_dest_dialect()) extracting_text = True elif (not self._primary_pid and not self._master_pid and is_sqlatype_text_of_length_at_least( sqla_coltype, dbconf.ddgen_min_length_for_scrubbing) and not self.matches_fielddef( dbconf.ddgen_safe_fields_exempt_from_scrubbing)): # Text field meeting the criteria to scrub self._alter_methods.append(AlterMethod(config=self.config, scrub=True)) if extracting_text: # Scrub all extract-text fields, unless asked not to if (not self.matches_fielddef( dbconf.ddgen_safe_fields_exempt_from_scrubbing)): self._alter_methods.append(AlterMethod(config=self.config, scrub=True)) # Set skip_if_text_extract_fails flag? if self.matches_fielddef( dbconf.ddgen_skip_row_if_extract_text_fails_fields): self._alter_methods.append(AlterMethod( config=self.config, skip_if_text_extract_fails=True)) for fieldspec, cfg_section in dbconf.ddgen_extra_hash_fields.items(): if self.matches_fielddef(fieldspec): self._alter_methods.append(AlterMethod( config=self.config, hash_=True, hash_config_section=cfg_section )) # --------------------------------------------------------------------- # Manipulate the destination table name? # --------------------------------------------------------------------- # http://stackoverflow.com/questions/10017147 self.dest_table = table if dbconf.ddgen_force_lower_case: self.dest_table = self.dest_table.lower() if dbconf.ddgen_convert_odd_chars_to_underscore: self.dest_table = str(self.dest_table) # ... if this fails, there's a Unicode problem self.dest_table = self.dest_table.translate(ODD_CHARS_TRANSLATE) for suffix in dbconf.ddgen_rename_tables_remove_suffixes: if self.dest_table.endswith(suffix): self.dest_table = self.dest_table[:-len(suffix)] # remove it break # only remove one suffix! # --------------------------------------------------------------------- # Should we index the destination? # --------------------------------------------------------------------- dest_sqla_type = self.get_dest_sqla_coltype() if self._pk: self.index = INDEX.UNIQUE elif (self._primary_pid or self._master_pid or self._defines_primary_pids or self.dest_field == self.config.research_id_fieldname): self.index = INDEX.NORMAL elif (dbconf.ddgen_allow_fulltext_indexing and does_sqlatype_merit_fulltext_index(dest_sqla_type)): self.index = INDEX.FULLTEXT elif self.matches_fielddef(dbconf.ddgen_index_fields): self.index = INDEX.NORMAL else: self.index = "" self.indexlen = ( DEFAULT_INDEX_LEN if (self.index is not INDEX.FULLTEXT and does_sqlatype_require_index_len(dest_sqla_type)) else None ) # --------------------------------------------------------------------- # Should we omit it (at least until a human has looked at the DD)? # --------------------------------------------------------------------- # In descending order of priority: if self.matches_fielddef(dbconf.ddgen_omit_fields): # explicit # Explicit omission trumps everything else # (There are rare occasions with "additional" databases where we # may want to omit a PK/PID/MPID field.) self.omit = True elif self._pk or self._primary_pid or self._master_pid: # We always want PKs, and the translated PID/MPID (RID+TRID or # MRID respectively). self.omit = False elif bool(self.scrub_src): # Scrub-source fields are generally sensitive and therefore worthy # of omission, EXCEPT that if a date is marked for truncation, the # user probably wants it (truncated) to come through! if any(am.truncate_date for am in self._alter_methods): self.omit = False else: self.omit = True elif self.matches_fielddef(dbconf.ddgen_include_fields): # explicit # Explicit inclusion next. self.omit = False else: self.omit = dbconf.ddgen_omit_by_default