Source code for crate_anon.nlp_manager.models

#!/usr/bin/env python
# crate_anon/nlp_manager/models.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================
"""

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.schema import Column, Index, MetaData
from sqlalchemy.types import BigInteger, DateTime, String

from crate_anon.anonymise.constants import TABLE_KWARGS
from crate_anon.nlp_manager.constants import (
    HashClass,
    MAX_STRING_PK_LENGTH,
    SqlTypeDbIdentifier,
)

progress_meta = MetaData()
ProgressBase = declarative_base(metadata=progress_meta)


# =============================================================================
# Global constants
# =============================================================================

SqlTypeHash = HashClass("dummysalt").sqla_column_type()


# =============================================================================
# Record of progress
# =============================================================================

[docs]class NlpRecord(ProgressBase): """ Class to record the fact of processing a source record (and to keep a hash allowing identification of altered source contents later). """ __tablename__ = 'crate_nlp_progress' __table_args__ = ( Index( '_idx1', # index name # index fields: 'srcpkval', # integer and most specific 'nlpdef', # usually >1 NLP def to 1 db/table/field combo 'srcfield', # } roughly, more to less specific? 'srctable', # } 'srcdb', # } 'srcpkstr', # last as we may not use it # - performance is critical here # - put them in descending order of specificity # http://stackoverflow.com/questions/2292662/how-important-is-the-order-of-columns-in-indexes # noqa # - start with srcpkval, as it's (a) specific and (b) integer # - srcpkfield: don't need to index, because the source table # can only have one PK # - srcpkstr: must include, since srcpkval can be non-unique, # due to hash collisions, if we're using a string # ... but ?should be last because we may not use it in # queries (for tables with integer PK) unique=True # Despite having a NULL field in a UNIQUE index, this is OK for # SQL Server 2008+ (http://stackoverflow.com/questions/767657) and # MySQL also seems happy. ), TABLE_KWARGS ) # http://stackoverflow.com/questions/6626810/multiple-columns-index-when-using-the-declarative-orm-extension-of-sqlalchemy # noqa # http://docs.sqlalchemy.org/en/latest/orm/extensions/declarative/table_config.html # noqa pk = Column( 'pk', BigInteger, primary_key=True, autoincrement=True, doc="PK of NLP record (no specific use)") srcdb = Column( 'srcdb', SqlTypeDbIdentifier, doc="Source database" # primary_key=True ) srctable = Column( 'srctable', SqlTypeDbIdentifier, doc="Source table name" # primary_key=True ) srcpkfield = Column( 'srcpkfield', SqlTypeDbIdentifier, doc="Primary key column name in source table (for info only)") srcpkval = Column( 'srcpkval', BigInteger, doc="Primary key value in source table (or hash if PK is a string)" # primary_key=True ) srcpkstr = Column( 'srcpkstr', String(MAX_STRING_PK_LENGTH), doc="Original string PK, used when the table has a string PK, to deal " "with hash collisions. Max length: {}".format( MAX_STRING_PK_LENGTH) # primary_key=True, default='' # can't have a NULL in a composite PK ) srcfield = Column( 'srcfield', SqlTypeDbIdentifier, doc="Name of column in source field containing actual data" # primary_key=True ) nlpdef = Column( 'nlpdef', SqlTypeDbIdentifier, doc="Name of natural language processing definition that source was " "processed for" # primary_key=True ) whenprocessedutc = Column( 'whenprocessedutc', DateTime, doc="Time that NLP record was processed") srchash = Column( 'srchash', SqlTypeHash, doc='Secure hash of source field contents at the time of processing')