Source code for crate_anon.nlp_manager.output_user_config

#!/usr/bin/env python
# crate_anon/nlp_manager/output_user_config.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================
"""

import ast
import shlex
from typing import Dict, List

from cardinal_pythonlib.rnc_db import (
    ensure_valid_field_name,
    ensure_valid_table_name,
    is_sqltype_valid
)
from cardinal_pythonlib.lists import chunks
from cardinal_pythonlib.sqlalchemy.schema import (
    get_sqla_coltype_from_dialect_str,
)
from sqlalchemy import Column, Index

from crate_anon.common.extendedconfigparser import ExtendedConfigParser
from crate_anon.nlp_manager.input_field_config import InputFieldConfig


# =============================================================================
# OutputUserConfig
# =============================================================================

[docs]class OutputUserConfig(object): """ Class defining configuration for the output of a given GATE app. """ def __init__(self, parser: ExtendedConfigParser, section: str) -> None: """ Read config from a configparser section. """ def opt_str(option: str, required: bool = False) -> str: return parser.get_str(section, option, required=required) def opt_strlist(option: str, required: bool = False, as_words: bool = True) -> List[str]: return parser.get_str_list(section, option, required=required, lower=False, as_words=as_words) # We do NOT change the case. if not parser.has_section(section): raise ValueError("config missing section: " + section) # --------------------------------------------------------------------- # desttable # --------------------------------------------------------------------- self._desttable = opt_str('desttable', required=True) ensure_valid_table_name(self._desttable) # --------------------------------------------------------------------- # renames # --------------------------------------------------------------------- self._renames = {} # type: Dict[str, str] rename_lines = opt_strlist('renames', required=False, as_words=False) for line in rename_lines: if not line.strip(): continue words = shlex.split(line) if len(words) != 2: raise ValueError( "Bad 'renames' option in config section {}; line was {} " "but should have contained two things".format( repr(section), repr(line))) annotation_name = words[0] field_name = words[1] ensure_valid_field_name(field_name) self._renames[annotation_name] = field_name # --------------------------------------------------------------------- # null_literals # --------------------------------------------------------------------- null_literal_lines = opt_strlist('null_literals', required=False, as_words=False) self._null_literals = [] # type: List[str] for line in null_literal_lines: self._null_literals += shlex.split(line) # --------------------------------------------------------------------- # destfields # --------------------------------------------------------------------- self._destfields = [] # type: List[str] self._dest_datatypes = [] # type: List[str] dest_fields_datatypes = opt_strlist('destfields', required=True) # log.critical(dest_fields_datatypes) for c in chunks(dest_fields_datatypes, 2): field = c[0] datatype = c[1].upper() ensure_valid_field_name(field) if not is_sqltype_valid(datatype): raise Exception( "Invalid datatype for {}: {}".format(field, datatype)) self._destfields.append(field) self._dest_datatypes.append(datatype) src_fields = [c.name for c in InputFieldConfig.get_core_columns_for_dest()] for sf in src_fields: if sf in self._destfields: raise Exception( "For section {}, destination field {} is auto-supplied; " "do not add it manually".format(section, sf)) if len(set(self._destfields)) != len(self._destfields): raise ValueError("Duplicate fields exist in destination fields: " "{}".format(self._destfields)) # --------------------------------------------------------------------- # indexdefs # --------------------------------------------------------------------- self._indexfields = [] # type: List[str] self._indexlengths = [] # type: List[int] indexdefs = opt_strlist('indexdefs') if indexdefs: for c in chunks(indexdefs, 2): # pairs: field, length indexfieldname = c[0] lengthstr = c[1] if indexfieldname not in self._destfields: raise ValueError( "Index field {} not in destination fields {}".format( indexfieldname, self._destfields)) try: length = ast.literal_eval(lengthstr) if length is not None: length = int(length) except ValueError: raise ValueError( "Bad index length: {}".format(lengthstr)) self._indexfields.append(indexfieldname) self._indexlengths.append(length) def get_tablename(self) -> str: return self._desttable def get_columns(self, engine) -> List[Column]: columns = [] # type: List[Column] for i, field in enumerate(self._destfields): datatype = self._dest_datatypes[i] columns.append(Column( field, get_sqla_coltype_from_dialect_str(datatype, engine.dialect) )) return columns def get_indexes(self) -> List[Index]: indexes = [] # type: List[Index] for i, field in enumerate(self._indexfields): index_name = '_idx_{}'.format(field) length = self._indexlengths[i] kwargs = {'mysql_length': length} if length is not None else {} indexes.append(Index(index_name, field, **kwargs)) return indexes def renames(self) -> Dict[str, str]: return self._renames def null_literals(self) -> List[str]: return self._null_literals