Source code for crate_anon.nlp_manager.build_medex_itself

#!/usr/bin/env python
# crate_anon/nlp_manager/build_medex_itself.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================

Script to compile Java source for MedEx-UIMA.
"""

import argparse
import logging
import os
import subprocess
from typing import Dict, List, Tuple, Union

from cardinal_pythonlib.fileops import purge
from cardinal_pythonlib.logs import configure_logger_for_colour

log = logging.getLogger(__name__)

DEFAULT_MEDEX_DIR = os.path.join(os.path.expanduser('~'), 'dev',
                                 'Medex_UIMA_1.3.6')
DEFAULT_JAVA = 'java'
DEFAULT_JAVAC = 'javac'

EXTRA_ROUTES = [
    "i/m",
    "i.m.",
    "i. m.",
    "intramuscularly",
    "intramuscular inj.",
    "intramuscular injection",
    "inh",
    "inh.",
    "i/v",
    "i.v.",
    "i. v.",
    "nasal",
    "nasally",
    "nebs",
    "nebulised",
    "nebuliser",
    "nebulized",
    "nebulizer",
    "ng",
    "n/g",
    "n.g.",
    "n. g.",
    "nasogastric",
    "nasogastrically",
    "nj",
    "n/j",
    "n.j.",
    "n. j.",
    "p/o",
    "p.o.",
    "p. o.",
    "pr",
    "p/r",
    "p.r.",
    "p. r.",
    "s/c",
    "s.c.",
    "s. c.",
    "top",
    "top.",
]
EXTRA_FREQUENCIES = [  # Tuples of (literal, TIMEX3)
    # EXTRA FOR UK FREQUENCIES; see
    # http://www.evidence.nhs.uk/formulary/bnf/current/general-reference/latin-abbreviations  # noqa
    # TIMEX3 codes:
    # http://www.timeml.org/tempeval2/tempeval2-trial/guidelines/timex3guidelines-072009.pdf

    # qqh, quarta quaque hora
    ("q.q.h.", "R1P4H"),

    # qds, quater die sumendum; MUST BE BEFORE COMPETING "qd" (= per day)
    # expression, e.g. in frequency_rules:
    # expression="[Qq]\.?[ ]?[Dd]\.?",val="R1P24H"
    ("q.d.s.", "R1P6H"),

    # tds, ter die sumendum
    ("t.d.s.", "R1P8H"),

    # bd, bis die
    ("b.d.", "R1P12H"),

    # od, omni die
    ("o.d.", "R1P24H"),

    # mane
    ("mane", "R1P24H"),

    # om, omni mane
    ("o.m.", "R1P24H"),

    # nocte
    ("nocte", "R1P24H"),

    # on, omni nocte -- beware also the word "on"...
    ("o.n.", "R1P24H"),

    # fortnightly and variants
    ("fortnightly", "R1P2W"),  # W: page 9 of TIMEX3 PDF above
    ("2 weekly", "R1P2W"),
    ("two weekly", "R1P2W"),

    # monthly
    ("monthly", "R1P1M"),  # M: page 8 of TIMEX3 PDF above
]
DO_NOT_REMOVE_DOTS = [
    'o.n.'
    # the word "on" is too confusing; e.g. "Start olanzapine 5mg nocte." is
    # fine; "Start olanzapine 5mg on." is tolerable, but too easily confused
    # with "Start olanzapine 5mg on Tuesday."
]

SEM_ENG_TRIGGER_LINE_TRIMMED = "Map regexlist = new Hashtable();"
FREQ_RULE_TRIGGER_LINE_TRIMMED = "FREQUENCY:"
SOURCE_START_MARKER = "// START CRATE MODIFICATIONS"
SOURCE_END_MARKER = "// END CRATE MODIFICATIONS"


def terminate(x: str) -> str:
    return x + '\n'


[docs]def lex_freq(x: str) -> str: """For MedEx's lexicon.cfg: a frequency line""" return "{}\tFREQ".format(x)
[docs]def lex_route(x: str) -> str: """For MedEx's lexicon.cfg: a route line""" return "{}\tRUT".format(x)
def semantic_rule_engine_line(frequency: str, dots_optional: bool = True) -> str: # NB case-insensitive regexes in SemanticRuleEngine.java, so ignore case # here # If you need to put in a \, double it to \\ for Java's benefit. regex_str = '' for c in frequency: if c == ' ': regex_str += r'\\s+' elif c == '.': if dots_optional: regex_str += r'\\.?\\s*' else: regex_str += r'\\.\\s*' else: regex_str += c return r' regexlist.put("^({})( |$)", "FREQ"); // RNC'.format( regex_str) def frequency_rules_line(frequency: str, timex: str, dots_optional: bool) -> str: # NB case-sensitive regexes in Rule.java, so offer upper- and lower-case # alternatives here. # No need for word boundaries with \b, since at this stage all words have # already been separated by the tokenization process. regex_str = '' for c in frequency: if c == ' ': regex_str += r'\s+' elif c == '.': if dots_optional: regex_str += r'\.?\s?' else: regex_str += r'\.\s?' elif c.isalpha(): # Case-insensitive here. regex_str += r'[{}{}]'.format(c.upper(), c.lower()) else: regex_str += c return r'expression="{}",val="{}"'.format(regex_str, timex)
[docs]def add_lines_if_not_in(filename: str, lines: List[str]) -> None: """Elements of lines should not have their own \n characters.""" with open(filename, 'r') as f: existing = f.readlines() # will have trailing newlines log.info("Read {} lines from {}".format(len(existing), filename)) # print(existing[-5:]) with open(filename, 'a') as f: for line in lines: if terminate(line) not in existing: log.info("Adding {} line: {}".format(filename, repr(line))) f.write(terminate(line))
[docs]def add_lines_after_trigger(filename: str, trigger: str, start_marker: str, end_marker: str, lines: List[str]) -> None: """Elements of lines should not have their own \n characters.""" with open(filename, 'r') as f: existing = f.readlines() log.info("Read {} lines from {}".format(len(existing), filename)) with open(filename, 'w') as f: index = 0 for line in existing: f.write(line) index += 1 if line.strip() == trigger: break # ... index now pointing to one after the trigger line # Excise an existing block of ours? if (index < len(existing) and existing[index] == terminate(start_marker)): while (index < len(existing) and existing[index] != terminate(end_marker)): index += 1 index += 1 # line after end_marker # Add stuff f.write(terminate(start_marker)) for line in lines: log.info("Adding {} line: {}".format(filename, repr(line))) f.write(terminate(line)) f.write(terminate(end_marker)) # Write the rest for line in existing[index:]: f.write(line)
[docs]def replace_in_file(filename: str, changes: List[Tuple[str, str]], count: int = -1, encoding: str = 'utf8', backup_suffix: str = "~") -> None: """Replaces all old by new in file filename.""" log.info("Replacing code in file: {}".format(filename)) # Read contents with open(filename, encoding=encoding) as input_file: original_content = input_file.read() # Replace new_content = original_content for old, new in changes: new_content = new_content.replace(old, new, count) # Check for differences if new_content == original_content: log.info("... nothing to do") return # Make backup, if different backup_name = filename + backup_suffix os.rename(filename, backup_name) log.info("... backup is: {}".format(repr(backup_name))) # Write out new with open(filename, 'w', encoding=encoding) as output_file: output_file.write(new_content)
def main() -> None: # ------------------------------------------------------------------------- # Arguments # ------------------------------------------------------------------------- parser = argparse.ArgumentParser( description="Compile MedEx-UIMA itself (in Java)") parser.add_argument( '--medexdir', default=DEFAULT_MEDEX_DIR, help="Root directory of MedEx installation (default: {})".format( DEFAULT_MEDEX_DIR)) parser.add_argument( '--javac', default=DEFAULT_JAVAC, help="Java compiler (default: {})".format(DEFAULT_JAVAC)) parser.add_argument( '--deletefirst', action='store_true', help="Delete existing .class files first (optional)") parser.add_argument('--verbose', '-v', action='store_true', help="Be verbose") args = parser.parse_args() # ------------------------------------------------------------------------- # Logging # ------------------------------------------------------------------------- loglevel = logging.DEBUG if args.verbose else logging.INFO rootlogger = logging.getLogger() configure_logger_for_colour(rootlogger, level=loglevel) # ------------------------------------------------------------------------- # Add lexicon entries # ------------------------------------------------------------------------- lexfilename = os.path.join(args.medexdir, 'resources', 'lexicon.cfg') lexlines = [lex_route(route) for route in EXTRA_ROUTES] for frequency, _ in EXTRA_FREQUENCIES: lexlines.append(lex_freq(frequency)) if '.' in frequency: lexlines.append(lex_freq(frequency.replace('.', '. '))) if frequency not in DO_NOT_REMOVE_DOTS: lexlines.append(lex_freq(frequency.replace('.', ''))) # Need to add variants, e.g. "om" for "o.m."? add_lines_if_not_in(lexfilename, lexlines) # ------------------------------------------------------------------------- # Add frequency tags to SemanticRuleEngine.java # ------------------------------------------------------------------------- semengfilename = os.path.join(args.medexdir, 'src', 'org', 'apache', 'medex', 'SemanticRuleEngine.java') semlines = [semantic_rule_engine_line(frequency, frequency not in DO_NOT_REMOVE_DOTS) for frequency, _ in EXTRA_FREQUENCIES] add_lines_after_trigger(semengfilename, SEM_ENG_TRIGGER_LINE_TRIMMED, SOURCE_START_MARKER, SOURCE_END_MARKER, semlines) # ------------------------------------------------------------------------- # Add frequency tags to frequency_rules # ------------------------------------------------------------------------- freqrulefilename = os.path.join(args.medexdir, 'resources', 'TIMEX', 'rules', 'frequency_rules') frlines = [frequency_rules_line(frequency, timex, frequency not in DO_NOT_REMOVE_DOTS) for frequency, timex in EXTRA_FREQUENCIES] add_lines_after_trigger(freqrulefilename, FREQ_RULE_TRIGGER_LINE_TRIMMED, SOURCE_START_MARKER, SOURCE_END_MARKER, frlines) # ------------------------------------------------------------------------- # Fix bugs! Argh. # ------------------------------------------------------------------------- bugfixes = [ { "filename": os.path.join(args.medexdir, 'src', 'org', 'apache', 'NLPTools', 'Document.java'), "changes": [ { "comment": """ Medex confuses & and &&, leading to Exception in thread "main" java.lang.StringIndexOutOfBoundsException: String index out of range: 2 at java.lang.String.charAt(Unknown Source) at org.apache.NLPTools.Document.<init>(Document.java:134) at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:256) at CrateMedexPipeline.processInput(CrateMedexPipeline.java:302) at CrateMedexPipeline.<init>(CrateMedexPipeline.java:128) at CrateMedexPipeline.main(CrateMedexPipeline.java:320) """, # noqa "wrong": r"while(cur_pos<llen & (txt.charAt(cur_pos)==' ' || txt.charAt(cur_pos)=='\n' || txt.charAt(cur_pos)=='\r') ){", # noqa "right": r"while(cur_pos<llen && (txt.charAt(cur_pos)==' ' || txt.charAt(cur_pos)=='\n' || txt.charAt(cur_pos)=='\r') ){" # noqa # -----------------------------^ }, ], }, # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ { "filename": os.path.join(args.medexdir, 'src', 'org', 'apache', 'algorithms', 'SuffixArray.java'), "changes": [ { "comment": """ java.lang.StringIndexOutOfBoundsException: String index out of range: 1 at java.lang.String.charAt(Unknown Source) at org.apache.algorithms.SuffixArray.construct_tree_word(SuffixArray.java:375) at org.apache.algorithms.SuffixArray.re_build(SuffixArray.java:97) at org.apache.algorithms.SuffixArray.<init>(SuffixArray.java:60) at org.apache.medex.MedTagger.medtagging(MedTagger.java:359) at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264) at CrateMedexPipeline.processInput(CrateMedexPipeline.java:302) at CrateMedexPipeline.<init>(CrateMedexPipeline.java:128) at CrateMedexPipeline.main(CrateMedexPipeline.java:320) Offending code in SuffixArray.java: for (int i=0;i<this.N;i++){ int pos=this.SA[i]; if (this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){ this.insert_SF_tree(this.SA[i], 0, 0); //# 0 denote the root in __SA; } } The bug may relate to what's in SA[i]... but as a simple fix: """, # noqa "wrong": r"if (this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){", # noqa "right": r"if (pos < this.otext.length() && this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){" # noqa # -------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ }, ], }, ] # type: List[Dict[str, Union[str, List[Dict[str, str]]]]] _ = """ BUGS IN MEDEX-UIMA NOT YET FIXED: java.lang.ArrayIndexOutOfBoundsException: -1 at java.util.Vector.elementData(Unknown Source) at java.util.Vector.get(Unknown Source) at org.apache.NLPTools.SentenceBoundary.detect_boundaries(SentenceBoundary.java:329) at org.apache.medex.MedTagger.medtagging(MedTagger.java:354) at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264) at CrateMedexPipeline.processInput(CrateMedexPipeline.java:312) at CrateMedexPipeline.runPipeline(CrateMedexPipeline.java:138) at CrateMedexPipeline.<init>(CrateMedexPipeline.java:112) at CrateMedexPipeline.main(CrateMedexPipeline.java:330) java.lang.NullPointerException at org.apache.algorithms.SuffixArray.search(SuffixArray.java:636) at org.apache.medex.MedTagger.medtagging(MedTagger.java:362) at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264) at CrateMedexPipeline.processInput(CrateMedexPipeline.java:312) at CrateMedexPipeline.runPipeline(CrateMedexPipeline.java:138) at CrateMedexPipeline.<init>(CrateMedexPipeline.java:112) at CrateMedexPipeline.main(CrateMedexPipeline.java:330) ... frankly, it's just badly written. That's clearly why it uses the "catch all exceptions" strategy, but one would imagine the errors are unintentional (certainly the &/&& one!) or else they wouldn't print a stack trace and chug on. """ # noqa for bf in bugfixes: filename = bf["filename"] changes = [] # type: List[Tuple[str, str]] for change in bf["changes"]: changes.append((change["wrong"], change["right"])) replace_in_file(filename, changes) # ------------------------------------------------------------------------- # Clean up first? # ------------------------------------------------------------------------- if args.deletefirst: purge(args.medexdir, '*.class') # ------------------------------------------------------------------------- # Compile # ------------------------------------------------------------------------- bindir = os.path.join(args.medexdir, 'bin') classpath = os.pathsep.join([ os.path.join(args.medexdir, 'src'), os.path.join(args.medexdir, 'lib', '*'), # jar files ]) classpath_options = ['-classpath', classpath] os.chdir(args.medexdir) cmdargs = ( [args.javac] + classpath_options + ['src/org/apache/medex/Main.java'] + # ... compiling this compiles everything else necessary ['-d', bindir] # put the binaries here ) log.info("Executing command: {}".format(cmdargs)) subprocess.check_call(cmdargs) if __name__ == '__main__': main()