Source code for crate_anon.nlp_manager.parse_clinical

#!/usr/bin/env python
# crate_anon/nlp_manager/parse_clinical.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================
"""

import logging
import sys
from typing import Any, Dict, Generator, List, Optional, Tuple

from sqlalchemy import Column, Integer, Float, String, Text

from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.regex_parser import (
    BaseNlpParser,
    common_tense,
    compile_regex,
    FN_CONTENT,
    FN_END,
    FN_RELATION,
    FN_RELATION_TEXT,
    FN_START,
    FN_TENSE,
    FN_TENSE_TEXT,
    FN_UNITS,
    FN_VALUE_TEXT,
    FN_VARIABLE_NAME,
    FN_VARIABLE_TEXT,
    HELP_CONTENT,
    HELP_END,
    HELP_RELATION,
    HELP_RELATION_TEXT,
    HELP_START,
    HELP_TENSE,
    HELP_UNITS,
    HELP_VALUE_TEXT,
    HELP_VARIABLE_TEXT,
    MAX_RELATION_LENGTH,
    MAX_RELATION_TEXT_LENGTH,
    MAX_TENSE_LENGTH,
    MAX_UNITS_LENGTH,
    MAX_VALUE_TEXT_LENGTH,
    NumericalResultParser,
    OPTIONAL_RESULTS_IGNORABLES,
    RELATION,
    SimpleNumericalResultParser,
    TENSE_INDICATOR,
    to_float,
    to_pos_float,
    ValidatorBase,
    WORD_BOUNDARY,
)
from crate_anon.nlp_manager.regex_numbers import SIGNED_FLOAT
from crate_anon.nlp_manager.regex_units import (
    assemble_units,
    CM,
    FEET,
    INCHES,
    KG,
    kg_from_st_lb_oz,
    KG_PER_SQ_M,
    LB,
    M,
    m_from_ft_in,
    m_from_m_cm,
    MM_HG,
    STONES,
)

log = logging.getLogger(__name__)


# =============================================================================
#  Anthropometrics
# =============================================================================

# -----------------------------------------------------------------------------
# Height
# -----------------------------------------------------------------------------

[docs]class Height(NumericalResultParser): """Height. Handles metric and imperial.""" METRIC_HEIGHT = r""" ( # capture group 4 (?: ( {SIGNED_FLOAT} ) # capture group 5 {OPTIONAL_RESULTS_IGNORABLES} ( {M} ) # capture group 6 {OPTIONAL_RESULTS_IGNORABLES} ( {SIGNED_FLOAT} ) # capture group 7 {OPTIONAL_RESULTS_IGNORABLES} ( {CM} ) # capture group 8 ) | (?: ( {SIGNED_FLOAT} ) # capture group 9 {OPTIONAL_RESULTS_IGNORABLES} ( {M} ) # capture group 10 ) | (?: ( {SIGNED_FLOAT} ) # capture group 11 {OPTIONAL_RESULTS_IGNORABLES} ( {CM} ) # capture group 12 ) ) """.format( SIGNED_FLOAT=SIGNED_FLOAT, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, M=M, CM=CM ) IMPERIAL_HEIGHT = r""" ( # capture group 13 (?: ( {SIGNED_FLOAT} ) # capture group 14 {OPTIONAL_RESULTS_IGNORABLES} ( {FEET} ) # capture group 15 {OPTIONAL_RESULTS_IGNORABLES} ( {SIGNED_FLOAT} ) # capture group 16 {OPTIONAL_RESULTS_IGNORABLES} ( {INCHES} ) # capture group 17 ) | (?: ( {SIGNED_FLOAT} ) # capture group 18 {OPTIONAL_RESULTS_IGNORABLES} ( {FEET} ) # capture group 19 ) | (?: ( {SIGNED_FLOAT} ) # capture group 20 {OPTIONAL_RESULTS_IGNORABLES} ( {INCHES} ) # capture group 21 ) ) """.format( SIGNED_FLOAT=SIGNED_FLOAT, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, FEET=FEET, INCHES=INCHES ) HEIGHT = r"(?: \b height \b)" REGEX = r""" ( {HEIGHT} ) # group 1 for "height" or equivalent {OPTIONAL_RESULTS_IGNORABLES} ( {TENSE_INDICATOR} )? # optional group 2 for tense {OPTIONAL_RESULTS_IGNORABLES} ( {RELATION} )? # optional group 3 for relation {OPTIONAL_RESULTS_IGNORABLES} (?: {METRIC_HEIGHT} | {IMPERIAL_HEIGHT} ) """.format( HEIGHT=HEIGHT, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, TENSE_INDICATOR=TENSE_INDICATOR, RELATION=RELATION, METRIC_HEIGHT=METRIC_HEIGHT, IMPERIAL_HEIGHT=IMPERIAL_HEIGHT, SIGNED_FLOAT=SIGNED_FLOAT, KG_PER_SQ_M=KG_PER_SQ_M, ) COMPILED_REGEX = compile_regex(REGEX) NAME = "Height" PREFERRED_UNIT_COLUMN = "value_m" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False, debug: bool = False) -> None: super().__init__( nlpdef=nlpdef, cfgsection=cfgsection, variable=self.NAME, target_unit=self.PREFERRED_UNIT_COLUMN, regex_str_for_debugging=self.REGEX, commit=commit ) if debug: print("Regex for {}: {}".format(type(self).__name__, self.REGEX))
[docs] def parse(self, text: str, debug: bool = False) -> Generator[Tuple[str, Dict[str, Any]], None, None]: """Parser for Height. Specialized for complex unit conversion.""" for m in self.COMPILED_REGEX.finditer(text): # watch out: 'm'/metres if debug: print("Match {} for {}".format(m, repr(text))) startpos = m.start() endpos = m.end() matching_text = m.group(0) # the whole thing variable_text = m.group(1) tense_text = m.group(2) relation_text = m.group(3) metric_expression = m.group(4) metric_m_and_cm_m = m.group(5) metric_m_and_cm_m_units = m.group(6) metric_m_and_cm_cm = m.group(7) metric_m_and_cm_cm_units = m.group(8) metric_m_only_m = m.group(9) metric_m_only_m_units = m.group(10) metric_cm_only_cm = m.group(11) metric_cm_only_cm_units = m.group(12) imperial_expression = m.group(13) imperial_ft_and_in_ft = m.group(14) imperial_ft_and_in_ft_units = m.group(15) imperial_ft_and_in_in = m.group(16) imperial_ft_and_in_in_units = m.group(17) imperial_ft_only_ft = m.group(18) imperial_ft_only_ft_units = m.group(19) imperial_in_only_in = m.group(20) imperial_in_only_in_units = m.group(21) expression = None value_m = None units = None if metric_expression: expression = metric_expression if metric_m_and_cm_m and metric_m_and_cm_cm: metres = to_pos_float(metric_m_and_cm_m) # ... beware: 'm' above cm = to_pos_float(metric_m_and_cm_cm) value_m = m_from_m_cm(metres=metres, centimetres=cm) units = assemble_units([metric_m_and_cm_m_units, metric_m_and_cm_cm_units]) elif metric_m_only_m: value_m = to_pos_float(metric_m_only_m) units = metric_m_only_m_units elif metric_cm_only_cm: cm = to_pos_float(metric_cm_only_cm) value_m = m_from_m_cm(centimetres=cm) units = metric_cm_only_cm_units elif imperial_expression: expression = imperial_expression if imperial_ft_and_in_ft and imperial_ft_and_in_in: ft = to_pos_float(imperial_ft_and_in_ft) inches = to_pos_float(imperial_ft_and_in_in) value_m = m_from_ft_in(feet=ft, inches=inches) units = assemble_units([imperial_ft_and_in_ft_units, imperial_ft_and_in_in_units]) elif imperial_ft_only_ft: ft = to_pos_float(imperial_ft_only_ft) value_m = m_from_ft_in(feet=ft) units = imperial_ft_only_ft_units elif imperial_in_only_in: inches = to_pos_float(imperial_in_only_in) value_m = m_from_ft_in(inches=inches) units = imperial_in_only_in_units tense, relation = common_tense(tense_text, relation_text) result = { FN_VARIABLE_NAME: self.variable, FN_CONTENT: matching_text, FN_START: startpos, FN_END: endpos, FN_VARIABLE_TEXT: variable_text, FN_RELATION_TEXT: relation_text, FN_RELATION: relation, FN_VALUE_TEXT: expression, FN_UNITS: units, self.target_unit: value_m, FN_TENSE_TEXT: tense_text, FN_TENSE: tense, } # log.critical(result) yield self.tablename, result
def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("Height", []), # should fail; no values ("her height was 1.6m", [1.6]), ("Height = 1.23 m", [1.23]), ("her height is 1.5m", [1.5]), ('''Height 5'8" ''', [m_from_ft_in(feet=5, inches=8)]), ("Height 5 ft 8 in", [m_from_ft_in(feet=5, inches=8)]), ("Height 5 feet 8 inches", [m_from_ft_in(feet=5, inches=8)]), ], verbose=verbose) self.detailed_test("Height 5 ft 11 in", [{ self.target_unit: m_from_ft_in(feet=5, inches=11), FN_UNITS: "ft in", }], verbose=verbose)
# *** Height NLP: deal with "tall" and plain "is", e.g. # she is 6'2"; she is 1.5m tall
[docs]class HeightValidator(ValidatorBase): """Validator for Height (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Height.HEIGHT], validated_variable=Height.NAME, commit=commit)
# ----------------------------------------------------------------------------- # Weight (mass) # -----------------------------------------------------------------------------
[docs]class Weight(NumericalResultParser): """Weight. Handles metric and imperial.""" METRIC_WEIGHT = r""" ( # capture group 4 ( {SIGNED_FLOAT} ) # capture group 5 {OPTIONAL_RESULTS_IGNORABLES} ( {KG} ) # capture group 6 ) """.format( SIGNED_FLOAT=SIGNED_FLOAT, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, KG=KG ) IMPERIAL_WEIGHT = r""" ( # capture group 7 (?: ( {SIGNED_FLOAT} ) # capture group 8 {OPTIONAL_RESULTS_IGNORABLES} ( {STONES} ) # capture group 9 {OPTIONAL_RESULTS_IGNORABLES} ( {SIGNED_FLOAT} ) # capture group 10 {OPTIONAL_RESULTS_IGNORABLES} ( {LB} ) # capture group 11 ) | (?: ( {SIGNED_FLOAT} ) # capture group 12 {OPTIONAL_RESULTS_IGNORABLES} ( {STONES} ) # capture group 13 ) | (?: ( {SIGNED_FLOAT} ) # capture group 14 {OPTIONAL_RESULTS_IGNORABLES} ( {LB} ) # capture group 15 ) ) """.format( SIGNED_FLOAT=SIGNED_FLOAT, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, STONES=STONES, LB=LB ) WEIGHT = r"(?: \b weigh[ts] \b )" # weight, weighs REGEX = r""" ( {WEIGHT} ) # group 1 for "weight" or equivalent {OPTIONAL_RESULTS_IGNORABLES} ( {TENSE_INDICATOR} )? # optional group 2 for tense {OPTIONAL_RESULTS_IGNORABLES} ( {RELATION} )? # optional group 3 for relation {OPTIONAL_RESULTS_IGNORABLES} (?: {METRIC_WEIGHT} | {IMPERIAL_WEIGHT} ) """.format( WEIGHT=WEIGHT, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, TENSE_INDICATOR=TENSE_INDICATOR, RELATION=RELATION, METRIC_WEIGHT=METRIC_WEIGHT, IMPERIAL_WEIGHT=IMPERIAL_WEIGHT, SIGNED_FLOAT=SIGNED_FLOAT, KG_PER_SQ_M=KG_PER_SQ_M, ) COMPILED_REGEX = compile_regex(REGEX) NAME = "Weight" PREFERRED_UNIT_COLUMN = "value_kg" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False, debug: bool = False) -> None: super().__init__( nlpdef=nlpdef, cfgsection=cfgsection, variable=self.NAME, target_unit=self.PREFERRED_UNIT_COLUMN, regex_str_for_debugging=self.REGEX, commit=commit ) if debug: print("Regex for {}: {}".format(type(self).__name__, self.REGEX))
[docs] def parse(self, text: str, debug: bool = False) -> Generator[Tuple[str, Dict[str, Any]], None, None]: """Parser for Weight. Specialized for complex unit conversion.""" for m in self.COMPILED_REGEX.finditer(text): if debug: print("Match {} for {}".format(m, repr(text))) startpos = m.start() endpos = m.end() matching_text = m.group(0) # the whole thing variable_text = m.group(1) tense_text = m.group(2) relation_text = m.group(3) metric_expression = m.group(4) metric_value = m.group(5) metric_units = m.group(6) imperial_expression = m.group(7) imperial_st_and_lb_st = m.group(8) imperial_st_and_lb_st_units = m.group(9) imperial_st_and_lb_lb = m.group(10) imperial_st_and_lb_lb_units = m.group(11) imperial_st_only_st = m.group(12) imperial_st_only_st_units = m.group(13) imperial_lb_only_lb = m.group(14) imperial_lb_only_lb_units = m.group(15) expression = None value_kg = None units = None if metric_expression: expression = metric_expression value_kg = to_float(metric_value) units = metric_units elif imperial_expression: expression = imperial_expression if imperial_st_and_lb_st and imperial_st_and_lb_lb: st = to_float(imperial_st_and_lb_st) lb = to_float(imperial_st_and_lb_lb) value_kg = kg_from_st_lb_oz(stones=st, pounds=lb) units = assemble_units([imperial_st_and_lb_st_units, imperial_st_and_lb_lb_units]) elif imperial_st_only_st: st = to_float(imperial_st_only_st) value_kg = kg_from_st_lb_oz(stones=st) units = imperial_st_only_st_units elif imperial_lb_only_lb: lb = to_float(imperial_lb_only_lb) value_kg = kg_from_st_lb_oz(pounds=lb) units = imperial_lb_only_lb_units # All left as signed float, as you definitely see things like # "weight -0.3 kg" for weight changes. tense, relation = common_tense(tense_text, relation_text) result = { FN_VARIABLE_NAME: self.variable, FN_CONTENT: matching_text, FN_START: startpos, FN_END: endpos, FN_VARIABLE_TEXT: variable_text, FN_RELATION_TEXT: relation_text, FN_RELATION: relation, FN_VALUE_TEXT: expression, FN_UNITS: units, self.target_unit: value_kg, FN_TENSE_TEXT: tense_text, FN_TENSE: tense, } # log.critical(result) yield self.tablename, result
def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("Weight", []), # should fail; no values ("her weight was 60.2kg", [60.2]), ("Weight = 52.3kg", [52.3]), ("Weight: 80.8kgs", [80.8]), ("she weighs 61kg", [61]), ("she weighs 61 kg", [61]), ("she weighs 61 kgs", [61]), ("she weighs 61 kilo", [61]), ("she weighs 61 kilos", [61]), ("she weighs 8 stones ", [kg_from_st_lb_oz(stones=8)]), ("she weighs 200 lb", [kg_from_st_lb_oz(pounds=200)]), ("she weighs 200 pounds", [kg_from_st_lb_oz(pounds=200)]), ("she weighs 6 st 12 lb", [kg_from_st_lb_oz(stones=6, pounds=12)]), ("change in weight -0.4kg", [-0.4]), ("change in weight - 0.4kg", [0.4]), # ASCII hyphen (hyphen-minus) ("change in weight ‐ 0.4kg", [0.4]), # Unicode hyphen # ("failme", [999]), ("change in weight −0.4kg", [-0.4]), # Unicode minus ("change in weight –0.4kg", [-0.4]), # en dash ("change in weight —0.4kg", [0.4]), # em dash ], verbose=verbose) self.detailed_test("Weight: 80.8kgs", [{ self.target_unit: 80.8, FN_UNITS: "kgs", }], verbose=verbose)
[docs]class WeightValidator(ValidatorBase): """Validator for Weight (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Weight.WEIGHT], validated_variable=Weight.NAME, commit=commit)
# ----------------------------------------------------------------------------- # Body mass index (BMI) # -----------------------------------------------------------------------------
[docs]class Bmi(SimpleNumericalResultParser): """Body mass index (in kg / m^2).""" BMI = r""" (?: {WORD_BOUNDARY} (?: BMI | body \s+ mass \s+ index ) {WORD_BOUNDARY} ) """.format(WORD_BOUNDARY=WORD_BOUNDARY) REGEX = r""" ( {BMI} ) # group for "BMI" or equivalent {OPTIONAL_RESULTS_IGNORABLES} ( {TENSE_INDICATOR} )? # optional group for tense indicator {OPTIONAL_RESULTS_IGNORABLES} ( {RELATION} )? # optional group for relation {OPTIONAL_RESULTS_IGNORABLES} ( {SIGNED_FLOAT} ) # group for value {OPTIONAL_RESULTS_IGNORABLES} ( # group for units {KG_PER_SQ_M} )? """.format( BMI=BMI, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, TENSE_INDICATOR=TENSE_INDICATOR, RELATION=RELATION, SIGNED_FLOAT=SIGNED_FLOAT, KG_PER_SQ_M=KG_PER_SQ_M, ) NAME = "BMI" PREFERRED_UNIT_COLUMN = "value_kg_per_sq_m" UNIT_MAPPING = { KG_PER_SQ_M: 1, # preferred unit } # deal with "a BMI of 30"? def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__( nlpdef=nlpdef, cfgsection=cfgsection, regex_str=self.REGEX, variable=self.NAME, target_unit=self.PREFERRED_UNIT_COLUMN, units_to_factor=self.UNIT_MAPPING, commit=commit, take_absolute=True ) def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("BMI", []), # should fail; no values ("body mass index was 30", [30]), ("his BMI (30) is too high", [30]), ("BMI 25 kg/sq m", [25]), ("BMI was 18.4 kg/m^-2", [18.4]), ("ACE 79", []), ("BMI-23", [23]), ], verbose=verbose)
[docs]class BmiValidator(ValidatorBase): """Validator for Bmi (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Bmi.BMI], validated_variable=Bmi.NAME, commit=commit)
# ============================================================================= # Bedside investigations: BP # =============================================================================
[docs]class Bp(BaseNlpParser): """Blood pressure, in mmHg. (Since we produce two variables, SBP and DBP, and we use something a little more complex than NumeratorOutOfDenominatorParser, we subclass BaseNlpParser directly.)""" BP = r"(?: \b blood \s+ pressure \b | \b B\.?P\.? \b )" SYSTOLIC_BP = r"(?: \b systolic \s+ {BP} | \b S\.?B\.?P\.? \b )".format( BP=BP) DIASTOLIC_BP = r"(?: \b diastolic \s+ {BP} | \b D\.?B\.?P\.? \b )".format( BP=BP) TWO_NUMBER_BP = r""" ( {SIGNED_FLOAT} ) \s* (?: \b over \b | \/ ) \s* ( {SIGNED_FLOAT} ) """.format(SIGNED_FLOAT=SIGNED_FLOAT) ONE_NUMBER_BP = SIGNED_FLOAT COMPILED_BP = compile_regex(BP) COMPILED_SBP = compile_regex(SYSTOLIC_BP) COMPILED_DBP = compile_regex(DIASTOLIC_BP) COMPILED_ONE_NUMBER_BP = compile_regex(ONE_NUMBER_BP) COMPILED_TWO_NUMBER_BP = compile_regex(TWO_NUMBER_BP) REGEX = r""" ( # group for "BP" or equivalent {SYSTOLIC_BP} # ... from more to less specific | {DIASTOLIC_BP} | {BP} ) {OPTIONAL_RESULTS_IGNORABLES} ( {TENSE_INDICATOR} )? # optional group for tense indicator {OPTIONAL_RESULTS_IGNORABLES} ( {RELATION} )? # optional group for relation {OPTIONAL_RESULTS_IGNORABLES} ( {SIGNED_FLOAT} # systolic (?: \s* (?: \b over \b | \/ ) \s* # / {SIGNED_FLOAT} # diastolic )? ) {OPTIONAL_RESULTS_IGNORABLES} ( # group for units {MM_HG} )? """.format( BP=BP, SYSTOLIC_BP=SYSTOLIC_BP, DIASTOLIC_BP=DIASTOLIC_BP, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, TENSE_INDICATOR=TENSE_INDICATOR, RELATION=RELATION, SIGNED_FLOAT=SIGNED_FLOAT, MM_HG=MM_HG, ) COMPILED_REGEX = compile_regex(REGEX) FN_SYSTOLIC_BP_MMHG = 'systolic_bp_mmhg' FN_DIASTOLIC_BP_MMHG = 'diastolic_bp_mmhg' NAME = "BP" UNIT_MAPPING = { MM_HG: 1, # preferred unit } def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__( nlpdef=nlpdef, cfgsection=cfgsection, commit=commit ) if nlpdef is None: # only None for debugging! self.tablename = '' else: self.tablename = nlpdef.opt_str( cfgsection, 'desttable', required=True) @classmethod def print_info(cls, file=sys.stdout): print("Blood pressure finder. Regular expression: \n{}".format( cls.REGEX), file=file)
[docs] def dest_tables_columns(self) -> Dict[str, List[Column]]: return {self.tablename: [ Column(FN_CONTENT, Text, doc=HELP_CONTENT), Column(FN_START, Integer, doc=HELP_START), Column(FN_END, Integer, doc=HELP_END), Column(FN_VARIABLE_TEXT, Text, doc=HELP_VARIABLE_TEXT), Column(FN_RELATION_TEXT, String(MAX_RELATION_TEXT_LENGTH), doc=HELP_RELATION_TEXT), Column(FN_RELATION, String(MAX_RELATION_LENGTH), doc=HELP_RELATION), Column(FN_VALUE_TEXT, String(MAX_VALUE_TEXT_LENGTH), doc=HELP_VALUE_TEXT), Column(FN_UNITS, String(MAX_UNITS_LENGTH), doc=HELP_UNITS), Column(self.FN_SYSTOLIC_BP_MMHG, Float, doc="Systolic blood pressure in mmHg"), Column(self.FN_DIASTOLIC_BP_MMHG, Float, doc="Diastolic blood pressure in mmHg"), Column(FN_TENSE, String(MAX_TENSE_LENGTH), doc=HELP_TENSE), ]}
[docs] def parse(self, text: str, debug: bool = False) -> Generator[Tuple[str, Dict[str, Any]], None, None]: """Parser for BP. Specialized because we're fetching two numbers.""" for m in self.COMPILED_REGEX.finditer(text): if debug: print("Match {} for {}".format(m, repr(text))) startpos = m.start() endpos = m.end() matching_text = m.group(0) # the whole thing variable_text = m.group(1) tense_indicator = m.group(2) relation_text = m.group(3) value_text = m.group(4) units = m.group(5) sbp = None dbp = None if self.COMPILED_SBP.match(variable_text): if self.COMPILED_ONE_NUMBER_BP.match(value_text): sbp = to_pos_float(value_text) elif self.COMPILED_DBP.match(variable_text): if self.COMPILED_ONE_NUMBER_BP.match(value_text): dbp = to_pos_float(value_text) elif self.COMPILED_BP.match(variable_text): bpmatch = self.COMPILED_TWO_NUMBER_BP.match(value_text) if bpmatch: sbp = to_pos_float(bpmatch.group(1)) dbp = to_pos_float(bpmatch.group(2)) if sbp is None and dbp is None: # This is OK; e.g. "BP 110", which we will ignore. # log.warning( # "Failed interpretation: matching_text={matching_text}, " # "variable_text={variable_text}, " # "tense_indicator={tense_indicator}, " # "relation={relation}, " # "value_text={value_text}, " # "units={units}".format( # matching_text=repr(matching_text), # variable_text=repr(variable_text), # tense_indicator=repr(tense_indicator), # relation=repr(relation), # value_text=repr(value_text), # units=repr(units), # ) # ) continue tense, relation = common_tense(tense_indicator, relation_text) yield self.tablename, { FN_CONTENT: matching_text, FN_START: startpos, FN_END: endpos, FN_VARIABLE_TEXT: variable_text, FN_RELATION_TEXT: relation_text, FN_RELATION: relation, FN_VALUE_TEXT: value_text, FN_UNITS: units, self.FN_SYSTOLIC_BP_MMHG: sbp, self.FN_DIASTOLIC_BP_MMHG: dbp, FN_TENSE: tense, }
def test_bp_parser( self, test_expected_list: List[ Tuple[str, List[Tuple[float, float]]] ], verbose: bool = False) -> None: print("Testing parser: {}".format(type(self).__name__)) if verbose: print("... regex:\n{}".format(self.REGEX)) for test_string, expected_values in test_expected_list: actual_values = list( (x[self.FN_SYSTOLIC_BP_MMHG], x[self.FN_DIASTOLIC_BP_MMHG]) for t, x in self.parse(test_string) ) assert actual_values == expected_values, ( "Parser {name}: Expected {expected}, got {actual}, when " "parsing {test_string}; full result={full}".format( name=type(self).__name__, expected=expected_values, actual=actual_values, test_string=repr(test_string), full=repr(list(self.parse(test_string))), ) ) print("... OK") def test(self, verbose: bool = False) -> None: self.test_bp_parser([ ("BP", []), # should fail; no values ("his blood pressure was 120/80", [(120, 80)]), ("BP 120/80 mmhg", [(120, 80)]), ("systolic BP 120", [(120, None)]), ("diastolic BP 80", [(None, 80)]), ("BP-130/70", [(130, 70)]), ("BP 110 /80", [(110, 80)]), ("BP 110 /80 -", [(110, 80)]), # real example ("BP 120 / 70 -", [(120, 70)]), # real example ("BP :115 / 70 -", [(115, 70)]), # real example ("B.P 110", []), # real example ], verbose=verbose)
# 1. Unsure if best to take abs value. # One reason not to might be if people express changes, e.g. # "BP change -40/-10", but I very much doubt it. # Went with abs value using to_pos_float(). # 2. "BP 110" - too unreliable; not definitely a blood pressure.
[docs]class BpValidator(ValidatorBase): """Validator for Bp (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Bp.REGEX], validated_variable=Bp.NAME, commit=commit)
# ============================================================================= # Command-line entry point # ============================================================================= def test_all(verbose: bool = False) -> None: height = Height(None, None) height.test(verbose=verbose) weight = Weight(None, None) weight.test(verbose=verbose) bmi = Bmi(None, None) bmi.test(verbose=verbose) bp = Bp(None, None) bp.test(verbose=verbose) if __name__ == '__main__': test_all(verbose=True)