Source code for crate_anon.nlp_manager.parse_haematology

#!/usr/bin/env python
# crate_anon/nlp_manager/parse_haematology.py

"""
===============================================================================

    Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <http://www.gnu.org/licenses/>.

===============================================================================
"""

import logging
from typing import Optional

from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.regex_parser import (
    OPTIONAL_RESULTS_IGNORABLES,
    RELATION,
    SimpleNumericalResultParser,
    TENSE_INDICATOR,
    ValidatorBase,
    WORD_BOUNDARY,
)
from crate_anon.nlp_manager.regex_numbers import SIGNED_FLOAT
from crate_anon.nlp_manager.regex_units import (
    BILLION_PER_L,
    CELLS_PER_CUBIC_MM,
    MG_PER_DL,
    MG_PER_L,
    MM_PER_H,
    PERCENT,
)

log = logging.getLogger(__name__)


# =============================================================================
#  Erythrocyte sedimentation rate (ESR)
# =============================================================================

[docs]class Esr(SimpleNumericalResultParser): """Erythrocyte sedimentation rate (ESR).""" ESR = r""" (?: {WORD_BOUNDARY} (?: (?: Erythrocyte [\s]+ sed(?:\.|imentation)? [\s]+ rate) | (?:ESR) ) {WORD_BOUNDARY} ) """.format(WORD_BOUNDARY=WORD_BOUNDARY) REGEX = r""" ( {ESR} ) # group for "ESR" or equivalent {OPTIONAL_RESULTS_IGNORABLES} ( {TENSE_INDICATOR} )? # optional group for tense indicator {OPTIONAL_RESULTS_IGNORABLES} ( {RELATION} )? # optional group for relation {OPTIONAL_RESULTS_IGNORABLES} ( {SIGNED_FLOAT} ) # group for value {OPTIONAL_RESULTS_IGNORABLES} ( # optional group for units {MM_PER_H} # good | {MG_PER_DL} # bad | {MG_PER_L} # bad )? """.format( ESR=ESR, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, TENSE_INDICATOR=TENSE_INDICATOR, RELATION=RELATION, SIGNED_FLOAT=SIGNED_FLOAT, MM_PER_H=MM_PER_H, MG_PER_DL=MG_PER_DL, MG_PER_L=MG_PER_L, ) NAME = "ESR" PREFERRED_UNIT_COLUMN = "value_mm_h" UNIT_MAPPING = { MM_PER_H: 1, # preferred unit # not MG_PER_DL, MG_PER_L } def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__( nlpdef=nlpdef, cfgsection=cfgsection, regex_str=self.REGEX, variable=self.NAME, target_unit=self.PREFERRED_UNIT_COLUMN, units_to_factor=self.UNIT_MAPPING, commit=commit, take_absolute=True ) def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("ESR (should fail)", []), # should fail; no values ("ESR 6 (should succeed)", [6]), ("ESR = 6", [6]), ("ESR 6 mm/h", [6]), ("ESR <10", [10]), ("ESR <10 mm/hr", [10]), ("ESR >100", [100]), ("ESR >100 mm/hour", [100]), ("ESR was 62", [62]), ("ESR was 62 mm/h", [62]), ("ESR was 62 (H) mm/h", [62]), ("ESR was 62 mg/dl (should fail, wrong units)", []), ("Erythrocyte sed. rate was 19", [19]), ("his erythrocyte sedimentation rate was 19", [19]), ("erythrocyte sedimentation rate was 19", [19]), ("ESR 1.9 mg/L", []), # wrong units ("ESR 1.9 (H) mg/L", []), # wrong units ("ESR | 1.9 (H) | mg/L", []), ("my ESR was 15, but his ESR was 89!", [15, 89]), ("ESR-18", [18]), ], verbose=verbose)
[docs]class EsrValidator(ValidatorBase): """Validator for Esr (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Esr.ESR], validated_variable=Esr.NAME, commit=commit)
# ============================================================================= # White blood cell count and differential # ============================================================================= # Do NOT accept my handwritten abbreviations with slashed zeros, e.g. # L0 lymphocytes # N0 neutrophils # M0 monocytes # B0 basophils # E0 eosinophils # ... too likely that these are interpreted in wrong contexts, particularly # if we are not allowing units, like "M0 3": macrophages 3 x 10^9/L, or part # of "T2 N0 M0 ..." cancer staging?
[docs]class WbcBase(SimpleNumericalResultParser): """DO NOT USE DIRECTLY. White cell count base class.""" PREFERRED_UNIT_COLUMN = "value_billion_per_l" UNIT_MAPPING = { BILLION_PER_L: 1, # preferred unit: 10^9 / L CELLS_PER_CUBIC_MM: 0.001, # 1000 cells/mm^3 -> 1 x 10^9 / L # but NOT percent (too hard to interpret relative differentials # reliably) } def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], cell_type_regex_text: str, variable: str, commit: bool = False) -> None: super().__init__( nlpdef=nlpdef, cfgsection=cfgsection, regex_str=self.make_wbc_regex(cell_type_regex_text), variable=variable, target_unit=self.PREFERRED_UNIT_COLUMN, units_to_factor=self.UNIT_MAPPING, commit=commit, take_absolute=True ) @staticmethod def make_wbc_regex(cell_type_regex_text: str) -> str: return r""" ({CELL_TYPE}) # group for cell type name {OPTIONAL_RESULTS_IGNORABLES} ({TENSE_INDICATOR})? # optional group for tense indicator {OPTIONAL_RESULTS_IGNORABLES} ({RELATION})? # optional group for relation {OPTIONAL_RESULTS_IGNORABLES} ({SIGNED_FLOAT}) # group for value {OPTIONAL_RESULTS_IGNORABLES} ( # optional units, good and bad {BILLION_PER_L} # good | {CELLS_PER_CUBIC_MM} # good | {PERCENT} # bad, so we can ignore it )? """.format( CELL_TYPE=cell_type_regex_text, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, TENSE_INDICATOR=TENSE_INDICATOR, RELATION=RELATION, SIGNED_FLOAT=SIGNED_FLOAT, BILLION_PER_L=BILLION_PER_L, CELLS_PER_CUBIC_MM=CELLS_PER_CUBIC_MM, PERCENT=PERCENT, )
# ----------------------------------------------------------------------------- # WBC # -----------------------------------------------------------------------------
[docs]class Wbc(WbcBase): """White cell count (WBC, WCC).""" WBC = r""" (?: \b (?: (?: # White blood cells, white cell count, etc. White\b [\s]* (?:\bblood\b)? [\s]* \bcell[s]?\b [\s]* (?:\bcount\b)? [\s]* (?: # optional suffix WBC, (WBC), (WBCC), (WCC), etc. [\(]? (?: WBC | WBCC | WCC) [\)]? )? ) | (?: # just WBC(s), WBCC, WCC (?: WBC[s]? | WBCC | WCC ) ) ) \b ) """ NAME = "WBC" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, cell_type_regex_text=self.WBC, variable=self.NAME) def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("WBC (should fail)", []), # should fail; no values ("WBC 6", [6]), ("WBC = 6", [6]), ("WBC 6 x 10^9/L", [6]), ("WBC 6 x 10 ^ 9 / L", [6]), ("WCC 6.2", [6.2]), ("white cells 6.2", [6.2]), ("white cells 6.2", [6.2]), ("white cells 9800/mm3", [9.8]), ("white cells 9800 cell/mm3", [9.8]), ("white cells 9800 cells/mm3", [9.8]), ("white cells 9800 per cubic mm", [9.8]), ("white cells 17,600/mm3", [17.6]), ("WBC – 6", [6]), # en dash ("WBC—6", [6]), # em dash ("WBC -- 6", [6]), # double hyphen used as dash ("WBC - 6", [6]), ("WBC-6.5", [6.5]), ], verbose=verbose)
[docs]class WbcValidator(ValidatorBase): """Validator for Wbc (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Wbc.WBC], validated_variable=Wbc.NAME, commit=commit)
# ----------------------------------------------------------------------------- # Neutrophils # -----------------------------------------------------------------------------
[docs]class Neutrophils(WbcBase): """Neutrophil count (absolute).""" NEUTROPHILS = r""" (?: (?: \b absolute \s* )? \b (?: Neut(?:r(?:o(?:phil)?)?)?s? | N0 ) \b (?: \s* count \b )? ) """ NAME = "neutrophils" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, cell_type_regex_text=self.NEUTROPHILS, variable=self.NAME) def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("neutrophils (should fail)", []), # should fail; no values ("absolute neutrophil count 6", [6]), ("neuts = 6", [6]), ("N0 6 x 10^9/L", [6]), ("neutrophil count 6 x 10 ^ 9 / L", [6]), ("neutrs 6.2", [6.2]), ("neutrophil 6.2", [6.2]), ("neutrophils 6.2", [6.2]), ("n0 9800/mm3", [9.8]), ("absolute neutrophils 9800 cell/mm3", [9.8]), ("neutrophils count 9800 cells/mm3", [9.8]), ("n0 9800 per cubic mm", [9.8]), ("n0 17,600/mm3", [17.6]), ("neuts-17", [17]), ], verbose=verbose)
[docs]class NeutrophilsValidator(ValidatorBase): """Validator for Neutrophils (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Neutrophils.NEUTROPHILS], validated_variable=Neutrophils.NAME, commit=commit)
# ----------------------------------------------------------------------------- # Lymphocytes # -----------------------------------------------------------------------------
[docs]class Lymphocytes(WbcBase): """Lymphocyte count (absolute).""" LYMPHOCYTES = r""" (?: (?: \b absolute \s* )? \b Lymph(?:o(?:cyte)?)?s? \b (?: \s* count \b )? ) """ NAME = "lymphocytes" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, cell_type_regex_text=self.LYMPHOCYTES, variable=self.NAME) def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("lymphocytes (should fail)", []), # should fail; no values ("absolute lymphocyte count 6", [6]), ("lymphs = 6", [6]), ("L0 6 x 10^9/L (should fail)", []), ("lymphocyte count 6 x 10 ^ 9 / L", [6]), ("lymphs 6.2", [6.2]), ("lymph 6.2", [6.2]), ("lympho 6.2", [6.2]), ("lymphos 9800/mm3", [9.8]), ("absolute lymphocytes 9800 cell/mm3", [9.8]), ("lymphocytes count 9800 cells/mm3", [9.8]), ("l0 9800 per cubic mm (should fail)", []), ("l0 17,600/mm3 (should fail)", []), ("lymphs-6.3", [6.3]), ], verbose=verbose)
[docs]class LymphocytesValidator(ValidatorBase): """Validator for Lymphocytes (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Lymphocytes.LYMPHOCYTES], validated_variable=Lymphocytes.NAME, commit=commit)
# ----------------------------------------------------------------------------- # Monocytes # -----------------------------------------------------------------------------
[docs]class Monocytes(WbcBase): """Monocyte count (absolute).""" MONOCYTES = r""" (?: (?: \b absolute \s* )? \b Mono(?:cyte)?s? \b (?: \s* count \b )? ) """ NAME = "monocytes" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, cell_type_regex_text=self.MONOCYTES, variable=self.NAME) def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("monocytes (should fail)", []), # should fail; no values ("absolute monocyte count 6", [6]), ("monos = 6", [6]), ("M0 6 x 10^9/L (should fail)", []), ("monocyte count 6 x 10 ^ 9 / L", [6]), ("monos 6.2", [6.2]), ("mono 6.2", [6.2]), ("monos 9800/mm3", [9.8]), ("absolute mono 9800 cell/mm3", [9.8]), ("monocytes count 9800 cells/mm3", [9.8]), ("m0 9800 per cubic mm (should fail)", []), ("m0 17,600/mm3 (should fail)", []), ("monocytes-5.2", [5.2]), ], verbose=verbose)
[docs]class MonocytesValidator(ValidatorBase): """Validator for Monocytes (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Monocytes.MONOCYTES], validated_variable=Monocytes.NAME, commit=commit)
# ----------------------------------------------------------------------------- # Basophils # -----------------------------------------------------------------------------
[docs]class Basophils(WbcBase): """Basophil count (absolute).""" BASOPHILS = r""" (?: (?: \b absolute \s* )? \b Baso(?:phil)?s? \b (?: \s* count \b )? ) """ NAME = "basophils" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, cell_type_regex_text=self.BASOPHILS, variable=self.NAME) def test(self, verbose=False) -> None: self.test_numerical_parser([ ("basophils (should fail)", []), # should fail; no values ("absolute basophil count 6", [6]), ("basos = 6", [6]), ("B0 6 x 10^9/L (should fail)", []), ("basophil count 6 x 10 ^ 9 / L", [6]), ("basos 6.2", [6.2]), ("baso 6.2", [6.2]), ("basos 9800/mm3", [9.8]), ("absolute basophil 9800 cell/mm3", [9.8]), ("basophils count 9800 cells/mm3", [9.8]), ("b0 9800 per cubic mm (should fail)", []), ("b0 17,600/mm3 (should fail)", []), ("basophils-5.2", [5.2]), ], verbose=verbose)
[docs]class BasophilsValidator(ValidatorBase): """Validator for Basophils (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Basophils.BASOPHILS], validated_variable=Basophils.NAME, commit=commit)
# ----------------------------------------------------------------------------- # Eosinophils # -----------------------------------------------------------------------------
[docs]class Eosinophils(WbcBase): """Eosinophil count (absolute).""" EOSINOPHILS = r""" (?: (?: \b absolute \s* )? \b Eo(?:sin(?:o(?:phil)?)?)?s? \b (?: \s* count \b )? ) """ NAME = "eosinophils" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, cell_type_regex_text=self.EOSINOPHILS, variable=self.NAME) def test(self, verbose: bool = False) -> None: self.test_numerical_parser([ ("eosinophils (should fail)", []), # should fail; no values ("absolute eosinophil count 6", [6]), ("eos = 6", [6]), ("E0 6 x 10^9/L (should fail)", []), ("eosinophil count 6 x 10 ^ 9 / L", [6]), ("eosins 6.2", [6.2]), ("eosino 6.2", [6.2]), ("eosinos 9800/mm3", [9.8]), ("absolute eosinophil 9800 cell/mm3", [9.8]), ("eosinophils count 9800 cells/mm3", [9.8]), ("e0 9800 per cubic mm (should fail)", []), ("e0 17,600/mm3 (should fail)", []), ("eosinophils-5.3", [5.3]), ], verbose=verbose)
[docs]class EosinophilsValidator(ValidatorBase): """Validator for Eosinophils (see ValidatorBase for explanation).""" def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, regex_str_list=[Eosinophils.EOSINOPHILS], validated_variable=Eosinophils.NAME, commit=commit)
# ============================================================================= # Command-line entry point # ============================================================================= def test_all(verbose: bool = False) -> None: # ESR esr = Esr(None, None) esr.test(verbose=verbose) # WBC and differential wbc = Wbc(None, None) wbc.test(verbose=verbose) n0 = Neutrophils(None, None) n0.test(verbose=verbose) l0 = Lymphocytes(None, None) l0.test(verbose=verbose) m0 = Monocytes(None, None) m0.test(verbose=verbose) b0 = Basophils(None, None) b0.test(verbose=verbose) e0 = Eosinophils(None, None) e0.test(verbose=verbose) if __name__ == '__main__': test_all(verbose=True)