#!/usr/bin/env python
# crate_anon/nlp_manager/regex_units.py
"""
===============================================================================
Copyright (C) 2015-2018 Rudolf Cardinal (rudolf@pobox.com).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <http://www.gnu.org/licenses/>.
===============================================================================
"""
from typing import List, Optional, Tuple
from crate_anon.nlp_manager.regex_test import test_text_regex
from crate_anon.nlp_manager.regex_numbers import (
BILLION,
MULTIPLY_OR_SPACE,
PLAIN_INTEGER,
POWER,
)
# =============================================================================
# Physical units
# =============================================================================
OUT_OF_SEPARATOR = r"(?: \/ | \b out \s+ of \b )"
def per(numerator: str, denominator: str,
include_power_minus1: bool = True) -> str:
# Copes with blank/optional numerators, too.
options = [
r"(?: {numerator} \s* (?: \/ | \b per \b) \s* {denominator} )".format(
numerator=numerator, denominator=denominator),
]
if include_power_minus1:
options.append(
r"(?: {numerator} \s* \b {denominator} \s* -1 )".format(
numerator=numerator, denominator=denominator))
return r"(?: {} )".format(r" | ".join(options))
# Use of "\s* \b" rather than "\s+" is so we can have a BLANK numerator.
def _out_of_str(n_as_regex: str):
# / n
# out of n
return r"(?: {out_of} \s* {n} \b)".format(out_of=OUT_OF_SEPARATOR,
n=n_as_regex)
def out_of(n: int) -> str:
return _out_of_str(str(n))
def out_of_anything() -> str:
# out_of(n) where n is any number
return _out_of_str(PLAIN_INTEGER)
def power(x: str, n: int, allow_no_operator: bool=False) -> str:
return r"(?: {x} \s* {power}{optional} \s* {n})".format(
x=x,
power=POWER,
optional="?" if allow_no_operator else "",
n=n,
)
[docs]def units_times(*args: str) -> str:
"""For units, where they are notionally multiplied."""
multiply = MULTIPLY_OR_SPACE + "?"
joined = multiply.join(args)
return r"(?: {} )".format(joined)
def units_by_dimension(*args: Tuple[str, int], # specify type of *one* arg!
allow_no_operator: bool=False) -> str:
multiply = " " + MULTIPLY_OR_SPACE + " "
power_elements = [] # type: List[str]
for i, unit_exponent in enumerate(args):
unit, exponent = unit_exponent
assert(exponent != 0)
power_elements.append(
power(unit, exponent, allow_no_operator=allow_no_operator))
joined_power_elements = multiply.join(power_elements)
power_style = r"(?: {} )".format(joined_power_elements)
options = [power_style]
# noinspection PyChainedComparisons
if len(args) == 2 and args[0][1] > 0 and args[1][1] < 0:
# x per y
options.append(per(args[0][0], args[1][0],
include_power_minus1=False))
return r"(?: {} )".format(r" | ".join(options))
# -----------------------------------------------------------------------------
# Distance
# -----------------------------------------------------------------------------
M = r"(?: met(?:re|er)s? | m )" # m, metre(s), meter(s)
CM = r"(?: cm | centimet(?:re|er)s? )" # cm, centimetre(s), centimeter(s)
MM = r"(?: mm | millimet(?:re|er)s? )" # mm, millimetre(s), millimeter(s)
FEET = r"""(?: f(?:ee|oo)?t | \' | ’ | ′ )"""
# ... feet, foot, ft
# ... apostrophe, right single quote (U+2019), prime (U+2032)
INCHES = r'''(?: in(?:ch(?:e)?)?s? | \" | ” | ″)'''
# ... in, ins, inch, inches, [inchs = typo but clear]
# ... ", right double quote (U+2014), double prime (U+2033)
# -----------------------------------------------------------------------------
# Mass
# -----------------------------------------------------------------------------
MCG = r"(?: mcg | microgram(?:me)?s? | [μu]g )" # you won't stop people using ug... # noqa
MG = r"(?: mg | milligram(?:me)?s? )" # mg, milligram, milligrams, milligramme, milligrammes # noqa
G = r"(?: gram(?:me)?s? | g )" # g, gram, grams, gramme, grammes # noqa
KG = r"(?: kgs? | kilo(?:gram(?:me)?)?s? )" # kg, kgs, kilos ... kilogrammes etc. # noqa
LB = r"(?: pounds? | lbs? )" # pound(s), lb(s)
STONES = r"(?: stones? | st\.? )" # stone(s), st, st.
# -----------------------------------------------------------------------------
# Volume
# -----------------------------------------------------------------------------
L = r"(?: lit(?:re|er)s? | L )" # L, litre(s), liter(s)
DL = r"(?: d(?:eci)?{L} )".format(L=L)
ML = r"(?: m(?:illi)?{L} )".format(L=L)
CUBIC_MM = r"""(?: (?:\b cubic \s+ {mm}) | {mm_cubed} )""".format(
mm=MM,
mm_cubed=power(MM, 3, allow_no_operator=True)
)
# cubic mm, etc. | mm^3, mm3, mm 3, etc.
# -----------------------------------------------------------------------------
# Inverse volume
# -----------------------------------------------------------------------------
PER_CUBIC_MM = per("", CUBIC_MM)
# -----------------------------------------------------------------------------
# Time
# -----------------------------------------------------------------------------
HOUR = r"(?:h(?:rs?|ours?)?)" # h, hr, hrs, hour, hours
# -----------------------------------------------------------------------------
# Counts, proportions
# -----------------------------------------------------------------------------
PERCENT = r"""(?:%|pe?r?\s?ce?n?t)"""
# must have pct, other characters optional
# -----------------------------------------------------------------------------
# Arbitrary count things
# -----------------------------------------------------------------------------
CELLS = r"(?:\b cells? \b)"
OPTIONAL_CELLS = CELLS + "?"
MILLIMOLES = r"(?: m(?:illi)?mol(?:es?)? )"
MILLIEQ = r"(?:m(?:illi)?Eq)"
UNITS = r"(?:[I]?U)" # U units, IU international units
MILLIUNITS = r"(?:m[I]?U)"
MICROUNITS = r"(?:[μu][I]?U)"
SCORE = r"(?:scored?)" # score(d)
# -----------------------------------------------------------------------------
# Concentration
# -----------------------------------------------------------------------------
MILLIMOLAR = r"(?:mM)" # NB case-insensitive... confusable with millimetres
MG_PER_DL = per(MG, DL)
MG_PER_L = per(MG, L)
MILLIMOLES_PER_L = per(MILLIMOLES, L)
MILLIEQ_PER_L = per(MILLIEQ, L)
BILLION_PER_L = per(BILLION, L)
CELLS_PER_CUBIC_MM = per(OPTIONAL_CELLS, CUBIC_MM)
MILLIUNITS_PER_L = per(MILLIUNITS, L)
MICROUNITS_PER_ML = per(MICROUNITS, ML)
# -----------------------------------------------------------------------------
# Speed
# -----------------------------------------------------------------------------
MM_PER_H = per(MM, HOUR)
# -----------------------------------------------------------------------------
# Pressure
# -----------------------------------------------------------------------------
MM_HG = r"(?: mm \s* Hg )" # mmHg, mm Hg
# ... likelihood of "millimetres of mercury" quite small?
# -----------------------------------------------------------------------------
# Things to powers
# -----------------------------------------------------------------------------
SQ_M = r"""
(?: # square metres
(?: sq(?:uare)? \s+ {m} ) # sq m, square metres, etc.
| (?: {m} \s+ sq(?:uared?)? ) # m sq, metres square(d), etc.
| {m_sq} # m ^ 2, etc.
)
""".format(m=M, m_sq=power(M, 2))
# BMI
KG_PER_SQ_M = r"(?: {kg_per_sqm} | {kg_sqm_pow_minus2} )".format(
kg_per_sqm=per(KG, SQ_M, include_power_minus1=False),
kg_sqm_pow_minus2=units_times(KG, power(M, -2)),
)
# =============================================================================
# Generic conversion functions
# =============================================================================
def kg_from_st_lb_oz(stones: float = 0,
pounds: float = 0,
ounces: float = 0) -> Optional[float]:
# 16 ounces in a pound
# 14 pounds in a stone
# 1 avoirdupois pound = 0.45359237 kg
# https://en.wikipedia.org/wiki/Pound_(mass)
# Have you the peas? "Goods of weight"; aveir de peis (OFr.; see OED).
try:
total_pounds = (stones * 14) + pounds + (ounces / 16)
return 0.45359237 * total_pounds
except (TypeError, ValueError):
return None
def m_from_ft_in(feet: float = 0, inches: float = 0) -> Optional[float]:
# 12 inches in a foot
# 1 inch = 25.4 mm
try:
total_inches = (feet * 12) + inches
return total_inches * 25.4 / 1000
except (TypeError, ValueError):
return None
def m_from_m_cm(metres: float = 0, centimetres: float = 0) -> Optional[float]:
try:
return metres + (centimetres / 100)
except (TypeError, ValueError):
return None
[docs]def assemble_units(components: List[Optional[str]]) -> str:
"""Takes e.g. ["ft", "in"] and makes "ft in"."""
active_components = [c for c in components if c]
return " ".join(active_components)
# =============================================================================
# Tests
# =============================================================================
def test_unit_regexes(verbose: bool = False) -> None:
test_text_regex("per(n, d)", per("n", "d"), [
("blah n per d blah", ["n per d"]),
("blah n/d blah", ["n/d"]),
("n / d", ["n / d"]),
("n d -1", ["n d -1"]),
("n d -1", ["n d -1"]),
("n blah d", []),
], verbose=verbose)
test_text_regex("out_of(5)", out_of(5), [
("4 out of 5", ["out of 5"]),
("4/5", ["/5"]),
("4 / 5", ["/ 5"]),
], verbose=verbose)
test_text_regex("M", M, [
("5 metres long", ["metres"]),
("5 meters long", ["meters"]),
("5m long", ["m"]),
], verbose=verbose)
test_text_regex("CM", CM, [
("5 centimetres long", ["centimetres"]),
("5 centimeters long", ["centimeters"]),
("5cm long", ["cm"]),
], verbose=verbose)
test_text_regex("MM", MM, [
("5 millimetres long", ["millimetres"]),
("5 millimeters long", ["millimeters"]),
("5mm long", ["mm"]),
], verbose=verbose)
test_text_regex("FEET", FEET, [
("5 feet long", ["feet"]),
("5 foot long", ["foot"]),
("5' long", ["'"]), # ASCII apostrophe
("5’ long", ["’"]), # right single quote (U+2019)
("5′ long", ["′"]), # prime (U+2032)
], verbose=verbose)
test_text_regex("INCHES", INCHES, [
("5 inches long", ["inches"]),
("5 in long", ["in"]),
('5" long', ['"']), # ASCII double quote
("5” long", ["”"]), # right double quote (U+2014)
("5″ long", ["″"]), # double prime (U+2033)
], verbose=verbose)
test_text_regex("MCG", MCG, [
("5 micrograms", ["micrograms"]),
("5 mcg", ["mcg"]),
("5 ug", ["ug"]),
("5 μg", ["μg"]),
], verbose=verbose)
test_text_regex("MG", MG, [
("5 milligrams", ["milligrams"]),
("5 mg", ["mg"]),
], verbose=verbose)
test_text_regex("G", G, [
("5 grams", ["grams"]),
("5 g", ["g"]),
], verbose=verbose)
test_text_regex("KG", KG, [
("5 kilograms", ["kilograms"]),
("5 kg", ["kg"]),
], verbose=verbose)
test_text_regex("LB", LB, [
("5 pounds", ["pounds"]),
("5 lb", ["lb"]),
], verbose=verbose)
test_text_regex("STONES", STONES, [
("5 stones", ["stones"]),
("5 stone", ["stone"]),
("5 st", ["st"]),
], verbose=verbose)
test_text_regex("L", L, [
("5 litres", ["litres"]),
("5 liters", ["liters"]),
("5 l", ["l"]),
("5 L", ["L"]),
], verbose=verbose)
test_text_regex("DL", DL, [
("5 decilitres", ["decilitres"]),
("5 deciliters", ["deciliters"]),
("5 dl", ["dl"]),
("5 dL", ["dL"]),
], verbose=verbose)
test_text_regex("ML", ML, [
("5 millilitres", ["millilitres"]),
("5 milliliters", ["milliliters"]),
("5 ml", ["ml"]),
("5 mL", ["mL"]),
], verbose=verbose)
test_text_regex("CUBIC_MM", CUBIC_MM, [
("5 mm^3", ["mm^3"]),
("5 cubic mm", ["cubic mm"]),
("5 cubic millimetres", ["cubic millimetres"]),
], verbose=verbose)
test_text_regex("HOUR", HOUR, [
("5 hours", ["hours"]),
("5 hr", ["hr"]),
("5 h", ["h"]),
], verbose=verbose)
test_text_regex("PERCENT", PERCENT, [
("5 percent", ["percent"]),
("5 per cent", ["per cent"]),
("5 pct", ["pct"]),
("5%", ["%"]),
], verbose=verbose)
test_text_regex("CELLS", CELLS, [
("5 cells", ["cells"]),
("5 cell", ["cell"]),
], verbose=verbose)
test_text_regex("MILLIMOLES", MILLIMOLES, [
("5 millimoles", ["millimoles"]),
("5 millimol", ["millimol"]),
("5 mmol", ["mmol"]),
], verbose=verbose)
test_text_regex("MILLIEQ", MILLIEQ, [
("5 mEq", ["mEq"]),
], verbose=verbose)
test_text_regex("UNITS", UNITS, [
("5 U", ["U"]),
("5 IU", ["IU"]),
], verbose=verbose)
test_text_regex("MILLIUNITS", MILLIUNITS, [
("5 mU", ["mU"]),
("5 mIU", ["mIU"]),
], verbose=verbose)
test_text_regex("MICROUNITS", MICROUNITS, [
("5 uU", ["uU"]),
("5 μU", ["μU"]),
("5 uIU", ["uIU"]),
("5 μIU", ["μIU"]),
], verbose=verbose)
test_text_regex("SCORE", SCORE, [
("I scored 5", ["scored"]),
("MMSE score 5", ["score"]),
], verbose=verbose)
test_text_regex("MILLIMOLAR", MILLIMOLAR, [
("5 mM", ["mM"]),
], verbose=verbose)
test_text_regex("MM_HG", MM_HG, [
("5 mmHg", ["mmHg"]),
("5 mm Hg", ["mm Hg"]),
], verbose=verbose)
test_text_regex("SQ_M", SQ_M, [
("5 square metres", ["square metres"]),
("5 sq m", ["sq m"]),
("5 m^2", ["m^2"]),
], verbose=verbose)
test_text_regex("KG_PER_SQ_M", KG_PER_SQ_M, [
("5 kg per square metre", ["kg per square metre"]),
("5 kg/sq m", ["kg/sq m"]),
("5 kg/m^2", ["kg/m^2"]),
("5 kg*m^-2", ["kg*m^-2"]),
], verbose=verbose)
def test_all(verbose: bool = False) -> None:
test_unit_regexes(verbose=verbose)
if __name__ == '__main__':
test_all()