readabs.recalibrate

Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.

  1"""Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000."""
  2
  3# --- imports
  4import sys
  5from operator import mul, truediv
  6
  7from pandas import Series, DataFrame
  8import numpy as np
  9
 10from readabs.datatype import Datatype as DataT
 11
 12
 13# --- public
 14def recalibrate(
 15    data: DataT,
 16    units: str,
 17) -> tuple[DataT, str]:
 18    """Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000.
 19    Change the name of the units to reflect the recalibration.
 20
 21    Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar.
 22    If you provide a Series, you will get a Series back. If you provide a DataFrame,
 23    you will get a DataFrame back.
 24
 25    Parameters
 26    ----------
 27    data : Series or DataFrame
 28        The data to recalibrate.
 29    units : str
 30        The units of the data. This string should be in the form of
 31        "Number", "Thousands", "Millions", "Billions", etc. The units
 32        should be in title case.
 33
 34    Returns
 35    -------
 36    Series or DataFrame
 37        The recalibrated data will be a Series if a Series was provided,
 38        or a DataFrame if a DataFrame was provided.
 39
 40    Examples
 41    --------
 42    ```python
 43    from pandas import Series
 44    from readabs import recalibrate
 45    s = Series([1_000, 10_000, 100_000, 1_000_000])
 46    recalibrated, units = recalibrate(s, "$")
 47    print(f"{recalibrated=}, {units=}")
 48    ```"""
 49
 50    if not isinstance(data, (Series, DataFrame)):
 51        raise TypeError("data must be a Series or DataFrame")
 52    units, restore_name = _prepare_units(units)
 53    flat_data = data.to_numpy().flatten()
 54    flat_data, units = _recalibrate(flat_data, units)
 55
 56    if restore_name:
 57        units = f"{restore_name} {units}"
 58        for n in "numbers", "number":
 59            if n in units:
 60                units = units.replace(n, "").strip()
 61                break
 62    units = units.title()
 63
 64    restore_pandas = DataFrame if len(data.shape) == 2 else Series
 65    result = restore_pandas(flat_data.reshape(data.shape))
 66    result.index = data.index
 67    if len(data.shape) == 2:
 68        result.columns = data.columns
 69    if len(data.shape) == 1:
 70        result.name = data.name
 71    return result, units
 72
 73
 74def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 75    """Recalibrate a floating point value. The value will be recalibrated
 76    so it is in the range -1000 to 1000. The units will be changed to reflect
 77    the recalibration.
 78
 79    Parameters
 80    ----------
 81    value : float
 82        The value to recalibrate.
 83    units : str
 84        The units of the value. This string should be in the form of
 85        "Number", "Thousands", "Millions", "Billions", etc. The units
 86        should be in title case.
 87
 88    Returns
 89    -------
 90    tuple[float, str]
 91        A tuple containing the recalibrated value and the recalibrated units.
 92
 93    Examples
 94    --------
 95    ```python
 96    from readabs import recalibrate_value
 97    recalibrated, units = recalibrate_value(10_000_000, "Thousand")
 98    print(recalibrated, units)
 99    ```"""
100
101    series = Series([value])
102    output, units = recalibrate(series, units)
103    return output.values[0], units
104
105
106# --- private
107_MIN_RECALIBRATE = "number"  # all lower case
108_MAX_RECALIBRATE = "decillion"  # all lower case
109_keywords = {
110    _MIN_RECALIBRATE.title(): 0,
111    "Thousand": 3,
112    "Million": 6,
113    "Billion": 9,
114    "Trillion": 12,
115    "Quadrillion": 15,
116    "Quintillion": 18,
117    "Sextillion": 21,
118    "Septillion": 24,
119    "Octillion": 27,
120    "Nonillion": 30,
121    _MAX_RECALIBRATE.title(): 33,
122}
123_r_keywords = {v: k for k, v in _keywords.items()}
124
125
126def _prepare_units(units: str) -> tuple[str, str]:
127    """Prepare the units for recalibration."""
128
129    substitutions = [
130        ("000 Hours", "Thousand Hours"),
131        ("$'000,000", "$ Million"),
132        ("$'000", " $ Thousand"),
133        ("'000,000", "Millions"),
134        ("'000", "Thousands"),
135        ("000,000", "Millions"),
136        ("000", "Thousands"),
137    ]
138    units = units.strip()
139    for pattern, replacement in substitutions:
140        units = units.replace(pattern, replacement)
141
142    # manage the names for some gnarly units
143    possible_units = ("$", "Tonnes")  # there may be more possible units
144    found = False
145    for pu in possible_units:
146        if pu.lower() in units.lower():
147            units = units.lower().replace(pu.lower(), "").strip()
148            if units == "":
149                units = "number"
150            found = True
151            break
152
153    return units, pu if found else ""
154
155
156def _find_calibration(units: str) -> str | None:
157    found = None
158    for keyword in _keywords:
159        if keyword in units or keyword.lower() in units:
160            found = keyword
161            break
162    return found
163
164
165# private
166def _perfect_already(data: np.ndarray) -> bool:
167    """No need to recalibrate if the data is already perfect."""
168    check_max = np.nanmax(np.abs(data))
169    return 1 <= check_max < 1000
170
171
172def _all_zero(data: np.ndarray) -> bool:
173    """Cannot recalibrate if all the data is zero."""
174    if np.nanmax(np.abs(data)) == 0:
175        print("recalibrate(): All zero data")
176        return True
177    return False
178
179
180def _not_numbers(data: np.ndarray) -> bool:
181    """Cannot recalibrate if the data is not numeric."""
182    if (
183        (not np.issubdtype(data.dtype, np.number))
184        or np.isinf(data).any()
185        or np.isnan(data).all()
186    ):
187        print("recalibrate(): Data is partly or completely non-numeric.")
188        return True
189    return False
190
191
192def _can_recalibrate(flat_data: np.ndarray, units: str) -> bool:
193    """Check if the data can be recalibrated."""
194
195    if _find_calibration(units) is None:
196        print(f"recalibrate(): Units not appropriately calibrated: {units}")
197        return False
198
199    for f in (_not_numbers, _all_zero, _perfect_already):
200        if f(flat_data):
201            return False
202
203    return True
204
205
206def _recalibrate(flat_data: np.ndarray, units: str) -> tuple[np.ndarray, str]:
207    """Recalibrate the data.  Loop over the data until
208    its maximum value is between -1000 and 1000."""
209
210    if _can_recalibrate(flat_data, units):
211        while True:
212            maximum = np.nanmax(np.abs(flat_data))
213            if maximum >= 1000:
214                if _MAX_RECALIBRATE in units.lower():
215                    print("recalibrate() is not designed for very big units")
216                    break
217                flat_data, units = _do_recal(flat_data, units, 3, truediv)
218                continue
219            if maximum < 1:
220                if _MIN_RECALIBRATE in units.lower():
221                    print("recalibrate() is not designed for very small units")
222                    break
223                flat_data, units = _do_recal(flat_data, units, -3, mul)
224                continue
225            break
226    return flat_data, units
227
228
229def _do_recal(flat_data, units, step, operator):
230    calibration = _find_calibration(units)
231    factor = _keywords[calibration]
232    if factor + step not in _r_keywords:
233        print(f"Unexpected factor: {factor + step}")
234        sys.exit(-1)
235    replacement = _r_keywords[factor + step]
236    units = units.replace(calibration, replacement)
237    units = units.replace(calibration.lower(), replacement)
238    flat_data = operator(flat_data, 1000)
239    return flat_data, units
240
241
242# --- test
243if __name__ == "__main__":
244
245    def test_example():
246        """Test the example in the docstring."""
247
248        s = Series([1_000, 10_000, 100_000, 1_000_000])
249        recalibrated, units = recalibrate(s, "$")
250        print(f"{recalibrated=}, {units=}")
251
252        recalibrated, units = recalibrate_value(10_000_000, "Thousand")
253        print(f"{recalibrated=}, {units=}")
254        print("=" * 40)
255
256    test_example()
257
258    def test_recalibrate():
259        """Test the recalibrate() function."""
260
261        def run_test(dataset: tuple[tuple[list[str], str]]) -> None:
262            for d, u in dataset:
263                data = Series(d)
264                recalibrated, units = recalibrate(data, u)
265                print(f"{data.values}, {u} ==> {recalibrated.values}, {units}")
266                print("=" * 40)
267
268        # good examples
269        good = (
270            ([1, 2, 3, 4, 5], "Number"),  # no change
271            ([1_000, 10_000, 100_000, 1_000_000], "$"),
272            ([1_000, 10_000, 100_000, 1_000_000], "Number Spiders"),
273            ([1_000, 10_000, 100_000, 1_000_000], "Thousand"),
274            ([0.2, 0.3], "Thousands"),
275            ([0.000_000_2, 0.000_000_3], "Trillion"),
276        )
277        run_test(good)
278
279        # bad sets of data - should produce error messages and do nothing
280        bad = (
281            ([1, 2, 3, 4, 5], "Hundreds"),
282            ([0, 0, 0], "Thousands"),
283            ([np.nan, 0, 0], "Thousands"),
284            ([np.inf, 1, 2], "Thousands"),
285            ([0, 0, "a"], "Thousands"),
286        )
287        run_test(bad)
288
289    test_recalibrate()
290
291    def test_recalibrate_value():
292        """Test the recalibrate_value() function."""
293
294        # good example
295        recalibrated, units = recalibrate_value(10_000_000, "Thousand")
296        print(recalibrated, units)
297        print("=" * 40)
298
299        # bad example
300        recalibrated, units = recalibrate_value(3_900, "Spiders")
301        print(recalibrated, units)
302        print("=" * 40)
303
304    test_recalibrate_value()
def recalibrate(data: ~Datatype, units: str) -> tuple[~Datatype, str]:
15def recalibrate(
16    data: DataT,
17    units: str,
18) -> tuple[DataT, str]:
19    """Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000.
20    Change the name of the units to reflect the recalibration.
21
22    Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar.
23    If you provide a Series, you will get a Series back. If you provide a DataFrame,
24    you will get a DataFrame back.
25
26    Parameters
27    ----------
28    data : Series or DataFrame
29        The data to recalibrate.
30    units : str
31        The units of the data. This string should be in the form of
32        "Number", "Thousands", "Millions", "Billions", etc. The units
33        should be in title case.
34
35    Returns
36    -------
37    Series or DataFrame
38        The recalibrated data will be a Series if a Series was provided,
39        or a DataFrame if a DataFrame was provided.
40
41    Examples
42    --------
43    ```python
44    from pandas import Series
45    from readabs import recalibrate
46    s = Series([1_000, 10_000, 100_000, 1_000_000])
47    recalibrated, units = recalibrate(s, "$")
48    print(f"{recalibrated=}, {units=}")
49    ```"""
50
51    if not isinstance(data, (Series, DataFrame)):
52        raise TypeError("data must be a Series or DataFrame")
53    units, restore_name = _prepare_units(units)
54    flat_data = data.to_numpy().flatten()
55    flat_data, units = _recalibrate(flat_data, units)
56
57    if restore_name:
58        units = f"{restore_name} {units}"
59        for n in "numbers", "number":
60            if n in units:
61                units = units.replace(n, "").strip()
62                break
63    units = units.title()
64
65    restore_pandas = DataFrame if len(data.shape) == 2 else Series
66    result = restore_pandas(flat_data.reshape(data.shape))
67    result.index = data.index
68    if len(data.shape) == 2:
69        result.columns = data.columns
70    if len(data.shape) == 1:
71        result.name = data.name
72    return result, units

Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000. Change the name of the units to reflect the recalibration.

Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.

Parameters

data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.

Examples

from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 75def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 76    """Recalibrate a floating point value. The value will be recalibrated
 77    so it is in the range -1000 to 1000. The units will be changed to reflect
 78    the recalibration.
 79
 80    Parameters
 81    ----------
 82    value : float
 83        The value to recalibrate.
 84    units : str
 85        The units of the value. This string should be in the form of
 86        "Number", "Thousands", "Millions", "Billions", etc. The units
 87        should be in title case.
 88
 89    Returns
 90    -------
 91    tuple[float, str]
 92        A tuple containing the recalibrated value and the recalibrated units.
 93
 94    Examples
 95    --------
 96    ```python
 97    from readabs import recalibrate_value
 98    recalibrated, units = recalibrate_value(10_000_000, "Thousand")
 99    print(recalibrated, units)
100    ```"""
101
102    series = Series([value])
103    output, units = recalibrate(series, units)
104    return output.values[0], units

Recalibrate a floating point value. The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.

Parameters

value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.

Examples

from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)