readabs.recalibrate
Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.
1"""Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.""" 2 3# --- imports 4import sys 5from operator import mul, truediv 6 7from pandas import Series, DataFrame 8import numpy as np 9 10from readabs.datatype import Datatype as DataT 11 12 13# --- public 14def recalibrate( 15 data: DataT, 16 units: str, 17) -> tuple[DataT, str]: 18 """Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000. 19 Change the name of the units to reflect the recalibration. 20 21 Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. 22 If you provide a Series, you will get a Series back. If you provide a DataFrame, 23 you will get a DataFrame back. 24 25 Parameters 26 ---------- 27 data : Series or DataFrame 28 The data to recalibrate. 29 units : str 30 The units of the data. This string should be in the form of 31 "Number", "Thousands", "Millions", "Billions", etc. The units 32 should be in title case. 33 34 Returns 35 ------- 36 Series or DataFrame 37 The recalibrated data will be a Series if a Series was provided, 38 or a DataFrame if a DataFrame was provided. 39 40 Examples 41 -------- 42 ```python 43 from pandas import Series 44 from readabs import recalibrate 45 s = Series([1_000, 10_000, 100_000, 1_000_000]) 46 recalibrated, units = recalibrate(s, "$") 47 print(f"{recalibrated=}, {units=}") 48 ```""" 49 50 if not isinstance(data, (Series, DataFrame)): 51 raise TypeError("data must be a Series or DataFrame") 52 units, restore_name = _prepare_units(units) 53 flat_data = data.to_numpy().flatten() 54 flat_data, units = _recalibrate(flat_data, units) 55 56 if restore_name: 57 units = f"{restore_name} {units}" 58 for n in "numbers", "number": 59 if n in units: 60 units = units.replace(n, "").strip() 61 break 62 units = units.title() 63 64 restore_pandas = DataFrame if len(data.shape) == 2 else Series 65 result = restore_pandas(flat_data.reshape(data.shape)) 66 result.index = data.index 67 if len(data.shape) == 2: 68 result.columns = data.columns 69 if len(data.shape) == 1: 70 result.name = data.name 71 return result, units 72 73 74def recalibrate_value(value: float, units: str) -> tuple[float, str]: 75 """Recalibrate a floating point value. The value will be recalibrated 76 so it is in the range -1000 to 1000. The units will be changed to reflect 77 the recalibration. 78 79 Parameters 80 ---------- 81 value : float 82 The value to recalibrate. 83 units : str 84 The units of the value. This string should be in the form of 85 "Number", "Thousands", "Millions", "Billions", etc. The units 86 should be in title case. 87 88 Returns 89 ------- 90 tuple[float, str] 91 A tuple containing the recalibrated value and the recalibrated units. 92 93 Examples 94 -------- 95 ```python 96 from readabs import recalibrate_value 97 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 98 print(recalibrated, units) 99 ```""" 100 101 series = Series([value]) 102 output, units = recalibrate(series, units) 103 return output.values[0], units 104 105 106# --- private 107_MIN_RECALIBRATE = "number" # all lower case 108_MAX_RECALIBRATE = "decillion" # all lower case 109_keywords = { 110 _MIN_RECALIBRATE.title(): 0, 111 "Thousand": 3, 112 "Million": 6, 113 "Billion": 9, 114 "Trillion": 12, 115 "Quadrillion": 15, 116 "Quintillion": 18, 117 "Sextillion": 21, 118 "Septillion": 24, 119 "Octillion": 27, 120 "Nonillion": 30, 121 _MAX_RECALIBRATE.title(): 33, 122} 123_r_keywords = {v: k for k, v in _keywords.items()} 124 125 126def _prepare_units(units: str) -> tuple[str, str]: 127 """Prepare the units for recalibration.""" 128 129 substitutions = [ 130 ("000 Hours", "Thousand Hours"), 131 ("$'000,000", "$ Million"), 132 ("$'000", " $ Thousand"), 133 ("'000,000", "Millions"), 134 ("'000", "Thousands"), 135 ("000,000", "Millions"), 136 ("000", "Thousands"), 137 ] 138 units = units.strip() 139 for pattern, replacement in substitutions: 140 units = units.replace(pattern, replacement) 141 142 # manage the names for some gnarly units 143 possible_units = ("$", "Tonnes") # there may be more possible units 144 found = False 145 for pu in possible_units: 146 if pu.lower() in units.lower(): 147 units = units.lower().replace(pu.lower(), "").strip() 148 if units == "": 149 units = "number" 150 found = True 151 break 152 153 return units, pu if found else "" 154 155 156def _find_calibration(units: str) -> str | None: 157 found = None 158 for keyword in _keywords: 159 if keyword in units or keyword.lower() in units: 160 found = keyword 161 break 162 return found 163 164 165# private 166def _perfect_already(data: np.ndarray) -> bool: 167 """No need to recalibrate if the data is already perfect.""" 168 check_max = np.nanmax(np.abs(data)) 169 return 1 <= check_max < 1000 170 171 172def _all_zero(data: np.ndarray) -> bool: 173 """Cannot recalibrate if all the data is zero.""" 174 if np.nanmax(np.abs(data)) == 0: 175 print("recalibrate(): All zero data") 176 return True 177 return False 178 179 180def _not_numbers(data: np.ndarray) -> bool: 181 """Cannot recalibrate if the data is not numeric.""" 182 if ( 183 (not np.issubdtype(data.dtype, np.number)) 184 or np.isinf(data).any() 185 or np.isnan(data).all() 186 ): 187 print("recalibrate(): Data is partly or completely non-numeric.") 188 return True 189 return False 190 191 192def _can_recalibrate(flat_data: np.ndarray, units: str) -> bool: 193 """Check if the data can be recalibrated.""" 194 195 if _find_calibration(units) is None: 196 print(f"recalibrate(): Units not appropriately calibrated: {units}") 197 return False 198 199 for f in (_not_numbers, _all_zero, _perfect_already): 200 if f(flat_data): 201 return False 202 203 return True 204 205 206def _recalibrate(flat_data: np.ndarray, units: str) -> tuple[np.ndarray, str]: 207 """Recalibrate the data. Loop over the data until 208 its maximum value is between -1000 and 1000.""" 209 210 if _can_recalibrate(flat_data, units): 211 while True: 212 maximum = np.nanmax(np.abs(flat_data)) 213 if maximum >= 1000: 214 if _MAX_RECALIBRATE in units.lower(): 215 print("recalibrate() is not designed for very big units") 216 break 217 flat_data, units = _do_recal(flat_data, units, 3, truediv) 218 continue 219 if maximum < 1: 220 if _MIN_RECALIBRATE in units.lower(): 221 print("recalibrate() is not designed for very small units") 222 break 223 flat_data, units = _do_recal(flat_data, units, -3, mul) 224 continue 225 break 226 return flat_data, units 227 228 229def _do_recal(flat_data, units, step, operator): 230 calibration = _find_calibration(units) 231 factor = _keywords[calibration] 232 if factor + step not in _r_keywords: 233 print(f"Unexpected factor: {factor + step}") 234 sys.exit(-1) 235 replacement = _r_keywords[factor + step] 236 units = units.replace(calibration, replacement) 237 units = units.replace(calibration.lower(), replacement) 238 flat_data = operator(flat_data, 1000) 239 return flat_data, units 240 241 242# --- test 243if __name__ == "__main__": 244 245 def test_example(): 246 """Test the example in the docstring.""" 247 248 s = Series([1_000, 10_000, 100_000, 1_000_000]) 249 recalibrated, units = recalibrate(s, "$") 250 print(f"{recalibrated=}, {units=}") 251 252 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 253 print(f"{recalibrated=}, {units=}") 254 print("=" * 40) 255 256 test_example() 257 258 def test_recalibrate(): 259 """Test the recalibrate() function.""" 260 261 def run_test(dataset: tuple[tuple[list[str], str]]) -> None: 262 for d, u in dataset: 263 data = Series(d) 264 recalibrated, units = recalibrate(data, u) 265 print(f"{data.values}, {u} ==> {recalibrated.values}, {units}") 266 print("=" * 40) 267 268 # good examples 269 good = ( 270 ([1, 2, 3, 4, 5], "Number"), # no change 271 ([1_000, 10_000, 100_000, 1_000_000], "$"), 272 ([1_000, 10_000, 100_000, 1_000_000], "Number Spiders"), 273 ([1_000, 10_000, 100_000, 1_000_000], "Thousand"), 274 ([0.2, 0.3], "Thousands"), 275 ([0.000_000_2, 0.000_000_3], "Trillion"), 276 ) 277 run_test(good) 278 279 # bad sets of data - should produce error messages and do nothing 280 bad = ( 281 ([1, 2, 3, 4, 5], "Hundreds"), 282 ([0, 0, 0], "Thousands"), 283 ([np.nan, 0, 0], "Thousands"), 284 ([np.inf, 1, 2], "Thousands"), 285 ([0, 0, "a"], "Thousands"), 286 ) 287 run_test(bad) 288 289 test_recalibrate() 290 291 def test_recalibrate_value(): 292 """Test the recalibrate_value() function.""" 293 294 # good example 295 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 296 print(recalibrated, units) 297 print("=" * 40) 298 299 # bad example 300 recalibrated, units = recalibrate_value(3_900, "Spiders") 301 print(recalibrated, units) 302 print("=" * 40) 303 304 test_recalibrate_value()
15def recalibrate( 16 data: DataT, 17 units: str, 18) -> tuple[DataT, str]: 19 """Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000. 20 Change the name of the units to reflect the recalibration. 21 22 Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. 23 If you provide a Series, you will get a Series back. If you provide a DataFrame, 24 you will get a DataFrame back. 25 26 Parameters 27 ---------- 28 data : Series or DataFrame 29 The data to recalibrate. 30 units : str 31 The units of the data. This string should be in the form of 32 "Number", "Thousands", "Millions", "Billions", etc. The units 33 should be in title case. 34 35 Returns 36 ------- 37 Series or DataFrame 38 The recalibrated data will be a Series if a Series was provided, 39 or a DataFrame if a DataFrame was provided. 40 41 Examples 42 -------- 43 ```python 44 from pandas import Series 45 from readabs import recalibrate 46 s = Series([1_000, 10_000, 100_000, 1_000_000]) 47 recalibrated, units = recalibrate(s, "$") 48 print(f"{recalibrated=}, {units=}") 49 ```""" 50 51 if not isinstance(data, (Series, DataFrame)): 52 raise TypeError("data must be a Series or DataFrame") 53 units, restore_name = _prepare_units(units) 54 flat_data = data.to_numpy().flatten() 55 flat_data, units = _recalibrate(flat_data, units) 56 57 if restore_name: 58 units = f"{restore_name} {units}" 59 for n in "numbers", "number": 60 if n in units: 61 units = units.replace(n, "").strip() 62 break 63 units = units.title() 64 65 restore_pandas = DataFrame if len(data.shape) == 2 else Series 66 result = restore_pandas(flat_data.reshape(data.shape)) 67 result.index = data.index 68 if len(data.shape) == 2: 69 result.columns = data.columns 70 if len(data.shape) == 1: 71 result.name = data.name 72 return result, units
Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000. Change the name of the units to reflect the recalibration.
Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.
Parameters
data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.
Returns
Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.
Examples
from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
75def recalibrate_value(value: float, units: str) -> tuple[float, str]: 76 """Recalibrate a floating point value. The value will be recalibrated 77 so it is in the range -1000 to 1000. The units will be changed to reflect 78 the recalibration. 79 80 Parameters 81 ---------- 82 value : float 83 The value to recalibrate. 84 units : str 85 The units of the value. This string should be in the form of 86 "Number", "Thousands", "Millions", "Billions", etc. The units 87 should be in title case. 88 89 Returns 90 ------- 91 tuple[float, str] 92 A tuple containing the recalibrated value and the recalibrated units. 93 94 Examples 95 -------- 96 ```python 97 from readabs import recalibrate_value 98 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 99 print(recalibrated, units) 100 ```""" 101 102 series = Series([value]) 103 output, units = recalibrate(series, units) 104 return output.values[0], units
Recalibrate a floating point value. The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.
Parameters
value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.
Returns
tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.
Examples
from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)