Coverage for src/scores/probability/functions.py: 99%
95 statements
« prev ^ index » next coverage.py v7.3.2, created at 2024-02-28 12:51 +1100
« prev ^ index » next coverage.py v7.3.2, created at 2024-02-28 12:51 +1100
1"""
2This module contains a variety of functions which modify data in various ways to process data structures
3to support probablistic verification.
4"""
5from collections.abc import Iterable
6from typing import Literal, Optional
8import numpy as np
9import pandas as pd
10import xarray as xr
12from scores.probability.checks import (
13 cdf_values_within_bounds,
14 check_nan_decreasing_inputs,
15)
16from scores.typing import XarrayLike
19def round_values(array: xr.DataArray, rounding_precision: float, final_round_decpl: int = 7) -> xr.DataArray:
20 """Round data array to specified precision.
22 Rounding is done differently to `xarray.DataArray.round` or `numpy.round` where
23 the number of decimal places is specified in those cases. Instead, here the rounding
24 precision is specified as a float. The value is rounded to the nearest value that is
25 divisible by `rounding_precision`.
27 For example, 3.73 rounded to precision 0.2 is 3.8, and 37.3 rounded to precision 20
28 is 40.
30 Assumes that rounding_precision >=0, with 0 indicating no rounding to be performed.
31 If rounding_precision > 0, a final round to `final_round_decpl` decimal places is performed
32 to remove artefacts of python rounding process.
34 Args:
35 array (xr.DataArray): array of data to be rounded
36 rounding_precision (float): rounding precision
37 final_round_decpl (int): final round to specified number of decimal
38 places when `rounding_precision` > 0.
40 Returns:
41 xr.DataArray: DataArray with rounded values.
43 Raises:
44 ValueError: If `rounding_precision` < 0.
45 """
46 if rounding_precision < 0:
47 raise ValueError(f"rounding_precision '{rounding_precision}' is negative")
49 if rounding_precision > 0:
50 array = (array / rounding_precision).round() * rounding_precision
51 array = array.round(decimals=final_round_decpl)
53 return array
56def propagate_nan(cdf: XarrayLike, threshold_dim: str) -> XarrayLike:
57 """Propagates the NaN values from a "cdf" variable along the `threshold_dim`.
59 Args:
60 cdf (xr.DataArray): CDF values, so that P(X <= threshold) = cdf_value for
61 each threshold in the `threshold_dim` dimension.
62 threshold_dim (str): name of the threshold dimension in `cdf`.
64 Returns:
65 xr.DataArray: `cdf` variable with NaNs propagated.
67 Raises:
68 ValueError: If `threshold_dim` is not a dimension of `cdf`.
69 """
70 if threshold_dim not in cdf.dims:
71 raise ValueError(f"'{threshold_dim}' is not a dimension of `cdf`")
73 where_nan = xr.DataArray(np.isnan(cdf)).any(dim=threshold_dim)
74 result = cdf.where(~where_nan, np.nan)
75 return result
78def observed_cdf(
79 obs: xr.DataArray,
80 threshold_dim: str,
81 threshold_values: Optional[Iterable[float]] = None,
82 include_obs_in_thresholds: bool = True,
83 precision: float = 0,
84) -> xr.DataArray:
85 """Returns a data array of observations converted into CDF format.
87 Such that:
88 returned_value = 0 if threshold < observation
89 returned_value = 1 if threshold >= observation
91 Args:
92 obs (xr.DataArray): observations
93 threshold_dim (str): name of dimension in returned array that contains the threshold values.
94 threshold_values (Optional[Iterable[float]]): values to include among thresholds.
95 include_obs_in_thresholds (bool): if `True`, include (rounded) observed values among thresholds.
96 precision (float): precision applied to observed values prior to constructing the CDF and
97 thresholds. Select 0 for highest precision (i.e. no rounding).
99 Returns:
100 xr.DataArray: Observed CDFs and thresholds in the `threshold_dim` dimension.
102 Raises:
103 ValueError: if `precision < 0`.
104 ValueError: if all observations are NaN and no non-NaN `threshold_values`
105 are not supplied.
106 """
107 if precision < 0:
108 raise ValueError("`precision` must be nonnegative.")
110 threshold_values_as_array = np.array(threshold_values)
112 if np.isnan(obs).all() and (threshold_values is None or np.isnan(threshold_values_as_array).all()):
113 raise ValueError("must include non-NaN observations in thresholds or supply threshold values")
115 if precision > 0:
116 obs = round_values(obs, precision)
118 thresholds = threshold_values_as_array if threshold_values is not None else []
120 if include_obs_in_thresholds:
121 thresholds = np.concatenate((obs.values.flatten(), thresholds))
123 # remove any NaN
124 thresholds = [x for x in thresholds if not np.isnan(x)]
126 thresholds = np.sort(pd.unique(thresholds))
128 da_thresholds = xr.DataArray(
129 thresholds,
130 dims=[threshold_dim],
131 coords={threshold_dim: thresholds},
132 )
134 dab_obs, dab_thresholds = xr.broadcast(obs, da_thresholds)
136 cdf = dab_thresholds >= dab_obs
138 # convert to 0 and 1
139 cdf = cdf.astype(float)
141 cdf = cdf.where(~np.isnan(dab_obs))
143 return cdf
146def integrate_square_piecewise_linear(function_values: xr.DataArray, threshold_dim: str) -> xr.DataArray:
147 """Calculates integral values and collapses `threshold_dim`.
149 Calculates integral(F(t)^2), where
150 - If t in a threshold value in `threshold_dim` then F(t) is in `function_values`,
151 - F is piecewise linear between each of the t values in `threshold_dim`.
152 Returns value of the integral with `threshold_dim` collapsed and other dimensions preserved.
153 Returns NaN if there are less than two non-NaN function_values.
155 This function assumes that
156 - `threshold_dim` is a dimension of `function_values`
157 - coordinates of `threshold_dim` are increasing.
159 Args:
160 function_values (xr.DataArray): array of function values F(t).
161 threshold_dim (xr.DataArray): dimension along which to integrate.
163 Returns:
164 xr.DataArray: Integral values and `threshold_dim` collapsed.
165 """
167 # notation: Since F is piecewise linear we have
168 # F(t) = mt + b, whenever x[i-1] <= t <= x[i].
170 # difference in x
171 diff_xs = function_values[threshold_dim] - function_values[threshold_dim].shift(**{threshold_dim: 1})
173 # difference in function values
174 diff_ys = function_values - function_values.shift(**{threshold_dim: 1})
176 # gradients m
177 m_values = diff_ys / diff_xs
179 # y intercepts b
180 b_values = function_values.shift(**{threshold_dim: 1})
182 # integral for x[i-1] <= t <= x[i]
183 piece_integral = (
184 (m_values**2) * (diff_xs**3) / 3 + m_values * b_values * (diff_xs**2) + (b_values**2) * diff_xs
185 )
187 # Need at least one non-NaN piece_integral to return float.
188 # Note: need at least two non-NaN function values to get a non-NaN piece_integral.
189 return piece_integral.sum(threshold_dim, min_count=1)
192def add_thresholds(
193 cdf: xr.DataArray,
194 threshold_dim: str,
195 new_thresholds: Iterable[float],
196 fill_method: Literal["linear", "step", "forward", "backward", "none"],
197 min_nonnan: int = 2,
198) -> xr.DataArray:
199 """Takes a CDF data array with dimension `threshold_dim` and adds values from `new_thresholds`.
201 The CDF is then filled to replace any NaN values.
202 The array `cdf` requires at least 2 non-NaN values along `threshold_dim`.
204 Args:
205 cdf (xr.DataArray): array of CDF values.
206 threshold_dim (str): name of the threshold dimension in `cdf`.
207 new_thresholds (Iterable[float]): new thresholds to add to `cdf`.
208 fill_method (Literal["linear", "step", "forward", "backward", "none"]): one of "linear",
209 "step", "forward" or "backward", as described in `fill_cdf`. If no filling, set to "none".
210 min_nonnan (int): passed onto `fill_cdf` for performing filling.
212 Returns:
213 xr.DataArray: Additional thresholds, and values at those thresholds
214 determined by the specified fill method.
215 """
217 thresholds = np.concatenate((cdf[threshold_dim].values, new_thresholds))
218 thresholds = np.sort(pd.unique(thresholds))
219 thresholds = thresholds[~np.isnan(thresholds)]
221 da_thresholds = xr.DataArray(data=thresholds, dims=[threshold_dim], coords={threshold_dim: thresholds})
223 da_cdf = xr.broadcast(cdf, da_thresholds)[0]
225 if fill_method != "none": 225 ↛ 228line 225 didn't jump to line 228, because the condition on line 225 was never false
226 da_cdf = fill_cdf(da_cdf, threshold_dim, fill_method, min_nonnan)
228 return da_cdf
231def fill_cdf(
232 cdf: xr.DataArray,
233 threshold_dim: str,
234 method: Literal["linear", "step", "forward", "backward"],
235 min_nonnan: int,
236) -> xr.DataArray:
237 """Fills NaNs in a CDF of a real-valued random variable along `threshold_dim` with appropriate values between 0 and 1.
239 Args:
240 cdf (xr.DataArray): CDF values, where P(Y <= threshold) = cdf_value for each threshold in `threshold_dim`.
241 threshold_dim (str): the threshold dimension in the CDF, along which filling is performed.
242 method (Literal["linear", "step", "forward", "backward"]): one of
243 - "linear": use linear interpolation, and if needed also extrapolate linearly. Clip to 0 and 1.
244 Needs at least two non-NaN values for interpolation, so returns NaNs where this condition fails.
245 - "step": use forward filling then set remaining leading NaNs to 0.
246 Produces a step function CDF (i.e. piecewise constant).
247 - "forward": use forward filling then fill any remaining leading NaNs with backward filling.
248 - "backward": use backward filling then fill any remaining trailing NaNs with forward filling.
249 min_nonnan (int): the minimum number of non-NaN entries required along `threshold_dim` for filling to
250 be performed. All CDF values are set to `np.nan` where this condition fails.
251 `min_nonnan` must be at least 2 for the "linear" method, and at least 1 for the other methods.
253 Returns:
254 xr.DataArray: Containing the same values as `cdf` but with NaNs filled.
256 Raises:
257 ValueError: If `threshold_dim` is not a dimension of `cdf`.
258 ValueError: If `min_nonnan` < 1 when `method="step"` or if `min_nonnan` < 2 when `method="linear"`.
259 ValueError: If `method` is not "linear", "step", "forward" or "backward".
260 ValueError: If any non-NaN value of `cdf` lies outside the unit interval [0,1].
262 """
264 if method not in ["linear", "step", "forward", "backward"]:
265 raise ValueError("`method` must be 'linear', 'step', 'forward' or 'backward'")
267 if not cdf_values_within_bounds(cdf):
268 raise ValueError("Input CDF has some values less than 0 or greater than 1.")
270 if threshold_dim not in cdf.dims:
271 raise ValueError(f"'{threshold_dim}' is not a dimension of `cdf`")
273 if min_nonnan < 1 and method != "linear":
274 raise ValueError(f"`min_nonnan` must be at least 1 when `method='{method}'`")
276 if min_nonnan < 2 and method == "linear":
277 raise ValueError("`min_nonnan` must be at least 2 when `method='linear'`")
279 # set cdf values to be NaN where min_nonnan requirement fails
280 nonnan_count = cdf.count(threshold_dim)
281 cdf = cdf.where(nonnan_count >= min_nonnan)
283 # NaN filling
284 if method == "linear":
285 cdf = cdf.interpolate_na(threshold_dim, method="linear", fill_value="extrapolate").clip(min=0, max=1)
287 if method == "step":
288 cdf = cdf.ffill(threshold_dim).fillna(0)
289 # NaN cdfs will now be all zero, so bring back Nans
290 cdf = cdf.where(nonnan_count >= min_nonnan)
292 if method == "forward":
293 cdf = cdf.ffill(threshold_dim).bfill(threshold_dim)
295 if method == "backward":
296 cdf = cdf.bfill(threshold_dim).ffill(threshold_dim)
298 return cdf
301def decreasing_cdfs(cdf: xr.DataArray, threshold_dim: str, tolerance: float) -> xr.DataArray:
302 """A CDF of a real-valued random variable should be nondecreasing along threshold_dim.
304 This is sometimes violated due to rounding issues or bad forecast process.
305 `decreasing_cdfs` checks CDF values decrease beyond specified tolerance; that is,
306 whenever the sum of the incremental decreases exceeds tolerarance.
307 For example, if the CDF values are
308 [0, 0.4, 0.3, 0.9, 0.88, 1]
309 then the sum of incremental decreases is -0.12. Given a specified positive `tolerance`,
310 the CDF values decrease beyond tolerance if the sum of incremental decreases < -`tolerance`.
312 Intended use is for CDFs with increasing coordinates along `threshold_dim` dimension, and where
313 either each CDF is always NaN or always non-NaN.
315 Args:
316 cdf (xr.DataArray): data array of CDF values
317 threshold_dim (str): threshold dimension, such that P(Y < threshold) = cdf_value.
318 tolerance (float): nonnegative tolerance value.
320 Returns:
321 xr.DataArray: Containing `threshold_dim` collapsed and values True if and only if
322 the CDF is decreasing outside tolerance. If the CDF consists only of NaNs then
323 the value is False.
325 Raises:
326 ValueError: If `threshold_dim` is not a dimension of `cdf`.
327 ValueError: If `tolerance` is negative.
328 ValueError: If coordinates are not increasing along `threshold_dim`.
329 ValueError: If some, but not all, CDF values in `cdf` along `threshold_dim` are NaN.
330 """
331 check_nan_decreasing_inputs(cdf, threshold_dim, tolerance)
333 # difference between consecutive terms along threshold_dim
334 diff = cdf - cdf.shift(**{threshold_dim: 1})
336 result = diff.clip(max=0).sum(dim=threshold_dim) < -tolerance
338 return result
341def cdf_envelope(
342 cdf: xr.DataArray,
343 threshold_dim: str,
344) -> xr.DataArray:
345 """Forecast cumulative distribution functions (CDFs) for real-valued random variables.
347 CDFs that are reconstructed from known points on the distribution should be nondecreasing
348 with respect to the threshold dimension. However, sometimes this may fail due to rounding
349 or poor forecast process. This function returns the "envelope" of the original CDF, which
350 consists of two bounding CDFs, both of which are nondecreasing.
352 The following example shows values from an original CDF that has a decreasing subsequence
353 (and so is not a true CDF). The resulting "upper" and "lower" CDFs minimally adjust
354 "original" so that "lower" <= "original" <= "upper".
355 "original": [0, .5, .2, .8, 1]
356 "upper": [0, .5, .5, .8, 1]
357 "lower": [0, .2, .2, .8, 1]
359 This function does not perform checks that `0 <= cdf <= 1`.
361 Args:
362 cdf (xr.DataArray): forecast CDF with thresholds in the thresholds_dim.
363 threshold_dim (str): dimension in fcst_cdf that contains the threshold ordinates.
365 Returns:
366 An xarray DataArray consisting of three CDF arrays indexed along the `"cdf_type"` dimension
367 with the following indices:
368 - "original": same data as `cdf`.
369 - "upper": minimally adjusted "original" CDF that is nondecreasing and
370 satisfies "upper" >= "original".
371 - "lower": minimally adjusted "original" CDF that is nondecreasing and
372 satisfies "lower" <= "original".
373 NaN values in `cdf` are maintained in "original", "upper" and "lower".
375 Raises:
376 ValueError: If `threshold_dim` is not a dimension of `cdf`.
377 """
378 if threshold_dim not in cdf.dims:
379 raise ValueError(f"'{threshold_dim}' is not a dimension of `cdf`")
381 # logic below assumes that cdf[threshold_dim].values are ascending
382 cdf = cdf.sortby(threshold_dim)
384 result = xr.full_like(cdf.expand_dims({"cdf_type": ["original", "upper", "lower"]}), np.nan)
386 dim_idx = cdf.dims.index(threshold_dim)
388 # use fmax so as not to propogate nans
389 cdf_upper = np.fmax.accumulate(cdf.values, axis=dim_idx)
390 cdf_lower = np.flip(
391 1 - np.fmax.accumulate(1 - np.flip(cdf.values, axis=dim_idx), axis=dim_idx),
392 axis=dim_idx,
393 )
395 result.loc["original"] = cdf.copy()
396 result.loc["upper"] = np.where(~np.isnan(cdf), cdf_upper, np.nan)
397 result.loc["lower"] = np.where(~np.isnan(cdf), cdf_lower, np.nan)
399 return result