Coverage for src/scores/probability/functions.py: 99%

95 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-02-28 12:51 +1100

1""" 

2This module contains a variety of functions which modify data in various ways to process data structures 

3to support probablistic verification. 

4""" 

5from collections.abc import Iterable 

6from typing import Literal, Optional 

7 

8import numpy as np 

9import pandas as pd 

10import xarray as xr 

11 

12from scores.probability.checks import ( 

13 cdf_values_within_bounds, 

14 check_nan_decreasing_inputs, 

15) 

16from scores.typing import XarrayLike 

17 

18 

19def round_values(array: xr.DataArray, rounding_precision: float, final_round_decpl: int = 7) -> xr.DataArray: 

20 """Round data array to specified precision. 

21 

22 Rounding is done differently to `xarray.DataArray.round` or `numpy.round` where 

23 the number of decimal places is specified in those cases. Instead, here the rounding 

24 precision is specified as a float. The value is rounded to the nearest value that is 

25 divisible by `rounding_precision`. 

26 

27 For example, 3.73 rounded to precision 0.2 is 3.8, and 37.3 rounded to precision 20 

28 is 40. 

29 

30 Assumes that rounding_precision >=0, with 0 indicating no rounding to be performed. 

31 If rounding_precision > 0, a final round to `final_round_decpl` decimal places is performed 

32 to remove artefacts of python rounding process. 

33 

34 Args: 

35 array (xr.DataArray): array of data to be rounded 

36 rounding_precision (float): rounding precision 

37 final_round_decpl (int): final round to specified number of decimal 

38 places when `rounding_precision` > 0. 

39 

40 Returns: 

41 xr.DataArray: DataArray with rounded values. 

42 

43 Raises: 

44 ValueError: If `rounding_precision` < 0. 

45 """ 

46 if rounding_precision < 0: 

47 raise ValueError(f"rounding_precision '{rounding_precision}' is negative") 

48 

49 if rounding_precision > 0: 

50 array = (array / rounding_precision).round() * rounding_precision 

51 array = array.round(decimals=final_round_decpl) 

52 

53 return array 

54 

55 

56def propagate_nan(cdf: XarrayLike, threshold_dim: str) -> XarrayLike: 

57 """Propagates the NaN values from a "cdf" variable along the `threshold_dim`. 

58 

59 Args: 

60 cdf (xr.DataArray): CDF values, so that P(X <= threshold) = cdf_value for 

61 each threshold in the `threshold_dim` dimension. 

62 threshold_dim (str): name of the threshold dimension in `cdf`. 

63 

64 Returns: 

65 xr.DataArray: `cdf` variable with NaNs propagated. 

66 

67 Raises: 

68 ValueError: If `threshold_dim` is not a dimension of `cdf`. 

69 """ 

70 if threshold_dim not in cdf.dims: 

71 raise ValueError(f"'{threshold_dim}' is not a dimension of `cdf`") 

72 

73 where_nan = xr.DataArray(np.isnan(cdf)).any(dim=threshold_dim) 

74 result = cdf.where(~where_nan, np.nan) 

75 return result 

76 

77 

78def observed_cdf( 

79 obs: xr.DataArray, 

80 threshold_dim: str, 

81 threshold_values: Optional[Iterable[float]] = None, 

82 include_obs_in_thresholds: bool = True, 

83 precision: float = 0, 

84) -> xr.DataArray: 

85 """Returns a data array of observations converted into CDF format. 

86 

87 Such that: 

88 returned_value = 0 if threshold < observation 

89 returned_value = 1 if threshold >= observation 

90 

91 Args: 

92 obs (xr.DataArray): observations 

93 threshold_dim (str): name of dimension in returned array that contains the threshold values. 

94 threshold_values (Optional[Iterable[float]]): values to include among thresholds. 

95 include_obs_in_thresholds (bool): if `True`, include (rounded) observed values among thresholds. 

96 precision (float): precision applied to observed values prior to constructing the CDF and 

97 thresholds. Select 0 for highest precision (i.e. no rounding). 

98 

99 Returns: 

100 xr.DataArray: Observed CDFs and thresholds in the `threshold_dim` dimension. 

101 

102 Raises: 

103 ValueError: if `precision < 0`. 

104 ValueError: if all observations are NaN and no non-NaN `threshold_values` 

105 are not supplied. 

106 """ 

107 if precision < 0: 

108 raise ValueError("`precision` must be nonnegative.") 

109 

110 threshold_values_as_array = np.array(threshold_values) 

111 

112 if np.isnan(obs).all() and (threshold_values is None or np.isnan(threshold_values_as_array).all()): 

113 raise ValueError("must include non-NaN observations in thresholds or supply threshold values") 

114 

115 if precision > 0: 

116 obs = round_values(obs, precision) 

117 

118 thresholds = threshold_values_as_array if threshold_values is not None else [] 

119 

120 if include_obs_in_thresholds: 

121 thresholds = np.concatenate((obs.values.flatten(), thresholds)) 

122 

123 # remove any NaN 

124 thresholds = [x for x in thresholds if not np.isnan(x)] 

125 

126 thresholds = np.sort(pd.unique(thresholds)) 

127 

128 da_thresholds = xr.DataArray( 

129 thresholds, 

130 dims=[threshold_dim], 

131 coords={threshold_dim: thresholds}, 

132 ) 

133 

134 dab_obs, dab_thresholds = xr.broadcast(obs, da_thresholds) 

135 

136 cdf = dab_thresholds >= dab_obs 

137 

138 # convert to 0 and 1 

139 cdf = cdf.astype(float) 

140 

141 cdf = cdf.where(~np.isnan(dab_obs)) 

142 

143 return cdf 

144 

145 

146def integrate_square_piecewise_linear(function_values: xr.DataArray, threshold_dim: str) -> xr.DataArray: 

147 """Calculates integral values and collapses `threshold_dim`. 

148 

149 Calculates integral(F(t)^2), where 

150 - If t in a threshold value in `threshold_dim` then F(t) is in `function_values`, 

151 - F is piecewise linear between each of the t values in `threshold_dim`. 

152 Returns value of the integral with `threshold_dim` collapsed and other dimensions preserved. 

153 Returns NaN if there are less than two non-NaN function_values. 

154 

155 This function assumes that 

156 - `threshold_dim` is a dimension of `function_values` 

157 - coordinates of `threshold_dim` are increasing. 

158 

159 Args: 

160 function_values (xr.DataArray): array of function values F(t). 

161 threshold_dim (xr.DataArray): dimension along which to integrate. 

162 

163 Returns: 

164 xr.DataArray: Integral values and `threshold_dim` collapsed. 

165 """ 

166 

167 # notation: Since F is piecewise linear we have 

168 # F(t) = mt + b, whenever x[i-1] <= t <= x[i]. 

169 

170 # difference in x 

171 diff_xs = function_values[threshold_dim] - function_values[threshold_dim].shift(**{threshold_dim: 1}) 

172 

173 # difference in function values 

174 diff_ys = function_values - function_values.shift(**{threshold_dim: 1}) 

175 

176 # gradients m 

177 m_values = diff_ys / diff_xs 

178 

179 # y intercepts b 

180 b_values = function_values.shift(**{threshold_dim: 1}) 

181 

182 # integral for x[i-1] <= t <= x[i] 

183 piece_integral = ( 

184 (m_values**2) * (diff_xs**3) / 3 + m_values * b_values * (diff_xs**2) + (b_values**2) * diff_xs 

185 ) 

186 

187 # Need at least one non-NaN piece_integral to return float. 

188 # Note: need at least two non-NaN function values to get a non-NaN piece_integral. 

189 return piece_integral.sum(threshold_dim, min_count=1) 

190 

191 

192def add_thresholds( 

193 cdf: xr.DataArray, 

194 threshold_dim: str, 

195 new_thresholds: Iterable[float], 

196 fill_method: Literal["linear", "step", "forward", "backward", "none"], 

197 min_nonnan: int = 2, 

198) -> xr.DataArray: 

199 """Takes a CDF data array with dimension `threshold_dim` and adds values from `new_thresholds`. 

200 

201 The CDF is then filled to replace any NaN values. 

202 The array `cdf` requires at least 2 non-NaN values along `threshold_dim`. 

203 

204 Args: 

205 cdf (xr.DataArray): array of CDF values. 

206 threshold_dim (str): name of the threshold dimension in `cdf`. 

207 new_thresholds (Iterable[float]): new thresholds to add to `cdf`. 

208 fill_method (Literal["linear", "step", "forward", "backward", "none"]): one of "linear", 

209 "step", "forward" or "backward", as described in `fill_cdf`. If no filling, set to "none". 

210 min_nonnan (int): passed onto `fill_cdf` for performing filling. 

211 

212 Returns: 

213 xr.DataArray: Additional thresholds, and values at those thresholds 

214 determined by the specified fill method. 

215 """ 

216 

217 thresholds = np.concatenate((cdf[threshold_dim].values, new_thresholds)) 

218 thresholds = np.sort(pd.unique(thresholds)) 

219 thresholds = thresholds[~np.isnan(thresholds)] 

220 

221 da_thresholds = xr.DataArray(data=thresholds, dims=[threshold_dim], coords={threshold_dim: thresholds}) 

222 

223 da_cdf = xr.broadcast(cdf, da_thresholds)[0] 

224 

225 if fill_method != "none": 225 ↛ 228line 225 didn't jump to line 228, because the condition on line 225 was never false

226 da_cdf = fill_cdf(da_cdf, threshold_dim, fill_method, min_nonnan) 

227 

228 return da_cdf 

229 

230 

231def fill_cdf( 

232 cdf: xr.DataArray, 

233 threshold_dim: str, 

234 method: Literal["linear", "step", "forward", "backward"], 

235 min_nonnan: int, 

236) -> xr.DataArray: 

237 """Fills NaNs in a CDF of a real-valued random variable along `threshold_dim` with appropriate values between 0 and 1. 

238 

239 Args: 

240 cdf (xr.DataArray): CDF values, where P(Y <= threshold) = cdf_value for each threshold in `threshold_dim`. 

241 threshold_dim (str): the threshold dimension in the CDF, along which filling is performed. 

242 method (Literal["linear", "step", "forward", "backward"]): one of 

243 - "linear": use linear interpolation, and if needed also extrapolate linearly. Clip to 0 and 1. 

244 Needs at least two non-NaN values for interpolation, so returns NaNs where this condition fails. 

245 - "step": use forward filling then set remaining leading NaNs to 0. 

246 Produces a step function CDF (i.e. piecewise constant). 

247 - "forward": use forward filling then fill any remaining leading NaNs with backward filling. 

248 - "backward": use backward filling then fill any remaining trailing NaNs with forward filling. 

249 min_nonnan (int): the minimum number of non-NaN entries required along `threshold_dim` for filling to 

250 be performed. All CDF values are set to `np.nan` where this condition fails. 

251 `min_nonnan` must be at least 2 for the "linear" method, and at least 1 for the other methods. 

252 

253 Returns: 

254 xr.DataArray: Containing the same values as `cdf` but with NaNs filled. 

255 

256 Raises: 

257 ValueError: If `threshold_dim` is not a dimension of `cdf`. 

258 ValueError: If `min_nonnan` < 1 when `method="step"` or if `min_nonnan` < 2 when `method="linear"`. 

259 ValueError: If `method` is not "linear", "step", "forward" or "backward". 

260 ValueError: If any non-NaN value of `cdf` lies outside the unit interval [0,1]. 

261 

262 """ 

263 

264 if method not in ["linear", "step", "forward", "backward"]: 

265 raise ValueError("`method` must be 'linear', 'step', 'forward' or 'backward'") 

266 

267 if not cdf_values_within_bounds(cdf): 

268 raise ValueError("Input CDF has some values less than 0 or greater than 1.") 

269 

270 if threshold_dim not in cdf.dims: 

271 raise ValueError(f"'{threshold_dim}' is not a dimension of `cdf`") 

272 

273 if min_nonnan < 1 and method != "linear": 

274 raise ValueError(f"`min_nonnan` must be at least 1 when `method='{method}'`") 

275 

276 if min_nonnan < 2 and method == "linear": 

277 raise ValueError("`min_nonnan` must be at least 2 when `method='linear'`") 

278 

279 # set cdf values to be NaN where min_nonnan requirement fails 

280 nonnan_count = cdf.count(threshold_dim) 

281 cdf = cdf.where(nonnan_count >= min_nonnan) 

282 

283 # NaN filling 

284 if method == "linear": 

285 cdf = cdf.interpolate_na(threshold_dim, method="linear", fill_value="extrapolate").clip(min=0, max=1) 

286 

287 if method == "step": 

288 cdf = cdf.ffill(threshold_dim).fillna(0) 

289 # NaN cdfs will now be all zero, so bring back Nans 

290 cdf = cdf.where(nonnan_count >= min_nonnan) 

291 

292 if method == "forward": 

293 cdf = cdf.ffill(threshold_dim).bfill(threshold_dim) 

294 

295 if method == "backward": 

296 cdf = cdf.bfill(threshold_dim).ffill(threshold_dim) 

297 

298 return cdf 

299 

300 

301def decreasing_cdfs(cdf: xr.DataArray, threshold_dim: str, tolerance: float) -> xr.DataArray: 

302 """A CDF of a real-valued random variable should be nondecreasing along threshold_dim. 

303 

304 This is sometimes violated due to rounding issues or bad forecast process. 

305 `decreasing_cdfs` checks CDF values decrease beyond specified tolerance; that is, 

306 whenever the sum of the incremental decreases exceeds tolerarance. 

307 For example, if the CDF values are 

308 [0, 0.4, 0.3, 0.9, 0.88, 1] 

309 then the sum of incremental decreases is -0.12. Given a specified positive `tolerance`, 

310 the CDF values decrease beyond tolerance if the sum of incremental decreases < -`tolerance`. 

311 

312 Intended use is for CDFs with increasing coordinates along `threshold_dim` dimension, and where 

313 either each CDF is always NaN or always non-NaN. 

314 

315 Args: 

316 cdf (xr.DataArray): data array of CDF values 

317 threshold_dim (str): threshold dimension, such that P(Y < threshold) = cdf_value. 

318 tolerance (float): nonnegative tolerance value. 

319 

320 Returns: 

321 xr.DataArray: Containing `threshold_dim` collapsed and values True if and only if 

322 the CDF is decreasing outside tolerance. If the CDF consists only of NaNs then 

323 the value is False. 

324 

325 Raises: 

326 ValueError: If `threshold_dim` is not a dimension of `cdf`. 

327 ValueError: If `tolerance` is negative. 

328 ValueError: If coordinates are not increasing along `threshold_dim`. 

329 ValueError: If some, but not all, CDF values in `cdf` along `threshold_dim` are NaN. 

330 """ 

331 check_nan_decreasing_inputs(cdf, threshold_dim, tolerance) 

332 

333 # difference between consecutive terms along threshold_dim 

334 diff = cdf - cdf.shift(**{threshold_dim: 1}) 

335 

336 result = diff.clip(max=0).sum(dim=threshold_dim) < -tolerance 

337 

338 return result 

339 

340 

341def cdf_envelope( 

342 cdf: xr.DataArray, 

343 threshold_dim: str, 

344) -> xr.DataArray: 

345 """Forecast cumulative distribution functions (CDFs) for real-valued random variables. 

346 

347 CDFs that are reconstructed from known points on the distribution should be nondecreasing 

348 with respect to the threshold dimension. However, sometimes this may fail due to rounding 

349 or poor forecast process. This function returns the "envelope" of the original CDF, which 

350 consists of two bounding CDFs, both of which are nondecreasing. 

351 

352 The following example shows values from an original CDF that has a decreasing subsequence 

353 (and so is not a true CDF). The resulting "upper" and "lower" CDFs minimally adjust 

354 "original" so that "lower" <= "original" <= "upper". 

355 "original": [0, .5, .2, .8, 1] 

356 "upper": [0, .5, .5, .8, 1] 

357 "lower": [0, .2, .2, .8, 1] 

358 

359 This function does not perform checks that `0 <= cdf <= 1`. 

360 

361 Args: 

362 cdf (xr.DataArray): forecast CDF with thresholds in the thresholds_dim. 

363 threshold_dim (str): dimension in fcst_cdf that contains the threshold ordinates. 

364 

365 Returns: 

366 An xarray DataArray consisting of three CDF arrays indexed along the `"cdf_type"` dimension 

367 with the following indices: 

368 - "original": same data as `cdf`. 

369 - "upper": minimally adjusted "original" CDF that is nondecreasing and 

370 satisfies "upper" >= "original". 

371 - "lower": minimally adjusted "original" CDF that is nondecreasing and 

372 satisfies "lower" <= "original". 

373 NaN values in `cdf` are maintained in "original", "upper" and "lower". 

374 

375 Raises: 

376 ValueError: If `threshold_dim` is not a dimension of `cdf`. 

377 """ 

378 if threshold_dim not in cdf.dims: 

379 raise ValueError(f"'{threshold_dim}' is not a dimension of `cdf`") 

380 

381 # logic below assumes that cdf[threshold_dim].values are ascending 

382 cdf = cdf.sortby(threshold_dim) 

383 

384 result = xr.full_like(cdf.expand_dims({"cdf_type": ["original", "upper", "lower"]}), np.nan) 

385 

386 dim_idx = cdf.dims.index(threshold_dim) 

387 

388 # use fmax so as not to propogate nans 

389 cdf_upper = np.fmax.accumulate(cdf.values, axis=dim_idx) 

390 cdf_lower = np.flip( 

391 1 - np.fmax.accumulate(1 - np.flip(cdf.values, axis=dim_idx), axis=dim_idx), 

392 axis=dim_idx, 

393 ) 

394 

395 result.loc["original"] = cdf.copy() 

396 result.loc["upper"] = np.where(~np.isnan(cdf), cdf_upper, np.nan) 

397 result.loc["lower"] = np.where(~np.isnan(cdf), cdf_lower, np.nan) 

398 

399 return result