Coverage for /var/devmt/py/utils4_1.7.0/utils4/stats.py: 100%

70 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2024-12-21 17:18 +0000

1#!/usr/bin/env python 

2# -*- coding: utf-8 -*- 

3""" 

4:Purpose: Provide access to various statistical calculations, namely: 

5 

6 - **CUSUM:** :meth:`~Stats.cusum` 

7 - **Gaussian KDE:** :meth:`~Stats.kde` 

8 - **Linear Regression:** :class:`~LinearRegression` 

9 

10:Platform: Linux/Windows | Python 3.7+ 

11:Developer: J Berendt 

12:Email: development@s3dev.uk 

13 

14:Comments: n/a 

15 

16:Example: 

17 

18 Create a sample dataset for the stats methods:: 

19 

20 >>> import matplotlib.pyplot as plt 

21 >>> import numpy as np 

22 >>> import pandas as pd 

23 

24 >>> np.random.seed(73) 

25 >>> data = np.random.normal(size=100)*100 

26 >>> x = np.arange(data.size) 

27 >>> y = pd.Series(data).rolling(window=25, min_periods=25).mean().cumsum() 

28 

29 >>> # Preview the trend. 

30 >>> plt.plot(x, y) 

31 

32""" 

33# pylint: disable=line-too-long 

34# pylint: disable=wrong-import-order 

35 

36import numpy as np 

37import pandas as pd 

38from scipy.stats import gaussian_kde 

39from typing import Union 

40# local 

41from utils4.reporterror import reporterror 

42 

43 

44class LinearRegression: 

45 """Calculate the linear regression of a dataset. 

46 

47 Args: 

48 x (np.array): Array of X-values. 

49 y (np.array): Array of Y-values. 

50 

51 :Slope Calculation: 

52 The calculation for the slope itself is borrowed from the 

53 :func:`scipy.stats.linregress` function. Whose `source code`_ was 

54 obtained on GitHub. 

55 

56 .. _source code: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_stats_mstats_common.py#L16-L203 

57 

58 :Example Use: 

59 

60 .. tip:: 

61 

62 For a sample dataset and imports to go along with this 

63 example, refer to the docstring for 

64 :mod:`this module <stats>`. 

65 

66 Calculate a linear regression line on an X/Y dataset:: 

67 

68 >>> from lib.stats import LinearRegression 

69 

70 >>> linreg = LinearRegression(x, y) 

71 >>> linreg.calculate() 

72 

73 >>> # Obtain the regression line array. 

74 >>> y_ = linreg.regression_line 

75 

76 >>> # View the intercept value. 

77 >>> linreg.intercept 

78 -31.26630 

79 

80 >>> # View the slope value. 

81 >>> linreg.slope 

82 1.95332 

83 

84 >>> # Plot the trend and regression line. 

85 >>> plt.plot(x, y, 'grey') 

86 >>> plt.plot(x, y_, 'red') 

87 >>> plt.show() 

88 

89 """ 

90 

91 def __init__(self, x: np.array, y: np.array): 

92 """LinearRegression class initialiser.""" 

93 self._x = x 

94 self._y = y 

95 self._xbar = 0.0 

96 self._ybar = 0.0 

97 self._c = 0.0 

98 self._m = 0.0 

99 self._line = np.array(()) 

100 

101 @property 

102 def slope(self): 

103 """Accessor to the slope value.""" 

104 return self._m 

105 

106 @property 

107 def intercept(self): 

108 """Accessor to the slope's y-intercept.""" 

109 return self._c 

110 

111 @property 

112 def regression_line(self): 

113 """Accessor to the calculated regression line, as y-values.""" 

114 return self._line 

115 

116 def calculate(self): 

117 """Calculate the linear regression for the X/Y data arrays. 

118 

119 The result of the calculation is accessible via the 

120 :attr:`regression_line` property. 

121 

122 """ 

123 self._calc_means() 

124 self._calc_slope() 

125 self._calc_intercept() 

126 self._calc_regression_line() 

127 

128 def _calc_intercept(self): 

129 """Calculate the intercept as: ybar - m * xbar.""" 

130 self._c = self._ybar - self._m * self._xbar 

131 

132 def _calc_means(self) -> float: 

133 """Calculate the mean of the X and Y arrays.""" 

134 self._xbar = self._x.mean() 

135 self._ybar = self._y.mean() 

136 

137 def _calc_regression_line(self): 

138 """Calculate the regression line as: y = mx + c.""" 

139 self._line = self._m * self._x + self._c 

140 

141 def _calc_slope(self): 

142 """Calculate the slope value as: R * ( std(y) / std(x) ). 

143 

144 Per the ``scipy`` source code comments:: 

145 

146 # Average sums of square differences from the mean 

147 # ssxm = mean( (x-mean(x))^2 ) 

148 # ssxym = mean( (x-mean(x)) * (y-mean(y)) ) 

149 

150 ... 

151 

152 slope = ssxym / ssxm 

153 

154 """ 

155 ssxm = np.mean( (self._x - self._xbar)**2 ) 

156 ssxym = np.mean( (self._x - self._xbar) * (self._y - self._ybar) ) 

157 self._m = ssxym / ssxm 

158 

159 @staticmethod 

160 def _calc_std(data: np.array, ddof: int=1) -> float: # pragma nocover 

161 """Calculate the standard deviation. 

162 

163 Args: 

164 data (np.array): Array of values. 

165 ddof (int): Degrees of freedom. Defaults to 1. 

166 

167 Returns: 

168 float: Standard deviation of the given values. 

169 

170 """ 

171 return np.std(data, ddof=ddof) 

172 

173 

174class Stats: 

175 """Wrapper class for various statistical calculations.""" 

176 

177 @staticmethod 

178 def cusum(df: pd.DataFrame, 

179 cols: Union[list, str], 

180 *, 

181 window: int=None, 

182 min_periods: int=1, 

183 inplace=False, 

184 show_plot: bool=False) -> Union[pd.DataFrame, None]: 

185 r"""Calculate a CUSUM on a set of data. 

186 

187 A CUSUM is a generalised method for smoothing a noisy trend, or 

188 for detecting a change in the trend. 

189 

190 Note: 

191 A CUSUM is *not* a cumulative sum (cumsum), although a 

192 cumulative sum is used. A CUSUM is a cumulative sum of 

193 derived values, where each derived value is calculated as the 

194 delta of a single value relative to the rolling mean of all 

195 previous values. 

196 

197 Args: 

198 df (pd.DataFrame): The DataFrame containing the column(s) on 

199 which a CUSUM is to be calculated. 

200 cols (Union[list, str]): The column (or list of columns) on 

201 which the CUSUM is to be calculated. 

202 window (int, optional): Size of the window on which the 

203 rolling mean is to be calculated. This corresponds to the 

204 ``pandas.df.rolling(window)`` parameter. 

205 Defaults to None. 

206 

207 - If None is received, a %5 window is calculated based on 

208 the length of the DataFrame. This method helps smooth 

209 the trend, while keeping a representation of the 

210 original trend. 

211 - For a *true* CUSUM, a running average should be 

212 calculated on the length of the DataFrame, except for 

213 the current value. For this method, pass 

214 ``window=len(df)``. 

215 

216 min_periods (int, optional): Number of periods to wait before 

217 calculating the rolling average. Defaults to 1. 

218 inplace (bool, optional): Update the passed DataFrame 

219 (in-place), rather returning a *copy* of the passed 

220 DataFrame. Defaults to False. 

221 show_plot (bool, optional): Display a graph of the raw value, 

222 and the calculated CUSUM results. Defaults to False. 

223 

224 :Calculation: 

225 The CUSUM is calculated by taking a rolling mean :math:`RA` 

226 (optionally locked at the first value), and calculate the 

227 delta of the current value, relative to the rolling mean all 

228 previous values. A cumulative sum is applied to the deltas. 

229 The cumulative sum for each data point is returned as the 

230 CUSUM value. 

231 

232 :Equation: 

233 

234 :math:`c_i = \sum_{i=1}^{n}(x_i - RA_i)` 

235 

236 where :math:`RA` (Rolling Mean) is defined as: 

237 

238 :math:`RA_{i+1} = \frac{1}{n}\sum_{j=1}^{n}x_j` 

239 

240 :Example Use: 

241 

242 Generate a *sample* trend dataset:: 

243 

244 >>> import numpy as np 

245 >>> import pandas as pd 

246 

247 >>> np.random.seed(13) 

248 >>> s1 = pd.Series(np.random.randn(1000)).rolling(window=100).mean() 

249 >>> np.random.seed(73) 

250 >>> s2 = pd.Series(np.random.randn(1000)).rolling(window=100).mean() 

251 >>> df = pd.DataFrame({'sample1': s1, 'sample2': s2}) 

252 

253 

254 Example for calculating a CUSUM on two columns:: 

255 

256 >>> from EHM.stats import stats 

257 

258 >>> df_c = stats.cusum(df=df, 

259 cols=['sample1', 'sample2'], 

260 window=len(df), 

261 inplace=False, 

262 show_plot=True) 

263 >>> df_c.tail() 

264 sample1 sample2 sample1_cusum sample2_cusum 

265 995 0.057574 0.065887 23.465337 29.279936 

266 996 0.062781 0.072213 23.556592 29.369397 

267 997 0.028513 0.072658 23.613478 29.459204 

268 998 0.024518 0.070769 23.666305 29.547022 

269 999 0.000346 0.074849 23.694901 29.638822 

270 

271 Returns: 

272 Union[pd.DataFrame, None]: If the ``inplace`` argument is 

273 ``False``, a *copy* of the original DataFrame with the new 

274 CUSUM columns appended is returned. Otherwise, the passed 

275 DataFrame is *updated*, and ``None`` is returned. 

276 

277 """ 

278 # Convert a single column name to a list. 

279 cols = [cols] if not isinstance(cols, list) else cols 

280 if not inplace: 

281 df = df.copy(deep=True) 

282 window = int(len(df) * 0.05) if window is None else window # Set default window as 5% 

283 for col in cols: 

284 new_col = f'{col}_cusum' 

285 # CUSUM calculation (rolling_sum on rolling_mean, with a shift of 1). 

286 df[new_col] = ((df[col] - df[col].rolling(window=window, min_periods=min_periods) 

287 .mean() 

288 .shift(1)) 

289 .rolling(window=len(df), min_periods=min_periods).sum()) 

290 # Show simple plot if requested. 

291 if show_plot: # pragma: nocover 

292 df[[col, new_col]].plot(title=f'TEMP PLOT\n{col} vs {new_col}', 

293 color=['lightgrey', 'red'], 

294 secondary_y=new_col, 

295 legend=False, 

296 grid=False) 

297 return None if inplace else df 

298 

299 def kde(self, 

300 data: Union[list, np.array, pd.Series], 

301 n: int=500) -> tuple: 

302 """Calculate the kernel density estimate (KDE) for an array X. 

303 

304 This function returns the *probability density* (PDF) using 

305 Gaussian KDE. 

306 

307 Args: 

308 data (Union[list, np.array, pd.Series]): An array-like object 

309 containing the data against which the Gaussian KDE is 

310 calculated. This can be a list, numpy array, or pandas 

311 Series. 

312 n (int, optional): Number of values returned in the X, Y 

313 arrays. Defaults to 500. 

314 

315 :Example Use: 

316 

317 .. tip:: 

318 

319 For a sample dataset and imports to go along with this 

320 example, refer to the docstring for 

321 :mod:`this module <stats>`. 

322 

323 Calculate a Gaussian KDE on Y:: 

324 

325 >>> from utils4.stats import stats 

326 

327 >>> # Preview the histogram. 

328 >>> _ = plt.hist(data) 

329 

330 >>> X, Y, max_x = stats.kde(data=data, n=500) 

331 >>> plt.plot(X, Y) 

332 

333 >>> # Show X value at peak of curve. 

334 >>> max_x 

335 -9.718684033029376 

336 

337 :Max X: 

338 This function also returns the X value of the curve's peak; 

339 where ``max_x`` is the ``X`` value corresponding to the max 

340 ``Y`` value on the curve. The result (``max_x``) is 

341 returned as the third tuple element. 

342 

343 :Further Detail: 

344 This method uses the :func:`scipy.stats.gaussian_kde` method 

345 for the KDE calculation. For further detail on the 

346 calculation itself, refer to that function's docstring. 

347 

348 :Background: 

349 Originally, :func:`plotly.figure_factory.dist_plot` was used 

350 to calculate the KDE. However, to remove the ``plotly`` 

351 dependency from this library, their code was copied and 

352 refactored (simplified) into this function. Both the 

353 :func:`dist_plot` and :func:`pandas.DataFrame.plot.kde` 

354 method call :func:`scipy.stats.gaussian_kde` for the 

355 calculation, which this function also calls. 

356 

357 Returns: 

358 tuple: A tuple containing the X-array, Y-array 

359 (both of ``n`` size), as well a the X value at max Y, as:: 

360 

361 (curve_x, curve_y, max_x) 

362 

363 """ 

364 try: 

365 data_ = self._obj_to_array(data=data) 

366 curve_x = np.linspace(data_.min(), data_.max(), n) 

367 curve_y = gaussian_kde(data_).evaluate(curve_x) 

368 max_x = curve_x[curve_y.argmax()] 

369 return (curve_x, curve_y, max_x) 

370 except Exception as err: 

371 reporterror(err) 

372 return (np.array(()), np.array(()), 0.0) 

373 

374 @staticmethod 

375 def _obj_to_array(data: Union[list, np.array, pd.Series]) -> np.ndarray: 

376 """Convert an iterable object to a numpy array. 

377 

378 Args: 

379 data (Union[list, np.array, pd.Series]): Array-like object 

380 to be converted into a ``numpy.ndarray``. 

381 

382 :NaN Values: 

383 In addition to converting the following types to a 

384 ``numpy.ndarray``, any ``nan`` values are dropped from 

385 the ``numpy.array`` and ``pd.Series`` objects. 

386 

387 Returns: 

388 np.array: A ``numpy.ndarray``, with ``nan`` values removed. 

389 

390 """ 

391 data_ = None 

392 if isinstance(data, np.ndarray): 

393 data_ = data 

394 elif isinstance(data, pd.Series): 

395 data_ = data.astype(float).to_numpy() 

396 elif isinstance(data, list): 

397 data_ = np.array(data) 

398 data_ = data_[~np.isnan(data_)] 

399 return data_ 

400 

401 

402stats = Stats()