Coverage for /var/devmt/py/utils4_1.7.0/utils4/stats.py: 100%
70 statements
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-21 17:18 +0000
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-21 17:18 +0000
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""
4:Purpose: Provide access to various statistical calculations, namely:
6 - **CUSUM:** :meth:`~Stats.cusum`
7 - **Gaussian KDE:** :meth:`~Stats.kde`
8 - **Linear Regression:** :class:`~LinearRegression`
10:Platform: Linux/Windows | Python 3.7+
11:Developer: J Berendt
12:Email: development@s3dev.uk
14:Comments: n/a
16:Example:
18 Create a sample dataset for the stats methods::
20 >>> import matplotlib.pyplot as plt
21 >>> import numpy as np
22 >>> import pandas as pd
24 >>> np.random.seed(73)
25 >>> data = np.random.normal(size=100)*100
26 >>> x = np.arange(data.size)
27 >>> y = pd.Series(data).rolling(window=25, min_periods=25).mean().cumsum()
29 >>> # Preview the trend.
30 >>> plt.plot(x, y)
32"""
33# pylint: disable=line-too-long
34# pylint: disable=wrong-import-order
36import numpy as np
37import pandas as pd
38from scipy.stats import gaussian_kde
39from typing import Union
40# local
41from utils4.reporterror import reporterror
44class LinearRegression:
45 """Calculate the linear regression of a dataset.
47 Args:
48 x (np.array): Array of X-values.
49 y (np.array): Array of Y-values.
51 :Slope Calculation:
52 The calculation for the slope itself is borrowed from the
53 :func:`scipy.stats.linregress` function. Whose `source code`_ was
54 obtained on GitHub.
56 .. _source code: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_stats_mstats_common.py#L16-L203
58 :Example Use:
60 .. tip::
62 For a sample dataset and imports to go along with this
63 example, refer to the docstring for
64 :mod:`this module <stats>`.
66 Calculate a linear regression line on an X/Y dataset::
68 >>> from lib.stats import LinearRegression
70 >>> linreg = LinearRegression(x, y)
71 >>> linreg.calculate()
73 >>> # Obtain the regression line array.
74 >>> y_ = linreg.regression_line
76 >>> # View the intercept value.
77 >>> linreg.intercept
78 -31.26630
80 >>> # View the slope value.
81 >>> linreg.slope
82 1.95332
84 >>> # Plot the trend and regression line.
85 >>> plt.plot(x, y, 'grey')
86 >>> plt.plot(x, y_, 'red')
87 >>> plt.show()
89 """
91 def __init__(self, x: np.array, y: np.array):
92 """LinearRegression class initialiser."""
93 self._x = x
94 self._y = y
95 self._xbar = 0.0
96 self._ybar = 0.0
97 self._c = 0.0
98 self._m = 0.0
99 self._line = np.array(())
101 @property
102 def slope(self):
103 """Accessor to the slope value."""
104 return self._m
106 @property
107 def intercept(self):
108 """Accessor to the slope's y-intercept."""
109 return self._c
111 @property
112 def regression_line(self):
113 """Accessor to the calculated regression line, as y-values."""
114 return self._line
116 def calculate(self):
117 """Calculate the linear regression for the X/Y data arrays.
119 The result of the calculation is accessible via the
120 :attr:`regression_line` property.
122 """
123 self._calc_means()
124 self._calc_slope()
125 self._calc_intercept()
126 self._calc_regression_line()
128 def _calc_intercept(self):
129 """Calculate the intercept as: ybar - m * xbar."""
130 self._c = self._ybar - self._m * self._xbar
132 def _calc_means(self) -> float:
133 """Calculate the mean of the X and Y arrays."""
134 self._xbar = self._x.mean()
135 self._ybar = self._y.mean()
137 def _calc_regression_line(self):
138 """Calculate the regression line as: y = mx + c."""
139 self._line = self._m * self._x + self._c
141 def _calc_slope(self):
142 """Calculate the slope value as: R * ( std(y) / std(x) ).
144 Per the ``scipy`` source code comments::
146 # Average sums of square differences from the mean
147 # ssxm = mean( (x-mean(x))^2 )
148 # ssxym = mean( (x-mean(x)) * (y-mean(y)) )
150 ...
152 slope = ssxym / ssxm
154 """
155 ssxm = np.mean( (self._x - self._xbar)**2 )
156 ssxym = np.mean( (self._x - self._xbar) * (self._y - self._ybar) )
157 self._m = ssxym / ssxm
159 @staticmethod
160 def _calc_std(data: np.array, ddof: int=1) -> float: # pragma nocover
161 """Calculate the standard deviation.
163 Args:
164 data (np.array): Array of values.
165 ddof (int): Degrees of freedom. Defaults to 1.
167 Returns:
168 float: Standard deviation of the given values.
170 """
171 return np.std(data, ddof=ddof)
174class Stats:
175 """Wrapper class for various statistical calculations."""
177 @staticmethod
178 def cusum(df: pd.DataFrame,
179 cols: Union[list, str],
180 *,
181 window: int=None,
182 min_periods: int=1,
183 inplace=False,
184 show_plot: bool=False) -> Union[pd.DataFrame, None]:
185 r"""Calculate a CUSUM on a set of data.
187 A CUSUM is a generalised method for smoothing a noisy trend, or
188 for detecting a change in the trend.
190 Note:
191 A CUSUM is *not* a cumulative sum (cumsum), although a
192 cumulative sum is used. A CUSUM is a cumulative sum of
193 derived values, where each derived value is calculated as the
194 delta of a single value relative to the rolling mean of all
195 previous values.
197 Args:
198 df (pd.DataFrame): The DataFrame containing the column(s) on
199 which a CUSUM is to be calculated.
200 cols (Union[list, str]): The column (or list of columns) on
201 which the CUSUM is to be calculated.
202 window (int, optional): Size of the window on which the
203 rolling mean is to be calculated. This corresponds to the
204 ``pandas.df.rolling(window)`` parameter.
205 Defaults to None.
207 - If None is received, a %5 window is calculated based on
208 the length of the DataFrame. This method helps smooth
209 the trend, while keeping a representation of the
210 original trend.
211 - For a *true* CUSUM, a running average should be
212 calculated on the length of the DataFrame, except for
213 the current value. For this method, pass
214 ``window=len(df)``.
216 min_periods (int, optional): Number of periods to wait before
217 calculating the rolling average. Defaults to 1.
218 inplace (bool, optional): Update the passed DataFrame
219 (in-place), rather returning a *copy* of the passed
220 DataFrame. Defaults to False.
221 show_plot (bool, optional): Display a graph of the raw value,
222 and the calculated CUSUM results. Defaults to False.
224 :Calculation:
225 The CUSUM is calculated by taking a rolling mean :math:`RA`
226 (optionally locked at the first value), and calculate the
227 delta of the current value, relative to the rolling mean all
228 previous values. A cumulative sum is applied to the deltas.
229 The cumulative sum for each data point is returned as the
230 CUSUM value.
232 :Equation:
234 :math:`c_i = \sum_{i=1}^{n}(x_i - RA_i)`
236 where :math:`RA` (Rolling Mean) is defined as:
238 :math:`RA_{i+1} = \frac{1}{n}\sum_{j=1}^{n}x_j`
240 :Example Use:
242 Generate a *sample* trend dataset::
244 >>> import numpy as np
245 >>> import pandas as pd
247 >>> np.random.seed(13)
248 >>> s1 = pd.Series(np.random.randn(1000)).rolling(window=100).mean()
249 >>> np.random.seed(73)
250 >>> s2 = pd.Series(np.random.randn(1000)).rolling(window=100).mean()
251 >>> df = pd.DataFrame({'sample1': s1, 'sample2': s2})
254 Example for calculating a CUSUM on two columns::
256 >>> from EHM.stats import stats
258 >>> df_c = stats.cusum(df=df,
259 cols=['sample1', 'sample2'],
260 window=len(df),
261 inplace=False,
262 show_plot=True)
263 >>> df_c.tail()
264 sample1 sample2 sample1_cusum sample2_cusum
265 995 0.057574 0.065887 23.465337 29.279936
266 996 0.062781 0.072213 23.556592 29.369397
267 997 0.028513 0.072658 23.613478 29.459204
268 998 0.024518 0.070769 23.666305 29.547022
269 999 0.000346 0.074849 23.694901 29.638822
271 Returns:
272 Union[pd.DataFrame, None]: If the ``inplace`` argument is
273 ``False``, a *copy* of the original DataFrame with the new
274 CUSUM columns appended is returned. Otherwise, the passed
275 DataFrame is *updated*, and ``None`` is returned.
277 """
278 # Convert a single column name to a list.
279 cols = [cols] if not isinstance(cols, list) else cols
280 if not inplace:
281 df = df.copy(deep=True)
282 window = int(len(df) * 0.05) if window is None else window # Set default window as 5%
283 for col in cols:
284 new_col = f'{col}_cusum'
285 # CUSUM calculation (rolling_sum on rolling_mean, with a shift of 1).
286 df[new_col] = ((df[col] - df[col].rolling(window=window, min_periods=min_periods)
287 .mean()
288 .shift(1))
289 .rolling(window=len(df), min_periods=min_periods).sum())
290 # Show simple plot if requested.
291 if show_plot: # pragma: nocover
292 df[[col, new_col]].plot(title=f'TEMP PLOT\n{col} vs {new_col}',
293 color=['lightgrey', 'red'],
294 secondary_y=new_col,
295 legend=False,
296 grid=False)
297 return None if inplace else df
299 def kde(self,
300 data: Union[list, np.array, pd.Series],
301 n: int=500) -> tuple:
302 """Calculate the kernel density estimate (KDE) for an array X.
304 This function returns the *probability density* (PDF) using
305 Gaussian KDE.
307 Args:
308 data (Union[list, np.array, pd.Series]): An array-like object
309 containing the data against which the Gaussian KDE is
310 calculated. This can be a list, numpy array, or pandas
311 Series.
312 n (int, optional): Number of values returned in the X, Y
313 arrays. Defaults to 500.
315 :Example Use:
317 .. tip::
319 For a sample dataset and imports to go along with this
320 example, refer to the docstring for
321 :mod:`this module <stats>`.
323 Calculate a Gaussian KDE on Y::
325 >>> from utils4.stats import stats
327 >>> # Preview the histogram.
328 >>> _ = plt.hist(data)
330 >>> X, Y, max_x = stats.kde(data=data, n=500)
331 >>> plt.plot(X, Y)
333 >>> # Show X value at peak of curve.
334 >>> max_x
335 -9.718684033029376
337 :Max X:
338 This function also returns the X value of the curve's peak;
339 where ``max_x`` is the ``X`` value corresponding to the max
340 ``Y`` value on the curve. The result (``max_x``) is
341 returned as the third tuple element.
343 :Further Detail:
344 This method uses the :func:`scipy.stats.gaussian_kde` method
345 for the KDE calculation. For further detail on the
346 calculation itself, refer to that function's docstring.
348 :Background:
349 Originally, :func:`plotly.figure_factory.dist_plot` was used
350 to calculate the KDE. However, to remove the ``plotly``
351 dependency from this library, their code was copied and
352 refactored (simplified) into this function. Both the
353 :func:`dist_plot` and :func:`pandas.DataFrame.plot.kde`
354 method call :func:`scipy.stats.gaussian_kde` for the
355 calculation, which this function also calls.
357 Returns:
358 tuple: A tuple containing the X-array, Y-array
359 (both of ``n`` size), as well a the X value at max Y, as::
361 (curve_x, curve_y, max_x)
363 """
364 try:
365 data_ = self._obj_to_array(data=data)
366 curve_x = np.linspace(data_.min(), data_.max(), n)
367 curve_y = gaussian_kde(data_).evaluate(curve_x)
368 max_x = curve_x[curve_y.argmax()]
369 return (curve_x, curve_y, max_x)
370 except Exception as err:
371 reporterror(err)
372 return (np.array(()), np.array(()), 0.0)
374 @staticmethod
375 def _obj_to_array(data: Union[list, np.array, pd.Series]) -> np.ndarray:
376 """Convert an iterable object to a numpy array.
378 Args:
379 data (Union[list, np.array, pd.Series]): Array-like object
380 to be converted into a ``numpy.ndarray``.
382 :NaN Values:
383 In addition to converting the following types to a
384 ``numpy.ndarray``, any ``nan`` values are dropped from
385 the ``numpy.array`` and ``pd.Series`` objects.
387 Returns:
388 np.array: A ``numpy.ndarray``, with ``nan`` values removed.
390 """
391 data_ = None
392 if isinstance(data, np.ndarray):
393 data_ = data
394 elif isinstance(data, pd.Series):
395 data_ = data.astype(float).to_numpy()
396 elif isinstance(data, list):
397 data_ = np.array(data)
398 data_ = data_[~np.isnan(data_)]
399 return data_
402stats = Stats()