Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/robust/robust_linear_model.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Robust linear models with support for the M-estimators listed under
3:ref:`norms <norms>`.
5References
6----------
7PJ Huber. 'Robust Statistics' John Wiley and Sons, Inc., New York. 1981.
9PJ Huber. 1973, 'The 1972 Wald Memorial Lectures: Robust Regression:
10 Asymptotics, Conjectures, and Monte Carlo.' The Annals of Statistics,
11 1.5, 799-821.
13R Venables, B Ripley. 'Modern Applied Statistics in S' Springer, New York,
14 2002.
15"""
16import numpy as np
17import scipy.stats as stats
19from statsmodels.tools.decorators import cache_readonly
20from statsmodels.tools.sm_exceptions import ConvergenceWarning
21import statsmodels.regression.linear_model as lm
22import statsmodels.regression._tools as reg_tools
23import statsmodels.robust.norms as norms
24import statsmodels.robust.scale as scale
25import statsmodels.base.model as base
26import statsmodels.base.wrapper as wrap
28__all__ = ['RLM']
31def _check_convergence(criterion, iteration, tol, maxiter):
32 cond = np.abs(criterion[iteration] - criterion[iteration - 1])
33 return not (np.any(cond > tol) and iteration < maxiter)
36class RLM(base.LikelihoodModel):
37 __doc__ = """
38 Robust Linear Model
40 Estimate a robust linear model via iteratively reweighted least squares
41 given a robust criterion estimator.
43 %(params)s
44 M : statsmodels.robust.norms.RobustNorm, optional
45 The robust criterion function for downweighting outliers.
46 The current options are LeastSquares, HuberT, RamsayE, AndrewWave,
47 TrimmedMean, Hampel, and TukeyBiweight. The default is HuberT().
48 See statsmodels.robust.norms for more information.
49 %(extra_params)s
51 Attributes
52 ----------
54 df_model : float
55 The degrees of freedom of the model. The number of regressors p less
56 one for the intercept. Note that the reported model degrees
57 of freedom does not count the intercept as a regressor, though
58 the model is assumed to have an intercept.
59 df_resid : float
60 The residual degrees of freedom. The number of observations n
61 less the number of regressors p. Note that here p does include
62 the intercept as using a degree of freedom.
63 endog : ndarray
64 See above. Note that endog is a reference to the data so that if
65 data is already an array and it is changed, then `endog` changes
66 as well.
67 exog : ndarray
68 See above. Note that endog is a reference to the data so that if
69 data is already an array and it is changed, then `endog` changes
70 as well.
71 M : statsmodels.robust.norms.RobustNorm
72 See above. Robust estimator instance instantiated.
73 nobs : float
74 The number of observations n
75 pinv_wexog : ndarray
76 The pseudoinverse of the design / exogenous data array. Note that
77 RLM has no whiten method, so this is just the pseudo inverse of the
78 design.
79 normalized_cov_params : ndarray
80 The p x p normalized covariance of the design / exogenous data.
81 This is approximately equal to (X.T X)^(-1)
83 Examples
84 --------
85 >>> import statsmodels.api as sm
86 >>> data = sm.datasets.stackloss.load(as_pandas=False)
87 >>> data.exog = sm.add_constant(data.exog)
88 >>> rlm_model = sm.RLM(data.endog, data.exog, \
89 M=sm.robust.norms.HuberT())
91 >>> rlm_results = rlm_model.fit()
92 >>> rlm_results.params
93 array([ 0.82938433, 0.92606597, -0.12784672, -41.02649835])
94 >>> rlm_results.bse
95 array([ 0.11100521, 0.30293016, 0.12864961, 9.79189854])
96 >>> rlm_results_HC2 = rlm_model.fit(cov="H2")
97 >>> rlm_results_HC2.params
98 array([ 0.82938433, 0.92606597, -0.12784672, -41.02649835])
99 >>> rlm_results_HC2.bse
100 array([ 0.11945975, 0.32235497, 0.11796313, 9.08950419])
101 >>> mod = sm.RLM(data.endog, data.exog, M=sm.robust.norms.Hampel())
102 >>> rlm_hamp_hub = mod.fit(scale_est=sm.robust.scale.HuberScale())
103 >>> rlm_hamp_hub.params
104 array([ 0.73175452, 1.25082038, -0.14794399, -40.27122257])
105 """ % {'params': base._model_params_doc,
106 'extra_params': base._missing_param_doc}
108 def __init__(self, endog, exog, M=None, missing='none',
109 **kwargs):
110 self.M = M if M is not None else norms.HuberT()
111 super(base.LikelihoodModel, self).__init__(endog, exog,
112 missing=missing, **kwargs)
113 self._initialize()
114 # things to remove_data
115 self._data_attr.extend(['weights', 'pinv_wexog'])
117 def _initialize(self):
118 """
119 Initializes the model for the IRLS fit.
121 Resets the history and number of iterations.
122 """
123 self.pinv_wexog = np.linalg.pinv(self.exog)
124 self.normalized_cov_params = np.dot(self.pinv_wexog,
125 np.transpose(self.pinv_wexog))
126 self.df_resid = (np.float(self.exog.shape[0] -
127 np.linalg.matrix_rank(self.exog)))
128 self.df_model = np.float(np.linalg.matrix_rank(self.exog) - 1)
129 self.nobs = float(self.endog.shape[0])
131 def score(self, params):
132 raise NotImplementedError
134 def information(self, params):
135 raise NotImplementedError
137 def predict(self, params, exog=None):
138 """
139 Return linear predicted values from a design matrix.
141 Parameters
142 ----------
143 params : array_like
144 Parameters of a linear model
145 exog : array_like, optional.
146 Design / exogenous data. Model exog is used if None.
148 Returns
149 -------
150 An array of fitted values
151 """
152 # copied from linear_model # TODO: then is it needed?
153 if exog is None:
154 exog = self.exog
155 return np.dot(exog, params)
157 def loglike(self, params):
158 raise NotImplementedError
160 def deviance(self, tmp_results):
161 """
162 Returns the (unnormalized) log-likelihood from the M estimator.
163 """
164 tmp_resid = self.endog - tmp_results.fittedvalues
165 return self.M(tmp_resid / tmp_results.scale).sum()
167 def _update_history(self, tmp_results, history, conv):
168 history['params'].append(tmp_results.params)
169 history['scale'].append(tmp_results.scale)
170 if conv == 'dev':
171 history['deviance'].append(self.deviance(tmp_results))
172 elif conv == 'sresid':
173 history['sresid'].append(tmp_results.resid / tmp_results.scale)
174 elif conv == 'weights':
175 history['weights'].append(tmp_results.model.weights)
176 return history
178 def _estimate_scale(self, resid):
179 """
180 Estimates the scale based on the option provided to the fit method.
181 """
182 if isinstance(self.scale_est, str):
183 if self.scale_est.lower() == 'mad':
184 return scale.mad(resid, center=0)
185 else:
186 raise ValueError("Option %s for scale_est not understood" %
187 self.scale_est)
188 elif isinstance(self.scale_est, scale.HuberScale):
189 return self.scale_est(self.df_resid, self.nobs, resid)
190 else:
191 return scale.scale_est(self, resid) ** 2
193 def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
194 update_scale=True, conv='dev', start_params=None):
195 """
196 Fits the model using iteratively reweighted least squares.
198 The IRLS routine runs until the specified objective converges to `tol`
199 or `maxiter` has been reached.
201 Parameters
202 ----------
203 conv : str
204 Indicates the convergence criteria.
205 Available options are "coefs" (the coefficients), "weights" (the
206 weights in the iteration), "sresid" (the standardized residuals),
207 and "dev" (the un-normalized log-likelihood for the M
208 estimator). The default is "dev".
209 cov : str, optional
210 'H1', 'H2', or 'H3'
211 Indicates how the covariance matrix is estimated. Default is 'H1'.
212 See rlm.RLMResults for more information.
213 init : str
214 Specifies method for the initial estimates of the parameters.
215 Default is None, which means that the least squares estimate
216 is used. Currently it is the only available choice.
217 maxiter : int
218 The maximum number of iterations to try. Default is 50.
219 scale_est : str or HuberScale()
220 'mad' or HuberScale()
221 Indicates the estimate to use for scaling the weights in the IRLS.
222 The default is 'mad' (median absolute deviation. Other options are
223 'HuberScale' for Huber's proposal 2. Huber's proposal 2 has
224 optional keyword arguments d, tol, and maxiter for specifying the
225 tuning constant, the convergence tolerance, and the maximum number
226 of iterations. See statsmodels.robust.scale for more information.
227 tol : float
228 The convergence tolerance of the estimate. Default is 1e-8.
229 update_scale : Bool
230 If `update_scale` is False then the scale estimate for the
231 weights is held constant over the iteration. Otherwise, it
232 is updated for each fit in the iteration. Default is True.
233 start_params : array-like, optional
234 Initial guess of the solution of the optimizer. If not provided,
235 the initial parameters are computed using OLS.
237 Returns
238 -------
239 results : statsmodels.rlm.RLMresults
240 Results instance
241 """
242 if cov.upper() not in ["H1", "H2", "H3"]:
243 raise ValueError("Covariance matrix %s not understood" % cov)
244 else:
245 self.cov = cov.upper()
246 conv = conv.lower()
247 if conv not in ["weights", "coefs", "dev", "sresid"]:
248 raise ValueError("Convergence argument %s not understood" % conv)
249 self.scale_est = scale_est
251 if start_params is None:
252 wls_results = lm.WLS(self.endog, self.exog).fit()
253 else:
254 start_params = np.asarray(start_params, dtype=np.double).squeeze()
255 if (start_params.shape[0] != self.exog.shape[1] or
256 start_params.ndim != 1):
257 raise ValueError('start_params must by a 1-d array with {0} '
258 'values'.format(self.exog.shape[1]))
259 fake_wls = reg_tools._MinimalWLS(self.endog, self.exog,
260 weights=np.ones_like(self.endog),
261 check_weights=False)
262 wls_results = fake_wls.results(start_params)
264 if not init:
265 self.scale = self._estimate_scale(wls_results.resid)
267 history = dict(params=[np.inf], scale=[])
268 if conv == 'coefs':
269 criterion = history['params']
270 elif conv == 'dev':
271 history.update(dict(deviance=[np.inf]))
272 criterion = history['deviance']
273 elif conv == 'sresid':
274 history.update(dict(sresid=[np.inf]))
275 criterion = history['sresid']
276 elif conv == 'weights':
277 history.update(dict(weights=[np.inf]))
278 criterion = history['weights']
280 # done one iteration so update
281 history = self._update_history(wls_results, history, conv)
282 iteration = 1
283 converged = 0
284 while not converged:
285 if self.scale == 0.0:
286 import warnings
287 warnings.warn('Estimated scale is 0.0 indicating that the most'
288 ' last iteration produced a perfect fit of the '
289 'weighted data.', ConvergenceWarning)
290 break
291 self.weights = self.M.weights(wls_results.resid / self.scale)
292 wls_results = reg_tools._MinimalWLS(self.endog, self.exog,
293 weights=self.weights,
294 check_weights=True).fit()
295 if update_scale is True:
296 self.scale = self._estimate_scale(wls_results.resid)
297 history = self._update_history(wls_results, history, conv)
298 iteration += 1
299 converged = _check_convergence(criterion, iteration, tol, maxiter)
300 results = RLMResults(self, wls_results.params,
301 self.normalized_cov_params, self.scale)
303 history['iteration'] = iteration
304 results.fit_history = history
305 results.fit_options = dict(cov=cov.upper(), scale_est=scale_est,
306 norm=self.M.__class__.__name__, conv=conv)
307 # norm is not changed in fit, no old state
309 # doing the next causes exception
310 # self.cov = self.scale_est = None #reset for additional fits
311 # iteration and history could contain wrong state with repeated fit
312 return RLMResultsWrapper(results)
315class RLMResults(base.LikelihoodModelResults):
316 """
317 Class to contain RLM results
319 Attributes
320 ----------
322 bcov_scaled : ndarray
323 p x p scaled covariance matrix specified in the model fit method.
324 The default is H1. H1 is defined as
325 ``k**2 * (1/df_resid*sum(M.psi(sresid)**2)*scale**2)/
326 ((1/nobs*sum(M.psi_deriv(sresid)))**2) * (X.T X)^(-1)``
328 where ``k = 1 + (df_model +1)/nobs * var_psiprime/m**2``
329 where ``m = mean(M.psi_deriv(sresid))`` and
330 ``var_psiprime = var(M.psi_deriv(sresid))``
332 H2 is defined as
333 ``k * (1/df_resid) * sum(M.psi(sresid)**2) *scale**2/
334 ((1/nobs)*sum(M.psi_deriv(sresid)))*W_inv``
336 H3 is defined as
337 ``1/k * (1/df_resid * sum(M.psi(sresid)**2)*scale**2 *
338 (W_inv X.T X W_inv))``
340 where `k` is defined as above and
341 ``W_inv = (M.psi_deriv(sresid) exog.T exog)^(-1)``
343 See the technical documentation for cleaner formulae.
344 bcov_unscaled : ndarray
345 The usual p x p covariance matrix with scale set equal to 1. It
346 is then just equivalent to normalized_cov_params.
347 bse : ndarray
348 An array of the standard errors of the parameters. The standard
349 errors are taken from the robust covariance matrix specified in the
350 argument to fit.
351 chisq : ndarray
352 An array of the chi-squared values of the parameter estimates.
353 df_model
354 See RLM.df_model
355 df_resid
356 See RLM.df_resid
357 fit_history : dict
358 Contains information about the iterations. Its keys are `deviance`,
359 `params`, `iteration` and the convergence criteria specified in
360 `RLM.fit`, if different from `deviance` or `params`.
361 fit_options : dict
362 Contains the options given to fit.
363 fittedvalues : ndarray
364 The linear predicted values. dot(exog, params)
365 model : statsmodels.rlm.RLM
366 A reference to the model instance
367 nobs : float
368 The number of observations n
369 normalized_cov_params : ndarray
370 See RLM.normalized_cov_params
371 params : ndarray
372 The coefficients of the fitted model
373 pinv_wexog : ndarray
374 See RLM.pinv_wexog
375 pvalues : ndarray
376 The p values associated with `tvalues`. Note that `tvalues` are assumed
377 to be distributed standard normal rather than Student's t.
378 resid : ndarray
379 The residuals of the fitted model. endog - fittedvalues
380 scale : float
381 The type of scale is determined in the arguments to the fit method in
382 RLM. The reported scale is taken from the residuals of the weighted
383 least squares in the last IRLS iteration if update_scale is True. If
384 update_scale is False, then it is the scale given by the first OLS
385 fit before the IRLS iterations.
386 sresid : ndarray
387 The scaled residuals.
388 tvalues : ndarray
389 The "t-statistics" of params. These are defined as params/bse where
390 bse are taken from the robust covariance matrix specified in the
391 argument to fit.
392 weights : ndarray
393 The reported weights are determined by passing the scaled residuals
394 from the last weighted least squares fit in the IRLS algorithm.
396 See Also
397 --------
398 statsmodels.base.model.LikelihoodModelResults
399 """
401 def __init__(self, model, params, normalized_cov_params, scale):
402 super(RLMResults, self).__init__(model, params,
403 normalized_cov_params, scale)
404 self.model = model
405 self.df_model = model.df_model
406 self.df_resid = model.df_resid
407 self.nobs = model.nobs
408 self._cache = {}
409 # for remove_data
410 self.data_in_cache = ['sresid']
412 self.cov_params_default = self.bcov_scaled
413 # TODO: "pvals" should come from chisq on bse?
415 @cache_readonly
416 def fittedvalues(self):
417 return np.dot(self.model.exog, self.params)
419 @cache_readonly
420 def resid(self):
421 return self.model.endog - self.fittedvalues # before bcov
423 @cache_readonly
424 def sresid(self):
425 if self.scale == 0.0:
426 sresid = self.resid.copy()
427 sresid[:] = 0.0
428 return sresid
429 return self.resid / self.scale
431 @cache_readonly
432 def bcov_unscaled(self):
433 return self.normalized_cov_params
435 @cache_readonly
436 def weights(self):
437 return self.model.weights
439 @cache_readonly
440 def bcov_scaled(self):
441 model = self.model
442 m = np.mean(model.M.psi_deriv(self.sresid))
443 var_psiprime = np.var(model.M.psi_deriv(self.sresid))
444 k = 1 + (self.df_model + 1) / self.nobs * var_psiprime / m ** 2
446 if model.cov == "H1":
447 ss_psi = np.sum(model.M.psi(self.sresid) ** 2)
448 s_psi_deriv = np.sum(model.M.psi_deriv(self.sresid))
449 return k ** 2 * (1 / self.df_resid * ss_psi * self.scale ** 2) /\
450 ((1 / self.nobs * s_psi_deriv) ** 2) *\
451 model.normalized_cov_params
452 else:
453 W = np.dot(model.M.psi_deriv(self.sresid) * model.exog.T,
454 model.exog)
455 W_inv = np.linalg.inv(W)
456 # [W_jk]^-1 = [SUM(psi_deriv(Sr_i)*x_ij*x_jk)]^-1
457 # where Sr are the standardized residuals
458 if model.cov == "H2":
459 # These are correct, based on Huber (1973) 8.13
460 return k * (1 / self.df_resid) * np.sum(
461 model.M.psi(self.sresid) ** 2) * self.scale ** 2 \
462 / ((1 / self.nobs) *
463 np.sum(model.M.psi_deriv(self.sresid))) * W_inv
464 elif model.cov == "H3":
465 return k ** -1 * 1 / self.df_resid * np.sum(
466 model.M.psi(self.sresid) ** 2) * self.scale ** 2 \
467 * np.dot(
468 np.dot(W_inv, np.dot(model.exog.T, model.exog)),
469 W_inv)
471 @cache_readonly
472 def pvalues(self):
473 return stats.norm.sf(np.abs(self.tvalues)) * 2
475 @cache_readonly
476 def bse(self):
477 return np.sqrt(np.diag(self.bcov_scaled))
479 @cache_readonly
480 def chisq(self):
481 return (self.params / self.bse) ** 2
483 def summary(self, yname=None, xname=None, title=0, alpha=.05,
484 return_fmt='text'):
485 """
486 This is for testing the new summary setup
487 """
488 top_left = [('Dep. Variable:', None),
489 ('Model:', None),
490 ('Method:', ['IRLS']),
491 ('Norm:', [self.fit_options['norm']]),
492 ('Scale Est.:', [self.fit_options['scale_est']]),
493 ('Cov Type:', [self.fit_options['cov']]),
494 ('Date:', None),
495 ('Time:', None),
496 ('No. Iterations:', ["%d" % self.fit_history['iteration']])
497 ]
498 top_right = [('No. Observations:', None),
499 ('Df Residuals:', None),
500 ('Df Model:', None)
501 ]
503 if title is not None:
504 title = "Robust linear Model Regression Results"
506 # boiler plate
507 from statsmodels.iolib.summary import Summary
508 smry = Summary()
509 smry.add_table_2cols(self, gleft=top_left, gright=top_right,
510 yname=yname, xname=xname, title=title)
511 smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha,
512 use_t=self.use_t)
514 # add warnings/notes, added to text format only
515 etext = []
516 wstr = ("If the model instance has been used for another fit with "
517 "different fit parameters, then the fit options might not be "
518 "the correct ones anymore .")
519 etext.append(wstr)
521 if etext:
522 smry.add_extra_txt(etext)
524 return smry
526 def summary2(self, xname=None, yname=None, title=None, alpha=.05,
527 float_format="%.4f"):
528 """Experimental summary function for regression results
530 Parameters
531 ----------
532 yname : str
533 Name of the dependent variable (optional)
534 xname : list[str], optional
535 Names for the exogenous variables. Default is `var_##` for ## in
536 the number of regressors. Must match the number of parameters
537 in the model
538 title : str, optional
539 Title for the top table. If not None, then this replaces the
540 default title
541 alpha : float
542 significance level for the confidence intervals
543 float_format : str
544 print format for floats in parameters summary
546 Returns
547 -------
548 smry : Summary instance
549 this holds the summary tables and text, which can be printed or
550 converted to various output formats.
552 See Also
553 --------
554 statsmodels.iolib.summary2.Summary : class to hold summary results
555 """
556 from statsmodels.iolib import summary2
557 smry = summary2.Summary()
558 smry.add_base(results=self, alpha=alpha, float_format=float_format,
559 xname=xname, yname=yname, title=title)
561 return smry
564class RLMResultsWrapper(lm.RegressionResultsWrapper):
565 pass
568wrap.populate_wrapper(RLMResultsWrapper, RLMResults) # noqa:E305