Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/anova.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import numpy as np
2from scipy import stats
3import pandas as pd
4from pandas import DataFrame, Index
5import patsy
7from statsmodels.regression.linear_model import OLS
8from statsmodels.compat.python import lrange
9from statsmodels.formula.formulatools import (_remove_intercept_patsy,
10 _has_intercept, _intercept_idx)
11from statsmodels.iolib import summary2
14def _get_covariance(model, robust):
15 if robust is None:
16 return model.cov_params()
17 elif robust == "hc0":
18 return model.cov_HC0
19 elif robust == "hc1":
20 return model.cov_HC1
21 elif robust == "hc2":
22 return model.cov_HC2
23 elif robust == "hc3":
24 return model.cov_HC3
25 else: # pragma: no cover
26 raise ValueError("robust options %s not understood" % robust)
29# NOTE: these need to take into account weights !
31def anova_single(model, **kwargs):
32 """
33 Anova table for one fitted linear model.
35 Parameters
36 ----------
37 model : fitted linear model results instance
38 A fitted linear model
39 typ : int or str {1,2,3} or {"I","II","III"}
40 Type of sum of squares to use.
42 **kwargs**
44 scale : float
45 Estimate of variance, If None, will be estimated from the largest
46 model. Default is None.
47 test : str {"F", "Chisq", "Cp"} or None
48 Test statistics to provide. Default is "F".
50 Notes
51 -----
52 Use of this function is discouraged. Use anova_lm instead.
53 """
54 test = kwargs.get("test", "F")
55 scale = kwargs.get("scale", None)
56 typ = kwargs.get("typ", 1)
57 robust = kwargs.get("robust", None)
58 if robust:
59 robust = robust.lower()
61 endog = model.model.endog
62 exog = model.model.exog
63 nobs = exog.shape[0]
65 response_name = model.model.endog_names
66 design_info = model.model.data.design_info
67 exog_names = model.model.exog_names
68 # +1 for resids
69 n_rows = (len(design_info.terms) - _has_intercept(design_info) + 1)
71 pr_test = "PR(>%s)" % test
72 names = ['df', 'sum_sq', 'mean_sq', test, pr_test]
74 table = DataFrame(np.zeros((n_rows, 5)), columns=names)
76 if typ in [1, "I"]:
77 return anova1_lm_single(model, endog, exog, nobs, design_info, table,
78 n_rows, test, pr_test, robust)
79 elif typ in [2, "II"]:
80 return anova2_lm_single(model, design_info, n_rows, test, pr_test,
81 robust)
82 elif typ in [3, "III"]:
83 return anova3_lm_single(model, design_info, n_rows, test, pr_test,
84 robust)
85 elif typ in [4, "IV"]:
86 raise NotImplementedError("Type IV not yet implemented")
87 else: # pragma: no cover
88 raise ValueError("Type %s not understood" % str(typ))
91def anova1_lm_single(model, endog, exog, nobs, design_info, table, n_rows, test,
92 pr_test, robust):
93 """
94 Anova table for one fitted linear model.
96 Parameters
97 ----------
98 model : fitted linear model results instance
99 A fitted linear model
101 **kwargs**
103 scale : float
104 Estimate of variance, If None, will be estimated from the largest
105 model. Default is None.
106 test : str {"F", "Chisq", "Cp"} or None
107 Test statistics to provide. Default is "F".
109 Notes
110 -----
111 Use of this function is discouraged. Use anova_lm instead.
112 """
113 #maybe we should rethink using pinv > qr in OLS/linear models?
114 effects = getattr(model, 'effects', None)
115 if effects is None:
116 q,r = np.linalg.qr(exog)
117 effects = np.dot(q.T, endog)
119 arr = np.zeros((len(design_info.terms), len(design_info.column_names)))
120 slices = [design_info.slice(name) for name in design_info.term_names]
121 for i,slice_ in enumerate(slices):
122 arr[i, slice_] = 1
124 sum_sq = np.dot(arr, effects**2)
125 #NOTE: assumes intercept is first column
126 idx = _intercept_idx(design_info)
127 sum_sq = sum_sq[~idx]
128 term_names = np.array(design_info.term_names) # want boolean indexing
129 term_names = term_names[~idx]
131 index = term_names.tolist()
132 table.index = Index(index + ['Residual'])
133 table.loc[index, ['df', 'sum_sq']] = np.c_[arr[~idx].sum(1), sum_sq]
134 # fill in residual
135 table.loc['Residual', ['sum_sq','df']] = model.ssr, model.df_resid
136 if test == 'F':
137 table[test] = ((table['sum_sq'] / table['df']) /
138 (model.ssr / model.df_resid))
139 table[pr_test] = stats.f.sf(table["F"], table["df"],
140 model.df_resid)
141 table.loc['Residual', [test, pr_test]] = np.nan, np.nan
142 table['mean_sq'] = table['sum_sq'] / table['df']
143 return table
145#NOTE: the below is not agnostic about formula...
146def anova2_lm_single(model, design_info, n_rows, test, pr_test, robust):
147 """
148 Anova type II table for one fitted linear model.
150 Parameters
151 ----------
152 model : fitted linear model results instance
153 A fitted linear model
155 **kwargs**
157 scale : float
158 Estimate of variance, If None, will be estimated from the largest
159 model. Default is None.
160 test : str {"F", "Chisq", "Cp"} or None
161 Test statistics to provide. Default is "F".
163 Notes
164 -----
165 Use of this function is discouraged. Use anova_lm instead.
167 Type II
168 Sum of Squares compares marginal contribution of terms. Thus, it is
169 not particularly useful for models with significant interaction terms.
170 """
171 terms_info = design_info.terms[:] # copy
172 terms_info = _remove_intercept_patsy(terms_info)
174 names = ['sum_sq', 'df', test, pr_test]
176 table = DataFrame(np.zeros((n_rows, 4)), columns = names)
177 cov = _get_covariance(model, None)
178 robust_cov = _get_covariance(model, robust)
179 col_order = []
180 index = []
181 for i, term in enumerate(terms_info):
182 # grab all varaibles except interaction effects that contain term
183 # need two hypotheses matrices L1 is most restrictive, ie., term==0
184 # L2 is everything except term==0
185 cols = design_info.slice(term)
186 L1 = lrange(cols.start, cols.stop)
187 L2 = []
188 term_set = set(term.factors)
189 for t in terms_info: # for the term you have
190 other_set = set(t.factors)
191 if term_set.issubset(other_set) and not term_set == other_set:
192 col = design_info.slice(t)
193 # on a higher order term containing current `term`
194 L1.extend(lrange(col.start, col.stop))
195 L2.extend(lrange(col.start, col.stop))
197 L1 = np.eye(model.model.exog.shape[1])[L1]
198 L2 = np.eye(model.model.exog.shape[1])[L2]
200 if L2.size:
201 LVL = np.dot(np.dot(L1,robust_cov),L2.T)
202 from scipy import linalg
203 orth_compl,_ = linalg.qr(LVL)
204 r = L1.shape[0] - L2.shape[0]
205 # L1|2
206 # use the non-unique orthogonal completion since L12 is rank r
207 L12 = np.dot(orth_compl[:,-r:].T, L1)
208 else:
209 L12 = L1
210 r = L1.shape[0]
211 #from IPython.core.debugger import Pdb; Pdb().set_trace()
212 if test == 'F':
213 f = model.f_test(L12, cov_p=robust_cov)
214 table.loc[table.index[i], test] = test_value = f.fvalue
215 table.loc[table.index[i], pr_test] = f.pvalue
217 # need to back out SSR from f_test
218 table.loc[table.index[i], 'df'] = r
219 col_order.append(cols.start)
220 index.append(term.name())
222 table.index = Index(index + ['Residual'])
223 table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1]+1])]
224 # back out sum of squares from f_test
225 ssr = table[test] * table['df'] * model.ssr/model.df_resid
226 table['sum_sq'] = ssr
227 # fill in residual
228 table.loc['Residual', ['sum_sq','df', test, pr_test]] = (model.ssr,
229 model.df_resid,
230 np.nan, np.nan)
232 return table
234def anova3_lm_single(model, design_info, n_rows, test, pr_test, robust):
235 n_rows += _has_intercept(design_info)
236 terms_info = design_info.terms
238 names = ['sum_sq', 'df', test, pr_test]
240 table = DataFrame(np.zeros((n_rows, 4)), columns = names)
241 cov = _get_covariance(model, robust)
242 col_order = []
243 index = []
244 for i, term in enumerate(terms_info):
245 # grab term, hypothesis is that term == 0
246 cols = design_info.slice(term)
247 L1 = np.eye(model.model.exog.shape[1])[cols]
248 L12 = L1
249 r = L1.shape[0]
251 if test == 'F':
252 f = model.f_test(L12, cov_p=cov)
253 table.loc[table.index[i], test] = test_value = f.fvalue
254 table.loc[table.index[i], pr_test] = f.pvalue
256 # need to back out SSR from f_test
257 table.loc[table.index[i], 'df'] = r
258 #col_order.append(cols.start)
259 index.append(term.name())
261 table.index = Index(index + ['Residual'])
262 #NOTE: Do not need to sort because terms are an ordered dict now
263 #table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1]+1])]
264 # back out sum of squares from f_test
265 ssr = table[test] * table['df'] * model.ssr/model.df_resid
266 table['sum_sq'] = ssr
267 # fill in residual
268 table.loc['Residual', ['sum_sq','df', test, pr_test]] = (model.ssr,
269 model.df_resid,
270 np.nan, np.nan)
271 return table
273def anova_lm(*args, **kwargs):
274 """
275 Anova table for one or more fitted linear models.
277 Parameters
278 ----------
279 args : fitted linear model results instance
280 One or more fitted linear models
281 scale : float
282 Estimate of variance, If None, will be estimated from the largest
283 model. Default is None.
284 test : str {"F", "Chisq", "Cp"} or None
285 Test statistics to provide. Default is "F".
286 typ : str or int {"I","II","III"} or {1,2,3}
287 The type of Anova test to perform. See notes.
288 robust : {None, "hc0", "hc1", "hc2", "hc3"}
289 Use heteroscedasticity-corrected coefficient covariance matrix.
290 If robust covariance is desired, it is recommended to use `hc3`.
292 Returns
293 -------
294 anova : DataFrame
295 When args is a single model, return is DataFrame with columns:
297 sum_sq : float64
298 Sum of squares for model terms.
299 df : float64
300 Degrees of freedom for model terms.
301 F : float64
302 F statistic value for significance of adding model terms.
303 PR(>F) : float64
304 P-value for significance of adding model terms.
306 When args is multiple models, return is DataFrame with columns:
308 df_resid : float64
309 Degrees of freedom of residuals in models.
310 ssr : float64
311 Sum of squares of residuals in models.
312 df_diff : float64
313 Degrees of freedom difference from previous model in args
314 ss_dff : float64
315 Difference in ssr from previous model in args
316 F : float64
317 F statistic comparing to previous model in args
318 PR(>F): float64
319 P-value for significance comparing to previous model in args
321 Notes
322 -----
323 Model statistics are given in the order of args. Models must have been fit
324 using the formula api.
326 See Also
327 --------
328 model_results.compare_f_test, model_results.compare_lm_test
330 Examples
331 --------
332 >>> import statsmodels.api as sm
333 >>> from statsmodels.formula.api import ols
334 >>> moore = sm.datasets.get_rdataset("Moore", "carData", cache=True) # load
335 >>> data = moore.data
336 >>> data = data.rename(columns={"partner.status" :
337 ... "partner_status"}) # make name pythonic
338 >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
339 ... data=data).fit()
340 >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 Anova DataFrame
341 >>> print(table)
342 """
343 typ = kwargs.get('typ', 1)
345 ### Farm Out Single model Anova Type I, II, III, and IV ###
347 if len(args) == 1:
348 model = args[0]
349 return anova_single(model, **kwargs)
351 if typ not in [1, "I"]:
352 raise ValueError("Multiple models only supported for type I. "
353 "Got type %s" % str(typ))
355 test = kwargs.get("test", "F")
356 scale = kwargs.get("scale", None)
357 n_models = len(args)
358 pr_test = "Pr(>%s)" % test
359 names = ['df_resid', 'ssr', 'df_diff', 'ss_diff', test, pr_test]
360 table = DataFrame(np.zeros((n_models, 6)), columns=names)
362 if not scale: # assume biggest model is last
363 scale = args[-1].scale
365 table["ssr"] = [mdl.ssr for mdl in args]
366 table["df_resid"] = [mdl.df_resid for mdl in args]
367 table.loc[table.index[1:], "df_diff"] = -np.diff(table["df_resid"].values)
368 table["ss_diff"] = -table["ssr"].diff()
369 if test == "F":
370 table["F"] = table["ss_diff"] / table["df_diff"] / scale
371 table[pr_test] = stats.f.sf(table["F"], table["df_diff"],
372 table["df_resid"])
373 # for earlier scipy - stats.f.sf(np.nan, 10, 2) -> 0 not nan
374 table[pr_test][table['F'].isnull()] = np.nan
376 return table
379def _not_slice(slices, slices_to_exclude, n):
380 ind = np.array([True]*n)
381 for term in slices_to_exclude:
382 s = slices[term]
383 ind[s] = False
384 return ind
387def _ssr_reduced_model(y, x, term_slices, params, keys):
388 """
389 Residual sum of squares of OLS model excluding factors in `keys`
390 Assumes x matrix is orthogonal
392 Parameters
393 ----------
394 y : array_like
395 dependent variable
396 x : array_like
397 independent variables
398 term_slices : a dict of slices
399 term_slices[key] is a boolean array specifies the parameters
400 associated with the factor `key`
401 params : ndarray
402 OLS solution of y = x * params
403 keys : keys for term_slices
404 factors to be excluded
406 Returns
407 -------
408 rss : float
409 residual sum of squares
410 df : int
411 degrees of freedom
412 """
413 ind = _not_slice(term_slices, keys, x.shape[1])
414 params1 = params[ind]
415 ssr = np.subtract(y, x[:, ind].dot(params1))
416 ssr = ssr.T.dot(ssr)
417 df_resid = len(y) - len(params1)
418 return ssr, df_resid
421class AnovaRM(object):
422 """
423 Repeated measures Anova using least squares regression
425 The full model regression residual sum of squares is
426 used to compare with the reduced model for calculating the
427 within-subject effect sum of squares [1].
429 Currently, only fully balanced within-subject designs are supported.
430 Calculation of between-subject effects and corrections for violation of
431 sphericity are not yet implemented.
433 Parameters
434 ----------
435 data : DataFrame
436 depvar : str
437 The dependent variable in `data`
438 subject : str
439 Specify the subject id
440 within : list[str]
441 The within-subject factors
442 between : list[str]
443 The between-subject factors, this is not yet implemented
444 aggregate_func : {None, 'mean', callable}
445 If the data set contains more than a single observation per subject
446 and cell of the specified model, this function will be used to
447 aggregate the data before running the Anova. `None` (the default) will
448 not perform any aggregation; 'mean' is s shortcut to `numpy.mean`.
449 An exception will be raised if aggregation is required, but no
450 aggregation function was specified.
452 Returns
453 -------
454 results : AnovaResults instance
456 Raises
457 ------
458 ValueError
459 If the data need to be aggregated, but `aggregate_func` was not
460 specified.
462 Notes
463 -----
464 This implementation currently only supports fully balanced designs. If the
465 data contain more than one observation per subject and cell of the design,
466 these observations need to be aggregated into a single observation
467 before the Anova is calculated, either manually or by passing an aggregation
468 function via the `aggregate_func` keyword argument.
469 Note that if the input data set was not balanced before performing the
470 aggregation, the implied heteroscedasticity of the data is ignored.
472 References
473 ----------
474 .. [*] Rutherford, Andrew. Anova and ANCOVA: a GLM approach. John Wiley & Sons, 2011.
475 """
477 def __init__(self, data, depvar, subject, within=None, between=None,
478 aggregate_func=None):
479 self.data = data
480 self.depvar = depvar
481 self.within = within
482 if 'C' in within:
483 raise ValueError("Factor name cannot be 'C'! This is in conflict "
484 "with patsy's contrast function name.")
485 self.between = between
486 if between is not None:
487 raise NotImplementedError('Between subject effect not '
488 'yet supported!')
489 self.subject = subject
491 if aggregate_func == 'mean':
492 self.aggregate_func = np.mean
493 else:
494 self.aggregate_func = aggregate_func
496 if not data.equals(data.drop_duplicates(subset=[subject] + within)):
497 if self.aggregate_func is not None:
498 self._aggregate()
499 else:
500 msg = ('The data set contains more than one observation per '
501 'subject and cell. Either aggregate the data manually, '
502 'or pass the `aggregate_func` parameter.')
503 raise ValueError(msg)
505 self._check_data_balanced()
507 def _aggregate(self):
508 self.data = (self.data
509 .groupby([self.subject] + self.within,
510 as_index=False)[self.depvar]
511 .agg(self.aggregate_func))
513 def _check_data_balanced(self):
514 """raise if data is not balanced
516 This raises a ValueError if the data is not balanced, and
517 returns None if it is balance
519 Return might change
520 """
521 factor_levels = 1
522 for wi in self.within:
523 factor_levels *= len(self.data[wi].unique())
525 cell_count = {}
526 for index in range(self.data.shape[0]):
527 key = []
528 for col in self.within:
529 key.append(self.data[col].iloc[index])
530 key = tuple(key)
531 if key in cell_count:
532 cell_count[key] = cell_count[key] + 1
533 else:
534 cell_count[key] = 1
535 error_message = "Data is unbalanced."
536 if len(cell_count) != factor_levels:
537 raise ValueError(error_message)
538 count = cell_count[key]
539 for key in cell_count:
540 if count != cell_count[key]:
541 raise ValueError(error_message)
542 if self.data.shape[0] > count * factor_levels:
543 raise ValueError('There are more than 1 element in a cell! Missing'
544 ' factors?')
546 def fit(self):
547 """estimate the model and compute the Anova table
549 Returns
550 -------
551 AnovaResults instance
552 """
553 y = self.data[self.depvar].values
555 # Construct OLS endog and exog from string using patsy
556 within = ['C(%s, Sum)' % i for i in self.within]
557 subject = 'C(%s, Sum)' % self.subject
558 factors = within + [subject]
559 x = patsy.dmatrix('*'.join(factors), data=self.data)
560 term_slices = x.design_info.term_name_slices
561 for key in term_slices:
562 ind = np.array([False]*x.shape[1])
563 ind[term_slices[key]] = True
564 term_slices[key] = np.array(ind)
565 term_exclude = [':'.join(factors)]
566 ind = _not_slice(term_slices, term_exclude, x.shape[1])
567 x = x[:, ind]
569 # Fit OLS
570 model = OLS(y, x)
571 results = model.fit()
572 if model.rank < x.shape[1]:
573 raise ValueError('Independent variables are collinear.')
574 for i in term_exclude:
575 term_slices.pop(i)
576 for key in term_slices:
577 term_slices[key] = term_slices[key][ind]
578 params = results.params
579 df_resid = results.df_resid
580 ssr = results.ssr
582 columns = ['F Value', 'Num DF', 'Den DF', 'Pr > F']
583 anova_table = pd.DataFrame(np.zeros((0, 4)), columns=columns)
585 for key in term_slices:
586 if self.subject not in key and key != 'Intercept':
587 # Independen variables are orthogonal
588 ssr1, df_resid1 = _ssr_reduced_model(
589 y, x, term_slices, params, [key])
590 df1 = df_resid1 - df_resid
591 msm = (ssr1 - ssr) / df1
592 if (key == ':'.join(factors[:-1]) or
593 (key + ':' + subject not in term_slices)):
594 mse = ssr / df_resid
595 df2 = df_resid
596 else:
597 ssr1, df_resid1 = _ssr_reduced_model(
598 y, x, term_slices, params,
599 [key + ':' + subject])
600 df2 = df_resid1 - df_resid
601 mse = (ssr1 - ssr) / df2
602 F = msm / mse
603 p = stats.f.sf(F, df1, df2)
604 term = key.replace('C(', '').replace(', Sum)', '')
605 anova_table.loc[term, 'F Value'] = F
606 anova_table.loc[term, 'Num DF'] = df1
607 anova_table.loc[term, 'Den DF'] = df2
608 anova_table.loc[term, 'Pr > F'] = p
610 return AnovaResults(anova_table)
613class AnovaResults(object):
614 """
615 Anova results class
617 Attributes
618 ----------
619 anova_table : DataFrame
620 """
621 def __init__(self, anova_table):
622 self.anova_table = anova_table
624 def __str__(self):
625 return self.summary().__str__()
627 def summary(self):
628 """create summary results
630 Returns
631 -------
632 summary : summary2.Summary instance
633 """
634 summ = summary2.Summary()
635 summ.add_title('Anova')
636 summ.add_df(self.anova_table)
638 return summ
641if __name__ == "__main__":
642 import pandas
643 from statsmodels.formula.api import ols
644 # in R
645 #library(car)
646 #write.csv(Moore, "moore.csv", row.names=FALSE)
647 moore = pandas.read_csv('moore.csv', skiprows=1,
648 names=['partner_status','conformity',
649 'fcategory','fscore'])
650 moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
651 data=moore).fit()
653 mooreB = ols('conformity ~ C(partner_status, Sum)', data=moore).fit()
655 # for each term you just want to test vs the model without its
656 # higher-order terms
658 # using Monette-Fox slides and Marden class notes for linear algebra /
659 # orthogonal complement
660 # https://netfiles.uiuc.edu/jimarden/www/Classes/STAT324/
662 table = anova_lm(moore_lm, typ=2)