Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import numpy as np 

2from scipy import stats 

3import pandas as pd 

4from pandas import DataFrame, Index 

5import patsy 

6 

7from statsmodels.regression.linear_model import OLS 

8from statsmodels.compat.python import lrange 

9from statsmodels.formula.formulatools import (_remove_intercept_patsy, 

10 _has_intercept, _intercept_idx) 

11from statsmodels.iolib import summary2 

12 

13 

14def _get_covariance(model, robust): 

15 if robust is None: 

16 return model.cov_params() 

17 elif robust == "hc0": 

18 return model.cov_HC0 

19 elif robust == "hc1": 

20 return model.cov_HC1 

21 elif robust == "hc2": 

22 return model.cov_HC2 

23 elif robust == "hc3": 

24 return model.cov_HC3 

25 else: # pragma: no cover 

26 raise ValueError("robust options %s not understood" % robust) 

27 

28 

29# NOTE: these need to take into account weights ! 

30 

31def anova_single(model, **kwargs): 

32 """ 

33 Anova table for one fitted linear model. 

34 

35 Parameters 

36 ---------- 

37 model : fitted linear model results instance 

38 A fitted linear model 

39 typ : int or str {1,2,3} or {"I","II","III"} 

40 Type of sum of squares to use. 

41 

42 **kwargs** 

43 

44 scale : float 

45 Estimate of variance, If None, will be estimated from the largest 

46 model. Default is None. 

47 test : str {"F", "Chisq", "Cp"} or None 

48 Test statistics to provide. Default is "F". 

49 

50 Notes 

51 ----- 

52 Use of this function is discouraged. Use anova_lm instead. 

53 """ 

54 test = kwargs.get("test", "F") 

55 scale = kwargs.get("scale", None) 

56 typ = kwargs.get("typ", 1) 

57 robust = kwargs.get("robust", None) 

58 if robust: 

59 robust = robust.lower() 

60 

61 endog = model.model.endog 

62 exog = model.model.exog 

63 nobs = exog.shape[0] 

64 

65 response_name = model.model.endog_names 

66 design_info = model.model.data.design_info 

67 exog_names = model.model.exog_names 

68 # +1 for resids 

69 n_rows = (len(design_info.terms) - _has_intercept(design_info) + 1) 

70 

71 pr_test = "PR(>%s)" % test 

72 names = ['df', 'sum_sq', 'mean_sq', test, pr_test] 

73 

74 table = DataFrame(np.zeros((n_rows, 5)), columns=names) 

75 

76 if typ in [1, "I"]: 

77 return anova1_lm_single(model, endog, exog, nobs, design_info, table, 

78 n_rows, test, pr_test, robust) 

79 elif typ in [2, "II"]: 

80 return anova2_lm_single(model, design_info, n_rows, test, pr_test, 

81 robust) 

82 elif typ in [3, "III"]: 

83 return anova3_lm_single(model, design_info, n_rows, test, pr_test, 

84 robust) 

85 elif typ in [4, "IV"]: 

86 raise NotImplementedError("Type IV not yet implemented") 

87 else: # pragma: no cover 

88 raise ValueError("Type %s not understood" % str(typ)) 

89 

90 

91def anova1_lm_single(model, endog, exog, nobs, design_info, table, n_rows, test, 

92 pr_test, robust): 

93 """ 

94 Anova table for one fitted linear model. 

95 

96 Parameters 

97 ---------- 

98 model : fitted linear model results instance 

99 A fitted linear model 

100 

101 **kwargs** 

102 

103 scale : float 

104 Estimate of variance, If None, will be estimated from the largest 

105 model. Default is None. 

106 test : str {"F", "Chisq", "Cp"} or None 

107 Test statistics to provide. Default is "F". 

108 

109 Notes 

110 ----- 

111 Use of this function is discouraged. Use anova_lm instead. 

112 """ 

113 #maybe we should rethink using pinv > qr in OLS/linear models? 

114 effects = getattr(model, 'effects', None) 

115 if effects is None: 

116 q,r = np.linalg.qr(exog) 

117 effects = np.dot(q.T, endog) 

118 

119 arr = np.zeros((len(design_info.terms), len(design_info.column_names))) 

120 slices = [design_info.slice(name) for name in design_info.term_names] 

121 for i,slice_ in enumerate(slices): 

122 arr[i, slice_] = 1 

123 

124 sum_sq = np.dot(arr, effects**2) 

125 #NOTE: assumes intercept is first column 

126 idx = _intercept_idx(design_info) 

127 sum_sq = sum_sq[~idx] 

128 term_names = np.array(design_info.term_names) # want boolean indexing 

129 term_names = term_names[~idx] 

130 

131 index = term_names.tolist() 

132 table.index = Index(index + ['Residual']) 

133 table.loc[index, ['df', 'sum_sq']] = np.c_[arr[~idx].sum(1), sum_sq] 

134 # fill in residual 

135 table.loc['Residual', ['sum_sq','df']] = model.ssr, model.df_resid 

136 if test == 'F': 

137 table[test] = ((table['sum_sq'] / table['df']) / 

138 (model.ssr / model.df_resid)) 

139 table[pr_test] = stats.f.sf(table["F"], table["df"], 

140 model.df_resid) 

141 table.loc['Residual', [test, pr_test]] = np.nan, np.nan 

142 table['mean_sq'] = table['sum_sq'] / table['df'] 

143 return table 

144 

145#NOTE: the below is not agnostic about formula... 

146def anova2_lm_single(model, design_info, n_rows, test, pr_test, robust): 

147 """ 

148 Anova type II table for one fitted linear model. 

149 

150 Parameters 

151 ---------- 

152 model : fitted linear model results instance 

153 A fitted linear model 

154 

155 **kwargs** 

156 

157 scale : float 

158 Estimate of variance, If None, will be estimated from the largest 

159 model. Default is None. 

160 test : str {"F", "Chisq", "Cp"} or None 

161 Test statistics to provide. Default is "F". 

162 

163 Notes 

164 ----- 

165 Use of this function is discouraged. Use anova_lm instead. 

166 

167 Type II 

168 Sum of Squares compares marginal contribution of terms. Thus, it is 

169 not particularly useful for models with significant interaction terms. 

170 """ 

171 terms_info = design_info.terms[:] # copy 

172 terms_info = _remove_intercept_patsy(terms_info) 

173 

174 names = ['sum_sq', 'df', test, pr_test] 

175 

176 table = DataFrame(np.zeros((n_rows, 4)), columns = names) 

177 cov = _get_covariance(model, None) 

178 robust_cov = _get_covariance(model, robust) 

179 col_order = [] 

180 index = [] 

181 for i, term in enumerate(terms_info): 

182 # grab all varaibles except interaction effects that contain term 

183 # need two hypotheses matrices L1 is most restrictive, ie., term==0 

184 # L2 is everything except term==0 

185 cols = design_info.slice(term) 

186 L1 = lrange(cols.start, cols.stop) 

187 L2 = [] 

188 term_set = set(term.factors) 

189 for t in terms_info: # for the term you have 

190 other_set = set(t.factors) 

191 if term_set.issubset(other_set) and not term_set == other_set: 

192 col = design_info.slice(t) 

193 # on a higher order term containing current `term` 

194 L1.extend(lrange(col.start, col.stop)) 

195 L2.extend(lrange(col.start, col.stop)) 

196 

197 L1 = np.eye(model.model.exog.shape[1])[L1] 

198 L2 = np.eye(model.model.exog.shape[1])[L2] 

199 

200 if L2.size: 

201 LVL = np.dot(np.dot(L1,robust_cov),L2.T) 

202 from scipy import linalg 

203 orth_compl,_ = linalg.qr(LVL) 

204 r = L1.shape[0] - L2.shape[0] 

205 # L1|2 

206 # use the non-unique orthogonal completion since L12 is rank r 

207 L12 = np.dot(orth_compl[:,-r:].T, L1) 

208 else: 

209 L12 = L1 

210 r = L1.shape[0] 

211 #from IPython.core.debugger import Pdb; Pdb().set_trace() 

212 if test == 'F': 

213 f = model.f_test(L12, cov_p=robust_cov) 

214 table.loc[table.index[i], test] = test_value = f.fvalue 

215 table.loc[table.index[i], pr_test] = f.pvalue 

216 

217 # need to back out SSR from f_test 

218 table.loc[table.index[i], 'df'] = r 

219 col_order.append(cols.start) 

220 index.append(term.name()) 

221 

222 table.index = Index(index + ['Residual']) 

223 table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1]+1])] 

224 # back out sum of squares from f_test 

225 ssr = table[test] * table['df'] * model.ssr/model.df_resid 

226 table['sum_sq'] = ssr 

227 # fill in residual 

228 table.loc['Residual', ['sum_sq','df', test, pr_test]] = (model.ssr, 

229 model.df_resid, 

230 np.nan, np.nan) 

231 

232 return table 

233 

234def anova3_lm_single(model, design_info, n_rows, test, pr_test, robust): 

235 n_rows += _has_intercept(design_info) 

236 terms_info = design_info.terms 

237 

238 names = ['sum_sq', 'df', test, pr_test] 

239 

240 table = DataFrame(np.zeros((n_rows, 4)), columns = names) 

241 cov = _get_covariance(model, robust) 

242 col_order = [] 

243 index = [] 

244 for i, term in enumerate(terms_info): 

245 # grab term, hypothesis is that term == 0 

246 cols = design_info.slice(term) 

247 L1 = np.eye(model.model.exog.shape[1])[cols] 

248 L12 = L1 

249 r = L1.shape[0] 

250 

251 if test == 'F': 

252 f = model.f_test(L12, cov_p=cov) 

253 table.loc[table.index[i], test] = test_value = f.fvalue 

254 table.loc[table.index[i], pr_test] = f.pvalue 

255 

256 # need to back out SSR from f_test 

257 table.loc[table.index[i], 'df'] = r 

258 #col_order.append(cols.start) 

259 index.append(term.name()) 

260 

261 table.index = Index(index + ['Residual']) 

262 #NOTE: Do not need to sort because terms are an ordered dict now 

263 #table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1]+1])] 

264 # back out sum of squares from f_test 

265 ssr = table[test] * table['df'] * model.ssr/model.df_resid 

266 table['sum_sq'] = ssr 

267 # fill in residual 

268 table.loc['Residual', ['sum_sq','df', test, pr_test]] = (model.ssr, 

269 model.df_resid, 

270 np.nan, np.nan) 

271 return table 

272 

273def anova_lm(*args, **kwargs): 

274 """ 

275 Anova table for one or more fitted linear models. 

276 

277 Parameters 

278 ---------- 

279 args : fitted linear model results instance 

280 One or more fitted linear models 

281 scale : float 

282 Estimate of variance, If None, will be estimated from the largest 

283 model. Default is None. 

284 test : str {"F", "Chisq", "Cp"} or None 

285 Test statistics to provide. Default is "F". 

286 typ : str or int {"I","II","III"} or {1,2,3} 

287 The type of Anova test to perform. See notes. 

288 robust : {None, "hc0", "hc1", "hc2", "hc3"} 

289 Use heteroscedasticity-corrected coefficient covariance matrix. 

290 If robust covariance is desired, it is recommended to use `hc3`. 

291 

292 Returns 

293 ------- 

294 anova : DataFrame 

295 When args is a single model, return is DataFrame with columns: 

296 

297 sum_sq : float64 

298 Sum of squares for model terms. 

299 df : float64 

300 Degrees of freedom for model terms. 

301 F : float64 

302 F statistic value for significance of adding model terms. 

303 PR(>F) : float64 

304 P-value for significance of adding model terms. 

305 

306 When args is multiple models, return is DataFrame with columns: 

307 

308 df_resid : float64 

309 Degrees of freedom of residuals in models. 

310 ssr : float64 

311 Sum of squares of residuals in models. 

312 df_diff : float64 

313 Degrees of freedom difference from previous model in args 

314 ss_dff : float64 

315 Difference in ssr from previous model in args 

316 F : float64 

317 F statistic comparing to previous model in args 

318 PR(>F): float64 

319 P-value for significance comparing to previous model in args 

320 

321 Notes 

322 ----- 

323 Model statistics are given in the order of args. Models must have been fit 

324 using the formula api. 

325 

326 See Also 

327 -------- 

328 model_results.compare_f_test, model_results.compare_lm_test 

329 

330 Examples 

331 -------- 

332 >>> import statsmodels.api as sm 

333 >>> from statsmodels.formula.api import ols 

334 >>> moore = sm.datasets.get_rdataset("Moore", "carData", cache=True) # load 

335 >>> data = moore.data 

336 >>> data = data.rename(columns={"partner.status" : 

337 ... "partner_status"}) # make name pythonic 

338 >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)', 

339 ... data=data).fit() 

340 >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 Anova DataFrame 

341 >>> print(table) 

342 """ 

343 typ = kwargs.get('typ', 1) 

344 

345 ### Farm Out Single model Anova Type I, II, III, and IV ### 

346 

347 if len(args) == 1: 

348 model = args[0] 

349 return anova_single(model, **kwargs) 

350 

351 if typ not in [1, "I"]: 

352 raise ValueError("Multiple models only supported for type I. " 

353 "Got type %s" % str(typ)) 

354 

355 test = kwargs.get("test", "F") 

356 scale = kwargs.get("scale", None) 

357 n_models = len(args) 

358 pr_test = "Pr(>%s)" % test 

359 names = ['df_resid', 'ssr', 'df_diff', 'ss_diff', test, pr_test] 

360 table = DataFrame(np.zeros((n_models, 6)), columns=names) 

361 

362 if not scale: # assume biggest model is last 

363 scale = args[-1].scale 

364 

365 table["ssr"] = [mdl.ssr for mdl in args] 

366 table["df_resid"] = [mdl.df_resid for mdl in args] 

367 table.loc[table.index[1:], "df_diff"] = -np.diff(table["df_resid"].values) 

368 table["ss_diff"] = -table["ssr"].diff() 

369 if test == "F": 

370 table["F"] = table["ss_diff"] / table["df_diff"] / scale 

371 table[pr_test] = stats.f.sf(table["F"], table["df_diff"], 

372 table["df_resid"]) 

373 # for earlier scipy - stats.f.sf(np.nan, 10, 2) -> 0 not nan 

374 table[pr_test][table['F'].isnull()] = np.nan 

375 

376 return table 

377 

378 

379def _not_slice(slices, slices_to_exclude, n): 

380 ind = np.array([True]*n) 

381 for term in slices_to_exclude: 

382 s = slices[term] 

383 ind[s] = False 

384 return ind 

385 

386 

387def _ssr_reduced_model(y, x, term_slices, params, keys): 

388 """ 

389 Residual sum of squares of OLS model excluding factors in `keys` 

390 Assumes x matrix is orthogonal 

391 

392 Parameters 

393 ---------- 

394 y : array_like 

395 dependent variable 

396 x : array_like 

397 independent variables 

398 term_slices : a dict of slices 

399 term_slices[key] is a boolean array specifies the parameters 

400 associated with the factor `key` 

401 params : ndarray 

402 OLS solution of y = x * params 

403 keys : keys for term_slices 

404 factors to be excluded 

405 

406 Returns 

407 ------- 

408 rss : float 

409 residual sum of squares 

410 df : int 

411 degrees of freedom 

412 """ 

413 ind = _not_slice(term_slices, keys, x.shape[1]) 

414 params1 = params[ind] 

415 ssr = np.subtract(y, x[:, ind].dot(params1)) 

416 ssr = ssr.T.dot(ssr) 

417 df_resid = len(y) - len(params1) 

418 return ssr, df_resid 

419 

420 

421class AnovaRM(object): 

422 """ 

423 Repeated measures Anova using least squares regression 

424 

425 The full model regression residual sum of squares is 

426 used to compare with the reduced model for calculating the 

427 within-subject effect sum of squares [1]. 

428 

429 Currently, only fully balanced within-subject designs are supported. 

430 Calculation of between-subject effects and corrections for violation of 

431 sphericity are not yet implemented. 

432 

433 Parameters 

434 ---------- 

435 data : DataFrame 

436 depvar : str 

437 The dependent variable in `data` 

438 subject : str 

439 Specify the subject id 

440 within : list[str] 

441 The within-subject factors 

442 between : list[str] 

443 The between-subject factors, this is not yet implemented 

444 aggregate_func : {None, 'mean', callable} 

445 If the data set contains more than a single observation per subject 

446 and cell of the specified model, this function will be used to 

447 aggregate the data before running the Anova. `None` (the default) will 

448 not perform any aggregation; 'mean' is s shortcut to `numpy.mean`. 

449 An exception will be raised if aggregation is required, but no 

450 aggregation function was specified. 

451 

452 Returns 

453 ------- 

454 results : AnovaResults instance 

455 

456 Raises 

457 ------ 

458 ValueError 

459 If the data need to be aggregated, but `aggregate_func` was not 

460 specified. 

461 

462 Notes 

463 ----- 

464 This implementation currently only supports fully balanced designs. If the 

465 data contain more than one observation per subject and cell of the design, 

466 these observations need to be aggregated into a single observation 

467 before the Anova is calculated, either manually or by passing an aggregation 

468 function via the `aggregate_func` keyword argument. 

469 Note that if the input data set was not balanced before performing the 

470 aggregation, the implied heteroscedasticity of the data is ignored. 

471 

472 References 

473 ---------- 

474 .. [*] Rutherford, Andrew. Anova and ANCOVA: a GLM approach. John Wiley & Sons, 2011. 

475 """ 

476 

477 def __init__(self, data, depvar, subject, within=None, between=None, 

478 aggregate_func=None): 

479 self.data = data 

480 self.depvar = depvar 

481 self.within = within 

482 if 'C' in within: 

483 raise ValueError("Factor name cannot be 'C'! This is in conflict " 

484 "with patsy's contrast function name.") 

485 self.between = between 

486 if between is not None: 

487 raise NotImplementedError('Between subject effect not ' 

488 'yet supported!') 

489 self.subject = subject 

490 

491 if aggregate_func == 'mean': 

492 self.aggregate_func = np.mean 

493 else: 

494 self.aggregate_func = aggregate_func 

495 

496 if not data.equals(data.drop_duplicates(subset=[subject] + within)): 

497 if self.aggregate_func is not None: 

498 self._aggregate() 

499 else: 

500 msg = ('The data set contains more than one observation per ' 

501 'subject and cell. Either aggregate the data manually, ' 

502 'or pass the `aggregate_func` parameter.') 

503 raise ValueError(msg) 

504 

505 self._check_data_balanced() 

506 

507 def _aggregate(self): 

508 self.data = (self.data 

509 .groupby([self.subject] + self.within, 

510 as_index=False)[self.depvar] 

511 .agg(self.aggregate_func)) 

512 

513 def _check_data_balanced(self): 

514 """raise if data is not balanced 

515 

516 This raises a ValueError if the data is not balanced, and 

517 returns None if it is balance 

518 

519 Return might change 

520 """ 

521 factor_levels = 1 

522 for wi in self.within: 

523 factor_levels *= len(self.data[wi].unique()) 

524 

525 cell_count = {} 

526 for index in range(self.data.shape[0]): 

527 key = [] 

528 for col in self.within: 

529 key.append(self.data[col].iloc[index]) 

530 key = tuple(key) 

531 if key in cell_count: 

532 cell_count[key] = cell_count[key] + 1 

533 else: 

534 cell_count[key] = 1 

535 error_message = "Data is unbalanced." 

536 if len(cell_count) != factor_levels: 

537 raise ValueError(error_message) 

538 count = cell_count[key] 

539 for key in cell_count: 

540 if count != cell_count[key]: 

541 raise ValueError(error_message) 

542 if self.data.shape[0] > count * factor_levels: 

543 raise ValueError('There are more than 1 element in a cell! Missing' 

544 ' factors?') 

545 

546 def fit(self): 

547 """estimate the model and compute the Anova table 

548 

549 Returns 

550 ------- 

551 AnovaResults instance 

552 """ 

553 y = self.data[self.depvar].values 

554 

555 # Construct OLS endog and exog from string using patsy 

556 within = ['C(%s, Sum)' % i for i in self.within] 

557 subject = 'C(%s, Sum)' % self.subject 

558 factors = within + [subject] 

559 x = patsy.dmatrix('*'.join(factors), data=self.data) 

560 term_slices = x.design_info.term_name_slices 

561 for key in term_slices: 

562 ind = np.array([False]*x.shape[1]) 

563 ind[term_slices[key]] = True 

564 term_slices[key] = np.array(ind) 

565 term_exclude = [':'.join(factors)] 

566 ind = _not_slice(term_slices, term_exclude, x.shape[1]) 

567 x = x[:, ind] 

568 

569 # Fit OLS 

570 model = OLS(y, x) 

571 results = model.fit() 

572 if model.rank < x.shape[1]: 

573 raise ValueError('Independent variables are collinear.') 

574 for i in term_exclude: 

575 term_slices.pop(i) 

576 for key in term_slices: 

577 term_slices[key] = term_slices[key][ind] 

578 params = results.params 

579 df_resid = results.df_resid 

580 ssr = results.ssr 

581 

582 columns = ['F Value', 'Num DF', 'Den DF', 'Pr > F'] 

583 anova_table = pd.DataFrame(np.zeros((0, 4)), columns=columns) 

584 

585 for key in term_slices: 

586 if self.subject not in key and key != 'Intercept': 

587 # Independen variables are orthogonal 

588 ssr1, df_resid1 = _ssr_reduced_model( 

589 y, x, term_slices, params, [key]) 

590 df1 = df_resid1 - df_resid 

591 msm = (ssr1 - ssr) / df1 

592 if (key == ':'.join(factors[:-1]) or 

593 (key + ':' + subject not in term_slices)): 

594 mse = ssr / df_resid 

595 df2 = df_resid 

596 else: 

597 ssr1, df_resid1 = _ssr_reduced_model( 

598 y, x, term_slices, params, 

599 [key + ':' + subject]) 

600 df2 = df_resid1 - df_resid 

601 mse = (ssr1 - ssr) / df2 

602 F = msm / mse 

603 p = stats.f.sf(F, df1, df2) 

604 term = key.replace('C(', '').replace(', Sum)', '') 

605 anova_table.loc[term, 'F Value'] = F 

606 anova_table.loc[term, 'Num DF'] = df1 

607 anova_table.loc[term, 'Den DF'] = df2 

608 anova_table.loc[term, 'Pr > F'] = p 

609 

610 return AnovaResults(anova_table) 

611 

612 

613class AnovaResults(object): 

614 """ 

615 Anova results class 

616 

617 Attributes 

618 ---------- 

619 anova_table : DataFrame 

620 """ 

621 def __init__(self, anova_table): 

622 self.anova_table = anova_table 

623 

624 def __str__(self): 

625 return self.summary().__str__() 

626 

627 def summary(self): 

628 """create summary results 

629 

630 Returns 

631 ------- 

632 summary : summary2.Summary instance 

633 """ 

634 summ = summary2.Summary() 

635 summ.add_title('Anova') 

636 summ.add_df(self.anova_table) 

637 

638 return summ 

639 

640 

641if __name__ == "__main__": 

642 import pandas 

643 from statsmodels.formula.api import ols 

644 # in R 

645 #library(car) 

646 #write.csv(Moore, "moore.csv", row.names=FALSE) 

647 moore = pandas.read_csv('moore.csv', skiprows=1, 

648 names=['partner_status','conformity', 

649 'fcategory','fscore']) 

650 moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)', 

651 data=moore).fit() 

652 

653 mooreB = ols('conformity ~ C(partner_status, Sum)', data=moore).fit() 

654 

655 # for each term you just want to test vs the model without its 

656 # higher-order terms 

657 

658 # using Monette-Fox slides and Marden class notes for linear algebra / 

659 # orthogonal complement 

660 # https://netfiles.uiuc.edu/jimarden/www/Classes/STAT324/ 

661 

662 table = anova_lm(moore_lm, typ=2)