Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2 

3"""General linear model 

4 

5author: Yichuan Liu 

6""" 

7import numpy as np 

8from numpy.linalg import eigvals, inv, solve, matrix_rank, pinv, svd 

9from scipy import stats 

10import pandas as pd 

11from patsy import DesignInfo 

12 

13from statsmodels.compat.pandas import Substitution 

14from statsmodels.base.model import Model 

15from statsmodels.iolib import summary2 

16__docformat__ = 'restructuredtext en' 

17 

18_hypotheses_doc = \ 

19"""hypotheses : list[tuple] 

20 Hypothesis `L*B*M = C` to be tested where B is the parameters in 

21 regression Y = X*B. Each element is a tuple of length 2, 3, or 4: 

22 

23 * (name, contrast_L) 

24 * (name, contrast_L, transform_M) 

25 * (name, contrast_L, transform_M, constant_C) 

26 

27 containing a string `name`, the contrast matrix L, the transform 

28 matrix M (for transforming dependent variables), and right-hand side 

29 constant matrix constant_C, respectively. 

30 

31 contrast_L : 2D array or an array of strings 

32 Left-hand side contrast matrix for hypotheses testing. 

33 If 2D array, each row is an hypotheses and each column is an 

34 independent variable. At least 1 row 

35 (1 by k_exog, the number of independent variables) is required. 

36 If an array of strings, it will be passed to 

37 patsy.DesignInfo().linear_constraint. 

38 

39 transform_M : 2D array or an array of strings or None, optional 

40 Left hand side transform matrix. 

41 If `None` or left out, it is set to a k_endog by k_endog 

42 identity matrix (i.e. do not transform y matrix). 

43 If an array of strings, it will be passed to 

44 patsy.DesignInfo().linear_constraint. 

45 

46 constant_C : 2D array or None, optional 

47 Right-hand side constant matrix. 

48 if `None` or left out it is set to a matrix of zeros 

49 Must has the same number of rows as contrast_L and the same 

50 number of columns as transform_M 

51 

52 If `hypotheses` is None: 1) the effect of each independent variable 

53 on the dependent variables will be tested. Or 2) if model is created 

54 using a formula, `hypotheses` will be created according to 

55 `design_info`. 1) and 2) is equivalent if no additional variables 

56 are created by the formula (e.g. dummy variables for categorical 

57 variables and interaction terms) 

58""" 

59 

60 

61def _multivariate_ols_fit(endog, exog, method='svd', tolerance=1e-8): 

62 """ 

63 Solve multivariate linear model y = x * params 

64 where y is dependent variables, x is independent variables 

65 

66 Parameters 

67 ---------- 

68 endog : array_like 

69 each column is a dependent variable 

70 exog : array_like 

71 each column is a independent variable 

72 method : str 

73 'svd' - Singular value decomposition 

74 'pinv' - Moore-Penrose pseudoinverse 

75 tolerance : float, a small positive number 

76 Tolerance for eigenvalue. Values smaller than tolerance is considered 

77 zero. 

78 Returns 

79 ------- 

80 a tuple of matrices or values necessary for hypotheses testing 

81 

82 .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm 

83 Notes 

84 ----- 

85 Status: experimental and incomplete 

86 """ 

87 y = endog 

88 x = exog 

89 nobs, k_endog = y.shape 

90 nobs1, k_exog= x.shape 

91 if nobs != nobs1: 

92 raise ValueError('x(n=%d) and y(n=%d) should have the same number of ' 

93 'rows!' % (nobs1, nobs)) 

94 

95 # Calculate the matrices necessary for hypotheses testing 

96 df_resid = nobs - k_exog 

97 if method == 'pinv': 

98 # Regression coefficients matrix 

99 pinv_x = pinv(x) 

100 params = pinv_x.dot(y) 

101 

102 # inverse of x'x 

103 inv_cov = pinv_x.dot(pinv_x.T) 

104 if matrix_rank(inv_cov,tol=tolerance) < k_exog: 

105 raise ValueError('Covariance of x singular!') 

106 

107 # Sums of squares and cross-products of residuals 

108 # Y'Y - (X * params)'B * params 

109 t = x.dot(params) 

110 sscpr = np.subtract(y.T.dot(y), t.T.dot(t)) 

111 return (params, df_resid, inv_cov, sscpr) 

112 elif method == 'svd': 

113 u, s, v = svd(x, 0) 

114 if (s > tolerance).sum() < len(s): 

115 raise ValueError('Covariance of x singular!') 

116 invs = 1. / s 

117 

118 params = v.T.dot(np.diag(invs)).dot(u.T).dot(y) 

119 inv_cov = v.T.dot(np.diag(np.power(invs, 2))).dot(v) 

120 t = np.diag(s).dot(v).dot(params) 

121 sscpr = np.subtract(y.T.dot(y), t.T.dot(t)) 

122 return (params, df_resid, inv_cov, sscpr) 

123 else: 

124 raise ValueError('%s is not a supported method!' % method) 

125 

126 

127def multivariate_stats(eigenvals, 

128 r_err_sscp, 

129 r_contrast, df_resid, tolerance=1e-8): 

130 """ 

131 For multivariate linear model Y = X * B 

132 Testing hypotheses 

133 L*B*M = 0 

134 where L is contrast matrix, B is the parameters of the 

135 multivariate linear model and M is dependent variable transform matrix. 

136 T = L*inv(X'X)*L' 

137 H = M'B'L'*inv(T)*LBM 

138 E = M'(Y'Y - B'X'XB)M 

139 

140 Parameters 

141 ---------- 

142 eigenvals : ndarray 

143 The eigenvalues of inv(E + H)*H 

144 r_err_sscp : int 

145 Rank of E + H 

146 r_contrast : int 

147 Rank of T matrix 

148 df_resid : int 

149 Residual degree of freedom (n_samples minus n_variables of X) 

150 tolerance : float 

151 smaller than which eigenvalue is considered 0 

152 

153 Returns 

154 ------- 

155 A DataFrame 

156 

157 References 

158 ---------- 

159 .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm 

160 """ 

161 v = df_resid 

162 p = r_err_sscp 

163 q = r_contrast 

164 s = np.min([p, q]) 

165 ind = eigenvals > tolerance 

166 n_e = ind.sum() 

167 eigv2 = eigenvals[ind] 

168 eigv1 = np.array([i / (1 - i) for i in eigv2]) 

169 m = (np.abs(p - q) - 1) / 2 

170 n = (v - p - 1) / 2 

171 

172 cols = ['Value', 'Num DF', 'Den DF', 'F Value', 'Pr > F'] 

173 index = ["Wilks' lambda", "Pillai's trace", 

174 "Hotelling-Lawley trace", "Roy's greatest root"] 

175 results = pd.DataFrame(columns=cols, 

176 index=index) 

177 

178 def fn(x): 

179 return np.real([x])[0] 

180 

181 results.loc["Wilks' lambda", 'Value'] = fn(np.prod(1 - eigv2)) 

182 

183 results.loc["Pillai's trace", 'Value'] = fn(eigv2.sum()) 

184 

185 results.loc["Hotelling-Lawley trace", 'Value'] = fn(eigv1.sum()) 

186 

187 results.loc["Roy's greatest root", 'Value'] = fn(eigv1.max()) 

188 

189 r = v - (p - q + 1)/2 

190 u = (p*q - 2) / 4 

191 df1 = p * q 

192 if p*p + q*q - 5 > 0: 

193 t = np.sqrt((p*p*q*q - 4) / (p*p + q*q - 5)) 

194 else: 

195 t = 1 

196 df2 = r*t - 2*u 

197 lmd = results.loc["Wilks' lambda", 'Value'] 

198 lmd = np.power(lmd, 1 / t) 

199 F = (1 - lmd) / lmd * df2 / df1 

200 results.loc["Wilks' lambda", 'Num DF'] = df1 

201 results.loc["Wilks' lambda", 'Den DF'] = df2 

202 results.loc["Wilks' lambda", 'F Value'] = F 

203 pval = stats.f.sf(F, df1, df2) 

204 results.loc["Wilks' lambda", 'Pr > F'] = pval 

205 

206 V = results.loc["Pillai's trace", 'Value'] 

207 df1 = s * (2*m + s + 1) 

208 df2 = s * (2*n + s + 1) 

209 F = df2 / df1 * V / (s - V) 

210 results.loc["Pillai's trace", 'Num DF'] = df1 

211 results.loc["Pillai's trace", 'Den DF'] = df2 

212 results.loc["Pillai's trace", 'F Value'] = F 

213 pval = stats.f.sf(F, df1, df2) 

214 results.loc["Pillai's trace", 'Pr > F'] = pval 

215 

216 U = results.loc["Hotelling-Lawley trace", 'Value'] 

217 if n > 0: 

218 b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1) 

219 df1 = p * q 

220 df2 = 4 + (p*q + 2) / (b - 1) 

221 c = (df2 - 2) / 2 / n 

222 F = df2 / df1 * U / c 

223 else: 

224 df1 = s * (2*m + s + 1) 

225 df2 = s * (s*n + 1) 

226 F = df2 / df1 / s * U 

227 results.loc["Hotelling-Lawley trace", 'Num DF'] = df1 

228 results.loc["Hotelling-Lawley trace", 'Den DF'] = df2 

229 results.loc["Hotelling-Lawley trace", 'F Value'] = F 

230 pval = stats.f.sf(F, df1, df2) 

231 results.loc["Hotelling-Lawley trace", 'Pr > F'] = pval 

232 

233 sigma = results.loc["Roy's greatest root", 'Value'] 

234 r = np.max([p, q]) 

235 df1 = r 

236 df2 = v - r + q 

237 F = df2 / df1 * sigma 

238 results.loc["Roy's greatest root", 'Num DF'] = df1 

239 results.loc["Roy's greatest root", 'Den DF'] = df2 

240 results.loc["Roy's greatest root", 'F Value'] = F 

241 pval = stats.f.sf(F, df1, df2) 

242 results.loc["Roy's greatest root", 'Pr > F'] = pval 

243 return results 

244 

245 

246def _multivariate_ols_test(hypotheses, fit_results, exog_names, 

247 endog_names): 

248 def fn(L, M, C): 

249 # .. [1] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm 

250 params, df_resid, inv_cov, sscpr = fit_results 

251 # t1 = (L * params)M 

252 t1 = L.dot(params).dot(M) - C 

253 # H = t1'L(X'X)^L't1 

254 t2 = L.dot(inv_cov).dot(L.T) 

255 q = matrix_rank(t2) 

256 H = t1.T.dot(inv(t2)).dot(t1) 

257 

258 # E = M'(Y'Y - B'(X'X)B)M 

259 E = M.T.dot(sscpr).dot(M) 

260 return E, H, q, df_resid 

261 

262 return _multivariate_test(hypotheses, exog_names, endog_names, fn) 

263 

264 

265@Substitution(hypotheses_doc=_hypotheses_doc) 

266def _multivariate_test(hypotheses, exog_names, endog_names, fn): 

267 """ 

268 Multivariate linear model hypotheses testing 

269 

270 For y = x * params, where y are the dependent variables and x are the 

271 independent variables, testing L * params * M = 0 where L is the contrast 

272 matrix for hypotheses testing and M is the transformation matrix for 

273 transforming the dependent variables in y. 

274 

275 Algorithm: 

276 T = L*inv(X'X)*L' 

277 H = M'B'L'*inv(T)*LBM 

278 E = M'(Y'Y - B'X'XB)M 

279 And then finding the eigenvalues of inv(H + E)*H 

280 

281 .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm 

282 

283 Parameters 

284 ---------- 

285 %(hypotheses_doc)s 

286 k_xvar : int 

287 The number of independent variables 

288 k_yvar : int 

289 The number of dependent variables 

290 fn : function 

291 a function fn(contrast_L, transform_M) that returns E, H, q, df_resid 

292 where q is the rank of T matrix 

293 

294 Returns 

295 ------- 

296 results : MANOVAResults 

297 """ 

298 

299 k_xvar = len(exog_names) 

300 k_yvar = len(endog_names) 

301 results = {} 

302 for hypo in hypotheses: 

303 if len(hypo) ==2: 

304 name, L = hypo 

305 M = None 

306 C = None 

307 elif len(hypo) == 3: 

308 name, L, M = hypo 

309 C = None 

310 elif len(hypo) == 4: 

311 name, L, M, C = hypo 

312 else: 

313 raise ValueError('hypotheses must be a tuple of length 2, 3 or 4.' 

314 ' len(hypotheses)=%d' % len(hypo)) 

315 if any(isinstance(j, str) for j in L): 

316 L = DesignInfo(exog_names).linear_constraint(L).coefs 

317 else: 

318 if not isinstance(L, np.ndarray) or len(L.shape) != 2: 

319 raise ValueError('Contrast matrix L must be a 2-d array!') 

320 if L.shape[1] != k_xvar: 

321 raise ValueError('Contrast matrix L should have the same ' 

322 'number of columns as exog! %d != %d' % 

323 (L.shape[1], k_xvar)) 

324 if M is None: 

325 M = np.eye(k_yvar) 

326 elif any(isinstance(j, str) for j in M): 

327 M = DesignInfo(endog_names).linear_constraint(M).coefs.T 

328 else: 

329 if M is not None: 

330 if not isinstance(M, np.ndarray) or len(M.shape) != 2: 

331 raise ValueError('Transform matrix M must be a 2-d array!') 

332 if M.shape[0] != k_yvar: 

333 raise ValueError('Transform matrix M should have the same ' 

334 'number of rows as the number of columns ' 

335 'of endog! %d != %d' % 

336 (M.shape[0], k_yvar)) 

337 if C is None: 

338 C = np.zeros([L.shape[0], M.shape[1]]) 

339 elif not isinstance(C, np.ndarray): 

340 raise ValueError('Constant matrix C must be a 2-d array!') 

341 

342 if C.shape[0] != L.shape[0]: 

343 raise ValueError('contrast L and constant C must have the same ' 

344 'number of rows! %d!=%d' 

345 % (L.shape[0], C.shape[0])) 

346 if C.shape[1] != M.shape[1]: 

347 raise ValueError('transform M and constant C must have the same ' 

348 'number of columns! %d!=%d' 

349 % (M.shape[1], C.shape[1])) 

350 E, H, q, df_resid = fn(L, M, C) 

351 EH = np.add(E, H) 

352 p = matrix_rank(EH) 

353 

354 # eigenvalues of inv(E + H)H 

355 eigv2 = np.sort(eigvals(solve(EH, H))) 

356 stat_table = multivariate_stats(eigv2, p, q, df_resid) 

357 

358 results[name] = {'stat':stat_table, 'contrast_L':L, 

359 'transform_M':M, 'constant_C':C} 

360 return results 

361 

362 

363class _MultivariateOLS(Model): 

364 """ 

365 Multivariate linear model via least squares 

366 

367 

368 Parameters 

369 ---------- 

370 endog : array_like 

371 Dependent variables. A nobs x k_endog array where nobs is 

372 the number of observations and k_endog is the number of dependent 

373 variables 

374 exog : array_like 

375 Independent variables. A nobs x k_exog array where nobs is the 

376 number of observations and k_exog is the number of independent 

377 variables. An intercept is not included by default and should be added 

378 by the user (models specified using a formula include an intercept by 

379 default) 

380 

381 Attributes 

382 ---------- 

383 endog : ndarray 

384 See Parameters. 

385 exog : ndarray 

386 See Parameters. 

387 """ 

388 _formula_max_endog = None 

389 

390 def __init__(self, endog, exog, missing='none', hasconst=None, **kwargs): 

391 if len(endog.shape) == 1 or endog.shape[1] == 1: 

392 raise ValueError('There must be more than one dependent variable' 

393 ' to fit multivariate OLS!') 

394 super(_MultivariateOLS, self).__init__(endog, exog, missing=missing, 

395 hasconst=hasconst, **kwargs) 

396 

397 def fit(self, method='svd'): 

398 self._fittedmod = _multivariate_ols_fit( 

399 self.endog, self.exog, method=method) 

400 return _MultivariateOLSResults(self) 

401 

402 

403class _MultivariateOLSResults(object): 

404 """ 

405 _MultivariateOLS results class 

406 """ 

407 def __init__(self, fitted_mv_ols): 

408 if (hasattr(fitted_mv_ols, 'data') and 

409 hasattr(fitted_mv_ols.data, 'design_info')): 

410 self.design_info = fitted_mv_ols.data.design_info 

411 else: 

412 self.design_info = None 

413 self.exog_names = fitted_mv_ols.exog_names 

414 self.endog_names = fitted_mv_ols.endog_names 

415 self._fittedmod = fitted_mv_ols._fittedmod 

416 

417 def __str__(self): 

418 return self.summary().__str__() 

419 

420 @Substitution(hypotheses_doc=_hypotheses_doc) 

421 def mv_test(self, hypotheses=None): 

422 """ 

423 Linear hypotheses testing 

424 

425 Parameters 

426 ---------- 

427 %(hypotheses_doc)s 

428 

429 Returns 

430 ------- 

431 results: _MultivariateOLSResults 

432 

433 Notes 

434 ----- 

435 Tests hypotheses of the form 

436 

437 L * params * M = C 

438 

439 where `params` is the regression coefficient matrix for the 

440 linear model y = x * params, `L` is the contrast matrix, `M` is the 

441 dependent variable transform matrix and C is the constant matrix. 

442 """ 

443 k_xvar = len(self.exog_names) 

444 if hypotheses is None: 

445 if self.design_info is not None: 

446 terms = self.design_info.term_name_slices 

447 hypotheses = [] 

448 for key in terms: 

449 L_contrast = np.eye(k_xvar)[terms[key], :] 

450 hypotheses.append([key, L_contrast, None]) 

451 else: 

452 hypotheses = [] 

453 for i in range(k_xvar): 

454 name = 'x%d' % (i) 

455 L = np.zeros([1, k_xvar]) 

456 L[i] = 1 

457 hypotheses.append([name, L, None]) 

458 

459 results = _multivariate_ols_test(hypotheses, self._fittedmod, 

460 self.exog_names, self.endog_names) 

461 

462 return MultivariateTestResults(results, 

463 self.endog_names, 

464 self.exog_names) 

465 

466 def summary(self): 

467 raise NotImplementedError 

468 

469 

470class MultivariateTestResults(object): 

471 """ Multivariate test results class 

472 Returned by `mv_test` method of `_MultivariateOLSResults` class 

473 

474 Attributes 

475 ---------- 

476 results : dict 

477 For hypothesis name `key`: 

478 results[key]['stat'] contains the multivariate test results 

479 results[key]['contrast_L'] contains the contrast_L matrix 

480 results[key]['transform_M'] contains the transform_M matrix 

481 results[key]['constant_C'] contains the constant_C matrix 

482 endog_names : str 

483 exog_names : str 

484 summary_frame : multiindex dataframe 

485 Returns results as a multiindex dataframe 

486 """ 

487 def __init__(self, mv_test_df, endog_names, exog_names): 

488 self.results = mv_test_df 

489 self.endog_names = endog_names 

490 self.exog_names = exog_names 

491 

492 def __str__(self): 

493 return self.summary().__str__() 

494 

495 def __getitem__(self, item): 

496 return self.results[item] 

497 

498 @property 

499 def summary_frame(self): 

500 """ 

501 Return results as a multiindex dataframe 

502 """ 

503 df = [] 

504 for key in self.results: 

505 tmp = self.results[key]['stat'].copy() 

506 tmp.loc[:, 'Effect'] = key 

507 df.append(tmp.reset_index()) 

508 df = pd.concat(df, axis=0) 

509 df = df.set_index(['Effect', 'index']) 

510 df.index.set_names(['Effect', 'Statistic'], inplace=True) 

511 return df 

512 

513 def summary(self, show_contrast_L=False, show_transform_M=False, 

514 show_constant_C=False): 

515 """ 

516 

517 Parameters 

518 ---------- 

519 contrast_L : True or False 

520 Whether to show contrast_L matrix 

521 transform_M : True or False 

522 Whether to show transform_M matrix 

523 """ 

524 summ = summary2.Summary() 

525 summ.add_title('Multivariate linear model') 

526 for key in self.results: 

527 summ.add_dict({'':''}) 

528 df = self.results[key]['stat'].copy() 

529 df = df.reset_index() 

530 c = df.columns.values 

531 c[0] = key 

532 df.columns = c 

533 df.index = ['', '', '', ''] 

534 summ.add_df(df) 

535 if show_contrast_L: 

536 summ.add_dict({key:' contrast L='}) 

537 df = pd.DataFrame(self.results[key]['contrast_L'], 

538 columns=self.exog_names) 

539 summ.add_df(df) 

540 if show_transform_M: 

541 summ.add_dict({key:' transform M='}) 

542 df = pd.DataFrame(self.results[key]['transform_M'], 

543 index=self.endog_names) 

544 summ.add_df(df) 

545 if show_constant_C: 

546 summ.add_dict({key:' constant C='}) 

547 df = pd.DataFrame(self.results[key]['constant_C']) 

548 summ.add_df(df) 

549 return summ