Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2 

3import warnings 

4 

5import numpy as np 

6from numpy.linalg import eigh, inv, norm, matrix_rank 

7import pandas as pd 

8from scipy.optimize import minimize 

9 

10from statsmodels.tools.decorators import cache_readonly 

11from statsmodels.base.model import Model 

12from statsmodels.iolib import summary2 

13from statsmodels.graphics.utils import _import_mpl 

14 

15from .factor_rotation import rotate_factors, promax 

16 

17 

18_opt_defaults = {'gtol': 1e-7} 

19 

20 

21def _check_args_1(endog, n_factor, corr, nobs): 

22 

23 msg = "Either endog or corr must be provided." 

24 if endog is not None and corr is not None: 

25 raise ValueError(msg) 

26 if endog is None and corr is None: 

27 warnings.warn('Both endog and corr are provided, ' + 

28 'corr will be used for factor analysis.') 

29 

30 if n_factor <= 0: 

31 raise ValueError('n_factor must be larger than 0! %d < 0' % 

32 (n_factor)) 

33 

34 if nobs is not None and endog is not None: 

35 warnings.warn("nobs is ignored when endog is provided") 

36 

37 

38def _check_args_2(endog, n_factor, corr, nobs, k_endog): 

39 

40 if n_factor > k_endog: 

41 raise ValueError('n_factor cannot be greater than the number' 

42 ' of variables! %d > %d' % 

43 (n_factor, k_endog)) 

44 

45 if np.max(np.abs(np.diag(corr) - 1)) > 1e-10: 

46 raise ValueError("corr must be a correlation matrix") 

47 

48 if corr.shape[0] != corr.shape[1]: 

49 raise ValueError('Correlation matrix corr must be a square ' 

50 '(rows %d != cols %d)' % corr.shape) 

51 

52 

53class Factor(Model): 

54 """ 

55 Factor analysis 

56 

57 Parameters 

58 ---------- 

59 endog : array_like 

60 Variables in columns, observations in rows. May be `None` if 

61 `corr` is not `None`. 

62 n_factor : int 

63 The number of factors to extract 

64 corr : array_like 

65 Directly specify the correlation matrix instead of estimating 

66 it from `endog`. If provided, `endog` is not used for the 

67 factor analysis, it may be used in post-estimation. 

68 method : str 

69 The method to extract factors, currently must be either 'pa' 

70 for principal axis factor analysis or 'ml' for maximum 

71 likelihood estimation. 

72 smc : True or False 

73 Whether or not to apply squared multiple correlations (method='pa') 

74 endog_names : str 

75 Names of endogenous variables. If specified, it will be used 

76 instead of the column names in endog 

77 nobs : int 

78 The number of observations, not used if endog is present. Needs to 

79 be provided for inference if endog is None. 

80 missing : 'none', 'drop', or 'raise' 

81 Missing value handling for endog, default is row-wise deletion 'drop' 

82 If 'none', no nan checking is done. If 'drop', any observations with 

83 nans are dropped. If 'raise', an error is raised. 

84 

85 

86 Notes 

87 ----- 

88 **Experimental** 

89 

90 Supported rotations: 'varimax', 'quartimax', 'biquartimax', 

91 'equamax', 'oblimin', 'parsimax', 'parsimony', 'biquartimin', 

92 'promax' 

93 

94 If method='ml', the factors are rotated to satisfy condition IC3 

95 of Bai and Li (2012). This means that the scores have covariance 

96 I, so the model for the covariance matrix is L * L' + diag(U), 

97 where L are the loadings and U are the uniquenesses. In addition, 

98 L' * diag(U)^{-1} L must be diagonal. 

99 

100 References 

101 ---------- 

102 .. [*] Hofacker, C. (2004). Exploratory Factor Analysis, Mathematical 

103 Marketing. http://www.openaccesstexts.org/pdf/Quant_Chapter_11_efa.pdf 

104 .. [*] J Bai, K Li (2012). Statistical analysis of factor models of high 

105 dimension. Annals of Statistics. https://arxiv.org/pdf/1205.6617.pdf 

106 """ 

107 def __init__(self, endog=None, n_factor=1, corr=None, method='pa', 

108 smc=True, endog_names=None, nobs=None, missing='drop'): 

109 

110 _check_args_1(endog, n_factor, corr, nobs) 

111 

112 if endog is not None: 

113 super(Factor, self).__init__(endog, exog=None, missing=missing) 

114 endog = self.endog # after preprocessing like missing, asarray 

115 k_endog = endog.shape[1] 

116 nobs = endog.shape[0] 

117 corr = self.corr = np.corrcoef(endog, rowvar=0) 

118 elif corr is not None: 

119 corr = self.corr = np.asarray(corr) 

120 k_endog = self.corr.shape[0] 

121 self.endog = None 

122 else: 

123 msg = "Either endog or corr must be provided." 

124 raise ValueError(msg) 

125 

126 _check_args_2(endog, n_factor, corr, nobs, k_endog) 

127 

128 self.n_factor = n_factor 

129 self.loadings = None 

130 self.communality = None 

131 self.method = method 

132 self.smc = smc 

133 self.nobs = nobs 

134 self.method = method 

135 self.corr = corr 

136 self.k_endog = k_endog 

137 

138 if endog_names is None: 

139 if hasattr(corr, 'index'): 

140 endog_names = corr.index 

141 if hasattr(corr, 'columns'): 

142 endog_names = corr.columns 

143 self.endog_names = endog_names 

144 

145 @property 

146 def endog_names(self): 

147 """Names of endogenous variables""" 

148 if self._endog_names is not None: 

149 return self._endog_names 

150 else: 

151 if self.endog is not None: 

152 return self.data.ynames 

153 else: 

154 d = 0 

155 n = self.corr.shape[0] - 1 

156 while n > 0: 

157 d += 1 

158 n //= 10 

159 return [('var%0' + str(d) + 'd') % i 

160 for i in range(self.corr.shape[0])] 

161 

162 @endog_names.setter 

163 def endog_names(self, value): 

164 # Check validity of endog_names: 

165 if value is not None: 

166 if len(value) != self.corr.shape[0]: 

167 raise ValueError('The length of `endog_names` must ' 

168 'equal the number of variables.') 

169 self._endog_names = np.asarray(value) 

170 else: 

171 self._endog_names = None 

172 

173 def fit(self, maxiter=50, tol=1e-8, start=None, opt_method='BFGS', 

174 opt=None, em_iter=3): 

175 """ 

176 Estimate factor model parameters. 

177 

178 Parameters 

179 ---------- 

180 maxiter : int 

181 Maximum number of iterations for iterative estimation algorithms 

182 tol : float 

183 Stopping criteria (error tolerance) for iterative estimation 

184 algorithms 

185 start : array_like 

186 Starting values, currently only used for ML estimation 

187 opt_method : str 

188 Optimization method for ML estimation 

189 opt : dict-like 

190 Keyword arguments passed to optimizer, only used for ML estimation 

191 em_iter : int 

192 The number of EM iterations before starting gradient optimization, 

193 only used for ML estimation. 

194 

195 Returns 

196 ------- 

197 results: FactorResults 

198 """ 

199 method = self.method.lower() 

200 if method == 'pa': 

201 return self._fit_pa(maxiter=maxiter, tol=tol) 

202 elif method == 'ml': 

203 return self._fit_ml(start, em_iter, opt_method, opt) 

204 else: 

205 msg = "Unknown factor extraction approach '%s'" % self.method 

206 raise ValueError(msg) 

207 

208 def _fit_pa(self, maxiter=50, tol=1e-8): 

209 """ 

210 Extract factors using the iterative principal axis method 

211 

212 Parameters 

213 ---------- 

214 maxiter : int 

215 Maximum number of iterations for communality estimation 

216 tol : float 

217 If `norm(communality - last_communality) < tolerance`, 

218 estimation stops 

219 

220 Returns 

221 ------- 

222 results : FactorResults instance 

223 """ 

224 

225 R = self.corr.copy() # inplace modification below 

226 

227 # Parameter validation 

228 self.n_comp = matrix_rank(R) 

229 if self.n_factor > self.n_comp: 

230 raise ValueError('n_factor must be smaller or equal to the rank' 

231 ' of endog! %d > %d' % 

232 (self.n_factor, self.n_comp)) 

233 if maxiter <= 0: 

234 raise ValueError('n_max_iter must be larger than 0! %d < 0' % 

235 (maxiter)) 

236 if tol <= 0 or tol > 0.01: 

237 raise ValueError('tolerance must be larger than 0 and smaller than' 

238 ' 0.01! Got %f instead' % (tol)) 

239 

240 # Initial communality estimation 

241 if self.smc: 

242 c = 1 - 1 / np.diag(inv(R)) 

243 else: 

244 c = np.ones(len(R)) 

245 

246 # Iterative communality estimation 

247 eigenvals = None 

248 for i in range(maxiter): 

249 # Get eigenvalues/eigenvectors of R with diag replaced by 

250 # communality 

251 for j in range(len(R)): 

252 R[j, j] = c[j] 

253 L, V = eigh(R, UPLO='U') 

254 c_last = np.array(c) 

255 ind = np.argsort(L) 

256 ind = ind[::-1] 

257 L = L[ind] 

258 n_pos = (L > 0).sum() 

259 V = V[:, ind] 

260 eigenvals = np.array(L) 

261 

262 # Select eigenvectors with positive eigenvalues 

263 n = np.min([n_pos, self.n_factor]) 

264 sL = np.diag(np.sqrt(L[:n])) 

265 V = V[:, :n] 

266 

267 # Calculate new loadings and communality 

268 A = V.dot(sL) 

269 c = np.power(A, 2).sum(axis=1) 

270 if norm(c_last - c) < tol: 

271 break 

272 

273 self.eigenvals = eigenvals 

274 self.communality = c 

275 self.uniqueness = 1 - c 

276 self.loadings = A 

277 return FactorResults(self) 

278 

279 # Unpacks the model parameters from a flat vector, used for ML 

280 # estimation. The first k_endog elements of par are the square 

281 # roots of the uniquenesses. The remaining elements are the 

282 # factor loadings, packed one factor at a time. 

283 def _unpack(self, par): 

284 return (par[0:self.k_endog]**2, 

285 np.reshape(par[self.k_endog:], (-1, self.k_endog)).T) 

286 

287 # Packs the model parameters into a flat parameter, used for ML 

288 # estimation. 

289 def _pack(self, load, uniq): 

290 return np.concatenate((np.sqrt(uniq), load.T.flat)) 

291 

292 def loglike(self, par): 

293 """ 

294 Evaluate the log-likelihood function. 

295 

296 Parameters 

297 ---------- 

298 par : ndarray or tuple of 2 ndarray's 

299 The model parameters, either a packed representation of 

300 the model parameters or a 2-tuple containing a `k_endog x 

301 n_factor` matrix of factor loadings and a `k_endog` vector 

302 of uniquenesses. 

303 

304 Returns 

305 ------- 

306 loglike : float 

307 """ 

308 

309 if type(par) is np.ndarray: 

310 uniq, load = self._unpack(par) 

311 else: 

312 load, uniq = par[0], par[1] 

313 

314 loadu = load / uniq[:, None] 

315 lul = np.dot(load.T, loadu) 

316 

317 # log|GG' + S| 

318 # Using matrix determinant lemma: 

319 # |GG' + S| = |I + G'S^{-1}G|*|S| 

320 lul.flat[::lul.shape[0]+1] += 1 

321 _, ld = np.linalg.slogdet(lul) 

322 v = np.sum(np.log(uniq)) + ld 

323 

324 # tr((GG' + S)^{-1}C) 

325 # Using Sherman-Morrison-Woodbury 

326 w = np.sum(1 / uniq) 

327 b = np.dot(load.T, self.corr / uniq[:, None]) 

328 b = np.linalg.solve(lul, b) 

329 b = np.dot(loadu, b) 

330 w -= np.trace(b) 

331 

332 # Scaled log-likelihood 

333 return -(v + w) / (2*self.k_endog) 

334 

335 def score(self, par): 

336 """ 

337 Evaluate the score function (first derivative of loglike). 

338 

339 Parameters 

340 ---------- 

341 par : ndarray or tuple of 2 ndarray's 

342 The model parameters, either a packed representation of 

343 the model parameters or a 2-tuple containing a `k_endog x 

344 n_factor` matrix of factor loadings and a `k_endog` vector 

345 of uniquenesses. 

346 

347 Returns 

348 ------- 

349 score : ndarray 

350 """ 

351 

352 if type(par) is np.ndarray: 

353 uniq, load = self._unpack(par) 

354 else: 

355 load, uniq = par[0], par[1] 

356 

357 # Center term of SMW 

358 loadu = load / uniq[:, None] 

359 c = np.dot(load.T, loadu) 

360 c.flat[::c.shape[0]+1] += 1 

361 d = np.linalg.solve(c, load.T) 

362 

363 # Precompute these terms 

364 lud = np.dot(loadu, d) 

365 cu = (self.corr / uniq) / uniq[:, None] 

366 r = np.dot(cu, load) 

367 lul = np.dot(lud.T, load) 

368 luz = np.dot(cu, lul) 

369 

370 # First term 

371 du = 2*np.sqrt(uniq) * (1/uniq - (d * load.T).sum(0) / uniq**2) 

372 dl = 2*(loadu - np.dot(lud, loadu)) 

373 

374 # Second term 

375 h = np.dot(lud, cu) 

376 f = np.dot(h, lud.T) 

377 du -= 2*np.sqrt(uniq) * (np.diag(cu) - 2*np.diag(h) + np.diag(f)) 

378 dl -= 2*r 

379 dl += 2*np.dot(lud, r) 

380 dl += 2*luz 

381 dl -= 2*np.dot(lud, luz) 

382 

383 # Cannot use _pack because we are working with the square root 

384 # uniquenesses directly. 

385 return -np.concatenate((du, dl.T.flat)) / (2*self.k_endog) 

386 

387 # Maximum likelihood factor analysis. 

388 def _fit_ml(self, start, em_iter, opt_method, opt): 

389 """estimate Factor model using Maximum Likelihood 

390 """ 

391 

392 # Starting values 

393 if start is None: 

394 load, uniq = self._fit_ml_em(em_iter) 

395 start = self._pack(load, uniq) 

396 elif len(start) == 2: 

397 if len(start[1]) != start[0].shape[0]: 

398 msg = "Starting values have incompatible dimensions" 

399 raise ValueError(msg) 

400 start = self._pack(start[0], start[1]) 

401 else: 

402 raise ValueError("Invalid starting values") 

403 

404 def nloglike(par): 

405 return -self.loglike(par) 

406 

407 def nscore(par): 

408 return -self.score(par) 

409 

410 # Do the optimization 

411 if opt is None: 

412 opt = _opt_defaults 

413 r = minimize(nloglike, start, jac=nscore, method=opt_method, 

414 options=opt) 

415 if not r.success: 

416 warnings.warn("Fitting did not converge") 

417 par = r.x 

418 uniq, load = self._unpack(par) 

419 

420 if uniq.min() < 1e-10: 

421 warnings.warn("Some uniquenesses are nearly zero") 

422 

423 # Rotate solution to satisfy IC3 of Bai and Li 

424 load = self._rotate(load, uniq) 

425 

426 self.uniqueness = uniq 

427 self.communality = 1 - uniq 

428 self.loadings = load 

429 self.mle_retvals = r 

430 

431 return FactorResults(self) 

432 

433 def _fit_ml_em(self, iter): 

434 """estimate Factor model using EM algorithm 

435 """ 

436 # Starting values 

437 np.random.seed(3427) 

438 load = 0.1*np.random.normal(size=(self.k_endog, self.n_factor)) 

439 uniq = 0.5 * np.ones(self.k_endog) 

440 

441 for k in range(iter): 

442 

443 loadu = load / uniq[:, None] 

444 

445 f = np.dot(load.T, loadu) 

446 f.flat[::f.shape[0]+1] += 1 

447 

448 r = np.linalg.solve(f, loadu.T) 

449 q = np.dot(loadu.T, load) 

450 h = np.dot(r, load) 

451 

452 c = load - np.dot(load, h) 

453 c /= uniq[:, None] 

454 

455 g = np.dot(q, r) 

456 e = np.dot(g, self.corr) 

457 d = np.dot(loadu.T, self.corr) - e 

458 

459 a = np.dot(d, c) 

460 a -= np.dot(load.T, c) 

461 a.flat[::a.shape[0]+1] += 1 

462 

463 b = np.dot(self.corr, c) 

464 

465 load = np.linalg.solve(a, b.T).T 

466 uniq = np.diag(self.corr) - (load * d.T).sum(1) 

467 

468 return load, uniq 

469 

470 def _rotate(self, load, uniq): 

471 """rotate loadings for MLE 

472 """ 

473 # Rotations used in ML estimation. 

474 load, s, _ = np.linalg.svd(load, 0) 

475 load *= s 

476 

477 if self.nobs is None: 

478 nobs = 1 

479 else: 

480 nobs = self.nobs 

481 

482 cm = np.dot(load.T, load / uniq[:, None]) / nobs 

483 _, f = np.linalg.eig(cm) 

484 load = np.dot(load, f) 

485 return load 

486 

487 

488class FactorResults(object): 

489 """ 

490 Factor results class 

491 

492 For result summary, scree/loading plots and factor rotations 

493 

494 Parameters 

495 ---------- 

496 factor : Factor 

497 Fitted Factor class 

498 

499 Attributes 

500 ---------- 

501 uniqueness: ndarray 

502 The uniqueness (variance of uncorrelated errors unique to 

503 each variable) 

504 communality: ndarray 

505 1 - uniqueness 

506 loadings : ndarray 

507 Each column is the loading vector for one factor 

508 loadings_no_rot : ndarray 

509 Unrotated loadings, not available under maximum likelihood 

510 analysis. 

511 eigenvalues : ndarray 

512 The eigenvalues for a factor analysis obtained using 

513 principal components; not available under ML estimation. 

514 n_comp : int 

515 Number of components (factors) 

516 nbs : int 

517 Number of observations 

518 fa_method : str 

519 The method used to obtain the decomposition, either 'pa' for 

520 'principal axes' or 'ml' for maximum likelihood. 

521 df : int 

522 Degrees of freedom of the factor model. 

523 

524 Notes 

525 ----- 

526 Under ML estimation, the default rotation (used for `loadings`) is 

527 condition IC3 of Bai and Li (2012). Under this rotation, the 

528 factor scores are iid and standardized. If `G` is the canonical 

529 loadings and `U` is the vector of uniquenesses, then the 

530 covariance matrix implied by the factor analysis is `GG' + 

531 diag(U)`. 

532 

533 Status: experimental, Some refactoring will be necessary when new 

534 features are added. 

535 """ 

536 def __init__(self, factor): 

537 self.model = factor 

538 self.endog_names = factor.endog_names 

539 self.loadings_no_rot = factor.loadings 

540 if hasattr(factor, "eigenvals"): 

541 self.eigenvals = factor.eigenvals 

542 

543 self.communality = factor.communality 

544 self.uniqueness = factor.uniqueness 

545 self.rotation_method = None 

546 self.fa_method = factor.method 

547 self.n_comp = factor.loadings.shape[1] 

548 self.nobs = factor.nobs 

549 self._factor = factor 

550 if hasattr(factor, "mle_retvals"): 

551 self.mle_retvals = factor.mle_retvals 

552 

553 p, k = self.loadings_no_rot.shape 

554 self.df = ((p - k)**2 - (p + k)) // 2 

555 

556 # no rotation, overwritten in `rotate` 

557 self.loadings = factor.loadings 

558 self.rotation_matrix = np.eye(self.n_comp) 

559 

560 

561 def __str__(self): 

562 return self.summary().__str__() 

563 

564 def rotate(self, method): 

565 """ 

566 Apply rotation, inplace modification of this Results instance 

567 

568 Parameters 

569 ---------- 

570 method : str 

571 Rotation to be applied. Allowed methods are varimax, 

572 quartimax, biquartimax, equamax, oblimin, parsimax, 

573 parsimony, biquartimin, promax. 

574 

575 Returns 

576 ------- 

577 None : nothing returned, modifications are inplace 

578 

579 

580 Notes 

581 ----- 

582 Warning: 'varimax', 'quartimax' and 'oblimin' are verified against R or 

583 Stata. Some rotation methods such as promax do not produce the same 

584 results as the R or Stata default functions. 

585 

586 See Also 

587 -------- 

588 factor_rotation : subpackage that implements rotation methods 

589 """ 

590 self.rotation_method = method 

591 if method not in ['varimax', 'quartimax', 'biquartimax', 

592 'equamax', 'oblimin', 'parsimax', 'parsimony', 

593 'biquartimin', 'promax']: 

594 raise ValueError('Unknown rotation method %s' % (method)) 

595 

596 if method in ['varimax', 'quartimax', 'biquartimax', 'equamax', 

597 'parsimax', 'parsimony', 'biquartimin']: 

598 self.loadings, T = rotate_factors(self.loadings_no_rot, method) 

599 elif method == 'oblimin': 

600 self.loadings, T = rotate_factors(self.loadings_no_rot, 

601 'quartimin') 

602 elif method == 'promax': 

603 self.loadings, T = promax(self.loadings_no_rot) 

604 else: 

605 raise ValueError('rotation method not recognized') 

606 

607 self.rotation_matrix = T 

608 

609 def _corr_factors(self): 

610 """correlation of factors implied by rotation 

611 

612 If the rotation is oblique, then the factors are correlated. 

613 

614 currently not cached 

615 

616 Returns 

617 ------- 

618 corr_f : ndarray 

619 correlation matrix of rotated factors, assuming initial factors are 

620 orthogonal 

621 """ 

622 T = self.rotation_matrix 

623 corr_f = T.T.dot(T) 

624 return corr_f 

625 

626 def factor_score_params(self, method='bartlett'): 

627 """ 

628 Compute factor scoring coefficient matrix 

629 

630 The coefficient matrix is not cached. 

631 

632 Parameters 

633 ---------- 

634 method : 'bartlett' or 'regression' 

635 Method to use for factor scoring. 

636 'regression' can be abbreviated to `reg` 

637 

638 Returns 

639 ------- 

640 coeff_matrix : ndarray 

641 matrix s to compute factors f from a standardized endog ys. 

642 ``f = ys dot s`` 

643 

644 Notes 

645 ----- 

646 The `regression` method follows the Stata definition. 

647 Method bartlett and regression are verified against Stats. 

648 Two unofficial methods, 'ols' and 'gls', produce similar factor scores 

649 but are not verified. 

650 

651 See Also 

652 -------- 

653 statsmodels.multivariate.factor.FactorResults.factor_scoring 

654 """ 

655 L = self.loadings 

656 T = self.rotation_matrix.T 

657 #TODO: check row versus column convention for T 

658 uni = 1 - self.communality #self.uniqueness 

659 

660 if method == 'bartlett': 

661 s_mat = np.linalg.inv(L.T.dot(L/(uni[:,None]))).dot((L.T / uni)).T 

662 elif method.startswith('reg'): 

663 corr = self.model.corr 

664 corr_f = self._corr_factors() 

665 # if orthogonal then corr_f is just eye 

666 s_mat = corr_f.dot(L.T.dot(np.linalg.inv(corr))).T 

667 elif method == 'ols': 

668 # not verified 

669 corr = self.model.corr 

670 corr_f = self._corr_factors() 

671 s_mat = corr_f.dot(np.linalg.pinv(L)).T 

672 elif method == 'gls': 

673 # not verified 

674 #s_mat = np.linalg.inv(1*np.eye(L.shape[1]) + L.T.dot(L/(uni[:,None]))) 

675 corr = self.model.corr 

676 corr_f = self._corr_factors() 

677 s_mat = np.linalg.inv(np.linalg.inv(corr_f) + L.T.dot(L/(uni[:,None]))) 

678 s_mat = s_mat.dot(L.T / uni).T 

679 else: 

680 raise ValueError('method not available, use "bartlett ' + 

681 'or "regression"') 

682 return s_mat 

683 

684 def factor_scoring(self, endog=None, method='bartlett', transform=True): 

685 """ 

686 factor scoring: compute factors for endog 

687 

688 If endog was not provided when creating the factor class, then 

689 a standarized endog needs to be provided here. 

690 

691 Parameters 

692 ---------- 

693 method : 'bartlett' or 'regression' 

694 Method to use for factor scoring. 

695 'regression' can be abbreviated to `reg` 

696 transform : bool 

697 If transform is true and endog is provided, then it will be 

698 standardized using mean and scale of original data, which has to 

699 be available in this case. 

700 If transform is False, then a provided endog will be used unchanged. 

701 The original endog in the Factor class will 

702 always be standardized if endog is None, independently of `transform`. 

703 

704 Returns 

705 ------- 

706 factor_score : ndarray 

707 estimated factors using scoring matrix s and standarized endog ys 

708 ``f = ys dot s`` 

709 

710 Notes 

711 ----- 

712 Status: transform option is experimental and might change. 

713 

714 See Also 

715 -------- 

716 statsmodels.multivariate.factor.FactorResults.factor_score_params 

717 """ 

718 

719 if transform is False and endog is not None: 

720 # no transformation in this case 

721 endog = np.asarray(endog) 

722 else: 

723 # we need to standardize with the original mean and scale 

724 if self.model.endog is not None: 

725 m = self.model.endog.mean(0) 

726 s = self.model.endog.std(ddof=1, axis=0) 

727 if endog is None: 

728 endog = self.model.endog 

729 else: 

730 endog = np.asarray(endog) 

731 else: 

732 raise ValueError('If transform is True, then `endog` needs ' + 

733 'to be available in the Factor instance.') 

734 

735 endog = (endog - m) / s 

736 

737 s_mat = self.factor_score_params(method=method) 

738 factors = endog.dot(s_mat) 

739 return factors 

740 

741 def summary(self): 

742 """Summary""" 

743 summ = summary2.Summary() 

744 summ.add_title('Factor analysis results') 

745 loadings_no_rot = pd.DataFrame( 

746 self.loadings_no_rot, 

747 columns=["factor %d" % (i) 

748 for i in range(self.loadings_no_rot.shape[1])], 

749 index=self.endog_names 

750 ) 

751 if hasattr(self, "eigenvals"): 

752 # eigenvals not available for ML method 

753 eigenvals = pd.DataFrame( 

754 [self.eigenvals], columns=self.endog_names, index=['']) 

755 summ.add_dict({'': 'Eigenvalues'}) 

756 summ.add_df(eigenvals) 

757 communality = pd.DataFrame([self.communality], 

758 columns=self.endog_names, index=['']) 

759 summ.add_dict({'': ''}) 

760 summ.add_dict({'': 'Communality'}) 

761 summ.add_df(communality) 

762 summ.add_dict({'': ''}) 

763 summ.add_dict({'': 'Pre-rotated loadings'}) 

764 summ.add_df(loadings_no_rot) 

765 summ.add_dict({'': ''}) 

766 if self.rotation_method is not None: 

767 loadings = pd.DataFrame( 

768 self.loadings, 

769 columns=["factor %d" % (i) 

770 for i in range(self.loadings.shape[1])], 

771 index=self.endog_names 

772 ) 

773 summ.add_dict({'': '%s rotated loadings' % (self.rotation_method)}) 

774 summ.add_df(loadings) 

775 return summ 

776 

777 def get_loadings_frame(self, style='display', sort_=True, threshold=0.3, 

778 highlight_max=True, color_max='yellow', 

779 decimals=None): 

780 """get loadings matrix as DataFrame or pandas Styler 

781 

782 Parameters 

783 ---------- 

784 style : 'display' (default), 'raw' or 'strings' 

785 Style to use for display 

786 

787 * 'raw' returns just a DataFrame of the loadings matrix, no options are 

788 applied 

789 * 'display' add sorting and styling as defined by other keywords 

790 * 'strings' returns a DataFrame with string elements with optional sorting 

791 and suppressing small loading coefficients. 

792 

793 sort_ : bool 

794 If True, then the rows of the DataFrame is sorted by contribution of each 

795 factor. applies if style is either 'display' or 'strings' 

796 threshold : float 

797 If the threshold is larger than zero, then loading coefficients are 

798 either colored white (if style is 'display') or replace by empty 

799 string (if style is 'strings'). 

800 highlight_max : bool 

801 This add a background color to the largest coefficient in each row. 

802 color_max : html color 

803 default is 'yellow'. color for background of row maximum 

804 decimals : None or int 

805 If None, then pandas default precision applies. Otherwise values are 

806 rounded to the specified decimals. If style is 'display', then the 

807 underlying dataframe is not changed. If style is 'strings', then 

808 values are rounded before conversion to strings. 

809 

810 Returns 

811 ------- 

812 loadings : DataFrame or pandas Styler instance 

813 The return is a pandas Styler instance, if style is 'display' and 

814 at least one of highlight_max, threshold or decimals is applied. 

815 Otherwise, the returned loadings is a DataFrame. 

816 

817 Examples 

818 -------- 

819 >>> mod = Factor(df, 3, smc=True) 

820 >>> res = mod.fit() 

821 >>> res.get_loadings_frame(style='display', decimals=3, threshold=0.2) 

822 

823 To get a sorted DataFrame, all styling options need to be turned off: 

824 

825 >>> df_sorted = res.get_loadings_frame(style='display', 

826 ... highlight_max=False, decimals=None, threshold=0) 

827 

828 Options except for highlighting are available for plain test or Latex 

829 usage: 

830 

831 >>> lds = res_u.get_loadings_frame(style='strings', decimals=3, 

832 ... threshold=0.3) 

833 >>> print(lds.to_latex()) 

834 """ 

835 

836 loadings_df = pd.DataFrame( 

837 self.loadings, 

838 columns=["factor %d" % (i) 

839 for i in range(self.loadings.shape[1])], 

840 index=self.endog_names 

841 ) 

842 

843 if style not in ['raw', 'display', 'strings']: 

844 msg = "style has to be one of 'raw', 'display', 'strings'" 

845 raise ValueError(msg) 

846 

847 if style == 'raw': 

848 return loadings_df 

849 

850 # add sorting and some formatting 

851 if sort_ is True: 

852 loadings_df2 = loadings_df.copy() 

853 n_f = len(loadings_df2) 

854 high = np.abs(loadings_df2.values).argmax(1) 

855 loadings_df2['high'] = high 

856 loadings_df2['largest'] = np.abs(loadings_df.values[np.arange(n_f), high]) 

857 loadings_df2.sort_values(by=['high', 'largest'], ascending=[True, False], inplace=True) 

858 loadings_df = loadings_df2.drop(['high', 'largest'], axis=1) 

859 

860 if style == 'display': 

861 sty = None 

862 if threshold > 0: 

863 def color_white_small(val): 

864 """ 

865 Takes a scalar and returns a string with 

866 the css property `'color: white'` for small values, black otherwise. 

867 

868 takes threshold from outer scope 

869 """ 

870 color = 'white' if np.abs(val) < threshold else 'black' 

871 return 'color: %s' % color 

872 

873 sty = loadings_df.style.applymap(color_white_small) 

874 

875 if highlight_max is True: 

876 def highlight_max(s): 

877 ''' 

878 highlight the maximum in a Series yellow. 

879 ''' 

880 s = np.abs(s) 

881 is_max = s == s.max() 

882 return ['background-color: '+ color_max if v else '' for v in is_max] 

883 

884 if sty is None: 

885 sty = loadings_df.style 

886 

887 sty = sty.apply(highlight_max, axis=1) 

888 

889 if decimals is not None: 

890 if sty is None: 

891 sty = loadings_df.style 

892 

893 sty.format("{:.%sf}" % decimals) 

894 

895 if sty is None: 

896 return loadings_df 

897 else: 

898 return sty 

899 

900 if style == 'strings': 

901 ld = loadings_df 

902 if decimals is not None: 

903 ld = ld.round(decimals) 

904 ld = ld.astype(str) 

905 if threshold > 0: 

906 ld[loadings_df.abs() < threshold] = '' 

907 return ld 

908 

909 def plot_scree(self, ncomp=None): 

910 """ 

911 Plot of the ordered eigenvalues and variance explained for the loadings 

912 

913 Parameters 

914 ---------- 

915 ncomp : int, optional 

916 Number of loadings to include in the plot. If None, will 

917 included the same as the number of maximum possible loadings 

918 

919 Returns 

920 ------- 

921 Figure 

922 Handle to the figure. 

923 """ 

924 _import_mpl() 

925 from .plots import plot_scree 

926 return plot_scree(self.eigenvals, self.n_comp, ncomp) 

927 

928 def plot_loadings(self, loading_pairs=None, plot_prerotated=False): 

929 """ 

930 Plot factor loadings in 2-d plots 

931 

932 Parameters 

933 ---------- 

934 loading_pairs : None or a list of tuples 

935 Specify plots. Each tuple (i, j) represent one figure, i and j is 

936 the loading number for x-axis and y-axis, respectively. If `None`, 

937 all combinations of the loadings will be plotted. 

938 plot_prerotated : True or False 

939 If True, the loadings before rotation applied will be plotted. If 

940 False, rotated loadings will be plotted. 

941 

942 Returns 

943 ------- 

944 figs : a list of figure handles 

945 """ 

946 _import_mpl() 

947 from .plots import plot_loadings 

948 

949 if self.rotation_method is None: 

950 plot_prerotated = True 

951 loadings = self.loadings_no_rot if plot_prerotated else self.loadings 

952 if plot_prerotated: 

953 title = 'Prerotated Factor Pattern' 

954 else: 

955 title = '%s Rotated Factor Pattern' % (self.rotation_method) 

956 var_explained = self.eigenvals / self.n_comp * 100 

957 

958 return plot_loadings(loadings, loading_pairs=loading_pairs, 

959 title=title, row_names=self.endog_names, 

960 percent_variance=var_explained) 

961 

962 @cache_readonly 

963 def fitted_cov(self): 

964 """ 

965 Returns the fitted covariance matrix. 

966 """ 

967 

968 c = np.dot(self.loadings, self.loadings.T) 

969 c.flat[::c.shape[0]+1] += self.uniqueness 

970 return c 

971 

972 @cache_readonly 

973 def uniq_stderr(self, kurt=0): 

974 """ 

975 The standard errors of the uniquenesses. 

976 

977 Parameters 

978 ---------- 

979 kurt: float 

980 Excess kurtosis 

981 

982 Notes 

983 ----- 

984 If excess kurtosis is known, provide as `kurt`. Standard 

985 errors are only available if the model was fit using maximum 

986 likelihood. If `endog` is not provided, `nobs` must be 

987 provided to obtain standard errors. 

988 

989 These are asymptotic standard errors. See Bai and Li (2012) 

990 for conditions under which the standard errors are valid. 

991 

992 The standard errors are only applicable to the original, 

993 unrotated maximum likelihood solution. 

994 """ 

995 

996 if self.fa_method.lower() != "ml": 

997 msg = "Standard errors only available under ML estimation" 

998 raise ValueError(msg) 

999 

1000 if self.nobs is None: 

1001 msg = "nobs is required to obtain standard errors." 

1002 raise ValueError(msg) 

1003 

1004 v = self.uniqueness**2 * (2 + kurt) 

1005 return np.sqrt(v / self.nobs) 

1006 

1007 @cache_readonly 

1008 def load_stderr(self): 

1009 """ 

1010 The standard errors of the loadings. 

1011 

1012 Standard errors are only available if the model was fit using 

1013 maximum likelihood. If `endog` is not provided, `nobs` must be 

1014 provided to obtain standard errors. 

1015 

1016 These are asymptotic standard errors. See Bai and Li (2012) 

1017 for conditions under which the standard errors are valid. 

1018 

1019 The standard errors are only applicable to the original, 

1020 unrotated maximum likelihood solution. 

1021 """ 

1022 

1023 if self.fa_method.lower() != "ml": 

1024 msg = "Standard errors only available under ML estimation" 

1025 raise ValueError(msg) 

1026 

1027 if self.nobs is None: 

1028 msg = "nobs is required to obtain standard errors." 

1029 raise ValueError(msg) 

1030 

1031 v = np.outer(self.uniqueness, np.ones(self.loadings.shape[1])) 

1032 return np.sqrt(v / self.nobs)