Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/multivariate/cancorr.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
3"""Canonical correlation analysis
5author: Yichuan Liu
6"""
7import numpy as np
8from numpy.linalg import svd
9import scipy
10import pandas as pd
12from statsmodels.base.model import Model
13from statsmodels.iolib import summary2
14from .multivariate_ols import multivariate_stats
17class CanCorr(Model):
18 """
19 Canonical correlation analysis using singular value decomposition
21 For matrices exog=x and endog=y, find projections x_cancoef and y_cancoef
22 such that:
24 x1 = x * x_cancoef, x1' * x1 is identity matrix
25 y1 = y * y_cancoef, y1' * y1 is identity matrix
27 and the correlation between x1 and y1 is maximized.
29 Attributes
30 ----------
31 endog : ndarray
32 See Parameters.
33 exog : ndarray
34 See Parameters.
35 cancorr : ndarray
36 The canonical correlation values
37 y_cancoeff: ndarray
38 The canonical coefficients for endog
39 x_cancoeff: ndarray
40 The canonical coefficients for exog
42 References
43 ----------
44 .. [*] http://numerical.recipes/whp/notes/CanonCorrBySVD.pdf
45 .. [*] http://www.csun.edu/~ata20315/psy524/docs/Psy524%20Lecture%208%20CC.pdf
46 .. [*] http://www.mathematica-journal.com/2014/06/canonical-correlation-analysis/
47 """ # noqa:E501
48 def __init__(self, endog, exog, tolerance=1e-8, missing='none', hasconst=None, **kwargs):
49 super(CanCorr, self).__init__(endog, exog, missing=missing,
50 hasconst=hasconst, **kwargs)
51 self._fit(tolerance)
53 def _fit(self, tolerance=1e-8):
54 """Fit the model
56 A ValueError is raised if there are singular values smaller than the
57 tolerance. The treatment of singular arrays might change in future.
59 Parameters
60 ----------
61 tolerance : float
62 eigenvalue tolerance, values smaller than which is considered 0
63 """
64 nobs, k_yvar = self.endog.shape
65 nobs, k_xvar = self.exog.shape
66 k = np.min([k_yvar, k_xvar])
68 x = np.array(self.exog)
69 x = x - x.mean(0)
70 y = np.array(self.endog)
71 y = y - y.mean(0)
73 ux, sx, vx = svd(x, 0)
74 # vx_ds = vx.T divided by sx
75 vx_ds = vx.T
76 mask = sx > tolerance
77 if mask.sum() < len(mask):
78 raise ValueError('exog is collinear.')
79 vx_ds[:, mask] /= sx[mask]
80 uy, sy, vy = svd(y, 0)
81 # vy_ds = vy.T divided by sy
82 vy_ds = vy.T
83 mask = sy > tolerance
84 if mask.sum() < len(mask):
85 raise ValueError('endog is collinear.')
86 vy_ds[:, mask] /= sy[mask]
87 u, s, v = svd(ux.T.dot(uy), 0)
89 # Correct any roundoff
90 self.cancorr = np.array([max(0, min(s[i], 1)) for i in range(len(s))])
92 self.x_cancoef = vx_ds.dot(u[:, :k])
93 self.y_cancoef = vy_ds.dot(v.T[:, :k])
95 def corr_test(self):
96 """Approximate F test
97 Perform multivariate statistical tests of the hypothesis that
98 there is no canonical correlation between endog and exog.
99 For each canonical correlation, testing its significance based on
100 Wilks' lambda.
102 Returns
103 -------
104 CanCorrTestResults instance
105 """
106 nobs, k_yvar = self.endog.shape
107 nobs, k_xvar = self.exog.shape
108 eigenvals = np.power(self.cancorr, 2)
109 stats = pd.DataFrame(columns=['Canonical Correlation', "Wilks' lambda",
110 'Num DF','Den DF', 'F Value','Pr > F'],
111 index=list(range(len(eigenvals) - 1, -1, -1)))
112 prod = 1
113 for i in range(len(eigenvals) - 1, -1, -1):
114 prod *= 1 - eigenvals[i]
115 p = k_yvar - i
116 q = k_xvar - i
117 r = (nobs - k_yvar - 1) - (p - q + 1) / 2
118 u = (p * q - 2) / 4
119 df1 = p * q
120 if p ** 2 + q ** 2 - 5 > 0:
121 t = np.sqrt(((p * q) ** 2 - 4) / (p ** 2 + q ** 2 - 5))
122 else:
123 t = 1
124 df2 = r * t - 2 * u
125 lmd = np.power(prod, 1 / t)
126 F = (1 - lmd) / lmd * df2 / df1
127 stats.loc[i, 'Canonical Correlation'] = self.cancorr[i]
128 stats.loc[i, "Wilks' lambda"] = prod
129 stats.loc[i, 'Num DF'] = df1
130 stats.loc[i, 'Den DF'] = df2
131 stats.loc[i, 'F Value'] = F
132 pval = scipy.stats.f.sf(F, df1, df2)
133 stats.loc[i, 'Pr > F'] = pval
134 '''
135 # Wilk's Chi square test of each canonical correlation
136 df = (p - i + 1) * (q - i + 1)
137 chi2 = a * np.log(prod)
138 pval = stats.chi2.sf(chi2, df)
139 stats.loc[i, 'Canonical correlation'] = self.cancorr[i]
140 stats.loc[i, 'Chi-square'] = chi2
141 stats.loc[i, 'DF'] = df
142 stats.loc[i, 'Pr > ChiSq'] = pval
143 '''
144 ind = stats.index.values[::-1]
145 stats = stats.loc[ind, :]
147 # Multivariate tests (remember x has mean removed)
148 stats_mv = multivariate_stats(eigenvals,
149 k_yvar, k_xvar, nobs - k_xvar - 1)
150 return CanCorrTestResults(stats, stats_mv)
153class CanCorrTestResults(object):
154 """
155 Canonical correlation results class
157 Attributes
158 ----------
159 stats : DataFrame
160 Contain statistical tests results for each canonical correlation
161 stats_mv : DataFrame
162 Contain the multivariate statistical tests results
163 """
164 def __init__(self, stats, stats_mv):
165 self.stats = stats
166 self.stats_mv = stats_mv
168 def __str__(self):
169 return self.summary().__str__()
171 def summary(self):
172 summ = summary2.Summary()
173 summ.add_title('Cancorr results')
174 summ.add_df(self.stats)
175 summ.add_dict({'': ''})
176 summ.add_dict({'Multivariate Statistics and F Approximations': ''})
177 summ.add_df(self.stats_mv)
178 return summ