Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/emplike/descriptive.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Empirical likelihood inference on descriptive statistics
4This module conducts hypothesis tests and constructs confidence
5intervals for the mean, variance, skewness, kurtosis and correlation.
7If matplotlib is installed, this module can also generate multivariate
8confidence region plots as well as mean-variance contour plots.
10See _OptFuncts docstring for technical details and optimization variable
11definitions.
13General References:
14------------------
15Owen, A. (2001). "Empirical Likelihood." Chapman and Hall
17"""
18import numpy as np
19from scipy import optimize
20from scipy.stats import chi2, skew, kurtosis
21from statsmodels.base.optimizer import _fit_newton
22import itertools
23from statsmodels.graphics import utils
26def DescStat(endog):
27 """
28 Returns an instance to conduct inference on descriptive statistics
29 via empirical likelihood. See DescStatUV and DescStatMV for more
30 information.
32 Parameters
33 ----------
34 endog : ndarray
35 Array of data
37 Returns : DescStat instance
38 If k=1, the function returns a univariate instance, DescStatUV.
39 If k>1, the function returns a multivariate instance, DescStatMV.
40 """
41 if endog.ndim == 1:
42 endog = endog.reshape(len(endog), 1)
43 if endog.shape[1] == 1:
44 return DescStatUV(endog)
45 if endog.shape[1] > 1:
46 return DescStatMV(endog)
49class _OptFuncts(object):
50 """
51 A class that holds functions that are optimized/solved.
53 The general setup of the class is simple. Any method that starts with
54 _opt_ creates a vector of estimating equations named est_vect such that
55 np.dot(p, (est_vect))=0 where p is the weight on each
56 observation as a 1 x n array and est_vect is n x k. Then _modif_Newton is
57 called to determine the optimal p by solving for the Lagrange multiplier
58 (eta) in the profile likelihood maximization problem. In the presence
59 of nuisance parameters, _opt_ functions are optimized over to profile
60 out the nuisance parameters.
62 Any method starting with _ci_limits calculates the log likelihood
63 ratio for a specific value of a parameter and then subtracts a
64 pre-specified critical value. This is solved so that llr - crit = 0.
65 """
67 def __init__(self, endog):
68 pass
70 def _log_star(self, eta, est_vect, weights, nobs):
71 """
72 Transforms the log of observation probabilities in terms of the
73 Lagrange multiplier to the log 'star' of the probabilities.
75 Parameters
76 ----------
77 eta : float
78 Lagrange multiplier
80 est_vect : ndarray (n,k)
81 Estimating equations vector
83 wts : nx1 array
84 Observation weights
86 Returns
87 ------
88 data_star : ndarray
89 The weighted logstar of the estimting equations
91 Notes
92 -----
93 This function is only a placeholder for the _fit_Newton.
94 The function value is not used in optimization and the optimal value
95 is disregarded when computing the log likelihood ratio.
96 """
97 data_star = np.log(weights) + (np.sum(weights) +\
98 np.dot(est_vect, eta))
99 idx = data_star < 1. / nobs
100 not_idx = ~idx
101 nx = nobs * data_star[idx]
102 data_star[idx] = np.log(1. / nobs) - 1.5 + nx * (2. - nx / 2)
103 data_star[not_idx] = np.log(data_star[not_idx])
104 return data_star
106 def _hess(self, eta, est_vect, weights, nobs):
107 """
108 Calculates the hessian of a weighted empirical likelihood
109 problem.
111 Parameters
112 ----------
113 eta : ndarray, (1,m)
114 Lagrange multiplier in the profile likelihood maximization
116 est_vect : ndarray (n,k)
117 Estimating equations vector
119 weights : 1darray
120 Observation weights
122 Returns
123 -------
124 hess : m x m array
125 Weighted hessian used in _wtd_modif_newton
126 """
127 #eta = np.squeeze(eta)
128 data_star_doub_prime = np.sum(weights) + np.dot(est_vect, eta)
129 idx = data_star_doub_prime < 1. / nobs
130 not_idx = ~idx
131 data_star_doub_prime[idx] = - nobs ** 2
132 data_star_doub_prime[not_idx] = - (data_star_doub_prime[not_idx]) ** -2
133 wtd_dsdp = weights * data_star_doub_prime
134 return np.dot(est_vect.T, wtd_dsdp[:, None] * est_vect)
136 def _grad(self, eta, est_vect, weights, nobs):
137 """
138 Calculates the gradient of a weighted empirical likelihood
139 problem
141 Parameters
142 ----------
143 eta : ndarray, (1,m)
144 Lagrange multiplier in the profile likelihood maximization
146 est_vect : ndarray, (n,k)
147 Estimating equations vector
149 weights : 1darray
150 Observation weights
152 Returns
153 -------
154 gradient : ndarray (m,1)
155 The gradient used in _wtd_modif_newton
156 """
157 #eta = np.squeeze(eta)
158 data_star_prime = np.sum(weights) + np.dot(est_vect, eta)
159 idx = data_star_prime < 1. / nobs
160 not_idx = ~idx
161 data_star_prime[idx] = nobs * (2 - nobs * data_star_prime[idx])
162 data_star_prime[not_idx] = 1. / data_star_prime[not_idx]
163 return np.dot(weights * data_star_prime, est_vect)
165 def _modif_newton(self, eta, est_vect, weights):
166 """
167 Modified Newton's method for maximizing the log 'star' equation. This
168 function calls _fit_newton to find the optimal values of eta.
170 Parameters
171 ----------
172 eta : ndarray, (1,m)
173 Lagrange multiplier in the profile likelihood maximization
175 est_vect : ndarray, (n,k)
176 Estimating equations vector
178 weights : 1darray
179 Observation weights
181 Returns
182 -------
183 params : 1xm array
184 Lagrange multiplier that maximizes the log-likelihood
185 """
186 nobs = len(est_vect)
187 f = lambda x0: - np.sum(self._log_star(x0, est_vect, weights, nobs))
188 grad = lambda x0: - self._grad(x0, est_vect, weights, nobs)
189 hess = lambda x0: - self._hess(x0, est_vect, weights, nobs)
190 kwds = {'tol': 1e-8}
191 eta = eta.squeeze()
192 res = _fit_newton(f, grad, eta, (), kwds, hess=hess, maxiter=50, \
193 disp=0)
194 return res[0]
196 def _find_eta(self, eta):
197 """
198 Finding the root of sum(xi-h0)/(1+eta(xi-mu)) solves for
199 eta when computing ELR for univariate mean.
201 Parameters
202 ----------
203 eta : float
204 Lagrange multiplier in the empirical likelihood maximization
206 Returns
207 -------
208 llr : float
209 n times the log likelihood value for a given value of eta
210 """
211 return np.sum((self.endog - self.mu0) / \
212 (1. + eta * (self.endog - self.mu0)))
214 def _ci_limits_mu(self, mu):
215 """
216 Calculates the difference between the log likelihood of mu_test and a
217 specified critical value.
219 Parameters
220 ----------
221 mu : float
222 Hypothesized value of the mean.
224 Returns
225 -------
226 diff : float
227 The difference between the log likelihood value of mu0 and
228 a specified value.
229 """
230 return self.test_mean(mu)[0] - self.r0
232 def _find_gamma(self, gamma):
233 """
234 Finds gamma that satisfies
235 sum(log(n * w(gamma))) - log(r0) = 0
237 Used for confidence intervals for the mean
239 Parameters
240 ----------
241 gamma : float
242 Lagrange multiplier when computing confidence interval
244 Returns
245 -------
246 diff : float
247 The difference between the log-liklihood when the Lagrange
248 multiplier is gamma and a pre-specified value
249 """
250 denom = np.sum((self.endog - gamma) ** -1)
251 new_weights = (self.endog - gamma) ** -1 / denom
252 return -2 * np.sum(np.log(self.nobs * new_weights)) - \
253 self.r0
255 def _opt_var(self, nuisance_mu, pval=False):
256 """
257 This is the function to be optimized over a nuisance mean parameter
258 to determine the likelihood ratio for the variance
260 Parameters
261 ----------
262 nuisance_mu : float
263 Value of a nuisance mean parameter
265 Returns
266 -------
267 llr : float
268 Log likelihood of a pre-specified variance holding the nuisance
269 parameter constant
270 """
271 endog = self.endog
272 nobs = self.nobs
273 sig_data = ((endog - nuisance_mu) ** 2 \
274 - self.sig2_0)
275 mu_data = (endog - nuisance_mu)
276 est_vect = np.column_stack((mu_data, sig_data))
277 eta_star = self._modif_newton(np.array([1. / nobs,
278 1. / nobs]), est_vect,
279 np.ones(nobs) * (1. / nobs))
281 denom = 1 + np.dot(eta_star, est_vect.T)
282 self.new_weights = 1. / nobs * 1. / denom
283 llr = np.sum(np.log(nobs * self.new_weights))
284 if pval: # Used for contour plotting
285 return chi2.sf(-2 * llr, 1)
286 return -2 * llr
288 def _ci_limits_var(self, var):
289 """
290 Used to determine the confidence intervals for the variance.
291 It calls test_var and when called by an optimizer,
292 finds the value of sig2_0 that is chi2.ppf(significance-level)
294 Parameters
295 ----------
296 var_test : float
297 Hypothesized value of the variance
299 Returns
300 -------
301 diff : float
302 The difference between the log likelihood ratio at var_test and a
303 pre-specified value.
304 """
305 return self.test_var(var)[0] - self.r0
307 def _opt_skew(self, nuis_params):
308 """
309 Called by test_skew. This function is optimized over
310 nuisance parameters mu and sigma
312 Parameters
313 ----------
314 nuis_params : 1darray
315 An array with a nuisance mean and variance parameter
317 Returns
318 -------
319 llr : float
320 The log likelihood ratio of a pre-specified skewness holding
321 the nuisance parameters constant.
322 """
323 endog = self.endog
324 nobs = self.nobs
325 mu_data = endog - nuis_params[0]
326 sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1]
327 skew_data = ((((endog - nuis_params[0]) ** 3) /
328 (nuis_params[1] ** 1.5))) - self.skew0
329 est_vect = np.column_stack((mu_data, sig_data, skew_data))
330 eta_star = self._modif_newton(np.array([1. / nobs,
331 1. / nobs,
332 1. / nobs]), est_vect,
333 np.ones(nobs) * (1. / nobs))
334 denom = 1. + np.dot(eta_star, est_vect.T)
335 self.new_weights = 1. / nobs * 1. / denom
336 llr = np.sum(np.log(nobs * self.new_weights))
337 return -2 * llr
339 def _opt_kurt(self, nuis_params):
340 """
341 Called by test_kurt. This function is optimized over
342 nuisance parameters mu and sigma
344 Parameters
345 ----------
346 nuis_params : 1darray
347 An array with a nuisance mean and variance parameter
349 Returns
350 -------
351 llr : float
352 The log likelihood ratio of a pre-speified kurtosis holding the
353 nuisance parameters constant
354 """
355 endog = self.endog
356 nobs = self.nobs
357 mu_data = endog - nuis_params[0]
358 sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1]
359 kurt_data = (((((endog - nuis_params[0]) ** 4) / \
360 (nuis_params[1] ** 2))) - 3) - self.kurt0
361 est_vect = np.column_stack((mu_data, sig_data, kurt_data))
362 eta_star = self._modif_newton(np.array([1. / nobs,
363 1. / nobs,
364 1. / nobs]), est_vect,
365 np.ones(nobs) * (1. / nobs))
366 denom = 1 + np.dot(eta_star, est_vect.T)
367 self.new_weights = 1. / nobs * 1. / denom
368 llr = np.sum(np.log(nobs * self.new_weights))
369 return -2 * llr
371 def _opt_skew_kurt(self, nuis_params):
372 """
373 Called by test_joint_skew_kurt. This function is optimized over
374 nuisance parameters mu and sigma
376 Parameters
377 ----------
378 nuis_params : 1darray
379 An array with a nuisance mean and variance parameter
381 Returns
382 ------
383 llr : float
384 The log likelihood ratio of a pre-speified skewness and
385 kurtosis holding the nuisance parameters constant.
386 """
387 endog = self.endog
388 nobs = self.nobs
389 mu_data = endog - nuis_params[0]
390 sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1]
391 skew_data = ((((endog - nuis_params[0]) ** 3) / \
392 (nuis_params[1] ** 1.5))) - self.skew0
393 kurt_data = (((((endog - nuis_params[0]) ** 4) / \
394 (nuis_params[1] ** 2))) - 3) - self.kurt0
395 est_vect = np.column_stack((mu_data, sig_data, skew_data, kurt_data))
396 eta_star = self._modif_newton(np.array([1. / nobs,
397 1. / nobs,
398 1. / nobs,
399 1. / nobs]), est_vect,
400 np.ones(nobs) * (1. / nobs))
401 denom = 1. + np.dot(eta_star, est_vect.T)
402 self.new_weights = 1. / nobs * 1. / denom
403 llr = np.sum(np.log(nobs * self.new_weights))
404 return -2 * llr
406 def _ci_limits_skew(self, skew):
407 """
408 Parameters
409 ----------
410 skew0 : float
411 Hypothesized value of skewness
413 Returns
414 -------
415 diff : float
416 The difference between the log likelihood ratio at skew and a
417 pre-specified value.
418 """
419 return self.test_skew(skew)[0] - self.r0
421 def _ci_limits_kurt(self, kurt):
422 """
423 Parameters
424 ----------
425 skew0 : float
426 Hypothesized value of kurtosis
428 Returns
429 -------
430 diff : float
431 The difference between the log likelihood ratio at kurt and a
432 pre-specified value.
433 """
434 return self.test_kurt(kurt)[0] - self.r0
436 def _opt_correl(self, nuis_params, corr0, endog, nobs, x0, weights0):
437 """
438 Parameters
439 ----------
440 nuis_params : 1darray
441 Array containing two nuisance means and two nuisance variances
443 Returns
444 -------
445 llr : float
446 The log-likelihood of the correlation coefficient holding nuisance
447 parameters constant
448 """
449 mu1_data, mu2_data = (endog - nuis_params[::2]).T
450 sig1_data = mu1_data ** 2 - nuis_params[1]
451 sig2_data = mu2_data ** 2 - nuis_params[3]
452 correl_data = ((mu1_data * mu2_data) - corr0 *
453 (nuis_params[1] * nuis_params[3]) ** .5)
454 est_vect = np.column_stack((mu1_data, sig1_data,
455 mu2_data, sig2_data, correl_data))
456 eta_star = self._modif_newton(x0, est_vect, weights0)
457 denom = 1. + np.dot(est_vect, eta_star)
458 self.new_weights = 1. / nobs * 1. / denom
459 llr = np.sum(np.log(nobs * self.new_weights))
460 return -2 * llr
462 def _ci_limits_corr(self, corr):
463 return self.test_corr(corr)[0] - self.r0
466class DescStatUV(_OptFuncts):
467 """
468 A class to compute confidence intervals and hypothesis tests involving
469 mean, variance, kurtosis and skewness of a univariate random variable.
471 Parameters
472 ----------
473 endog : 1darray
474 Data to be analyzed
476 Attributes
477 ----------
478 endog : 1darray
479 Data to be analyzed
481 nobs : float
482 Number of observations
483 """
485 def __init__(self, endog):
486 self.endog = np.squeeze(endog)
487 self.nobs = endog.shape[0]
489 def test_mean(self, mu0, return_weights=False):
490 """
491 Returns - 2 x log-likelihood ratio, p-value and weights
492 for a hypothesis test of the mean.
494 Parameters
495 ----------
496 mu0 : float
497 Mean value to be tested
499 return_weights : bool
500 If return_weights is True the function returns
501 the weights of the observations under the null hypothesis.
502 Default is False
504 Returns
505 -------
506 test_results : tuple
507 The log-likelihood ratio and p-value of mu0
508 """
509 self.mu0 = mu0
510 endog = self.endog
511 nobs = self.nobs
512 eta_min = (1. - (1. / nobs)) / (self.mu0 - max(endog))
513 eta_max = (1. - (1. / nobs)) / (self.mu0 - min(endog))
514 eta_star = optimize.brentq(self._find_eta, eta_min, eta_max)
515 new_weights = (1. / nobs) * 1. / (1. + eta_star * (endog - self.mu0))
516 llr = -2 * np.sum(np.log(nobs * new_weights))
517 if return_weights:
518 return llr, chi2.sf(llr, 1), new_weights
519 else:
520 return llr, chi2.sf(llr, 1)
522 def ci_mean(self, sig=.05, method='gamma', epsilon=10 ** -8,
523 gamma_low=-10 ** 10, gamma_high=10 ** 10):
524 """
525 Returns the confidence interval for the mean.
527 Parameters
528 ----------
529 sig : float
530 significance level. Default is .05
532 method : str
533 Root finding method, Can be 'nested-brent' or
534 'gamma'. Default is 'gamma'
536 'gamma' Tries to solve for the gamma parameter in the
537 Lagrange (see Owen pg 22) and then determine the weights.
539 'nested brent' uses brents method to find the confidence
540 intervals but must maximize the likelihood ratio on every
541 iteration.
543 gamma is generally much faster. If the optimizations does not
544 converge, try expanding the gamma_high and gamma_low
545 variable.
547 gamma_low : float
548 Lower bound for gamma when finding lower limit.
549 If function returns f(a) and f(b) must have different signs,
550 consider lowering gamma_low.
552 gamma_high : float
553 Upper bound for gamma when finding upper limit.
554 If function returns f(a) and f(b) must have different signs,
555 consider raising gamma_high.
557 epsilon : float
558 When using 'nested-brent', amount to decrease (increase)
559 from the maximum (minimum) of the data when
560 starting the search. This is to protect against the
561 likelihood ratio being zero at the maximum (minimum)
562 value of the data. If data is very small in absolute value
563 (<10 ``**`` -6) consider shrinking epsilon
565 When using 'gamma', amount to decrease (increase) the
566 minimum (maximum) by to start the search for gamma.
567 If function returns f(a) and f(b) must have different signs,
568 consider lowering epsilon.
570 Returns
571 -------
572 Interval : tuple
573 Confidence interval for the mean
574 """
575 endog = self.endog
576 sig = 1 - sig
577 if method == 'nested-brent':
578 self.r0 = chi2.ppf(sig, 1)
579 middle = np.mean(endog)
580 epsilon_u = (max(endog) - np.mean(endog)) * epsilon
581 epsilon_l = (np.mean(endog) - min(endog)) * epsilon
582 ulim = optimize.brentq(self._ci_limits_mu, middle,
583 max(endog) - epsilon_u)
584 llim = optimize.brentq(self._ci_limits_mu, middle,
585 min(endog) + epsilon_l)
586 return llim, ulim
588 if method == 'gamma':
589 self.r0 = chi2.ppf(sig, 1)
590 gamma_star_l = optimize.brentq(self._find_gamma, gamma_low,
591 min(endog) - epsilon)
592 gamma_star_u = optimize.brentq(self._find_gamma, \
593 max(endog) + epsilon, gamma_high)
594 weights_low = ((endog - gamma_star_l) ** -1) / \
595 np.sum((endog - gamma_star_l) ** -1)
596 weights_high = ((endog - gamma_star_u) ** -1) / \
597 np.sum((endog - gamma_star_u) ** -1)
598 mu_low = np.sum(weights_low * endog)
599 mu_high = np.sum(weights_high * endog)
600 return mu_low, mu_high
602 def test_var(self, sig2_0, return_weights=False):
603 """
604 Returns -2 x log-likelihood ratio and the p-value for the
605 hypothesized variance
607 Parameters
608 ----------
609 sig2_0 : float
610 Hypothesized variance to be tested
612 return_weights : bool
613 If True, returns the weights that maximize the
614 likelihood of observing sig2_0. Default is False
616 Returns
617 -------
618 test_results : tuple
619 The log-likelihood ratio and the p_value of sig2_0
621 Examples
622 --------
623 >>> import numpy as np
624 >>> import statsmodels.api as sm
625 >>> random_numbers = np.random.standard_normal(1000)*100
626 >>> el_analysis = sm.emplike.DescStat(random_numbers)
627 >>> hyp_test = el_analysis.test_var(9500)
628 """
629 self.sig2_0 = sig2_0
630 mu_max = max(self.endog)
631 mu_min = min(self.endog)
632 llr = optimize.fminbound(self._opt_var, mu_min, mu_max, \
633 full_output=1)[1]
634 p_val = chi2.sf(llr, 1)
635 if return_weights:
636 return llr, p_val, self.new_weights.T
637 else:
638 return llr, p_val
640 def ci_var(self, lower_bound=None, upper_bound=None, sig=.05):
641 """
642 Returns the confidence interval for the variance.
644 Parameters
645 ----------
646 lower_bound : float
647 The minimum value the lower confidence interval can
648 take. The p-value from test_var(lower_bound) must be lower
649 than 1 - significance level. Default is .99 confidence
650 limit assuming normality
652 upper_bound : float
653 The maximum value the upper confidence interval
654 can take. The p-value from test_var(upper_bound) must be lower
655 than 1 - significance level. Default is .99 confidence
656 limit assuming normality
658 sig : float
659 The significance level. Default is .05
661 Returns
662 -------
663 Interval : tuple
664 Confidence interval for the variance
666 Examples
667 --------
668 >>> import numpy as np
669 >>> import statsmodels.api as sm
670 >>> random_numbers = np.random.standard_normal(100)
671 >>> el_analysis = sm.emplike.DescStat(random_numbers)
672 >>> el_analysis.ci_var()
673 (0.7539322567470305, 1.229998852496268)
674 >>> el_analysis.ci_var(.5, 2)
675 (0.7539322567469926, 1.2299988524962664)
677 Notes
678 -----
679 If the function returns the error f(a) and f(b) must have
680 different signs, consider lowering lower_bound and raising
681 upper_bound.
682 """
683 endog = self.endog
684 if upper_bound is None:
685 upper_bound = ((self.nobs - 1) * endog.var()) / \
686 (chi2.ppf(.0001, self.nobs - 1))
687 if lower_bound is None:
688 lower_bound = ((self.nobs - 1) * endog.var()) / \
689 (chi2.ppf(.9999, self.nobs - 1))
690 self.r0 = chi2.ppf(1 - sig, 1)
691 llim = optimize.brentq(self._ci_limits_var, lower_bound, endog.var())
692 ulim = optimize.brentq(self._ci_limits_var, endog.var(), upper_bound)
693 return llim, ulim
695 def plot_contour(self, mu_low, mu_high, var_low, var_high, mu_step,
696 var_step,
697 levs=[.2, .1, .05, .01, .001]):
698 """
699 Returns a plot of the confidence region for a univariate
700 mean and variance.
702 Parameters
703 ----------
704 mu_low : float
705 Lowest value of the mean to plot
707 mu_high : float
708 Highest value of the mean to plot
710 var_low : float
711 Lowest value of the variance to plot
713 var_high : float
714 Highest value of the variance to plot
716 mu_step : float
717 Increments to evaluate the mean
719 var_step : float
720 Increments to evaluate the mean
722 levs : list
723 Which values of significance the contour lines will be drawn.
724 Default is [.2, .1, .05, .01, .001]
726 Returns
727 -------
728 Figure
729 The contour plot
730 """
731 fig, ax = utils.create_mpl_ax()
732 ax.set_ylabel('Variance')
733 ax.set_xlabel('Mean')
734 mu_vect = list(np.arange(mu_low, mu_high, mu_step))
735 var_vect = list(np.arange(var_low, var_high, var_step))
736 z = []
737 for sig0 in var_vect:
738 self.sig2_0 = sig0
739 for mu0 in mu_vect:
740 z.append(self._opt_var(mu0, pval=True))
741 z = np.asarray(z).reshape(len(var_vect), len(mu_vect))
742 ax.contour(mu_vect, var_vect, z, levels=levs)
743 return fig
745 def test_skew(self, skew0, return_weights=False):
746 """
747 Returns -2 x log-likelihood and p-value for the hypothesized
748 skewness.
750 Parameters
751 ----------
752 skew0 : float
753 Skewness value to be tested
755 return_weights : bool
756 If True, function also returns the weights that
757 maximize the likelihood ratio. Default is False.
759 Returns
760 -------
761 test_results : tuple
762 The log-likelihood ratio and p_value of skew0
763 """
764 self.skew0 = skew0
765 start_nuisance = np.array([self.endog.mean(),
766 self.endog.var()])
768 llr = optimize.fmin_powell(self._opt_skew, start_nuisance,
769 full_output=1, disp=0)[1]
770 p_val = chi2.sf(llr, 1)
771 if return_weights:
772 return llr, p_val, self.new_weights.T
773 return llr, p_val
775 def test_kurt(self, kurt0, return_weights=False):
776 """
777 Returns -2 x log-likelihood and the p-value for the hypothesized
778 kurtosis.
780 Parameters
781 ----------
782 kurt0 : float
783 Kurtosis value to be tested
785 return_weights : bool
786 If True, function also returns the weights that
787 maximize the likelihood ratio. Default is False.
789 Returns
790 -------
791 test_results : tuple
792 The log-likelihood ratio and p-value of kurt0
793 """
794 self.kurt0 = kurt0
795 start_nuisance = np.array([self.endog.mean(),
796 self.endog.var()])
798 llr = optimize.fmin_powell(self._opt_kurt, start_nuisance,
799 full_output=1, disp=0)[1]
800 p_val = chi2.sf(llr, 1)
801 if return_weights:
802 return llr, p_val, self.new_weights.T
803 return llr, p_val
805 def test_joint_skew_kurt(self, skew0, kurt0, return_weights=False):
806 """
807 Returns - 2 x log-likelihood and the p-value for the joint
808 hypothesis test for skewness and kurtosis
810 Parameters
811 ----------
812 skew0 : float
813 Skewness value to be tested
814 kurt0 : float
815 Kurtosis value to be tested
817 return_weights : bool
818 If True, function also returns the weights that
819 maximize the likelihood ratio. Default is False.
821 Returns
822 -------
823 test_results : tuple
824 The log-likelihood ratio and p-value of the joint hypothesis test.
825 """
826 self.skew0 = skew0
827 self.kurt0 = kurt0
828 start_nuisance = np.array([self.endog.mean(),
829 self.endog.var()])
831 llr = optimize.fmin_powell(self._opt_skew_kurt, start_nuisance,
832 full_output=1, disp=0)[1]
833 p_val = chi2.sf(llr, 2)
834 if return_weights:
835 return llr, p_val, self.new_weights.T
836 return llr, p_val
838 def ci_skew(self, sig=.05, upper_bound=None, lower_bound=None):
839 """
840 Returns the confidence interval for skewness.
842 Parameters
843 ----------
844 sig : float
845 The significance level. Default is .05
847 upper_bound : float
848 Maximum value of skewness the upper limit can be.
849 Default is .99 confidence limit assuming normality.
851 lower_bound : float
852 Minimum value of skewness the lower limit can be.
853 Default is .99 confidence level assuming normality.
855 Returns
856 -------
857 Interval : tuple
858 Confidence interval for the skewness
860 Notes
861 -----
862 If function returns f(a) and f(b) must have different signs, consider
863 expanding lower and upper bounds
864 """
865 nobs = self.nobs
866 endog = self.endog
867 if upper_bound is None:
868 upper_bound = skew(endog) + \
869 2.5 * ((6. * nobs * (nobs - 1.)) / \
870 ((nobs - 2.) * (nobs + 1.) * \
871 (nobs + 3.))) ** .5
872 if lower_bound is None:
873 lower_bound = skew(endog) - \
874 2.5 * ((6. * nobs * (nobs - 1.)) / \
875 ((nobs - 2.) * (nobs + 1.) * \
876 (nobs + 3.))) ** .5
877 self.r0 = chi2.ppf(1 - sig, 1)
878 llim = optimize.brentq(self._ci_limits_skew, lower_bound, skew(endog))
879 ulim = optimize.brentq(self._ci_limits_skew, skew(endog), upper_bound)
880 return llim, ulim
882 def ci_kurt(self, sig=.05, upper_bound=None, lower_bound=None):
883 """
884 Returns the confidence interval for kurtosis.
886 Parameters
887 ----------
889 sig : float
890 The significance level. Default is .05
892 upper_bound : float
893 Maximum value of kurtosis the upper limit can be.
894 Default is .99 confidence limit assuming normality.
896 lower_bound : float
897 Minimum value of kurtosis the lower limit can be.
898 Default is .99 confidence limit assuming normality.
900 Returns
901 -------
902 Interval : tuple
903 Lower and upper confidence limit
905 Notes
906 -----
907 For small n, upper_bound and lower_bound may have to be
908 provided by the user. Consider using test_kurt to find
909 values close to the desired significance level.
911 If function returns f(a) and f(b) must have different signs, consider
912 expanding the bounds.
913 """
914 endog = self.endog
915 nobs = self.nobs
916 if upper_bound is None:
917 upper_bound = kurtosis(endog) + \
918 (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
919 ((nobs - 2.) * (nobs + 1.) * \
920 (nobs + 3.))) ** .5) * \
921 (((nobs ** 2.) - 1.) / ((nobs - 3.) *\
922 (nobs + 5.))) ** .5)
923 if lower_bound is None:
924 lower_bound = kurtosis(endog) - \
925 (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
926 ((nobs - 2.) * (nobs + 1.) * \
927 (nobs + 3.))) ** .5) * \
928 (((nobs ** 2.) - 1.) / ((nobs - 3.) *\
929 (nobs + 5.))) ** .5)
930 self.r0 = chi2.ppf(1 - sig, 1)
931 llim = optimize.brentq(self._ci_limits_kurt, lower_bound, \
932 kurtosis(endog))
933 ulim = optimize.brentq(self._ci_limits_kurt, kurtosis(endog), \
934 upper_bound)
935 return llim, ulim
938class DescStatMV(_OptFuncts):
939 """
940 A class for conducting inference on multivariate means and correlation.
942 Parameters
943 ----------
944 endog : ndarray
945 Data to be analyzed
947 Attributes
948 ----------
949 endog : ndarray
950 Data to be analyzed
952 nobs : float
953 Number of observations
954 """
956 def __init__(self, endog):
957 self.endog = endog
958 self.nobs = endog.shape[0]
960 def mv_test_mean(self, mu_array, return_weights=False):
961 """
962 Returns -2 x log likelihood and the p-value
963 for a multivariate hypothesis test of the mean
965 Parameters
966 ----------
967 mu_array : 1d array
968 Hypothesized values for the mean. Must have same number of
969 elements as columns in endog
971 return_weights : bool
972 If True, returns the weights that maximize the
973 likelihood of mu_array. Default is False.
975 Returns
976 -------
977 test_results : tuple
978 The log-likelihood ratio and p-value for mu_array
979 """
980 endog = self.endog
981 nobs = self.nobs
982 if len(mu_array) != endog.shape[1]:
983 raise ValueError('mu_array must have the same number of '
984 'elements as the columns of the data.')
985 mu_array = mu_array.reshape(1, endog.shape[1])
986 means = np.ones((endog.shape[0], endog.shape[1]))
987 means = mu_array * means
988 est_vect = endog - means
989 start_vals = 1. / nobs * np.ones(endog.shape[1])
990 eta_star = self._modif_newton(start_vals, est_vect,
991 np.ones(nobs) * (1. / nobs))
992 denom = 1 + np.dot(eta_star, est_vect.T)
993 self.new_weights = 1 / nobs * 1 / denom
994 llr = -2 * np.sum(np.log(nobs * self.new_weights))
995 p_val = chi2.sf(llr, mu_array.shape[1])
996 if return_weights:
997 return llr, p_val, self.new_weights.T
998 else:
999 return llr, p_val
1001 def mv_mean_contour(self, mu1_low, mu1_upp, mu2_low, mu2_upp, step1, step2,
1002 levs=(.001, .01, .05, .1, .2), var1_name=None,
1003 var2_name=None, plot_dta=False):
1004 """
1005 Creates a confidence region plot for the mean of bivariate data
1007 Parameters
1008 ----------
1009 m1_low : float
1010 Minimum value of the mean for variable 1
1011 m1_upp : float
1012 Maximum value of the mean for variable 1
1013 mu2_low : float
1014 Minimum value of the mean for variable 2
1015 mu2_upp : float
1016 Maximum value of the mean for variable 2
1017 step1 : float
1018 Increment of evaluations for variable 1
1019 step2 : float
1020 Increment of evaluations for variable 2
1021 levs : list
1022 Levels to be drawn on the contour plot.
1023 Default = (.001, .01, .05, .1, .2)
1024 plot_dta : bool
1025 If True, makes a scatter plot of the data on
1026 top of the contour plot. Defaultis False.
1027 var1_name : str
1028 Name of variable 1 to be plotted on the x-axis
1029 var2_name : str
1030 Name of variable 2 to be plotted on the y-axis
1032 Notes
1033 -----
1034 The smaller the step size, the more accurate the intervals
1035 will be
1037 If the function returns optimization failed, consider narrowing
1038 the boundaries of the plot
1040 Examples
1041 --------
1042 >>> import statsmodels.api as sm
1043 >>> two_rvs = np.random.standard_normal((20,2))
1044 >>> el_analysis = sm.emplike.DescStat(two_rvs)
1045 >>> contourp = el_analysis.mv_mean_contour(-2, 2, -2, 2, .1, .1)
1046 >>> contourp.show()
1047 """
1048 if self.endog.shape[1] != 2:
1049 raise ValueError('Data must contain exactly two variables')
1050 fig, ax = utils.create_mpl_ax()
1051 if var2_name is None:
1052 ax.set_ylabel('Variable 2')
1053 else:
1054 ax.set_ylabel(var2_name)
1055 if var1_name is None:
1056 ax.set_xlabel('Variable 1')
1057 else:
1058 ax.set_xlabel(var1_name)
1059 x = np.arange(mu1_low, mu1_upp, step1)
1060 y = np.arange(mu2_low, mu2_upp, step2)
1061 pairs = itertools.product(x, y)
1062 z = []
1063 for i in pairs:
1064 z.append(self.mv_test_mean(np.asarray(i))[0])
1065 X, Y = np.meshgrid(x, y)
1066 z = np.asarray(z)
1067 z = z.reshape(X.shape[1], Y.shape[0])
1068 ax.contour(x, y, z.T, levels=levs)
1069 if plot_dta:
1070 ax.plot(self.endog[:, 0], self.endog[:, 1], 'bo')
1071 return fig
1073 def test_corr(self, corr0, return_weights=0):
1074 """
1075 Returns -2 x log-likelihood ratio and p-value for the
1076 correlation coefficient between 2 variables
1078 Parameters
1079 ----------
1080 corr0 : float
1081 Hypothesized value to be tested
1083 return_weights : bool
1084 If true, returns the weights that maximize
1085 the log-likelihood at the hypothesized value
1086 """
1087 nobs = self.nobs
1088 endog = self.endog
1089 if endog.shape[1] != 2:
1090 raise NotImplementedError('Correlation matrix not yet implemented')
1091 nuis0 = np.array([endog[:, 0].mean(),
1092 endog[:, 0].var(),
1093 endog[:, 1].mean(),
1094 endog[:, 1].var()])
1096 x0 = np.zeros(5)
1097 weights0 = np.array([1. / nobs] * int(nobs))
1098 args = (corr0, endog, nobs, x0, weights0)
1099 llr = optimize.fmin(self._opt_correl, nuis0, args=args,
1100 full_output=1, disp=0)[1]
1101 p_val = chi2.sf(llr, 1)
1102 if return_weights:
1103 return llr, p_val, self.new_weights.T
1104 return llr, p_val
1106 def ci_corr(self, sig=.05, upper_bound=None, lower_bound=None):
1107 """
1108 Returns the confidence intervals for the correlation coefficient
1110 Parameters
1111 ----------
1112 sig : float
1113 The significance level. Default is .05
1115 upper_bound : float
1116 Maximum value the upper confidence limit can be.
1117 Default is 99% confidence limit assuming normality.
1119 lower_bound : float
1120 Minimum value the lower confidence limit can be.
1121 Default is 99% confidence limit assuming normality.
1123 Returns
1124 -------
1125 interval : tuple
1126 Confidence interval for the correlation
1127 """
1128 endog = self.endog
1129 nobs = self.nobs
1130 self.r0 = chi2.ppf(1 - sig, 1)
1131 point_est = np.corrcoef(endog[:, 0], endog[:, 1])[0, 1]
1132 if upper_bound is None:
1133 upper_bound = min(.999, point_est + \
1134 2.5 * ((1. - point_est ** 2.) / \
1135 (nobs - 2.)) ** .5)
1137 if lower_bound is None:
1138 lower_bound = max(- .999, point_est - \
1139 2.5 * (np.sqrt((1. - point_est ** 2.) / \
1140 (nobs - 2.))))
1142 llim = optimize.brenth(self._ci_limits_corr, lower_bound, point_est)
1143 ulim = optimize.brenth(self._ci_limits_corr, point_est, upper_bound)
1144 return llim, ulim