Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3Created on Fri Dec 19 11:29:18 2014 

4 

5Author: Josef Perktold 

6License: BSD-3 

7 

8""" 

9 

10import numpy as np 

11from scipy import stats 

12 

13 

14# this is similar to ContrastResults after t_test, partially copied and adjusted 

15class PredictionResults(object): 

16 

17 def __init__(self, predicted_mean, var_pred_mean, var_resid=None, 

18 df=None, dist=None, row_labels=None, linpred=None, link=None): 

19 # TODO: is var_resid used? drop from arguments? 

20 self.predicted_mean = predicted_mean 

21 self.var_pred_mean = var_pred_mean 

22 self.df = df 

23 self.var_resid = var_resid 

24 self.row_labels = row_labels 

25 self.linpred = linpred 

26 self.link = link 

27 

28 if dist is None or dist == 'norm': 

29 self.dist = stats.norm 

30 self.dist_args = () 

31 elif dist == 't': 

32 self.dist = stats.t 

33 self.dist_args = (self.df,) 

34 else: 

35 self.dist = dist 

36 self.dist_args = () 

37 

38 @property 

39 def se_obs(self): 

40 raise NotImplementedError 

41 return np.sqrt(self.var_pred_mean + self.var_resid) 

42 

43 @property 

44 def se_mean(self): 

45 return np.sqrt(self.var_pred_mean) 

46 

47 @property 

48 def tvalues(self): 

49 return self.predicted_mean / self.se_mean 

50 

51 def t_test(self, value=0, alternative='two-sided'): 

52 '''z- or t-test for hypothesis that mean is equal to value 

53 

54 Parameters 

55 ---------- 

56 value : array_like 

57 value under the null hypothesis 

58 alternative : str 

59 'two-sided', 'larger', 'smaller' 

60 

61 Returns 

62 ------- 

63 stat : ndarray 

64 test statistic 

65 pvalue : ndarray 

66 p-value of the hypothesis test, the distribution is given by 

67 the attribute of the instance, specified in `__init__`. Default 

68 if not specified is the normal distribution. 

69 

70 ''' 

71 # assumes symmetric distribution 

72 stat = (self.predicted_mean - value) / self.se_mean 

73 

74 if alternative in ['two-sided', '2-sided', '2s']: 

75 pvalue = self.dist.sf(np.abs(stat), *self.dist_args)*2 

76 elif alternative in ['larger', 'l']: 

77 pvalue = self.dist.sf(stat, *self.dist_args) 

78 elif alternative in ['smaller', 's']: 

79 pvalue = self.dist.cdf(stat, *self.dist_args) 

80 else: 

81 raise ValueError('invalid alternative') 

82 return stat, pvalue 

83 

84 def conf_int(self, method='endpoint', alpha=0.05, **kwds): 

85 """ 

86 Returns the confidence interval of the value, `effect` of the 

87 constraint. 

88 

89 This is currently only available for t and z tests. 

90 

91 Parameters 

92 ---------- 

93 alpha : float, optional 

94 The significance level for the confidence interval. 

95 ie., The default `alpha` = .05 returns a 95% confidence interval. 

96 

97 kwds : extra keyword arguments 

98 currently ignored, only for compatibility, consistent signature 

99 

100 Returns 

101 ------- 

102 ci : ndarray, (k_constraints, 2) 

103 The array has the lower and the upper limit of the confidence 

104 interval in the columns. 

105 """ 

106 tmp = np.linspace(0, 1, 6) 

107 is_linear = (self.link.inverse(tmp) == tmp).all() 

108 if method == 'endpoint' and not is_linear: 

109 ci_linear = self.linpred.conf_int(alpha=alpha, obs=False) 

110 ci = self.link.inverse(ci_linear) 

111 elif method == 'delta' or is_linear: 

112 se = self.se_mean 

113 q = self.dist.ppf(1 - alpha / 2., *self.dist_args) 

114 lower = self.predicted_mean - q * se 

115 upper = self.predicted_mean + q * se 

116 ci = np.column_stack((lower, upper)) 

117 # if we want to stack at a new last axis, for lower.ndim > 1 

118 # np.concatenate((lower[..., None], upper[..., None]), axis=-1) 

119 

120 return ci 

121 

122 def summary_frame(self, what='all', alpha=0.05): 

123 """Summary frame""" 

124 # TODO: finish and cleanup 

125 import pandas as pd 

126 from collections import OrderedDict 

127 #ci_obs = self.conf_int(alpha=alpha, obs=True) # need to split 

128 ci_mean = self.conf_int(alpha=alpha) 

129 to_include = OrderedDict() 

130 to_include['mean'] = self.predicted_mean 

131 to_include['mean_se'] = self.se_mean 

132 to_include['mean_ci_lower'] = ci_mean[:, 0] 

133 to_include['mean_ci_upper'] = ci_mean[:, 1] 

134 

135 self.table = to_include 

136 #OrderedDict does not work to preserve sequence 

137 # pandas dict does not handle 2d_array 

138 #data = np.column_stack(list(to_include.values())) 

139 #names = .... 

140 res = pd.DataFrame(to_include, index=self.row_labels, 

141 columns=to_include.keys()) 

142 return res 

143 

144 

145def get_prediction_glm(self, exog=None, transform=True, weights=None, 

146 row_labels=None, linpred=None, link=None, 

147 pred_kwds=None): 

148 """ 

149 compute prediction results 

150 

151 Parameters 

152 ---------- 

153 exog : array_like, optional 

154 The values for which you want to predict. 

155 transform : bool, optional 

156 If the model was fit via a formula, do you want to pass 

157 exog through the formula. Default is True. E.g., if you fit 

158 a model y ~ log(x1) + log(x2), and transform is True, then 

159 you can pass a data structure that contains x1 and x2 in 

160 their original form. Otherwise, you'd need to log the data 

161 first. 

162 weights : array_like, optional 

163 Weights interpreted as in WLS, used for the variance of the predicted 

164 residual. 

165 *args : 

166 Some models can take additional arguments. See the 

167 predict method of the model for the details. 

168 **kwargs : 

169 Some models can take additional keyword arguments. See the 

170 predict method of the model for the details. 

171 

172 Returns 

173 ------- 

174 prediction_results : generalized_linear_model.PredictionResults 

175 The prediction results instance contains prediction and prediction 

176 variance and can on demand calculate confidence intervals and summary 

177 tables for the prediction of the mean and of new observations. 

178 """ 

179 

180 # prepare exog and row_labels, based on base Results.predict 

181 if transform and hasattr(self.model, 'formula') and exog is not None: 

182 from patsy import dmatrix 

183 exog = dmatrix(self.model.data.design_info, 

184 exog) 

185 

186 if exog is not None: 

187 if row_labels is None: 

188 row_labels = getattr(exog, 'index', None) 

189 if callable(row_labels): 

190 row_labels = None 

191 

192 exog = np.asarray(exog) 

193 if exog.ndim == 1 and (self.model.exog.ndim == 1 or 

194 self.model.exog.shape[1] == 1): 

195 exog = exog[:, None] 

196 exog = np.atleast_2d(exog) # needed in count model shape[1] 

197 else: 

198 exog = self.model.exog 

199 if weights is None: 

200 weights = getattr(self.model, 'weights', None) 

201 

202 if row_labels is None: 

203 row_labels = getattr(self.model.data, 'row_labels', None) 

204 

205 # need to handle other arrays, TODO: is delegating to model possible ? 

206 if weights is not None: 

207 weights = np.asarray(weights) 

208 if (weights.size > 1 and 

209 (weights.ndim != 1 or weights.shape[0] == exog.shape[1])): 

210 raise ValueError('weights has wrong shape') 

211 

212 ### end 

213 

214 pred_kwds['linear'] = False 

215 predicted_mean = self.model.predict(self.params, exog, **pred_kwds) 

216 

217 covb = self.cov_params() 

218 

219 link_deriv = self.model.family.link.inverse_deriv(linpred.predicted_mean) 

220 var_pred_mean = link_deriv**2 * (exog * np.dot(covb, exog.T).T).sum(1) 

221 var_resid = self.scale # self.mse_resid / weights 

222 

223 # TODO: check that we have correct scale, Refactor scale #??? 

224 # special case for now: 

225 if self.cov_type == 'fixed scale': 

226 var_resid = self.cov_kwds['scale'] 

227 

228 if weights is not None: 

229 var_resid /= weights 

230 

231 dist = ['norm', 't'][self.use_t] 

232 return PredictionResults(predicted_mean, var_pred_mean, var_resid, 

233 df=self.df_resid, dist=dist, 

234 row_labels=row_labels, linpred=linpred, link=link) 

235 

236 

237def params_transform_univariate(params, cov_params, link=None, transform=None, 

238 row_labels=None): 

239 """ 

240 results for univariate, nonlinear, monotonicaly transformed parameters 

241 

242 This provides transformed values, standard errors and confidence interval 

243 for transformations of parameters, for example in calculating rates with 

244 `exp(params)` in the case of Poisson or other models with exponential 

245 mean function. 

246 """ 

247 

248 from statsmodels.genmod.families import links 

249 if link is None and transform is None: 

250 link = links.Log() 

251 

252 if row_labels is None and hasattr(params, 'index'): 

253 row_labels = params.index 

254 

255 params = np.asarray(params) 

256 

257 predicted_mean = link.inverse(params) 

258 link_deriv = link.inverse_deriv(params) 

259 var_pred_mean = link_deriv**2 * np.diag(cov_params) 

260 # TODO: do we want covariance also, or just var/se 

261 

262 dist = stats.norm 

263 

264 # TODO: need ci for linear prediction, method of `lin_pred 

265 linpred = PredictionResults(params, np.diag(cov_params), dist=dist, 

266 row_labels=row_labels, link=links.identity()) 

267 

268 res = PredictionResults(predicted_mean, var_pred_mean, dist=dist, 

269 row_labels=row_labels, linpred=linpred, link=link) 

270 

271 return res