Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python 

2 

3''' 

4Quantile regression model 

5 

6Model parameters are estimated using iterated reweighted least squares. The 

7asymptotic covariance matrix estimated using kernel density estimation. 

8 

9Author: Vincent Arel-Bundock 

10License: BSD-3 

11Created: 2013-03-19 

12 

13The original IRLS function was written for Matlab by Shapour Mohammadi, 

14University of Tehran, 2008 (shmohammadi@gmail.com), with some lines based on 

15code written by James P. Lesage in Applied Econometrics Using MATLAB(1999).PP. 

1673-4. Translated to python with permission from original author by Christian 

17Prinoth (christian at prinoth dot name). 

18''' 

19 

20import numpy as np 

21import warnings 

22import scipy.stats as stats 

23from numpy.linalg import pinv 

24from scipy.stats import norm 

25from statsmodels.tools.decorators import cache_readonly 

26from statsmodels.regression.linear_model import (RegressionModel, 

27 RegressionResults, 

28 RegressionResultsWrapper) 

29from statsmodels.tools.sm_exceptions import (ConvergenceWarning, 

30 IterationLimitWarning) 

31 

32class QuantReg(RegressionModel): 

33 '''Quantile Regression 

34 

35 Estimate a quantile regression model using iterative reweighted least 

36 squares. 

37 

38 Parameters 

39 ---------- 

40 endog : array or dataframe 

41 endogenous/response variable 

42 exog : array or dataframe 

43 exogenous/explanatory variable(s) 

44 

45 Notes 

46 ----- 

47 The Least Absolute Deviation (LAD) estimator is a special case where 

48 quantile is set to 0.5 (q argument of the fit method). 

49 

50 The asymptotic covariance matrix is estimated following the procedure in 

51 Greene (2008, p.407-408), using either the logistic or gaussian kernels 

52 (kernel argument of the fit method). 

53 

54 References 

55 ---------- 

56 General: 

57 

58 * Birkes, D. and Y. Dodge(1993). Alternative Methods of Regression, John Wiley and Sons. 

59 * Green,W. H. (2008). Econometric Analysis. Sixth Edition. International Student Edition. 

60 * Koenker, R. (2005). Quantile Regression. New York: Cambridge University Press. 

61 * LeSage, J. P.(1999). Applied Econometrics Using MATLAB, 

62 

63 Kernels (used by the fit method): 

64 

65 * Green (2008) Table 14.2 

66 

67 Bandwidth selection (used by the fit method): 

68 

69 * Bofinger, E. (1975). Estimation of a density function using order statistics. Australian Journal of Statistics 17: 1-17. 

70 * Chamberlain, G. (1994). Quantile regression, censoring, and the structure of wages. In Advances in Econometrics, Vol. 1: Sixth World Congress, ed. C. A. Sims, 171-209. Cambridge: Cambridge University Press. 

71 * Hall, P., and S. Sheather. (1988). On the distribution of the Studentized quantile. Journal of the Royal Statistical Society, Series B 50: 381-391. 

72 

73 Keywords: Least Absolute Deviation(LAD) Regression, Quantile Regression, 

74 Regression, Robust Estimation. 

75 ''' 

76 

77 def __init__(self, endog, exog, **kwargs): 

78 super(QuantReg, self).__init__(endog, exog, **kwargs) 

79 

80 def whiten(self, data): 

81 """ 

82 QuantReg model whitener does nothing: returns data. 

83 """ 

84 return data 

85 

86 def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather', 

87 max_iter=1000, p_tol=1e-6, **kwargs): 

88 """ 

89 Solve by Iterative Weighted Least Squares 

90 

91 Parameters 

92 ---------- 

93 q : float 

94 Quantile must be between 0 and 1 

95 vcov : str, method used to calculate the variance-covariance matrix 

96 of the parameters. Default is ``robust``: 

97 

98 - robust : heteroskedasticity robust standard errors (as suggested 

99 in Greene 6th edition) 

100 - iid : iid errors (as in Stata 12) 

101 

102 kernel : str, kernel to use in the kernel density estimation for the 

103 asymptotic covariance matrix: 

104 

105 - epa: Epanechnikov 

106 - cos: Cosine 

107 - gau: Gaussian 

108 - par: Parzene 

109 

110 bandwidth : str, Bandwidth selection method in kernel density 

111 estimation for asymptotic covariance estimate (full 

112 references in QuantReg docstring): 

113 

114 - hsheather: Hall-Sheather (1988) 

115 - bofinger: Bofinger (1975) 

116 - chamberlain: Chamberlain (1994) 

117 """ 

118 

119 if q < 0 or q > 1: 

120 raise Exception('p must be between 0 and 1') 

121 

122 kern_names = ['biw', 'cos', 'epa', 'gau', 'par'] 

123 if kernel not in kern_names: 

124 raise Exception("kernel must be one of " + ', '.join(kern_names)) 

125 else: 

126 kernel = kernels[kernel] 

127 

128 if bandwidth == 'hsheather': 

129 bandwidth = hall_sheather 

130 elif bandwidth == 'bofinger': 

131 bandwidth = bofinger 

132 elif bandwidth == 'chamberlain': 

133 bandwidth = chamberlain 

134 else: 

135 raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'") 

136 

137 endog = self.endog 

138 exog = self.exog 

139 nobs = self.nobs 

140 exog_rank = np.linalg.matrix_rank(self.exog) 

141 self.rank = exog_rank 

142 self.df_model = float(self.rank - self.k_constant) 

143 self.df_resid = self.nobs - self.rank 

144 n_iter = 0 

145 xstar = exog 

146 

147 beta = np.ones(exog_rank) 

148 # TODO: better start, initial beta is used only for convergence check 

149 

150 # Note the following does not work yet, 

151 # the iteration loop always starts with OLS as initial beta 

152 # if start_params is not None: 

153 # if len(start_params) != rank: 

154 # raise ValueError('start_params has wrong length') 

155 # beta = start_params 

156 # else: 

157 # # start with OLS 

158 # beta = np.dot(np.linalg.pinv(exog), endog) 

159 

160 diff = 10 

161 cycle = False 

162 

163 history = dict(params = [], mse=[]) 

164 while n_iter < max_iter and diff > p_tol and not cycle: 

165 n_iter += 1 

166 beta0 = beta 

167 xtx = np.dot(xstar.T, exog) 

168 xty = np.dot(xstar.T, endog) 

169 beta = np.dot(pinv(xtx), xty) 

170 resid = endog - np.dot(exog, beta) 

171 

172 mask = np.abs(resid) < .000001 

173 resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001 

174 resid = np.where(resid < 0, q * resid, (1-q) * resid) 

175 resid = np.abs(resid) 

176 xstar = exog / resid[:, np.newaxis] 

177 diff = np.max(np.abs(beta - beta0)) 

178 history['params'].append(beta) 

179 history['mse'].append(np.mean(resid*resid)) 

180 

181 if (n_iter >= 300) and (n_iter % 100 == 0): 

182 # check for convergence circle, should not happen 

183 for ii in range(2, 10): 

184 if np.all(beta == history['params'][-ii]): 

185 cycle = True 

186 warnings.warn("Convergence cycle detected", ConvergenceWarning) 

187 break 

188 

189 if n_iter == max_iter: 

190 warnings.warn("Maximum number of iterations (" + str(max_iter) + 

191 ") reached.", IterationLimitWarning) 

192 

193 e = endog - np.dot(exog, beta) 

194 # Greene (2008, p.407) writes that Stata 6 uses this bandwidth: 

195 # h = 0.9 * np.std(e) / (nobs**0.2) 

196 # Instead, we calculate bandwidth as in Stata 12 

197 iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25) 

198 h = bandwidth(nobs, q) 

199 h = min(np.std(endog), 

200 iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h)) 

201 

202 fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h)) 

203 

204 if vcov == 'robust': 

205 d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2) 

206 xtxi = pinv(np.dot(exog.T, exog)) 

207 xtdx = np.dot(exog.T * d[np.newaxis, :], exog) 

208 vcov = xtxi @ xtdx @ xtxi 

209 elif vcov == 'iid': 

210 vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog)) 

211 else: 

212 raise Exception("vcov must be 'robust' or 'iid'") 

213 

214 lfit = QuantRegResults(self, beta, normalized_cov_params=vcov) 

215 

216 lfit.q = q 

217 lfit.iterations = n_iter 

218 lfit.sparsity = 1. / fhat0 

219 lfit.bandwidth = h 

220 lfit.history = history 

221 

222 return RegressionResultsWrapper(lfit) 

223 

224 

225def _parzen(u): 

226 z = np.where(np.abs(u) <= .5, 4./3 - 8. * u**2 + 8. * np.abs(u)**3, 

227 8. * (1 - np.abs(u))**3 / 3.) 

228 z[np.abs(u) > 1] = 0 

229 return z 

230 

231 

232kernels = {} 

233kernels['biw'] = lambda u: 15. / 16 * (1 - u**2)**2 * np.where(np.abs(u) <= 1, 1, 0) 

234kernels['cos'] = lambda u: np.where(np.abs(u) <= .5, 1 + np.cos(2 * np.pi * u), 0) 

235kernels['epa'] = lambda u: 3. / 4 * (1-u**2) * np.where(np.abs(u) <= 1, 1, 0) 

236kernels['gau'] = lambda u: norm.pdf(u) 

237kernels['par'] = _parzen 

238#kernels['bet'] = lambda u: np.where(np.abs(u) <= 1, .75 * (1 - u) * (1 + u), 0) 

239#kernels['log'] = lambda u: logistic.pdf(u) * (1 - logistic.pdf(u)) 

240#kernels['tri'] = lambda u: np.where(np.abs(u) <= 1, 1 - np.abs(u), 0) 

241#kernels['trw'] = lambda u: 35. / 32 * (1 - u**2)**3 * np.where(np.abs(u) <= 1, 1, 0) 

242#kernels['uni'] = lambda u: 1. / 2 * np.where(np.abs(u) <= 1, 1, 0) 

243 

244 

245def hall_sheather(n, q, alpha=.05): 

246 z = norm.ppf(q) 

247 num = 1.5 * norm.pdf(z)**2. 

248 den = 2. * z**2. + 1. 

249 h = n**(-1. / 3) * norm.ppf(1. - alpha / 2.)**(2./3) * (num / den)**(1./3) 

250 return h 

251 

252 

253def bofinger(n, q): 

254 num = 9. / 2 * norm.pdf(2 * norm.ppf(q))**4 

255 den = (2 * norm.ppf(q)**2 + 1)**2 

256 h = n**(-1. / 5) * (num / den)**(1. / 5) 

257 return h 

258 

259 

260def chamberlain(n, q, alpha=.05): 

261 return norm.ppf(1 - alpha / 2) * np.sqrt(q*(1 - q) / n) 

262 

263 

264class QuantRegResults(RegressionResults): 

265 '''Results instance for the QuantReg model''' 

266 

267 @cache_readonly 

268 def prsquared(self): 

269 q = self.q 

270 endog = self.model.endog 

271 e = self.resid 

272 e = np.where(e < 0, (1 - q) * e, q * e) 

273 e = np.abs(e) 

274 ered = endog - stats.scoreatpercentile(endog, q * 100) 

275 ered = np.where(ered < 0, (1 - q) * ered, q * ered) 

276 ered = np.abs(ered) 

277 return 1 - np.sum(e) / np.sum(ered) 

278 

279 #@cache_readonly 

280 def scale(self): 

281 return 1. 

282 

283 @cache_readonly 

284 def bic(self): 

285 return np.nan 

286 

287 @cache_readonly 

288 def aic(self): 

289 return np.nan 

290 

291 @cache_readonly 

292 def llf(self): 

293 return np.nan 

294 

295 @cache_readonly 

296 def rsquared(self): 

297 return np.nan 

298 

299 @cache_readonly 

300 def rsquared_adj(self): 

301 return np.nan 

302 

303 @cache_readonly 

304 def mse(self): 

305 return np.nan 

306 

307 @cache_readonly 

308 def mse_model(self): 

309 return np.nan 

310 

311 @cache_readonly 

312 def mse_total(self): 

313 return np.nan 

314 

315 @cache_readonly 

316 def centered_tss(self): 

317 return np.nan 

318 

319 @cache_readonly 

320 def uncentered_tss(self): 

321 return np.nan 

322 

323 @cache_readonly 

324 def HC0_se(self): 

325 raise NotImplementedError 

326 

327 @cache_readonly 

328 def HC1_se(self): 

329 raise NotImplementedError 

330 

331 @cache_readonly 

332 def HC2_se(self): 

333 raise NotImplementedError 

334 

335 @cache_readonly 

336 def HC3_se(self): 

337 raise NotImplementedError 

338 

339 def summary(self, yname=None, xname=None, title=None, alpha=.05): 

340 """Summarize the Regression Results 

341 

342 Parameters 

343 ---------- 

344 yname : str, optional 

345 Default is `y` 

346 xname : list[str], optional 

347 Names for the exogenous variables. Default is `var_##` for ## in 

348 the number of regressors. Must match the number of parameters 

349 in the model 

350 title : str, optional 

351 Title for the top table. If not None, then this replaces the 

352 default title 

353 alpha : float 

354 significance level for the confidence intervals 

355 

356 Returns 

357 ------- 

358 smry : Summary instance 

359 this holds the summary tables and text, which can be printed or 

360 converted to various output formats. 

361 

362 See Also 

363 -------- 

364 statsmodels.iolib.summary.Summary : class to hold summary results 

365 """ 

366 eigvals = self.eigenvals 

367 condno = self.condition_number 

368 

369 top_left = [('Dep. Variable:', None), 

370 ('Model:', None), 

371 ('Method:', ['Least Squares']), 

372 ('Date:', None), 

373 ('Time:', None) 

374 ] 

375 

376 top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]), 

377 ('Bandwidth:', ["%#8.4g" % self.bandwidth]), 

378 ('Sparsity:', ["%#8.4g" % self.sparsity]), 

379 ('No. Observations:', None), 

380 ('Df Residuals:', None), 

381 ('Df Model:', None) 

382 ] 

383 

384 if title is None: 

385 title = self.model.__class__.__name__ + ' ' + "Regression Results" 

386 

387 # create summary table instance 

388 from statsmodels.iolib.summary import Summary 

389 smry = Summary() 

390 smry.add_table_2cols(self, gleft=top_left, gright=top_right, 

391 yname=yname, xname=xname, title=title) 

392 smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha, 

393 use_t=self.use_t) 

394 

395 # add warnings/notes, added to text format only 

396 etext = [] 

397 if eigvals[-1] < 1e-10: 

398 wstr = "The smallest eigenvalue is %6.3g. This might indicate " 

399 wstr += "that there are\n" 

400 wstr += "strong multicollinearity problems or that the design " 

401 wstr += "matrix is singular." 

402 wstr = wstr % eigvals[-1] 

403 etext.append(wstr) 

404 elif condno > 1000: # TODO: what is recommended 

405 wstr = "The condition number is large, %6.3g. This might " 

406 wstr += "indicate that there are\n" 

407 wstr += "strong multicollinearity or other numerical " 

408 wstr += "problems." 

409 wstr = wstr % condno 

410 etext.append(wstr) 

411 

412 if etext: 

413 smry.add_extra_txt(etext) 

414 

415 return smry