Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/emplike/aft_el.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
3Accelerated Failure Time (AFT) Model with empirical likelihood inference.
5AFT regression analysis is applicable when the researcher has access
6to a randomly right censored dependent variable, a matrix of exogenous
7variables and an indicatior variable (delta) that takes a value of 0 if the
8observation is censored and 1 otherwise.
10AFT References
11--------------
13Stute, W. (1993). "Consistent Estimation Under Random Censorship when
14Covariables are Present." Journal of Multivariate Analysis.
15Vol. 45. Iss. 1. 89-103
17EL and AFT References
18---------------------
20Zhou, Kim And Bathke. "Empirical Likelihood Analysis for the Heteroskedastic
21Accelerated Failure Time Model." Manuscript:
22URL: www.ms.uky.edu/~mai/research/CasewiseEL20080724.pdf
24Zhou, M. (2005). Empirical Likelihood Ratio with Arbitrarily Censored/
25Truncated Data by EM Algorithm. Journal of Computational and Graphical
26Statistics. 14:3, 643-656.
29"""
30import numpy as np
31from statsmodels.regression.linear_model import OLS, WLS
32from statsmodels.tools import add_constant
33#from elregress import ElReg
34from scipy import optimize
35from scipy.stats import chi2
36from .descriptive import _OptFuncts
37import warnings
38from statsmodels.tools.sm_exceptions import IterationLimitWarning
40class OptAFT(_OptFuncts):
41 """
42 Provides optimization functions used in estimating and conducting
43 inference in an AFT model.
45 Methods
46 ------
48 _opt_wtd_nuis_regress:
49 Function optimized over nuisance parameters to compute
50 the profile likelihood
52 _EM_test:
53 Uses the modified Em algorithm of Zhou 2005 to maximize the
54 likelihood of a parameter vector.
55 """
56 def __init__(self):
57 pass
59 def _opt_wtd_nuis_regress(self, test_vals):
60 """
61 A function that is optimized over nuisance parameters to conduct a
62 hypothesis test for the parameters of interest
64 Parameters
65 ----------
67 params: 1d array
68 The regression coefficients of the model. This includes the
69 nuisance and parameters of interests.
71 Returns
72 -------
74 llr: float
75 -2 times the log likelihood of the nuisance parameters and the
76 hypothesized value of the parameter(s) of interest.
77 """
78 test_params = test_vals.reshape(self.model.nvar, 1)
79 est_vect = self.model.uncens_exog * (self.model.uncens_endog -
80 np.dot(self.model.uncens_exog,
81 test_params))
82 eta_star = self._modif_newton(np.zeros(self.model.nvar), est_vect,
83 self.model._fit_weights)
84 denom = np.sum(self.model._fit_weights) + np.dot(eta_star, est_vect.T)
85 self.new_weights = self.model._fit_weights / denom
86 return -1 * np.sum(np.log(self.new_weights))
88 def _EM_test(self, nuisance_params, params=None, param_nums=None,
89 b0_vals=None, F=None, survidx=None, uncens_nobs=None,
90 numcensbelow=None, km=None, uncensored=None, censored=None,
91 maxiter=None, ftol=None):
92 """
93 Uses EM algorithm to compute the maximum likelihood of a test
95 Parameters
96 ----------
98 Nuisance Params: ndarray
99 Vector of values to be used as nuisance params.
101 maxiter: int
102 Number of iterations in the EM algorithm for a parameter vector
104 Returns
105 -------
106 -2 ''*'' log likelihood ratio at hypothesized values and
107 nuisance params
109 Notes
110 -----
111 Optional parameters are provided by the test_beta function.
112 """
113 iters = 0
114 params[param_nums] = b0_vals
116 nuis_param_index = np.int_(np.delete(np.arange(self.model.nvar),
117 param_nums))
118 params[nuis_param_index] = nuisance_params
119 to_test = params.reshape(self.model.nvar, 1)
120 opt_res = np.inf
121 diff = np.inf
122 while iters < maxiter and diff > ftol:
123 F = F.flatten()
124 death = np.cumsum(F[::-1])
125 survivalprob = death[::-1]
126 surv_point_mat = np.dot(F.reshape(-1, 1),
127 1. / survivalprob[survidx].reshape(1, - 1))
128 surv_point_mat = add_constant(surv_point_mat)
129 summed_wts = np.cumsum(surv_point_mat, axis=1)
130 wts = summed_wts[np.int_(np.arange(uncens_nobs)),
131 numcensbelow[uncensored]]
132 # ^E step
133 # See Zhou 2005, section 3.
134 self.model._fit_weights = wts
135 new_opt_res = self._opt_wtd_nuis_regress(to_test)
136 # ^ Uncensored weights' contribution to likelihood value.
137 F = self.new_weights
138 # ^ M step
139 diff = np.abs(new_opt_res - opt_res)
140 opt_res = new_opt_res
141 iters = iters + 1
142 death = np.cumsum(F.flatten()[::-1])
143 survivalprob = death[::-1]
144 llike = -opt_res + np.sum(np.log(survivalprob[survidx]))
145 wtd_km = km.flatten() / np.sum(km)
146 survivalmax = np.cumsum(wtd_km[::-1])[::-1]
147 llikemax = np.sum(np.log(wtd_km[uncensored])) + \
148 np.sum(np.log(survivalmax[censored]))
149 if iters == maxiter:
150 warnings.warn('The EM reached the maximum number of iterations',
151 IterationLimitWarning)
152 return -2 * (llike - llikemax)
154 def _ci_limits_beta(self, b0, param_num=None):
155 """
156 Returns the difference between the log likelihood for a
157 parameter and some critical value.
159 Parameters
160 ----------
161 b0: float
162 Value of a regression parameter
164 param_num: int
165 Parameter index of b0
166 """
167 return self.test_beta([b0], [param_num])[0] - self.r0
170class emplikeAFT(object):
171 """
173 Class for estimating and conducting inference in an AFT model.
175 Parameters
176 ----------
178 endog: nx1 array
179 Response variables that are subject to random censoring
181 exog: nxk array
182 Matrix of covariates
184 censors: nx1 array
185 array with entries 0 or 1. 0 indicates a response was
186 censored.
188 Attributes
189 ----------
191 nobs: float
192 Number of observations
194 endog: ndarray
195 Endog attay
197 exog: ndarray
198 Exogenous variable matrix
200 censors
201 Censors array but sets the max(endog) to uncensored
203 nvar: float
204 Number of exogenous variables
206 uncens_nobs: float
207 Number of uncensored observations
209 uncens_endog: ndarray
210 Uncensored response variables
212 uncens_exog: ndarray
213 Exogenous variables of the uncensored observations
215 Methods
216 -------
218 params:
219 Fits model parameters
221 test_beta:
222 Tests if beta = b0 for any vector b0.
224 Notes
225 -----
227 The data is immediately sorted in order of increasing endogenous
228 variables
230 The last observation is assumed to be uncensored which makes
231 estimation and inference possible.
232 """
233 def __init__(self, endog, exog, censors):
234 self.nobs = np.shape(exog)[0]
235 self.endog = endog.reshape(self.nobs, 1)
236 self.exog = exog.reshape(self.nobs, -1)
237 self.censors = censors.reshape(self.nobs, 1)
238 self.nvar = self.exog.shape[1]
239 idx = np.lexsort((-self.censors[:, 0], self.endog[:, 0]))
240 self.endog = self.endog[idx]
241 self.exog = self.exog[idx]
242 self.censors = self.censors[idx]
243 self.censors[-1] = 1 # Sort in init, not in function
244 self.uncens_nobs = int(np.sum(self.censors))
245 mask = self.censors.ravel().astype(bool)
246 self.uncens_endog = self.endog[mask, :].reshape(-1, 1)
247 self.uncens_exog = self.exog[mask, :]
250 def _is_tied(self, endog, censors):
251 """
252 Indicated if an observation takes the same value as the next
253 ordered observation.
255 Parameters
256 ----------
257 endog: ndarray
258 Models endogenous variable
259 censors: ndarray
260 arrat indicating a censored array
262 Returns
263 -------
264 indic_ties: ndarray
265 ties[i]=1 if endog[i]==endog[i+1] and
266 censors[i]=censors[i+1]
267 """
268 nobs = int(self.nobs)
269 endog_idx = endog[np.arange(nobs - 1)] == (
270 endog[np.arange(nobs - 1) + 1])
271 censors_idx = censors[np.arange(nobs - 1)] == (
272 censors[np.arange(nobs - 1) + 1])
273 indic_ties = endog_idx * censors_idx # Both true
274 return np.int_(indic_ties)
276 def _km_w_ties(self, tie_indic, untied_km):
277 """
278 Computes KM estimator value at each observation, taking into acocunt
279 ties in the data.
281 Parameters
282 ----------
283 tie_indic: 1d array
284 Indicates if the i'th observation is the same as the ith +1
285 untied_km: 1d array
286 Km estimates at each observation assuming no ties.
287 """
288 # TODO: Vectorize, even though it is only 1 pass through for any
289 # function call
290 num_same = 1
291 idx_nums = []
292 for obs_num in np.arange(int(self.nobs - 1))[::-1]:
293 if tie_indic[obs_num] == 1:
294 idx_nums.append(obs_num)
295 num_same = num_same + 1
296 untied_km[obs_num] = untied_km[obs_num + 1]
297 elif tie_indic[obs_num] == 0 and num_same > 1:
298 idx_nums.append(max(idx_nums) + 1)
299 idx_nums = np.asarray(idx_nums)
300 untied_km[idx_nums] = untied_km[idx_nums]
301 num_same = 1
302 idx_nums = []
303 return untied_km.reshape(self.nobs, 1)
305 def _make_km(self, endog, censors):
306 """
308 Computes the Kaplan-Meier estimate for the weights in the AFT model
310 Parameters
311 ----------
312 endog: nx1 array
313 Array of response variables
314 censors: nx1 array
315 Censor-indicating variable
317 Returns
318 -------
319 Kaplan Meier estimate for each observation
321 Notes
322 -----
324 This function makes calls to _is_tied and km_w_ties to handle ties in
325 the data.If a censored observation and an uncensored observation has
326 the same value, it is assumed that the uncensored happened first.
327 """
328 nobs = self.nobs
329 num = (nobs - (np.arange(nobs) + 1.))
330 denom = ((nobs - (np.arange(nobs) + 1.) + 1.))
331 km = (num / denom).reshape(nobs, 1)
332 km = km ** np.abs(censors - 1.)
333 km = np.cumprod(km) # If no ties, this is kaplan-meier
334 tied = self._is_tied(endog, censors)
335 wtd_km = self._km_w_ties(tied, km)
336 return (censors / wtd_km).reshape(nobs, 1)
338 def fit(self):
339 """
341 Fits an AFT model and returns results instance
343 Parameters
344 ----------
345 None
348 Returns
349 -------
350 Results instance.
352 Notes
353 -----
354 To avoid dividing by zero, max(endog) is assumed to be uncensored.
355 """
356 return AFTResults(self)
358 def predict(self, params, endog=None):
359 if endog is None:
360 endog = self.endog
361 return np.dot(endog, params)
364class AFTResults(OptAFT):
365 def __init__(self, model):
366 self.model = model
368 def params(self):
369 """
371 Fits an AFT model and returns parameters.
373 Parameters
374 ----------
375 None
378 Returns
379 -------
380 Fitted params
382 Notes
383 -----
384 To avoid dividing by zero, max(endog) is assumed to be uncensored.
385 """
386 self.model.modif_censors = np.copy(self.model.censors)
387 self.model.modif_censors[-1] = 1
388 wts = self.model._make_km(self.model.endog, self.model.modif_censors)
389 res = WLS(self.model.endog, self.model.exog, wts).fit()
390 params = res.params
391 return params
393 def test_beta(self, b0_vals, param_nums, ftol=10 ** - 5, maxiter=30,
394 print_weights=1):
395 """
396 Returns the profile log likelihood for regression parameters
397 'param_num' at 'b0_vals.'
399 Parameters
400 ----------
401 b0_vals: list
402 The value of parameters to be tested
404 param_num: list
405 Which parameters to be tested
407 maxiter: int, optional
408 How many iterations to use in the EM algorithm. Default is 30
410 ftol: float, optional
411 The function tolerance for the EM optimization.
412 Default is 10''**''-5
414 print_weights: bool
415 If true, returns the weights tate maximize the profile
416 log likelihood. Default is False
418 Returns
419 -------
421 test_results : tuple
422 The log-likelihood and p-pvalue of the test.
424 Notes
425 -----
427 The function will warn if the EM reaches the maxiter. However, when
428 optimizing over nuisance parameters, it is possible to reach a
429 maximum number of inner iterations for a specific value for the
430 nuisance parameters while the resultsof the function are still valid.
431 This usually occurs when the optimization over the nuisance parameters
432 selects paramater values that yield a log-likihood ratio close to
433 infinity.
435 Examples
436 --------
438 >>> import statsmodels.api as sm
439 >>> import numpy as np
441 # Test parameter is .05 in one regressor no intercept model
442 >>> data=sm.datasets.heart.load(as_pandas=False)
443 >>> y = np.log10(data.endog)
444 >>> x = data.exog
445 >>> cens = data.censors
446 >>> model = sm.emplike.emplikeAFT(y, x, cens)
447 >>> res=model.test_beta([0], [0])
448 >>> res
449 (1.4657739632606308, 0.22601365256959183)
451 #Test slope is 0 in model with intercept
453 >>> data=sm.datasets.heart.load(as_pandas=False)
454 >>> y = np.log10(data.endog)
455 >>> x = data.exog
456 >>> cens = data.censors
457 >>> model = sm.emplike.emplikeAFT(y, sm.add_constant(x), cens)
458 >>> res = model.test_beta([0], [1])
459 >>> res
460 (4.623487775078047, 0.031537049752572731)
461 """
462 censors = self.model.censors
463 endog = self.model.endog
464 exog = self.model.exog
465 uncensored = (censors == 1).flatten()
466 censored = (censors == 0).flatten()
467 uncens_endog = endog[uncensored]
468 uncens_exog = exog[uncensored, :]
469 reg_model = OLS(uncens_endog, uncens_exog).fit()
470 llr, pval, new_weights = reg_model.el_test(b0_vals, param_nums,
471 return_weights=True) # Needs to be changed
472 km = self.model._make_km(endog, censors).flatten() # when merged
473 uncens_nobs = self.model.uncens_nobs
474 F = np.asarray(new_weights).reshape(uncens_nobs)
475 # Step 0 ^
476 params = self.params()
477 survidx = np.where(censors == 0)
478 survidx = survidx[0] - np.arange(len(survidx[0]))
479 numcensbelow = np.int_(np.cumsum(1 - censors))
480 if len(param_nums) == len(params):
481 llr = self._EM_test([], F=F, params=params,
482 param_nums=param_nums,
483 b0_vals=b0_vals, survidx=survidx,
484 uncens_nobs=uncens_nobs,
485 numcensbelow=numcensbelow, km=km,
486 uncensored=uncensored, censored=censored,
487 ftol=ftol, maxiter=25)
488 return llr, chi2.sf(llr, self.model.nvar)
489 else:
490 x0 = np.delete(params, param_nums)
491 try:
492 res = optimize.fmin(self._EM_test, x0,
493 (params, param_nums, b0_vals, F, survidx,
494 uncens_nobs, numcensbelow, km, uncensored,
495 censored, maxiter, ftol), full_output=1,
496 disp=0)
498 llr = res[1]
499 return llr, chi2.sf(llr, len(param_nums))
500 except np.linalg.linalg.LinAlgError:
501 return np.inf, 0
503 def ci_beta(self, param_num, beta_high, beta_low, sig=.05):
504 """
505 Returns the confidence interval for a regression
506 parameter in the AFT model.
508 Parameters
509 ----------
511 param_num: int
512 Parameter number of interest
514 beta_high: float
515 Upper bound for the confidence interval
517 beta_low:
518 Lower bound for the confidence interval
520 sig: float, optional
521 Significance level. Default is .05
523 Notes
524 -----
525 If the function returns f(a) and f(b) must have different signs,
526 consider widening the search area by adjusting beta_low and
527 beta_high.
529 Also note that this process is computational intensive. There
530 are 4 levels of optimization/solving. From outer to inner:
532 1) Solving so that llr-critical value = 0
533 2) maximizing over nuisance parameters
534 3) Using EM at each value of nuisamce parameters
535 4) Using the _modified_Newton optimizer at each iteration
536 of the EM algorithm.
538 Also, for very unlikely nuisance parameters, it is possible for
539 the EM algorithm to not converge. This is not an indicator
540 that the solver did not find the correct solution. It just means
541 for a specific iteration of the nuisance parameters, the optimizer
542 was unable to converge.
544 If the user desires to verify the success of the optimization,
545 it is recommended to test the limits using test_beta.
546 """
547 params = self.params()
548 self.r0 = chi2.ppf(1 - sig, 1)
549 ll = optimize.brentq(self._ci_limits_beta, beta_low,
550 params[param_num], (param_num))
551 ul = optimize.brentq(self._ci_limits_beta,
552 params[param_num], beta_high, (param_num))
553 return ll, ul