Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/duration/hazard_regression.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Implementation of proportional hazards regression models for duration
3data that may be censored ("Cox models").
5References
6----------
7T Therneau (1996). Extending the Cox model. Technical report.
8http://www.mayo.edu/research/documents/biostat-58pdf/DOC-10027288
10G Rodriguez (2005). Non-parametric estimation in survival models.
11http://data.princeton.edu/pop509/NonParametricSurvival.pdf
13B Gillespie (2006). Checking the assumptions in the Cox proportional
14hazards model.
15http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf
16"""
17import numpy as np
19from statsmodels.base import model
20import statsmodels.base.model as base
21from statsmodels.tools.decorators import cache_readonly
22from statsmodels.compat.pandas import Appender
25_predict_docstring = """
26 Returns predicted values from the proportional hazards
27 regression model.
29 Parameters
30 ----------%(params_doc)s
31 exog : array_like
32 Data to use as `exog` in forming predictions. If not
33 provided, the `exog` values from the model used to fit the
34 data are used.%(cov_params_doc)s
35 endog : array_like
36 Duration (time) values at which the predictions are made.
37 Only used if pred_type is either 'cumhaz' or 'surv'. If
38 using model `exog`, defaults to model `endog` (time), but
39 may be provided explicitly to make predictions at
40 alternative times.
41 strata : array_like
42 A vector of stratum values used to form the predictions.
43 Not used (may be 'None') if pred_type is 'lhr' or 'hr'.
44 If `exog` is None, the model stratum values are used. If
45 `exog` is not None and pred_type is 'surv' or 'cumhaz',
46 stratum values must be provided (unless there is only one
47 stratum).
48 offset : array_like
49 Offset values used to create the predicted values.
50 pred_type : str
51 If 'lhr', returns log hazard ratios, if 'hr' returns
52 hazard ratios, if 'surv' returns the survival function, if
53 'cumhaz' returns the cumulative hazard function.
55 Returns
56 -------
57 A bunch containing two fields: `predicted_values` and
58 `standard_errors`.
60 Notes
61 -----
62 Standard errors are only returned when predicting the log
63 hazard ratio (pred_type is 'lhr').
65 Types `surv` and `cumhaz` require estimation of the cumulative
66 hazard function.
67"""
69_predict_params_doc = """
70 params : array_like
71 The proportional hazards model parameters."""
73_predict_cov_params_docstring = """
74 cov_params : array_like
75 The covariance matrix of the estimated `params` vector,
76 used to obtain prediction errors if pred_type='lhr',
77 otherwise optional."""
81class PHSurvivalTime(object):
83 def __init__(self, time, status, exog, strata=None, entry=None,
84 offset=None):
85 """
86 Represent a collection of survival times with possible
87 stratification and left truncation.
89 Parameters
90 ----------
91 time : array_like
92 The times at which either the event (failure) occurs or
93 the observation is censored.
94 status : array_like
95 Indicates whether the event (failure) occurs at `time`
96 (`status` is 1), or if `time` is a censoring time (`status`
97 is 0).
98 exog : array_like
99 The exogeneous (covariate) data matrix, cases are rows and
100 variables are columns.
101 strata : array_like
102 Grouping variable defining the strata. If None, all
103 observations are in a single stratum.
104 entry : array_like
105 Entry (left truncation) times. The observation is not
106 part of the risk set for times before the entry time. If
107 None, the entry time is treated as being zero, which
108 gives no left truncation. The entry time must be less
109 than or equal to `time`.
110 offset : array_like
111 An optional array of offsets
112 """
114 # Default strata
115 if strata is None:
116 strata = np.zeros(len(time), dtype=np.int32)
118 # Default entry times
119 if entry is None:
120 entry = np.zeros(len(time))
122 # Parameter validity checks.
123 n1, n2, n3, n4 = len(time), len(status), len(strata),\
124 len(entry)
125 nv = [n1, n2, n3, n4]
126 if max(nv) != min(nv):
127 raise ValueError("endog, status, strata, and " +
128 "entry must all have the same length")
129 if min(time) < 0:
130 raise ValueError("endog must be non-negative")
131 if min(entry) < 0:
132 raise ValueError("entry time must be non-negative")
134 # In Stata, this is entry >= time, in R it is >.
135 if np.any(entry > time):
136 raise ValueError("entry times may not occur " +
137 "after event or censoring times")
139 # Get the row indices for the cases in each stratum
140 stu = np.unique(strata)
141 #sth = {x: [] for x in stu} # needs >=2.7
142 sth = dict([(x, []) for x in stu])
143 for i,k in enumerate(strata):
144 sth[k].append(i)
145 stratum_rows = [np.asarray(sth[k], dtype=np.int32) for k in stu]
146 stratum_names = stu
148 # Remove strata with no events
149 ix = [i for i,ix in enumerate(stratum_rows) if status[ix].sum() > 0]
150 self.nstrat_orig = len(stratum_rows)
151 stratum_rows = [stratum_rows[i] for i in ix]
152 stratum_names = [stratum_names[i] for i in ix]
154 # The number of strata
155 nstrat = len(stratum_rows)
156 self.nstrat = nstrat
158 # Remove subjects whose entry time occurs after the last event
159 # in their stratum.
160 for stx,ix in enumerate(stratum_rows):
161 last_failure = max(time[ix][status[ix] == 1])
163 # Stata uses < here, R uses <=
164 ii = [i for i,t in enumerate(entry[ix]) if
165 t <= last_failure]
166 stratum_rows[stx] = stratum_rows[stx][ii]
168 # Remove subjects who are censored before the first event in
169 # their stratum.
170 for stx,ix in enumerate(stratum_rows):
171 first_failure = min(time[ix][status[ix] == 1])
173 ii = [i for i,t in enumerate(time[ix]) if
174 t >= first_failure]
175 stratum_rows[stx] = stratum_rows[stx][ii]
177 # Order by time within each stratum
178 for stx,ix in enumerate(stratum_rows):
179 ii = np.argsort(time[ix])
180 stratum_rows[stx] = stratum_rows[stx][ii]
182 if offset is not None:
183 self.offset_s = []
184 for stx in range(nstrat):
185 self.offset_s.append(offset[stratum_rows[stx]])
186 else:
187 self.offset_s = None
189 # Number of informative subjects
190 self.n_obs = sum([len(ix) for ix in stratum_rows])
192 # Split everything by stratum
193 self.time_s = []
194 self.exog_s = []
195 self.status_s = []
196 self.entry_s = []
197 for ix in stratum_rows:
198 self.time_s.append(time[ix])
199 self.exog_s.append(exog[ix,:])
200 self.status_s.append(status[ix])
201 self.entry_s.append(entry[ix])
203 self.stratum_rows = stratum_rows
204 self.stratum_names = stratum_names
206 # Precalculate some indices needed to fit Cox models.
207 # Distinct failure times within a stratum are always taken to
208 # be sorted in ascending order.
209 #
210 # ufailt_ix[stx][k] is a list of indices for subjects who fail
211 # at the k^th sorted unique failure time in stratum stx
212 #
213 # risk_enter[stx][k] is a list of indices for subjects who
214 # enter the risk set at the k^th sorted unique failure time in
215 # stratum stx
216 #
217 # risk_exit[stx][k] is a list of indices for subjects who exit
218 # the risk set at the k^th sorted unique failure time in
219 # stratum stx
220 self.ufailt_ix, self.risk_enter, self.risk_exit, self.ufailt =\
221 [], [], [], []
223 for stx in range(self.nstrat):
225 # All failure times
226 ift = np.flatnonzero(self.status_s[stx] == 1)
227 ft = self.time_s[stx][ift]
229 # Unique failure times
230 uft = np.unique(ft)
231 nuft = len(uft)
233 # Indices of cases that fail at each unique failure time
234 #uft_map = {x:i for i,x in enumerate(uft)} # requires >=2.7
235 uft_map = dict([(x, i) for i,x in enumerate(uft)]) # 2.6
236 uft_ix = [[] for k in range(nuft)]
237 for ix,ti in zip(ift,ft):
238 uft_ix[uft_map[ti]].append(ix)
240 # Indices of cases (failed or censored) that enter the
241 # risk set at each unique failure time.
242 risk_enter1 = [[] for k in range(nuft)]
243 for i,t in enumerate(self.time_s[stx]):
244 ix = np.searchsorted(uft, t, "right") - 1
245 if ix >= 0:
246 risk_enter1[ix].append(i)
248 # Indices of cases (failed or censored) that exit the
249 # risk set at each unique failure time.
250 risk_exit1 = [[] for k in range(nuft)]
251 for i,t in enumerate(self.entry_s[stx]):
252 ix = np.searchsorted(uft, t)
253 risk_exit1[ix].append(i)
255 self.ufailt.append(uft)
256 self.ufailt_ix.append([np.asarray(x, dtype=np.int32)
257 for x in uft_ix])
258 self.risk_enter.append([np.asarray(x, dtype=np.int32)
259 for x in risk_enter1])
260 self.risk_exit.append([np.asarray(x, dtype=np.int32)
261 for x in risk_exit1])
264class PHReg(model.LikelihoodModel):
265 """
266 Cox Proportional Hazards Regression Model
268 The Cox PH Model is for right censored data.
270 Parameters
271 ----------
272 endog : array_like
273 The observed times (event or censoring)
274 exog : 2D array_like
275 The covariates or exogeneous variables
276 status : array_like
277 The censoring status values; status=1 indicates that an
278 event occurred (e.g. failure or death), status=0 indicates
279 that the observation was right censored. If None, defaults
280 to status=1 for all cases.
281 entry : array_like
282 The entry times, if left truncation occurs
283 strata : array_like
284 Stratum labels. If None, all observations are taken to be
285 in a single stratum.
286 ties : str
287 The method used to handle tied times, must be either 'breslow'
288 or 'efron'.
289 offset : array_like
290 Array of offset values
291 missing : str
292 The method used to handle missing data
294 Notes
295 -----
296 Proportional hazards regression models should not include an
297 explicit or implicit intercept. The effect of an intercept is
298 not identified using the partial likelihood approach.
300 `endog`, `event`, `strata`, `entry`, and the first dimension
301 of `exog` all must have the same length
302 """
304 def __init__(self, endog, exog, status=None, entry=None,
305 strata=None, offset=None, ties='breslow',
306 missing='drop', **kwargs):
308 # Default is no censoring
309 if status is None:
310 status = np.ones(len(endog))
312 super(PHReg, self).__init__(endog, exog, status=status,
313 entry=entry, strata=strata,
314 offset=offset, missing=missing,
315 **kwargs)
317 # endog and exog are automatically converted, but these are
318 # not
319 if self.status is not None:
320 self.status = np.asarray(self.status)
321 if self.entry is not None:
322 self.entry = np.asarray(self.entry)
323 if self.strata is not None:
324 self.strata = np.asarray(self.strata)
325 if self.offset is not None:
326 self.offset = np.asarray(self.offset)
328 self.surv = PHSurvivalTime(self.endog, self.status,
329 self.exog, self.strata,
330 self.entry, self.offset)
331 self.nobs = len(self.endog)
332 self.groups = None
334 # TODO: not used?
335 self.missing = missing
337 self.df_resid = (np.float(self.exog.shape[0] -
338 np.linalg.matrix_rank(self.exog)))
339 self.df_model = np.float(np.linalg.matrix_rank(self.exog))
341 ties = ties.lower()
342 if ties not in ("efron", "breslow"):
343 raise ValueError("`ties` must be either `efron` or " +
344 "`breslow`")
346 self.ties = ties
348 @classmethod
349 def from_formula(cls, formula, data, status=None, entry=None,
350 strata=None, offset=None, subset=None,
351 ties='breslow', missing='drop', *args, **kwargs):
352 """
353 Create a proportional hazards regression model from a formula
354 and dataframe.
356 Parameters
357 ----------
358 formula : str or generic Formula object
359 The formula specifying the model
360 data : array_like
361 The data for the model. See Notes.
362 status : array_like
363 The censoring status values; status=1 indicates that an
364 event occurred (e.g. failure or death), status=0 indicates
365 that the observation was right censored. If None, defaults
366 to status=1 for all cases.
367 entry : array_like
368 The entry times, if left truncation occurs
369 strata : array_like
370 Stratum labels. If None, all observations are taken to be
371 in a single stratum.
372 offset : array_like
373 Array of offset values
374 subset : array_like
375 An array-like object of booleans, integers, or index
376 values that indicate the subset of df to use in the
377 model. Assumes df is a `pandas.DataFrame`
378 ties : str
379 The method used to handle tied times, must be either 'breslow'
380 or 'efron'.
381 missing : str
382 The method used to handle missing data
383 args : extra arguments
384 These are passed to the model
385 kwargs : extra keyword arguments
386 These are passed to the model with one exception. The
387 ``eval_env`` keyword is passed to patsy. It can be either a
388 :class:`patsy:patsy.EvalEnvironment` object or an integer
389 indicating the depth of the namespace to use. For example, the
390 default ``eval_env=0`` uses the calling namespace. If you wish
391 to use a "clean" environment set ``eval_env=-1``.
393 Returns
394 -------
395 model : PHReg model instance
396 """
398 # Allow array arguments to be passed by column name.
399 if isinstance(status, str):
400 status = data[status]
401 if isinstance(entry, str):
402 entry = data[entry]
403 if isinstance(strata, str):
404 strata = data[strata]
405 if isinstance(offset, str):
406 offset = data[offset]
408 import re
409 terms = re.split(r"[+\-~]", formula)
410 for term in terms:
411 term = term.strip()
412 if term in ("0", "1"):
413 import warnings
414 warnings.warn("PHReg formulas should not include any '0' or '1' terms")
416 mod = super(PHReg, cls).from_formula(formula, data,
417 status=status, entry=entry, strata=strata,
418 offset=offset, subset=subset, ties=ties,
419 missing=missing, drop_cols=["Intercept"], *args,
420 **kwargs)
422 return mod
424 def fit(self, groups=None, **args):
425 """
426 Fit a proportional hazards regression model.
428 Parameters
429 ----------
430 groups : array_like
431 Labels indicating groups of observations that may be
432 dependent. If present, the standard errors account for
433 this dependence. Does not affect fitted values.
435 Returns
436 -------
437 PHRegResults
438 Returns a results instance.
439 """
441 # TODO process for missing values
442 if groups is not None:
443 if len(groups) != len(self.endog):
444 msg = ("len(groups) = %d and len(endog) = %d differ" %
445 (len(groups), len(self.endog)))
446 raise ValueError(msg)
447 self.groups = np.asarray(groups)
448 else:
449 self.groups = None
451 if 'disp' not in args:
452 args['disp'] = False
453 fit_rslts = super(PHReg, self).fit(**args)
455 if self.groups is None:
456 cov_params = fit_rslts.cov_params()
457 else:
458 cov_params = self.robust_covariance(fit_rslts.params)
460 results = PHRegResults(self, fit_rslts.params, cov_params)
462 return results
464 def fit_regularized(self, method="elastic_net", alpha=0.,
465 start_params=None, refit=False, **kwargs):
466 r"""
467 Return a regularized fit to a linear regression model.
469 Parameters
470 ----------
471 method : {'elastic_net'}
472 Only the `elastic_net` approach is currently implemented.
473 alpha : scalar or array_like
474 The penalty weight. If a scalar, the same penalty weight
475 applies to all variables in the model. If a vector, it
476 must have the same length as `params`, and contains a
477 penalty weight for each coefficient.
478 start_params : array_like
479 Starting values for `params`.
480 refit : bool
481 If True, the model is refit using only the variables that
482 have non-zero coefficients in the regularized fit. The
483 refitted model is not regularized.
484 **kwargs
485 Additional keyword arguments used to fit the model.
487 Returns
488 -------
489 PHRegResults
490 Returns a results instance.
492 Notes
493 -----
494 The penalty is the ``elastic net`` penalty, which is a
495 combination of L1 and L2 penalties.
497 The function that is minimized is:
499 .. math::
501 -loglike/n + alpha*((1-L1\_wt)*|params|_2^2/2 + L1\_wt*|params|_1)
503 where :math:`|*|_1` and :math:`|*|_2` are the L1 and L2 norms.
505 Post-estimation results are based on the same data used to
506 select variables, hence may be subject to overfitting biases.
508 The elastic_net method uses the following keyword arguments:
510 maxiter : int
511 Maximum number of iterations
512 L1_wt : float
513 Must be in [0, 1]. The L1 penalty has weight L1_wt and the
514 L2 penalty has weight 1 - L1_wt.
515 cnvrg_tol : float
516 Convergence threshold for line searches
517 zero_tol : float
518 Coefficients below this threshold are treated as zero.
519 """
521 from statsmodels.base.elastic_net import fit_elasticnet
523 if method != "elastic_net":
524 raise ValueError("method for fit_regularied must be elastic_net")
526 defaults = {"maxiter" : 50, "L1_wt" : 1, "cnvrg_tol" : 1e-10,
527 "zero_tol" : 1e-10}
528 defaults.update(kwargs)
530 return fit_elasticnet(self, method=method,
531 alpha=alpha,
532 start_params=start_params,
533 refit=refit,
534 **defaults)
537 def loglike(self, params):
538 """
539 Returns the log partial likelihood function evaluated at
540 `params`.
541 """
543 if self.ties == "breslow":
544 return self.breslow_loglike(params)
545 elif self.ties == "efron":
546 return self.efron_loglike(params)
548 def score(self, params):
549 """
550 Returns the score function evaluated at `params`.
551 """
553 if self.ties == "breslow":
554 return self.breslow_gradient(params)
555 elif self.ties == "efron":
556 return self.efron_gradient(params)
558 def hessian(self, params):
559 """
560 Returns the Hessian matrix of the log partial likelihood
561 function evaluated at `params`.
562 """
564 if self.ties == "breslow":
565 return self.breslow_hessian(params)
566 else:
567 return self.efron_hessian(params)
569 def breslow_loglike(self, params):
570 """
571 Returns the value of the log partial likelihood function
572 evaluated at `params`, using the Breslow method to handle tied
573 times.
574 """
576 surv = self.surv
578 like = 0.
580 # Loop over strata
581 for stx in range(surv.nstrat):
583 uft_ix = surv.ufailt_ix[stx]
584 exog_s = surv.exog_s[stx]
585 nuft = len(uft_ix)
587 linpred = np.dot(exog_s, params)
588 if surv.offset_s is not None:
589 linpred += surv.offset_s[stx]
590 linpred -= linpred.max()
591 e_linpred = np.exp(linpred)
593 xp0 = 0.
595 # Iterate backward through the unique failure times.
596 for i in range(nuft)[::-1]:
598 # Update for new cases entering the risk set.
599 ix = surv.risk_enter[stx][i]
600 xp0 += e_linpred[ix].sum()
602 # Account for all cases that fail at this point.
603 ix = uft_ix[i]
604 like += (linpred[ix] - np.log(xp0)).sum()
606 # Update for cases leaving the risk set.
607 ix = surv.risk_exit[stx][i]
608 xp0 -= e_linpred[ix].sum()
610 return like
612 def efron_loglike(self, params):
613 """
614 Returns the value of the log partial likelihood function
615 evaluated at `params`, using the Efron method to handle tied
616 times.
617 """
619 surv = self.surv
621 like = 0.
623 # Loop over strata
624 for stx in range(surv.nstrat):
626 # exog and linear predictor for this stratum
627 exog_s = surv.exog_s[stx]
628 linpred = np.dot(exog_s, params)
629 if surv.offset_s is not None:
630 linpred += surv.offset_s[stx]
631 linpred -= linpred.max()
632 e_linpred = np.exp(linpred)
634 xp0 = 0.
636 # Iterate backward through the unique failure times.
637 uft_ix = surv.ufailt_ix[stx]
638 nuft = len(uft_ix)
639 for i in range(nuft)[::-1]:
641 # Update for new cases entering the risk set.
642 ix = surv.risk_enter[stx][i]
643 xp0 += e_linpred[ix].sum()
644 xp0f = e_linpred[uft_ix[i]].sum()
646 # Account for all cases that fail at this point.
647 ix = uft_ix[i]
648 like += linpred[ix].sum()
650 m = len(ix)
651 J = np.arange(m, dtype=np.float64) / m
652 like -= np.log(xp0 - J*xp0f).sum()
654 # Update for cases leaving the risk set.
655 ix = surv.risk_exit[stx][i]
656 xp0 -= e_linpred[ix].sum()
658 return like
660 def breslow_gradient(self, params):
661 """
662 Returns the gradient of the log partial likelihood, using the
663 Breslow method to handle tied times.
664 """
666 surv = self.surv
668 grad = 0.
670 # Loop over strata
671 for stx in range(surv.nstrat):
673 # Indices of subjects in the stratum
674 strat_ix = surv.stratum_rows[stx]
676 # Unique failure times in the stratum
677 uft_ix = surv.ufailt_ix[stx]
678 nuft = len(uft_ix)
680 # exog and linear predictor for the stratum
681 exog_s = surv.exog_s[stx]
682 linpred = np.dot(exog_s, params)
683 if surv.offset_s is not None:
684 linpred += surv.offset_s[stx]
685 linpred -= linpred.max()
686 e_linpred = np.exp(linpred)
688 xp0, xp1 = 0., 0.
690 # Iterate backward through the unique failure times.
691 for i in range(nuft)[::-1]:
693 # Update for new cases entering the risk set.
694 ix = surv.risk_enter[stx][i]
695 if len(ix) > 0:
696 v = exog_s[ix,:]
697 xp0 += e_linpred[ix].sum()
698 xp1 += (e_linpred[ix][:,None] * v).sum(0)
700 # Account for all cases that fail at this point.
701 ix = uft_ix[i]
702 grad += (exog_s[ix,:] - xp1 / xp0).sum(0)
704 # Update for cases leaving the risk set.
705 ix = surv.risk_exit[stx][i]
706 if len(ix) > 0:
707 v = exog_s[ix,:]
708 xp0 -= e_linpred[ix].sum()
709 xp1 -= (e_linpred[ix][:,None] * v).sum(0)
711 return grad
713 def efron_gradient(self, params):
714 """
715 Returns the gradient of the log partial likelihood evaluated
716 at `params`, using the Efron method to handle tied times.
717 """
719 surv = self.surv
721 grad = 0.
723 # Loop over strata
724 for stx in range(surv.nstrat):
726 # Indices of cases in the stratum
727 strat_ix = surv.stratum_rows[stx]
729 # exog and linear predictor of the stratum
730 exog_s = surv.exog_s[stx]
731 linpred = np.dot(exog_s, params)
732 if surv.offset_s is not None:
733 linpred += surv.offset_s[stx]
734 linpred -= linpred.max()
735 e_linpred = np.exp(linpred)
737 xp0, xp1 = 0., 0.
739 # Iterate backward through the unique failure times.
740 uft_ix = surv.ufailt_ix[stx]
741 nuft = len(uft_ix)
742 for i in range(nuft)[::-1]:
744 # Update for new cases entering the risk set.
745 ix = surv.risk_enter[stx][i]
746 if len(ix) > 0:
747 v = exog_s[ix,:]
748 xp0 += e_linpred[ix].sum()
749 xp1 += (e_linpred[ix][:,None] * v).sum(0)
750 ixf = uft_ix[i]
751 if len(ixf) > 0:
752 v = exog_s[ixf,:]
753 xp0f = e_linpred[ixf].sum()
754 xp1f = (e_linpred[ixf][:,None] * v).sum(0)
756 # Consider all cases that fail at this point.
757 grad += v.sum(0)
759 m = len(ixf)
760 J = np.arange(m, dtype=np.float64) / m
761 numer = xp1 - np.outer(J, xp1f)
762 denom = xp0 - np.outer(J, xp0f)
763 ratio = numer / denom
764 rsum = ratio.sum(0)
765 grad -= rsum
767 # Update for cases leaving the risk set.
768 ix = surv.risk_exit[stx][i]
769 if len(ix) > 0:
770 v = exog_s[ix,:]
771 xp0 -= e_linpred[ix].sum()
772 xp1 -= (e_linpred[ix][:,None] * v).sum(0)
774 return grad
776 def breslow_hessian(self, params):
777 """
778 Returns the Hessian of the log partial likelihood evaluated at
779 `params`, using the Breslow method to handle tied times.
780 """
782 surv = self.surv
784 hess = 0.
786 # Loop over strata
787 for stx in range(surv.nstrat):
789 uft_ix = surv.ufailt_ix[stx]
790 nuft = len(uft_ix)
792 exog_s = surv.exog_s[stx]
794 linpred = np.dot(exog_s, params)
795 if surv.offset_s is not None:
796 linpred += surv.offset_s[stx]
797 linpred -= linpred.max()
798 e_linpred = np.exp(linpred)
800 xp0, xp1, xp2 = 0., 0., 0.
802 # Iterate backward through the unique failure times.
803 for i in range(nuft)[::-1]:
805 # Update for new cases entering the risk set.
806 ix = surv.risk_enter[stx][i]
807 if len(ix) > 0:
808 xp0 += e_linpred[ix].sum()
809 v = exog_s[ix,:]
810 xp1 += (e_linpred[ix][:,None] * v).sum(0)
811 elx = e_linpred[ix]
812 xp2 += np.einsum("ij,ik,i->jk", v, v, elx)
814 # Account for all cases that fail at this point.
815 m = len(uft_ix[i])
816 hess += m*(xp2 / xp0 - np.outer(xp1, xp1) / xp0**2)
818 # Update for new cases entering the risk set.
819 ix = surv.risk_exit[stx][i]
820 if len(ix) > 0:
821 xp0 -= e_linpred[ix].sum()
822 v = exog_s[ix,:]
823 xp1 -= (e_linpred[ix][:,None] * v).sum(0)
824 elx = e_linpred[ix]
825 xp2 -= np.einsum("ij,ik,i->jk", v, v, elx)
826 return -hess
828 def efron_hessian(self, params):
829 """
830 Returns the Hessian matrix of the partial log-likelihood
831 evaluated at `params`, using the Efron method to handle tied
832 times.
833 """
835 surv = self.surv
837 hess = 0.
839 # Loop over strata
840 for stx in range(surv.nstrat):
842 exog_s = surv.exog_s[stx]
844 linpred = np.dot(exog_s, params)
845 if surv.offset_s is not None:
846 linpred += surv.offset_s[stx]
847 linpred -= linpred.max()
848 e_linpred = np.exp(linpred)
850 xp0, xp1, xp2 = 0., 0., 0.
852 # Iterate backward through the unique failure times.
853 uft_ix = surv.ufailt_ix[stx]
854 nuft = len(uft_ix)
855 for i in range(nuft)[::-1]:
857 # Update for new cases entering the risk set.
858 ix = surv.risk_enter[stx][i]
859 if len(ix) > 0:
860 xp0 += e_linpred[ix].sum()
861 v = exog_s[ix,:]
862 xp1 += (e_linpred[ix][:,None] * v).sum(0)
863 elx = e_linpred[ix]
864 xp2 += np.einsum("ij,ik,i->jk", v, v, elx)
866 ixf = uft_ix[i]
867 if len(ixf) > 0:
868 v = exog_s[ixf,:]
869 xp0f = e_linpred[ixf].sum()
870 xp1f = (e_linpred[ixf][:,None] * v).sum(0)
871 elx = e_linpred[ixf]
872 xp2f = np.einsum("ij,ik,i->jk", v, v, elx)
874 # Account for all cases that fail at this point.
875 m = len(uft_ix[i])
876 J = np.arange(m, dtype=np.float64) / m
877 c0 = xp0 - J*xp0f
878 hess += xp2 * np.sum(1 / c0)
879 hess -= xp2f * np.sum(J / c0)
880 mat = (xp1[None, :] - np.outer(J, xp1f)) / c0[:, None]
881 hess -= np.einsum("ij,ik->jk", mat, mat)
883 # Update for new cases entering the risk set.
884 ix = surv.risk_exit[stx][i]
885 if len(ix) > 0:
886 xp0 -= e_linpred[ix].sum()
887 v = exog_s[ix,:]
888 xp1 -= (e_linpred[ix][:,None] * v).sum(0)
889 elx = e_linpred[ix]
890 xp2 -= np.einsum("ij,ik,i->jk", v, v, elx)
892 return -hess
894 def robust_covariance(self, params):
895 """
896 Returns a covariance matrix for the proportional hazards model
897 regresion coefficient estimates that is robust to certain
898 forms of model misspecification.
900 Parameters
901 ----------
902 params : ndarray
903 The parameter vector at which the covariance matrix is
904 calculated.
906 Returns
907 -------
908 The robust covariance matrix as a square ndarray.
910 Notes
911 -----
912 This function uses the `groups` argument to determine groups
913 within which observations may be dependent. The covariance
914 matrix is calculated using the Huber-White "sandwich" approach.
915 """
917 if self.groups is None:
918 raise ValueError("`groups` must be specified to calculate the robust covariance matrix")
920 hess = self.hessian(params)
922 score_obs = self.score_residuals(params)
924 # Collapse
925 grads = {}
926 for i,g in enumerate(self.groups):
927 if g not in grads:
928 grads[g] = 0.
929 grads[g] += score_obs[i, :]
930 grads = np.asarray(list(grads.values()))
932 mat = grads[None, :, :]
933 mat = mat.T * mat
934 mat = mat.sum(1)
936 hess_inv = np.linalg.inv(hess)
937 cmat = np.dot(hess_inv, np.dot(mat, hess_inv))
939 return cmat
941 def score_residuals(self, params):
942 """
943 Returns the score residuals calculated at a given vector of
944 parameters.
946 Parameters
947 ----------
948 params : ndarray
949 The parameter vector at which the score residuals are
950 calculated.
952 Returns
953 -------
954 The score residuals, returned as a ndarray having the same
955 shape as `exog`.
957 Notes
958 -----
959 Observations in a stratum with no observed events have undefined
960 score residuals, and contain NaN in the returned matrix.
961 """
963 surv = self.surv
965 score_resid = np.zeros(self.exog.shape, dtype=np.float64)
967 # Use to set undefined values to NaN.
968 mask = np.zeros(self.exog.shape[0], dtype=np.int32)
970 w_avg = self.weighted_covariate_averages(params)
972 # Loop over strata
973 for stx in range(surv.nstrat):
975 uft_ix = surv.ufailt_ix[stx]
976 exog_s = surv.exog_s[stx]
977 nuft = len(uft_ix)
978 strat_ix = surv.stratum_rows[stx]
980 xp0 = 0.
982 linpred = np.dot(exog_s, params)
983 if surv.offset_s is not None:
984 linpred += surv.offset_s[stx]
985 linpred -= linpred.max()
986 e_linpred = np.exp(linpred)
988 at_risk_ix = set([])
990 # Iterate backward through the unique failure times.
991 for i in range(nuft)[::-1]:
993 # Update for new cases entering the risk set.
994 ix = surv.risk_enter[stx][i]
995 at_risk_ix |= set(ix)
996 xp0 += e_linpred[ix].sum()
998 atr_ix = list(at_risk_ix)
999 leverage = exog_s[atr_ix, :] - w_avg[stx][i, :]
1001 # Event indicators
1002 d = np.zeros(exog_s.shape[0])
1003 d[uft_ix[i]] = 1
1005 # The increment in the cumulative hazard
1006 dchaz = len(uft_ix[i]) / xp0
1008 # Piece of the martingale residual
1009 mrp = d[atr_ix] - e_linpred[atr_ix] * dchaz
1011 # Update the score residuals
1012 ii = strat_ix[atr_ix]
1013 score_resid[ii,:] += leverage * mrp[:, None]
1014 mask[ii] = 1
1016 # Update for cases leaving the risk set.
1017 ix = surv.risk_exit[stx][i]
1018 at_risk_ix -= set(ix)
1019 xp0 -= e_linpred[ix].sum()
1021 jj = np.flatnonzero(mask == 0)
1022 if len(jj) > 0:
1023 score_resid[jj, :] = np.nan
1025 return score_resid
1027 def weighted_covariate_averages(self, params):
1028 """
1029 Returns the hazard-weighted average of covariate values for
1030 subjects who are at-risk at a particular time.
1032 Parameters
1033 ----------
1034 params : ndarray
1035 Parameter vector
1037 Returns
1038 -------
1039 averages : list of ndarrays
1040 averages[stx][i,:] is a row vector containing the weighted
1041 average values (for all the covariates) of at-risk
1042 subjects a the i^th largest observed failure time in
1043 stratum `stx`, using the hazard multipliers as weights.
1045 Notes
1046 -----
1047 Used to calculate leverages and score residuals.
1048 """
1050 surv = self.surv
1052 averages = []
1053 xp0, xp1 = 0., 0.
1055 # Loop over strata
1056 for stx in range(surv.nstrat):
1058 uft_ix = surv.ufailt_ix[stx]
1059 exog_s = surv.exog_s[stx]
1060 nuft = len(uft_ix)
1062 average_s = np.zeros((len(uft_ix), exog_s.shape[1]),
1063 dtype=np.float64)
1065 linpred = np.dot(exog_s, params)
1066 if surv.offset_s is not None:
1067 linpred += surv.offset_s[stx]
1068 linpred -= linpred.max()
1069 e_linpred = np.exp(linpred)
1071 # Iterate backward through the unique failure times.
1072 for i in range(nuft)[::-1]:
1074 # Update for new cases entering the risk set.
1075 ix = surv.risk_enter[stx][i]
1076 xp0 += e_linpred[ix].sum()
1077 xp1 += np.dot(e_linpred[ix], exog_s[ix, :])
1079 average_s[i, :] = xp1 / xp0
1081 # Update for cases leaving the risk set.
1082 ix = surv.risk_exit[stx][i]
1083 xp0 -= e_linpred[ix].sum()
1084 xp1 -= np.dot(e_linpred[ix], exog_s[ix, :])
1086 averages.append(average_s)
1088 return averages
1090 def baseline_cumulative_hazard(self, params):
1091 """
1092 Estimate the baseline cumulative hazard and survival
1093 functions.
1095 Parameters
1096 ----------
1097 params : ndarray
1098 The model parameters.
1100 Returns
1101 -------
1102 A list of triples (time, hazard, survival) containing the time
1103 values and corresponding cumulative hazard and survival
1104 function values for each stratum.
1106 Notes
1107 -----
1108 Uses the Nelson-Aalen estimator.
1109 """
1111 # TODO: some disagreements with R, not the same algorithm but
1112 # hard to deduce what R is doing. Our results are reasonable.
1114 surv = self.surv
1115 rslt = []
1117 # Loop over strata
1118 for stx in range(surv.nstrat):
1120 uft = surv.ufailt[stx]
1121 uft_ix = surv.ufailt_ix[stx]
1122 exog_s = surv.exog_s[stx]
1123 nuft = len(uft_ix)
1125 linpred = np.dot(exog_s, params)
1126 if surv.offset_s is not None:
1127 linpred += surv.offset_s[stx]
1128 e_linpred = np.exp(linpred)
1130 xp0 = 0.
1131 h0 = np.zeros(nuft, dtype=np.float64)
1133 # Iterate backward through the unique failure times.
1134 for i in range(nuft)[::-1]:
1136 # Update for new cases entering the risk set.
1137 ix = surv.risk_enter[stx][i]
1138 xp0 += e_linpred[ix].sum()
1140 # Account for all cases that fail at this point.
1141 ix = uft_ix[i]
1142 h0[i] = len(ix) / xp0
1144 # Update for cases leaving the risk set.
1145 ix = surv.risk_exit[stx][i]
1146 xp0 -= e_linpred[ix].sum()
1148 cumhaz = np.cumsum(h0) - h0
1149 current_strata_surv = np.exp(-cumhaz)
1150 rslt.append([uft, cumhaz, current_strata_surv])
1152 return rslt
1154 def baseline_cumulative_hazard_function(self, params):
1155 """
1156 Returns a function that calculates the baseline cumulative
1157 hazard function for each stratum.
1159 Parameters
1160 ----------
1161 params : ndarray
1162 The model parameters.
1164 Returns
1165 -------
1166 A dict mapping stratum names to the estimated baseline
1167 cumulative hazard function.
1168 """
1170 from scipy.interpolate import interp1d
1171 surv = self.surv
1172 base = self.baseline_cumulative_hazard(params)
1174 cumhaz_f = {}
1175 for stx in range(surv.nstrat):
1176 time_h = base[stx][0]
1177 cumhaz = base[stx][1]
1178 time_h = np.r_[-np.inf, time_h, np.inf]
1179 cumhaz = np.r_[cumhaz[0], cumhaz, cumhaz[-1]]
1180 func = interp1d(time_h, cumhaz, kind='zero')
1181 cumhaz_f[self.surv.stratum_names[stx]] = func
1183 return cumhaz_f
1185 @Appender(_predict_docstring % {
1186 'params_doc': _predict_params_doc,
1187 'cov_params_doc': _predict_cov_params_docstring})
1188 def predict(self, params, exog=None, cov_params=None, endog=None,
1189 strata=None, offset=None, pred_type="lhr"):
1191 pred_type = pred_type.lower()
1192 if pred_type not in ["lhr", "hr", "surv", "cumhaz"]:
1193 msg = "Type %s not allowed for prediction" % pred_type
1194 raise ValueError(msg)
1196 class bunch:
1197 predicted_values = None
1198 standard_errors = None
1199 ret_val = bunch()
1201 # Do not do anything with offset here because we want to allow
1202 # different offsets to be specified even if exog is the model
1203 # exog.
1204 exog_provided = True
1205 if exog is None:
1206 exog = self.exog
1207 exog_provided = False
1209 lhr = np.dot(exog, params)
1210 if offset is not None:
1211 lhr += offset
1212 # Never use self.offset unless we are also using self.exog
1213 elif self.offset is not None and not exog_provided:
1214 lhr += self.offset
1216 # Handle lhr and hr prediction first, since they do not make
1217 # use of the hazard function.
1219 if pred_type == "lhr":
1220 ret_val.predicted_values = lhr
1221 if cov_params is not None:
1222 mat = np.dot(exog, cov_params)
1223 va = (mat * exog).sum(1)
1224 ret_val.standard_errors = np.sqrt(va)
1225 return ret_val
1227 hr = np.exp(lhr)
1229 if pred_type == "hr":
1230 ret_val.predicted_values = hr
1231 return ret_val
1233 # Makes sure endog is defined
1234 if endog is None and exog_provided:
1235 msg = "If `exog` is provided `endog` must be provided."
1236 raise ValueError(msg)
1237 # Use model endog if using model exog
1238 elif endog is None and not exog_provided:
1239 endog = self.endog
1241 # Make sure strata is defined
1242 if strata is None:
1243 if exog_provided and self.surv.nstrat > 1:
1244 raise ValueError("`strata` must be provided")
1245 if self.strata is None:
1246 strata = [self.surv.stratum_names[0],] * len(endog)
1247 else:
1248 strata = self.strata
1250 cumhaz = np.nan * np.ones(len(endog), dtype=np.float64)
1251 stv = np.unique(strata)
1252 bhaz = self.baseline_cumulative_hazard_function(params)
1253 for stx in stv:
1254 ix = np.flatnonzero(strata == stx)
1255 func = bhaz[stx]
1256 cumhaz[ix] = func(endog[ix]) * hr[ix]
1258 if pred_type == "cumhaz":
1259 ret_val.predicted_values = cumhaz
1261 elif pred_type == "surv":
1262 ret_val.predicted_values = np.exp(-cumhaz)
1264 return ret_val
1266 def get_distribution(self, params):
1267 """
1268 Returns a scipy distribution object corresponding to the
1269 distribution of uncensored endog (duration) values for each
1270 case.
1272 Parameters
1273 ----------
1274 params : array_like
1275 The proportional hazards model parameters.
1277 Returns
1278 -------
1279 A list of objects of type scipy.stats.distributions.rv_discrete
1281 Notes
1282 -----
1283 The distributions are obtained from a simple discrete estimate
1284 of the survivor function that puts all mass on the observed
1285 failure times within a stratum.
1286 """
1288 # TODO: this returns a Python list of rv_discrete objects, so
1289 # nothing can be vectorized. It appears that rv_discrete does
1290 # not allow vectorization.
1292 surv = self.surv
1293 bhaz = self.baseline_cumulative_hazard(params)
1295 # The arguments to rv_discrete_float, first obtained by
1296 # stratum
1297 pk, xk = [], []
1299 for stx in range(self.surv.nstrat):
1301 exog_s = surv.exog_s[stx]
1303 linpred = np.dot(exog_s, params)
1304 if surv.offset_s is not None:
1305 linpred += surv.offset_s[stx]
1306 e_linpred = np.exp(linpred)
1308 # The unique failure times for this stratum (the support
1309 # of the distribution).
1310 pts = bhaz[stx][0]
1312 # The individual cumulative hazards for everyone in this
1313 # stratum.
1314 ichaz = np.outer(e_linpred, bhaz[stx][1])
1316 # The individual survival functions.
1317 usurv = np.exp(-ichaz)
1318 z = np.zeros((usurv.shape[0], 1))
1319 usurv = np.concatenate((usurv, z), axis=1)
1321 # The individual survival probability masses.
1322 probs = -np.diff(usurv, 1)
1324 pk.append(probs)
1325 xk.append(np.outer(np.ones(probs.shape[0]), pts))
1327 # Pad to make all strata have the same shape
1328 mxc = max([x.shape[1] for x in xk])
1329 for k in range(self.surv.nstrat):
1330 if xk[k].shape[1] < mxc:
1331 xk1 = np.zeros((xk[k].shape[0], mxc))
1332 pk1 = np.zeros((pk[k].shape[0], mxc))
1333 xk1[:, 0:xk[k].shape[1]] = xk[k]
1334 pk1[:, 0:pk[k].shape[1]] = pk[k]
1335 xk[k], pk[k] = xk1, pk1
1337 # Put the support points and probabilities into single matrices
1338 xka = np.nan * np.ones((len(self.endog), mxc))
1339 pka = np.ones((len(self.endog), mxc), dtype=np.float64) / mxc
1340 for stx in range(self.surv.nstrat):
1341 ix = self.surv.stratum_rows[stx]
1342 xka[ix, :] = xk[stx]
1343 pka[ix, :] = pk[stx]
1345 dist = rv_discrete_float(xka, pka)
1347 return dist
1350class PHRegResults(base.LikelihoodModelResults):
1351 '''
1352 Class to contain results of fitting a Cox proportional hazards
1353 survival model.
1355 PHregResults inherits from statsmodels.LikelihoodModelResults
1357 Parameters
1358 ----------
1359 See statsmodels.LikelihoodModelResults
1361 Attributes
1362 ----------
1363 model : class instance
1364 PHreg model instance that called fit.
1365 normalized_cov_params : ndarray
1366 The sampling covariance matrix of the estimates
1367 params : ndarray
1368 The coefficients of the fitted model. Each coefficient is the
1369 log hazard ratio corresponding to a 1 unit difference in a
1370 single covariate while holding the other covariates fixed.
1371 bse : ndarray
1372 The standard errors of the fitted parameters.
1374 See Also
1375 --------
1376 statsmodels.LikelihoodModelResults
1377 '''
1379 def __init__(self, model, params, cov_params, scale=1., covariance_type="naive"):
1381 # There is no scale parameter, but we need it for
1382 # meta-procedures that work with results.
1384 self.covariance_type = covariance_type
1385 self.df_resid = model.df_resid
1386 self.df_model = model.df_model
1388 super(PHRegResults, self).__init__(model, params, scale=1.,
1389 normalized_cov_params=cov_params)
1391 @cache_readonly
1392 def standard_errors(self):
1393 """
1394 Returns the standard errors of the parameter estimates.
1395 """
1396 return np.sqrt(np.diag(self.cov_params()))
1398 @cache_readonly
1399 def bse(self):
1400 """
1401 Returns the standard errors of the parameter estimates.
1402 """
1403 return self.standard_errors
1405 def get_distribution(self):
1406 """
1407 Returns a scipy distribution object corresponding to the
1408 distribution of uncensored endog (duration) values for each
1409 case.
1411 Returns
1412 -------
1413 A list of objects of type scipy.stats.distributions.rv_discrete
1415 Notes
1416 -----
1417 The distributions are obtained from a simple discrete estimate
1418 of the survivor function that puts all mass on the observed
1419 failure times within a stratum.
1420 """
1422 return self.model.get_distribution(self.params)
1424 @Appender(_predict_docstring % {'params_doc': '', 'cov_params_doc': ''})
1425 def predict(self, endog=None, exog=None, strata=None,
1426 offset=None, transform=True, pred_type="lhr"):
1427 return super(PHRegResults, self).predict(exog=exog,
1428 transform=transform,
1429 cov_params=self.cov_params(),
1430 endog=endog,
1431 strata=strata,
1432 offset=offset,
1433 pred_type=pred_type)
1435 def _group_stats(self, groups):
1436 """
1437 Descriptive statistics of the groups.
1438 """
1439 gsizes = np.unique(groups, return_counts=True)
1440 gsizes = gsizes[1]
1441 return gsizes.min(), gsizes.max(), gsizes.mean(), len(gsizes)
1443 @cache_readonly
1444 def weighted_covariate_averages(self):
1445 """
1446 The average covariate values within the at-risk set at each
1447 event time point, weighted by hazard.
1448 """
1449 return self.model.weighted_covariate_averages(self.params)
1451 @cache_readonly
1452 def score_residuals(self):
1453 """
1454 A matrix containing the score residuals.
1455 """
1456 return self.model.score_residuals(self.params)
1458 @cache_readonly
1459 def baseline_cumulative_hazard(self):
1460 """
1461 A list (corresponding to the strata) containing the baseline
1462 cumulative hazard function evaluated at the event points.
1463 """
1464 return self.model.baseline_cumulative_hazard(self.params)
1466 @cache_readonly
1467 def baseline_cumulative_hazard_function(self):
1468 """
1469 A list (corresponding to the strata) containing function
1470 objects that calculate the cumulative hazard function.
1471 """
1472 return self.model.baseline_cumulative_hazard_function(self.params)
1474 @cache_readonly
1475 def schoenfeld_residuals(self):
1476 """
1477 A matrix containing the Schoenfeld residuals.
1479 Notes
1480 -----
1481 Schoenfeld residuals for censored observations are set to zero.
1482 """
1484 surv = self.model.surv
1485 w_avg = self.weighted_covariate_averages
1487 # Initialize at NaN since rows that belong to strata with no
1488 # events have undefined residuals.
1489 sch_resid = np.nan*np.ones(self.model.exog.shape, dtype=np.float64)
1491 # Loop over strata
1492 for stx in range(surv.nstrat):
1494 uft = surv.ufailt[stx]
1495 exog_s = surv.exog_s[stx]
1496 time_s = surv.time_s[stx]
1497 strat_ix = surv.stratum_rows[stx]
1499 ii = np.searchsorted(uft, time_s)
1501 # These subjects are censored after the last event in
1502 # their stratum, so have empty risk sets and undefined
1503 # residuals.
1504 jj = np.flatnonzero(ii < len(uft))
1506 sch_resid[strat_ix[jj], :] = exog_s[jj, :] - w_avg[stx][ii[jj], :]
1508 jj = np.flatnonzero(self.model.status == 0)
1509 sch_resid[jj, :] = np.nan
1511 return sch_resid
1513 @cache_readonly
1514 def martingale_residuals(self):
1515 """
1516 The martingale residuals.
1517 """
1519 surv = self.model.surv
1521 # Initialize at NaN since rows that belong to strata with no
1522 # events have undefined residuals.
1523 mart_resid = np.nan*np.ones(len(self.model.endog), dtype=np.float64)
1525 cumhaz_f_list = self.baseline_cumulative_hazard_function
1527 # Loop over strata
1528 for stx in range(surv.nstrat):
1530 cumhaz_f = cumhaz_f_list[stx]
1532 exog_s = surv.exog_s[stx]
1533 time_s = surv.time_s[stx]
1535 linpred = np.dot(exog_s, self.params)
1536 if surv.offset_s is not None:
1537 linpred += surv.offset_s[stx]
1538 e_linpred = np.exp(linpred)
1540 ii = surv.stratum_rows[stx]
1541 chaz = cumhaz_f(time_s)
1542 mart_resid[ii] = self.model.status[ii] - e_linpred * chaz
1544 return mart_resid
1546 def summary(self, yname=None, xname=None, title=None, alpha=.05):
1547 """
1548 Summarize the proportional hazards regression results.
1550 Parameters
1551 ----------
1552 yname : str, optional
1553 Default is `y`
1554 xname : list[str], optional
1555 Names for the exogenous variables, default is `x#` for ## in p the
1556 number of regressors. Must match the number of parameters in
1557 the model
1558 title : str, optional
1559 Title for the top table. If not None, then this replaces
1560 the default title
1561 alpha : float
1562 significance level for the confidence intervals
1564 Returns
1565 -------
1566 smry : Summary instance
1567 this holds the summary tables and text, which can be
1568 printed or converted to various output formats.
1570 See Also
1571 --------
1572 statsmodels.iolib.summary2.Summary : class to hold summary results
1573 """
1575 from statsmodels.iolib import summary2
1576 from collections import OrderedDict
1577 smry = summary2.Summary()
1578 float_format = "%8.3f"
1580 info = OrderedDict()
1581 info["Model:"] = "PH Reg"
1582 if yname is None:
1583 yname = self.model.endog_names
1584 info["Dependent variable:"] = yname
1585 info["Ties:"] = self.model.ties.capitalize()
1586 info["Sample size:"] = str(self.model.surv.n_obs)
1587 info["Num. events:"] = str(int(sum(self.model.status)))
1589 if self.model.groups is not None:
1590 mn, mx, avg, num = self._group_stats(self.model.groups)
1591 info["Num groups:"] = "%.0f" % num
1592 info["Min group size:"] = "%.0f" % mn
1593 info["Max group size:"] = "%.0f" % mx
1594 info["Avg group size:"] = "%.1f" % avg
1596 if self.model.strata is not None:
1597 mn, mx, avg, num = self._group_stats(self.model.strata)
1598 info["Num strata:"] = "%.0f" % num
1599 info["Min stratum size:"] = "%.0f" % mn
1600 info["Max stratum size:"] = "%.0f" % mx
1601 info["Avg stratum size:"] = "%.1f" % avg
1603 smry.add_dict(info, align='l', float_format=float_format)
1605 param = summary2.summary_params(self, alpha=alpha)
1606 param = param.rename(columns={"Coef.": "log HR",
1607 "Std.Err.": "log HR SE"})
1608 param.insert(2, "HR", np.exp(param["log HR"]))
1609 a = "[%.3f" % (alpha / 2)
1610 param.loc[:, a] = np.exp(param.loc[:, a])
1611 a = "%.3f]" % (1 - alpha / 2)
1612 param.loc[:, a] = np.exp(param.loc[:, a])
1613 if xname is not None:
1614 param.index = xname
1615 smry.add_df(param, float_format=float_format)
1616 smry.add_title(title=title, results=self)
1617 smry.add_text("Confidence intervals are for the hazard ratios")
1619 dstrat = self.model.surv.nstrat_orig - self.model.surv.nstrat
1620 if dstrat > 0:
1621 if dstrat == 1:
1622 smry.add_text("1 stratum dropped for having no events")
1623 else:
1624 smry.add_text("%d strata dropped for having no events" % dstrat)
1626 if self.model.entry is not None:
1627 n_entry = sum(self.model.entry != 0)
1628 if n_entry == 1:
1629 smry.add_text("1 observation has a positive entry time")
1630 else:
1631 smry.add_text("%d observations have positive entry times" % n_entry)
1633 if self.model.groups is not None:
1634 smry.add_text("Standard errors account for dependence within groups")
1636 if hasattr(self, "regularized"):
1637 smry.add_text("Standard errors do not account for the regularization")
1639 return smry
1642class rv_discrete_float(object):
1643 """
1644 A class representing a collection of discrete distributions.
1646 Parameters
1647 ----------
1648 xk : 2d array_like
1649 The support points, should be non-decreasing within each
1650 row.
1651 pk : 2d array_like
1652 The probabilities, should sum to one within each row.
1654 Notes
1655 -----
1656 Each row of `xk`, and the corresponding row of `pk` describe a
1657 discrete distribution.
1659 `xk` and `pk` should both be two-dimensional ndarrays. Each row
1660 of `pk` should sum to 1.
1662 This class is used as a substitute for scipy.distributions.
1663 rv_discrete, since that class does not allow non-integer support
1664 points, or vectorized operations.
1666 Only a limited number of methods are implemented here compared to
1667 the other scipy distribution classes.
1668 """
1670 def __init__(self, xk, pk):
1672 self.xk = xk
1673 self.pk = pk
1674 self.cpk = np.cumsum(self.pk, axis=1)
1676 def rvs(self):
1677 """
1678 Returns a random sample from the discrete distribution.
1680 A vector is returned containing a single draw from each row of
1681 `xk`, using the probabilities of the corresponding row of `pk`
1682 """
1684 n = self.xk.shape[0]
1685 u = np.random.uniform(size=n)
1687 ix = (self.cpk < u[:, None]).sum(1)
1688 ii = np.arange(n, dtype=np.int32)
1689 return self.xk[(ii,ix)]
1691 def mean(self):
1692 """
1693 Returns a vector containing the mean values of the discrete
1694 distributions.
1696 A vector is returned containing the mean value of each row of
1697 `xk`, using the probabilities in the corresponding row of
1698 `pk`.
1699 """
1701 return (self.xk * self.pk).sum(1)
1703 def var(self):
1704 """
1705 Returns a vector containing the variances of the discrete
1706 distributions.
1708 A vector is returned containing the variance for each row of
1709 `xk`, using the probabilities in the corresponding row of
1710 `pk`.
1711 """
1713 mn = self.mean()
1714 xkc = self.xk - mn[:, None]
1716 return (self.pk * (self.xk - xkc)**2).sum(1)
1718 def std(self):
1719 """
1720 Returns a vector containing the standard deviations of the
1721 discrete distributions.
1723 A vector is returned containing the standard deviation for
1724 each row of `xk`, using the probabilities in the corresponding
1725 row of `pk`.
1726 """
1728 return np.sqrt(self.var())