818 lines
27 KiB
Python
818 lines
27 KiB
Python
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from scipy.stats.distributions import chi2, norm
|
||
|
from statsmodels.graphics import utils
|
||
|
|
||
|
|
||
|
def _calc_survfunc_right(time, status, weights=None, entry=None, compress=True,
|
||
|
retall=True):
|
||
|
"""
|
||
|
Calculate the survival function and its standard error for a single
|
||
|
group.
|
||
|
"""
|
||
|
|
||
|
# Convert the unique times to ranks (0, 1, 2, ...)
|
||
|
if entry is None:
|
||
|
utime, rtime = np.unique(time, return_inverse=True)
|
||
|
else:
|
||
|
tx = np.concatenate((time, entry))
|
||
|
utime, rtime = np.unique(tx, return_inverse=True)
|
||
|
rtime = rtime[0:len(time)]
|
||
|
|
||
|
# Number of deaths at each unique time.
|
||
|
ml = len(utime)
|
||
|
if weights is None:
|
||
|
d = np.bincount(rtime, weights=status, minlength=ml)
|
||
|
else:
|
||
|
d = np.bincount(rtime, weights=status*weights, minlength=ml)
|
||
|
|
||
|
# Size of risk set just prior to each event time.
|
||
|
if weights is None:
|
||
|
n = np.bincount(rtime, minlength=ml)
|
||
|
else:
|
||
|
n = np.bincount(rtime, weights=weights, minlength=ml)
|
||
|
if entry is not None:
|
||
|
n = np.cumsum(n) - n
|
||
|
rentry = np.searchsorted(utime, entry, side='left')
|
||
|
if weights is None:
|
||
|
n0 = np.bincount(rentry, minlength=ml)
|
||
|
else:
|
||
|
n0 = np.bincount(rentry, weights=weights, minlength=ml)
|
||
|
n0 = np.cumsum(n0) - n0
|
||
|
n = n0 - n
|
||
|
else:
|
||
|
n = np.cumsum(n[::-1])[::-1]
|
||
|
|
||
|
# Only retain times where an event occurred.
|
||
|
if compress:
|
||
|
ii = np.flatnonzero(d > 0)
|
||
|
d = d[ii]
|
||
|
n = n[ii]
|
||
|
utime = utime[ii]
|
||
|
|
||
|
# The survival function probabilities.
|
||
|
sp = 1 - d / n.astype(np.float64)
|
||
|
ii = sp < 1e-16
|
||
|
sp[ii] = 1e-16
|
||
|
sp = np.log(sp)
|
||
|
sp = np.cumsum(sp)
|
||
|
sp = np.exp(sp)
|
||
|
sp[ii] = 0
|
||
|
|
||
|
if not retall:
|
||
|
return sp, utime, rtime, n, d
|
||
|
|
||
|
# Standard errors
|
||
|
if weights is None:
|
||
|
# Greenwood's formula
|
||
|
denom = n * (n - d)
|
||
|
denom = np.clip(denom, 1e-12, np.inf)
|
||
|
se = d / denom.astype(np.float64)
|
||
|
se[(n == d) | (n == 0)] = np.nan
|
||
|
se = np.cumsum(se)
|
||
|
se = np.sqrt(se)
|
||
|
locs = np.isfinite(se) | (sp != 0)
|
||
|
se[locs] *= sp[locs]
|
||
|
se[~locs] = np.nan
|
||
|
else:
|
||
|
# Tsiatis' (1981) formula
|
||
|
se = d / (n * n).astype(np.float64)
|
||
|
se = np.cumsum(se)
|
||
|
se = np.sqrt(se)
|
||
|
|
||
|
return sp, se, utime, rtime, n, d
|
||
|
|
||
|
|
||
|
def _calc_incidence_right(time, status, weights=None):
|
||
|
"""
|
||
|
Calculate the cumulative incidence function and its standard error.
|
||
|
"""
|
||
|
|
||
|
# Calculate the all-cause survival function.
|
||
|
status0 = (status >= 1).astype(np.float64)
|
||
|
sp, utime, rtime, n, d = _calc_survfunc_right(time, status0, weights,
|
||
|
compress=False, retall=False)
|
||
|
|
||
|
ngrp = int(status.max())
|
||
|
|
||
|
# Number of cause-specific deaths at each unique time.
|
||
|
d = []
|
||
|
for k in range(ngrp):
|
||
|
status0 = (status == k + 1).astype(np.float64)
|
||
|
if weights is None:
|
||
|
d0 = np.bincount(rtime, weights=status0, minlength=len(utime))
|
||
|
else:
|
||
|
d0 = np.bincount(rtime, weights=status0*weights,
|
||
|
minlength=len(utime))
|
||
|
d.append(d0)
|
||
|
|
||
|
# The cumulative incidence function probabilities.
|
||
|
ip = []
|
||
|
sp0 = np.r_[1, sp[:-1]] / n
|
||
|
for k in range(ngrp):
|
||
|
ip0 = np.cumsum(sp0 * d[k])
|
||
|
ip.append(ip0)
|
||
|
|
||
|
# The standard error of the cumulative incidence function.
|
||
|
if weights is not None:
|
||
|
return ip, None, utime
|
||
|
se = []
|
||
|
da = sum(d)
|
||
|
for k in range(ngrp):
|
||
|
|
||
|
ra = da / (n * (n - da))
|
||
|
v = ip[k]**2 * np.cumsum(ra)
|
||
|
v -= 2 * ip[k] * np.cumsum(ip[k] * ra)
|
||
|
v += np.cumsum(ip[k]**2 * ra)
|
||
|
|
||
|
ra = (n - d[k]) * d[k] / n
|
||
|
v += np.cumsum(sp0**2 * ra)
|
||
|
|
||
|
ra = sp0 * d[k] / n
|
||
|
v -= 2 * ip[k] * np.cumsum(ra)
|
||
|
v += 2 * np.cumsum(ip[k] * ra)
|
||
|
|
||
|
se.append(np.sqrt(v))
|
||
|
|
||
|
return ip, se, utime
|
||
|
|
||
|
|
||
|
def _checkargs(time, status, entry, freq_weights, exog):
|
||
|
|
||
|
if len(time) != len(status):
|
||
|
raise ValueError("time and status must have the same length")
|
||
|
|
||
|
if entry is not None and (len(entry) != len(time)):
|
||
|
msg = "entry times and event times must have the same length"
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
if entry is not None and np.any(entry >= time):
|
||
|
msg = "Entry times must not occur on or after event times"
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
if freq_weights is not None and (len(freq_weights) != len(time)):
|
||
|
raise ValueError("weights, time and status must have the same length")
|
||
|
|
||
|
if exog is not None and (exog.shape[0] != len(time)):
|
||
|
raise ValueError("the rows of exog should align with time")
|
||
|
|
||
|
|
||
|
class CumIncidenceRight:
|
||
|
"""
|
||
|
Estimation and inference for a cumulative incidence function.
|
||
|
|
||
|
If J = 1, 2, ... indicates the event type, the cumulative
|
||
|
incidence function for cause j is:
|
||
|
|
||
|
I(t, j) = P(T <= t and J=j)
|
||
|
|
||
|
Only right censoring is supported. If frequency weights are provided,
|
||
|
the point estimate is returned without a standard error.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
time : array_like
|
||
|
An array of times (censoring times or event times)
|
||
|
status : array_like
|
||
|
If status >= 1 indicates which event occurred at time t. If
|
||
|
status = 0, the subject was censored at time t.
|
||
|
title : str
|
||
|
Optional title used for plots and summary output.
|
||
|
freq_weights : array_like
|
||
|
Optional frequency weights
|
||
|
exog : array_like
|
||
|
Optional, if present used to account for violation of
|
||
|
independent censoring.
|
||
|
bw_factor : float
|
||
|
Band-width multiplier for kernel-based estimation. Only
|
||
|
used if exog is provided.
|
||
|
dimred : bool
|
||
|
If True, proportional hazards regression models are used to
|
||
|
reduce exog to two columns by predicting overall events and
|
||
|
censoring in two separate models. If False, exog is used
|
||
|
directly for calculating kernel weights without dimension
|
||
|
reduction.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
times : array_like
|
||
|
The distinct times at which the incidence rates are estimated
|
||
|
cinc : list of arrays
|
||
|
cinc[k-1] contains the estimated cumulative incidence rates
|
||
|
for outcome k=1,2,...
|
||
|
cinc_se : list of arrays
|
||
|
The standard errors for the values in `cinc`. Not available when
|
||
|
exog and/or frequency weights are provided.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
When exog is provided, a local estimate of the cumulative incidence
|
||
|
rate around each point is provided, and these are averaged to
|
||
|
produce an estimate of the marginal cumulative incidence
|
||
|
functions. The procedure is analogous to that described in Zeng
|
||
|
(2004) for estimation of the marginal survival function. The
|
||
|
approach removes bias resulting from dependent censoring when the
|
||
|
censoring becomes independent conditioned on the columns of exog.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
The Stata stcompet procedure:
|
||
|
http://www.stata-journal.com/sjpdf.html?articlenum=st0059
|
||
|
|
||
|
Dinse, G. E. and M. G. Larson. 1986. A note on semi-Markov models
|
||
|
for partially censored data. Biometrika 73: 379-386.
|
||
|
|
||
|
Marubini, E. and M. G. Valsecchi. 1995. Analysing Survival Data
|
||
|
from Clinical Trials and Observational Studies. Chichester, UK:
|
||
|
John Wiley & Sons.
|
||
|
|
||
|
D. Zeng (2004). Estimating marginal survival function by
|
||
|
adjusting for dependent censoring using many covariates. Annals
|
||
|
of Statistics 32:4.
|
||
|
https://arxiv.org/pdf/math/0409180.pdf
|
||
|
"""
|
||
|
|
||
|
def __init__(self, time, status, title=None, freq_weights=None,
|
||
|
exog=None, bw_factor=1., dimred=True):
|
||
|
|
||
|
_checkargs(time, status, None, freq_weights, None)
|
||
|
time = self.time = np.asarray(time)
|
||
|
status = self.status = np.asarray(status)
|
||
|
if freq_weights is not None:
|
||
|
freq_weights = self.freq_weights = np.asarray(freq_weights)
|
||
|
|
||
|
if exog is not None:
|
||
|
from ._kernel_estimates import _kernel_cumincidence
|
||
|
exog = self.exog = np.asarray(exog)
|
||
|
nobs = exog.shape[0]
|
||
|
kw = nobs**(-1/3.0) * bw_factor
|
||
|
kfunc = lambda x: np.exp(-x**2 / kw**2).sum(1)
|
||
|
x = _kernel_cumincidence(time, status, exog, kfunc, freq_weights,
|
||
|
dimred)
|
||
|
self.times = x[0]
|
||
|
self.cinc = x[1]
|
||
|
return
|
||
|
|
||
|
x = _calc_incidence_right(time, status, freq_weights)
|
||
|
self.cinc = x[0]
|
||
|
self.cinc_se = x[1]
|
||
|
self.times = x[2]
|
||
|
self.title = "" if not title else title
|
||
|
|
||
|
|
||
|
class SurvfuncRight:
|
||
|
"""
|
||
|
Estimation and inference for a survival function.
|
||
|
|
||
|
The survival function S(t) = P(T > t) is the probability that an
|
||
|
event time T is greater than t.
|
||
|
|
||
|
This class currently only supports right censoring.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
time : array_like
|
||
|
An array of times (censoring times or event times)
|
||
|
status : array_like
|
||
|
Status at the event time, status==1 is the 'event'
|
||
|
(e.g. death, failure), meaning that the event
|
||
|
occurs at the given value in `time`; status==0
|
||
|
indicates that censoring has occurred, meaning that
|
||
|
the event occurs after the given value in `time`.
|
||
|
entry : array_like, optional An array of entry times for handling
|
||
|
left truncation (the subject is not in the risk set on or
|
||
|
before the entry time)
|
||
|
title : str
|
||
|
Optional title used for plots and summary output.
|
||
|
freq_weights : array_like
|
||
|
Optional frequency weights
|
||
|
exog : array_like
|
||
|
Optional, if present used to account for violation of
|
||
|
independent censoring.
|
||
|
bw_factor : float
|
||
|
Band-width multiplier for kernel-based estimation. Only used
|
||
|
if exog is provided.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
surv_prob : array_like
|
||
|
The estimated value of the survivor function at each time
|
||
|
point in `surv_times`.
|
||
|
surv_prob_se : array_like
|
||
|
The standard errors for the values in `surv_prob`. Not available
|
||
|
if exog is provided.
|
||
|
surv_times : array_like
|
||
|
The points where the survival function changes.
|
||
|
n_risk : array_like
|
||
|
The number of subjects at risk just before each time value in
|
||
|
`surv_times`. Not available if exog is provided.
|
||
|
n_events : array_like
|
||
|
The number of events (e.g. deaths) that occur at each point
|
||
|
in `surv_times`. Not available if exog is provided.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
If exog is None, the standard Kaplan-Meier estimator is used. If
|
||
|
exog is not None, a local estimate of the marginal survival
|
||
|
function around each point is constructed, and these are then
|
||
|
averaged. This procedure gives an estimate of the marginal
|
||
|
survival function that accounts for dependent censoring as long as
|
||
|
the censoring becomes independent when conditioning on the
|
||
|
covariates in exog. See Zeng et al. (2004) for details.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
D. Zeng (2004). Estimating marginal survival function by
|
||
|
adjusting for dependent censoring using many covariates. Annals
|
||
|
of Statistics 32:4.
|
||
|
https://arxiv.org/pdf/math/0409180.pdf
|
||
|
"""
|
||
|
|
||
|
def __init__(self, time, status, entry=None, title=None,
|
||
|
freq_weights=None, exog=None, bw_factor=1.):
|
||
|
|
||
|
_checkargs(time, status, entry, freq_weights, exog)
|
||
|
time = self.time = np.asarray(time)
|
||
|
status = self.status = np.asarray(status)
|
||
|
if freq_weights is not None:
|
||
|
freq_weights = self.freq_weights = np.asarray(freq_weights)
|
||
|
|
||
|
if entry is not None:
|
||
|
entry = self.entry = np.asarray(entry)
|
||
|
|
||
|
if exog is not None:
|
||
|
if entry is not None:
|
||
|
raise ValueError("exog and entry cannot both be present")
|
||
|
from ._kernel_estimates import _kernel_survfunc
|
||
|
exog = self.exog = np.asarray(exog)
|
||
|
nobs = exog.shape[0]
|
||
|
kw = nobs**(-1/3.0) * bw_factor
|
||
|
kfunc = lambda x: np.exp(-x**2 / kw**2).sum(1)
|
||
|
x = _kernel_survfunc(time, status, exog, kfunc, freq_weights)
|
||
|
self.surv_prob = x[0]
|
||
|
self.surv_times = x[1]
|
||
|
return
|
||
|
|
||
|
x = _calc_survfunc_right(time, status, weights=freq_weights,
|
||
|
entry=entry)
|
||
|
|
||
|
self.surv_prob = x[0]
|
||
|
self.surv_prob_se = x[1]
|
||
|
self.surv_times = x[2]
|
||
|
self.n_risk = x[4]
|
||
|
self.n_events = x[5]
|
||
|
self.title = "" if not title else title
|
||
|
|
||
|
def plot(self, ax=None):
|
||
|
"""
|
||
|
Plot the survival function.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Change the line color:
|
||
|
|
||
|
>>> import statsmodels.api as sm
|
||
|
>>> data = sm.datasets.get_rdataset("flchain", "survival").data
|
||
|
>>> df = data.loc[data.sex == "F", :]
|
||
|
>>> sf = sm.SurvfuncRight(df["futime"], df["death"])
|
||
|
>>> fig = sf.plot()
|
||
|
>>> ax = fig.get_axes()[0]
|
||
|
>>> li = ax.get_lines()
|
||
|
>>> li[0].set_color('purple')
|
||
|
>>> li[1].set_color('purple')
|
||
|
|
||
|
Do not show the censoring points:
|
||
|
|
||
|
>>> fig = sf.plot()
|
||
|
>>> ax = fig.get_axes()[0]
|
||
|
>>> li = ax.get_lines()
|
||
|
>>> li[1].set_visible(False)
|
||
|
"""
|
||
|
|
||
|
return plot_survfunc(self, ax)
|
||
|
|
||
|
def quantile(self, p):
|
||
|
"""
|
||
|
Estimated quantile of a survival distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
p : float
|
||
|
The probability point at which the quantile
|
||
|
is determined.
|
||
|
|
||
|
Returns the estimated quantile.
|
||
|
"""
|
||
|
|
||
|
# SAS uses a strict inequality here.
|
||
|
ii = np.flatnonzero(self.surv_prob < 1 - p)
|
||
|
|
||
|
if len(ii) == 0:
|
||
|
return np.nan
|
||
|
|
||
|
return self.surv_times[ii[0]]
|
||
|
|
||
|
def quantile_ci(self, p, alpha=0.05, method='cloglog'):
|
||
|
"""
|
||
|
Returns a confidence interval for a survival quantile.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
p : float
|
||
|
The probability point for which a confidence interval is
|
||
|
determined.
|
||
|
alpha : float
|
||
|
The confidence interval has nominal coverage probability
|
||
|
1 - `alpha`.
|
||
|
method : str
|
||
|
Function to use for g-transformation, must be ...
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
lb : float
|
||
|
The lower confidence limit.
|
||
|
ub : float
|
||
|
The upper confidence limit.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The confidence interval is obtained by inverting Z-tests. The
|
||
|
limits of the confidence interval will always be observed
|
||
|
event times.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
The method is based on the approach used in SAS, documented here:
|
||
|
|
||
|
http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm
|
||
|
"""
|
||
|
|
||
|
tr = norm.ppf(1 - alpha / 2)
|
||
|
|
||
|
method = method.lower()
|
||
|
if method == "cloglog":
|
||
|
g = lambda x: np.log(-np.log(x))
|
||
|
gprime = lambda x: -1 / (x * np.log(x))
|
||
|
elif method == "linear":
|
||
|
g = lambda x: x
|
||
|
gprime = lambda x: 1
|
||
|
elif method == "log":
|
||
|
g = np.log
|
||
|
gprime = lambda x: 1 / x
|
||
|
elif method == "logit":
|
||
|
g = lambda x: np.log(x / (1 - x))
|
||
|
gprime = lambda x: 1 / (x * (1 - x))
|
||
|
elif method == "asinsqrt":
|
||
|
g = lambda x: np.arcsin(np.sqrt(x))
|
||
|
gprime = lambda x: 1 / (2 * np.sqrt(x) * np.sqrt(1 - x))
|
||
|
else:
|
||
|
raise ValueError("unknown method")
|
||
|
|
||
|
r = g(self.surv_prob) - g(1 - p)
|
||
|
r /= (gprime(self.surv_prob) * self.surv_prob_se)
|
||
|
|
||
|
ii = np.flatnonzero(np.abs(r) <= tr)
|
||
|
if len(ii) == 0:
|
||
|
return np.nan, np.nan
|
||
|
|
||
|
lb = self.surv_times[ii[0]]
|
||
|
|
||
|
if ii[-1] == len(self.surv_times) - 1:
|
||
|
ub = np.inf
|
||
|
else:
|
||
|
ub = self.surv_times[ii[-1] + 1]
|
||
|
|
||
|
return lb, ub
|
||
|
|
||
|
def summary(self):
|
||
|
"""
|
||
|
Return a summary of the estimated survival function.
|
||
|
|
||
|
The summary is a dataframe containing the unique event times,
|
||
|
estimated survival function values, and related quantities.
|
||
|
"""
|
||
|
|
||
|
df = pd.DataFrame(index=self.surv_times)
|
||
|
df.index.name = "Time"
|
||
|
df["Surv prob"] = self.surv_prob
|
||
|
df["Surv prob SE"] = self.surv_prob_se
|
||
|
df["num at risk"] = self.n_risk
|
||
|
df["num events"] = self.n_events
|
||
|
|
||
|
return df
|
||
|
|
||
|
def simultaneous_cb(self, alpha=0.05, method="hw", transform="log"):
|
||
|
"""
|
||
|
Returns a simultaneous confidence band for the survival function.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
alpha : float
|
||
|
`1 - alpha` is the desired simultaneous coverage
|
||
|
probability for the confidence region. Currently alpha
|
||
|
must be set to 0.05, giving 95% simultaneous intervals.
|
||
|
method : str
|
||
|
The method used to produce the simultaneous confidence
|
||
|
band. Only the Hall-Wellner (hw) method is currently
|
||
|
implemented.
|
||
|
transform : str
|
||
|
The used to produce the interval (note that the returned
|
||
|
interval is on the survival probability scale regardless
|
||
|
of which transform is used). Only `log` and `arcsin` are
|
||
|
implemented.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
lcb : array_like
|
||
|
The lower confidence limits corresponding to the points
|
||
|
in `surv_times`.
|
||
|
ucb : array_like
|
||
|
The upper confidence limits corresponding to the points
|
||
|
in `surv_times`.
|
||
|
"""
|
||
|
|
||
|
method = method.lower()
|
||
|
if method != "hw":
|
||
|
msg = "only the Hall-Wellner (hw) method is implemented"
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
if alpha != 0.05:
|
||
|
raise ValueError("alpha must be set to 0.05")
|
||
|
|
||
|
transform = transform.lower()
|
||
|
s2 = self.surv_prob_se**2 / self.surv_prob**2
|
||
|
nn = self.n_risk
|
||
|
if transform == "log":
|
||
|
denom = np.sqrt(nn) * np.log(self.surv_prob)
|
||
|
theta = 1.3581 * (1 + nn * s2) / denom
|
||
|
theta = np.exp(theta)
|
||
|
lcb = self.surv_prob**(1/theta)
|
||
|
ucb = self.surv_prob**theta
|
||
|
elif transform == "arcsin":
|
||
|
k = 1.3581
|
||
|
k *= (1 + nn * s2) / (2 * np.sqrt(nn))
|
||
|
k *= np.sqrt(self.surv_prob / (1 - self.surv_prob))
|
||
|
f = np.arcsin(np.sqrt(self.surv_prob))
|
||
|
v = np.clip(f - k, 0, np.inf)
|
||
|
lcb = np.sin(v)**2
|
||
|
v = np.clip(f + k, -np.inf, np.pi/2)
|
||
|
ucb = np.sin(v)**2
|
||
|
else:
|
||
|
raise ValueError("Unknown transform")
|
||
|
|
||
|
return lcb, ucb
|
||
|
|
||
|
|
||
|
def survdiff(time, status, group, weight_type=None, strata=None,
|
||
|
entry=None, **kwargs):
|
||
|
"""
|
||
|
Test for the equality of two survival distributions.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
time : array_like
|
||
|
The event or censoring times.
|
||
|
status : array_like
|
||
|
The censoring status variable, status=1 indicates that the
|
||
|
event occurred, status=0 indicates that the observation was
|
||
|
censored.
|
||
|
group : array_like
|
||
|
Indicators of the two groups
|
||
|
weight_type : str
|
||
|
The following weight types are implemented:
|
||
|
None (default) : logrank test
|
||
|
fh : Fleming-Harrington, weights by S^(fh_p),
|
||
|
requires exponent fh_p to be provided as keyword
|
||
|
argument; the weights are derived from S defined at
|
||
|
the previous event time, and the first weight is
|
||
|
always 1.
|
||
|
gb : Gehan-Breslow, weights by the number at risk
|
||
|
tw : Tarone-Ware, weights by the square root of the number
|
||
|
at risk
|
||
|
strata : array_like
|
||
|
Optional stratum indicators for a stratified test
|
||
|
entry : array_like
|
||
|
Entry times to handle left truncation. The subject is not in
|
||
|
the risk set on or before the entry time.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
chisq : The chi-square (1 degree of freedom) distributed test
|
||
|
statistic value
|
||
|
pvalue : The p-value for the chi^2 test
|
||
|
"""
|
||
|
|
||
|
time = np.asarray(time)
|
||
|
status = np.asarray(status)
|
||
|
group = np.asarray(group)
|
||
|
|
||
|
gr = np.unique(group)
|
||
|
|
||
|
if strata is None:
|
||
|
obs, var = _survdiff(time, status, group, weight_type, gr,
|
||
|
entry, **kwargs)
|
||
|
else:
|
||
|
strata = np.asarray(strata)
|
||
|
stu = np.unique(strata)
|
||
|
obs, var = 0., 0.
|
||
|
for st in stu:
|
||
|
# could be more efficient?
|
||
|
ii = (strata == st)
|
||
|
obs1, var1 = _survdiff(time[ii], status[ii], group[ii],
|
||
|
weight_type, gr, entry, **kwargs)
|
||
|
obs += obs1
|
||
|
var += var1
|
||
|
|
||
|
chisq = obs.dot(np.linalg.solve(var, obs)) # (O - E).T * V^(-1) * (O - E)
|
||
|
pvalue = 1 - chi2.cdf(chisq, len(gr)-1)
|
||
|
|
||
|
return chisq, pvalue
|
||
|
|
||
|
|
||
|
def _survdiff(time, status, group, weight_type, gr, entry=None,
|
||
|
**kwargs):
|
||
|
# logrank test for one stratum
|
||
|
# calculations based on https://web.stanford.edu/~lutian/coursepdf/unit6.pdf
|
||
|
# formula for variance better to take from https://web.stanford.edu/~lutian/coursepdf/survweek3.pdf
|
||
|
|
||
|
# Get the unique times.
|
||
|
if entry is None:
|
||
|
utimes, rtimes = np.unique(time, return_inverse=True)
|
||
|
else:
|
||
|
utimes, rtimes = np.unique(np.concatenate((time, entry)),
|
||
|
return_inverse=True)
|
||
|
rtimes = rtimes[0:len(time)]
|
||
|
|
||
|
# Split entry times by group if present (should use pandas groupby)
|
||
|
tse = [(gr_i, None) for gr_i in gr]
|
||
|
if entry is not None:
|
||
|
for k, _ in enumerate(gr):
|
||
|
ii = (group == gr[k])
|
||
|
entry1 = entry[ii]
|
||
|
tse[k] = (gr[k], entry1)
|
||
|
|
||
|
# Event count and risk set size at each time point, per group and overall.
|
||
|
# TODO: should use Pandas groupby
|
||
|
nrisk, obsv = [], []
|
||
|
ml = len(utimes)
|
||
|
for g, entry0 in tse:
|
||
|
|
||
|
mk = (group == g)
|
||
|
n = np.bincount(rtimes, weights=mk, minlength=ml)
|
||
|
|
||
|
ob = np.bincount(rtimes, weights=status*mk, minlength=ml)
|
||
|
obsv.append(ob)
|
||
|
|
||
|
if entry is not None:
|
||
|
n = np.cumsum(n) - n
|
||
|
rentry = np.searchsorted(utimes, entry0, side='left')
|
||
|
n0 = np.bincount(rentry, minlength=ml)
|
||
|
n0 = np.cumsum(n0) - n0
|
||
|
nr = n0 - n
|
||
|
else:
|
||
|
nr = np.cumsum(n[::-1])[::-1]
|
||
|
|
||
|
nrisk.append(nr)
|
||
|
|
||
|
obs = sum(obsv)
|
||
|
nrisk_tot = sum(nrisk)
|
||
|
ix = np.flatnonzero(nrisk_tot > 1)
|
||
|
|
||
|
weights = None
|
||
|
if weight_type is not None:
|
||
|
weight_type = weight_type.lower()
|
||
|
if weight_type == "gb":
|
||
|
weights = nrisk_tot
|
||
|
elif weight_type == "tw":
|
||
|
weights = np.sqrt(nrisk_tot)
|
||
|
elif weight_type == "fh":
|
||
|
if "fh_p" not in kwargs:
|
||
|
msg = "weight_type type 'fh' requires specification of fh_p"
|
||
|
raise ValueError(msg)
|
||
|
fh_p = kwargs["fh_p"]
|
||
|
# Calculate the survivor function directly to avoid the
|
||
|
# overhead of creating a SurvfuncRight object
|
||
|
sp = 1 - obs / nrisk_tot.astype(np.float64)
|
||
|
sp = np.log(sp)
|
||
|
sp = np.cumsum(sp)
|
||
|
sp = np.exp(sp)
|
||
|
weights = sp**fh_p
|
||
|
weights = np.roll(weights, 1)
|
||
|
weights[0] = 1
|
||
|
else:
|
||
|
raise ValueError("weight_type not implemented")
|
||
|
|
||
|
dfs = len(gr) - 1
|
||
|
r = np.vstack(nrisk) / np.clip(nrisk_tot, 1e-10, np.inf)[None, :] # each line is timeseries of r's. line per group
|
||
|
|
||
|
# The variance of event counts in each group.
|
||
|
groups_oe = []
|
||
|
groups_var = []
|
||
|
|
||
|
var_denom = nrisk_tot - 1
|
||
|
var_denom = np.clip(var_denom, 1e-10, np.inf)
|
||
|
|
||
|
# use the first group as a reference
|
||
|
for g in range(1, dfs+1):
|
||
|
# Difference between observed and expected number of events in the group #g
|
||
|
oe = obsv[g] - r[g]*obs
|
||
|
|
||
|
# build one row of the dfs x dfs variance matrix
|
||
|
var_tensor_part = r[1:, :].T * (np.eye(1, dfs, g-1).ravel() - r[g, :, None]) # r*(1 - r) in multidim
|
||
|
var_scalar_part = obs * (nrisk_tot - obs) / var_denom
|
||
|
var = var_tensor_part * var_scalar_part[:, None]
|
||
|
|
||
|
if weights is not None:
|
||
|
oe = weights * oe
|
||
|
var = (weights**2)[:, None] * var
|
||
|
|
||
|
# sum over times and store
|
||
|
groups_oe.append(oe[ix].sum())
|
||
|
groups_var.append(var[ix].sum(axis=0))
|
||
|
|
||
|
obs_vec = np.hstack(groups_oe)
|
||
|
var_mat = np.vstack(groups_var)
|
||
|
|
||
|
return obs_vec, var_mat
|
||
|
|
||
|
|
||
|
def plot_survfunc(survfuncs, ax=None):
|
||
|
"""
|
||
|
Plot one or more survivor functions.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
survfuncs : object or array_like
|
||
|
A single SurvfuncRight object, or a list or SurvfuncRight
|
||
|
objects that are plotted together.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
A figure instance on which the plot was drawn.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Add a legend:
|
||
|
|
||
|
>>> import statsmodels.api as sm
|
||
|
>>> from statsmodels.duration.survfunc import plot_survfunc
|
||
|
>>> data = sm.datasets.get_rdataset("flchain", "survival").data
|
||
|
>>> df = data.loc[data.sex == "F", :]
|
||
|
>>> sf0 = sm.SurvfuncRight(df["futime"], df["death"])
|
||
|
>>> sf1 = sm.SurvfuncRight(3.0 * df["futime"], df["death"])
|
||
|
>>> fig = plot_survfunc([sf0, sf1])
|
||
|
>>> ax = fig.get_axes()[0]
|
||
|
>>> ax.set_position([0.1, 0.1, 0.64, 0.8])
|
||
|
>>> ha, lb = ax.get_legend_handles_labels()
|
||
|
>>> leg = fig.legend((ha[0], ha[1]), (lb[0], lb[1]), loc='center right')
|
||
|
|
||
|
Change the line colors:
|
||
|
|
||
|
>>> fig = plot_survfunc([sf0, sf1])
|
||
|
>>> ax = fig.get_axes()[0]
|
||
|
>>> ax.set_position([0.1, 0.1, 0.64, 0.8])
|
||
|
>>> ha, lb = ax.get_legend_handles_labels()
|
||
|
>>> ha[0].set_color('purple')
|
||
|
>>> ha[1].set_color('orange')
|
||
|
"""
|
||
|
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
|
||
|
# If we have only a single survival function to plot, put it into
|
||
|
# a list.
|
||
|
try:
|
||
|
assert type(survfuncs[0]) is SurvfuncRight
|
||
|
except:
|
||
|
survfuncs = [survfuncs]
|
||
|
|
||
|
for gx, sf in enumerate(survfuncs):
|
||
|
|
||
|
# The estimated survival function does not include a point at
|
||
|
# time 0, include it here for plotting.
|
||
|
surv_times = np.concatenate(([0], sf.surv_times))
|
||
|
surv_prob = np.concatenate(([1], sf.surv_prob))
|
||
|
|
||
|
# If the final times are censoring times they are not included
|
||
|
# in the survival function so we add them here
|
||
|
mxt = max(sf.time)
|
||
|
if mxt > surv_times[-1]:
|
||
|
surv_times = np.concatenate((surv_times, [mxt]))
|
||
|
surv_prob = np.concatenate((surv_prob, [surv_prob[-1]]))
|
||
|
|
||
|
label = getattr(sf, "title", "Group %d" % (gx + 1))
|
||
|
|
||
|
li, = ax.step(surv_times, surv_prob, '-', label=label, lw=2,
|
||
|
where='post')
|
||
|
|
||
|
# Plot the censored points.
|
||
|
ii = np.flatnonzero(np.logical_not(sf.status))
|
||
|
ti = np.unique(sf.time[ii])
|
||
|
jj = np.searchsorted(surv_times, ti) - 1
|
||
|
sp = surv_prob[jj]
|
||
|
ax.plot(ti, sp, '+', ms=12, color=li.get_color(),
|
||
|
label=label + " points")
|
||
|
|
||
|
ax.set_ylim(0, 1.01)
|
||
|
|
||
|
return fig
|