import numpy as np from statsmodels.duration.hazard_regression import PHReg def _kernel_cumincidence(time, status, exog, kfunc, freq_weights, dimred=True): """ Calculates cumulative incidence functions using kernels. Parameters ---------- time : array_like The observed time values status : array_like The status values. status == 0 indicates censoring, status == 1, 2, ... are the events. exog : array_like Covariates such that censoring becomes independent of outcome times conditioned on the covariate values. kfunc : function A kernel function freq_weights : array_like Optional frequency weights dimred : bool If True, proportional hazards regression models are used to reduce exog to two columns by predicting overall events and censoring in two separate models. If False, exog is used directly for calculating kernel weights without dimension reduction. """ # Reorder so time is ascending ii = np.argsort(time) time = time[ii] status = status[ii] exog = exog[ii, :] nobs = len(time) # Convert the unique times to ranks (0, 1, 2, ...) utime, rtime = np.unique(time, return_inverse=True) # Last index where each unique time occurs. ie = np.searchsorted(time, utime, side='right') - 1 ngrp = int(status.max()) # All-cause status statusa = (status >= 1).astype(np.float64) if freq_weights is not None: freq_weights = freq_weights / freq_weights.sum() ip = [] sp = [None] * nobs n_risk = [None] * nobs kd = [None] * nobs for k in range(ngrp): status0 = (status == k + 1).astype(np.float64) # Dimension reduction step if dimred: sfe = PHReg(time, exog, status0).fit() fitval_e = sfe.predict().predicted_values sfc = PHReg(time, exog, 1 - status0).fit() fitval_c = sfc.predict().predicted_values exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None])) exog2d -= exog2d.mean(0) exog2d /= exog2d.std(0) else: exog2d = exog ip0 = 0 for i in range(nobs): if k == 0: kd1 = exog2d - exog2d[i, :] kd1 = kfunc(kd1) kd[i] = kd1 # Get the local all-causes survival function if k == 0: denom = np.cumsum(kd[i][::-1])[::-1] num = kd[i] * statusa rat = num / denom tr = 1e-15 ii = np.flatnonzero((denom < tr) & (num < tr)) rat[ii] = 0 ratc = 1 - rat ratc = np.clip(ratc, 1e-10, np.inf) lrat = np.log(ratc) prat = np.cumsum(lrat)[ie] sf = np.exp(prat) sp[i] = np.r_[1, sf[:-1]] n_risk[i] = denom[ie] # Number of cause-specific deaths at each unique time. d0 = np.bincount(rtime, weights=status0*kd[i], minlength=len(utime)) # The cumulative incidence function probabilities. Carry # forward once the effective sample size drops below 1. ip1 = np.cumsum(sp[i] * d0 / n_risk[i]) jj = len(ip1) - np.searchsorted(n_risk[i][::-1], 1) if jj < len(ip1): ip1[jj:] = ip1[jj - 1] if freq_weights is None: ip0 += ip1 else: ip0 += freq_weights[i] * ip1 if freq_weights is None: ip0 /= nobs ip.append(ip0) return utime, ip def _kernel_survfunc(time, status, exog, kfunc, freq_weights): """ Estimate the marginal survival function under dependent censoring. Parameters ---------- time : array_like The observed times for each subject status : array_like The status for each subject (1 indicates event, 0 indicates censoring) exog : array_like Covariates such that censoring is independent conditional on exog kfunc : function Kernel function freq_weights : array_like Optional frequency weights Returns ------- probs : array_like The estimated survival probabilities times : array_like The times at which the survival probabilities are estimated References ---------- Zeng, Donglin 2004. Estimating Marginal Survival Function by Adjusting for Dependent Censoring Using Many Covariates. The Annals of Statistics 32 (4): 1533 55. doi:10.1214/009053604000000508. https://arxiv.org/pdf/math/0409180.pdf """ # Dimension reduction step sfe = PHReg(time, exog, status).fit() fitval_e = sfe.predict().predicted_values sfc = PHReg(time, exog, 1 - status).fit() fitval_c = sfc.predict().predicted_values exog2d = np.hstack((fitval_e[:, None], fitval_c[:, None])) n = len(time) ixd = np.flatnonzero(status == 1) # For consistency with standard KM, only compute the survival # function at the times of observed events. utime = np.unique(time[ixd]) # Reorder everything so time is ascending ii = np.argsort(time) time = time[ii] status = status[ii] exog2d = exog2d[ii, :] # Last index where each evaluation time occurs. ie = np.searchsorted(time, utime, side='right') - 1 if freq_weights is not None: freq_weights = freq_weights / freq_weights.sum() sprob = 0. for i in range(n): kd = exog2d - exog2d[i, :] kd = kfunc(kd) denom = np.cumsum(kd[::-1])[::-1] num = kd * status rat = num / denom tr = 1e-15 ii = np.flatnonzero((denom < tr) & (num < tr)) rat[ii] = 0 ratc = 1 - rat ratc = np.clip(ratc, 1e-12, np.inf) lrat = np.log(ratc) prat = np.cumsum(lrat)[ie] prat = np.exp(prat) if freq_weights is None: sprob += prat else: sprob += prat * freq_weights[i] if freq_weights is None: sprob /= n return sprob, utime