import numpy as np import pandas as pd from statsmodels.base.model import LikelihoodModelResults class BayesGaussMI: """ Bayesian Imputation using a Gaussian model. The approach is Bayesian. The goal is to sample from the joint distribution of the mean vector, covariance matrix, and missing data values given the observed data values. Conjugate priors for the population mean and covariance matrix are used. Gibbs sampling is used to update the mean vector, covariance matrix, and missing data values in turn. After burn-in, the imputed complete data sets from the Gibbs chain can be used in multiple imputation analyses (MI). Parameters ---------- data : ndarray The array of data to be imputed. Values in the array equal to NaN are imputed. mean_prior : ndarray, optional The covariance matrix of the Gaussian prior distribution for the mean vector. If not provided, the identity matrix is used. cov_prior : ndarray, optional The center matrix for the inverse Wishart prior distribution for the covariance matrix. If not provided, the identity matrix is used. cov_prior_df : positive float The degrees of freedom of the inverse Wishart prior distribution for the covariance matrix. Defaults to 1. Examples -------- A basic example with OLS. Data is generated assuming 10% is missing at random. >>> import numpy as np >>> x = np.random.standard_normal((1000, 2)) >>> x.flat[np.random.sample(2000) < 0.1] = np.nan The imputer is used with ``MI``. >>> import statsmodels.api as sm >>> def model_args_fn(x): ... # Return endog, exog from x ... return x[:, 0], x[:, 1:] >>> imp = sm.BayesGaussMI(x) >>> mi = sm.MI(imp, sm.OLS, model_args_fn) """ def __init__(self, data, mean_prior=None, cov_prior=None, cov_prior_df=1): self.exog_names = None if type(data) is pd.DataFrame: self.exog_names = data.columns data = np.require(data, requirements="W") self.data = data self._data = data self.mask = np.isnan(data) self.nobs = self.mask.shape[0] self.nvar = self.mask.shape[1] # Identify all distinct missing data patterns z = 1 + np.log(1 + np.arange(self.mask.shape[1])) c = np.dot(self.mask, z) rowmap = {} for i, v in enumerate(c): if v == 0: # No missing values continue if v not in rowmap: rowmap[v] = [] rowmap[v].append(i) self.patterns = [np.asarray(v) for v in rowmap.values()] # Simple starting values for mean and covariance p = self._data.shape[1] self.cov = np.eye(p) mean = [] for i in range(p): v = self._data[:, i] v = v[np.isfinite(v)] if len(v) == 0: msg = "Column %d has no observed values" % i raise ValueError(msg) mean.append(v.mean()) self.mean = np.asarray(mean) # Default covariance matrix of the (Gaussian) mean prior if mean_prior is None: mean_prior = np.eye(p) self.mean_prior = mean_prior # Default center matrix of the (inverse Wishart) covariance prior if cov_prior is None: cov_prior = np.eye(p) self.cov_prior = cov_prior # Degrees of freedom for the (inverse Wishart) covariance prior self.cov_prior_df = cov_prior_df def update(self): """ Cycle through all Gibbs updates. """ self.update_data() # Need to update data first self.update_mean() self.update_cov() def update_data(self): """ Gibbs update of the missing data values. """ for ix in self.patterns: i = ix[0] ix_miss = np.flatnonzero(self.mask[i, :]) ix_obs = np.flatnonzero(~self.mask[i, :]) mm = self.mean[ix_miss] mo = self.mean[ix_obs] voo = self.cov[ix_obs, :][:, ix_obs] vmm = self.cov[ix_miss, :][:, ix_miss] vmo = self.cov[ix_miss, :][:, ix_obs] r = self._data[ix, :][:, ix_obs] - mo cm = mm + np.dot(vmo, np.linalg.solve(voo, r.T)).T cv = vmm - np.dot(vmo, np.linalg.solve(voo, vmo.T)) cs = np.linalg.cholesky(cv) u = np.random.normal(size=(len(ix), len(ix_miss))) self._data[np.ix_(ix, ix_miss)] = cm + np.dot(u, cs.T) # Set the user-visible data set. if self.exog_names is not None: self.data = pd.DataFrame( self._data, columns=self.exog_names, copy=False) else: self.data = self._data def update_mean(self): """ Gibbs update of the mean vector. Do not call until update_data has been called once. """ # https://stats.stackexchange.com/questions/28744/multivariate-normal-posterior # Posterior covariance matrix of the mean cm = np.linalg.solve(self.cov/self.nobs + self.mean_prior, self.mean_prior / self.nobs) cm = np.dot(self.cov, cm) # Posterior mean of the mean vm = np.linalg.solve(self.cov, self._data.sum(0)) vm = np.dot(cm, vm) # Sample r = np.linalg.cholesky(cm) self.mean = vm + np.dot(r, np.random.normal(0, 1, self.nvar)) def update_cov(self): """ Gibbs update of the covariance matrix. Do not call until update_data has been called once. """ # https://stats.stackexchange.com/questions/50844/estimating-the-covariance-posterior-distribution-of-a-multivariate-gaussian r = self._data - self.mean gr = np.dot(r.T, r) a = gr + self.cov_prior df = int(np.ceil(self.nobs + self.cov_prior_df)) r = np.linalg.cholesky(np.linalg.inv(a)) x = np.dot(np.random.normal(size=(df, self.nvar)), r.T) ma = np.dot(x.T, x) self.cov = np.linalg.inv(ma) class MI: """ MI performs multiple imputation using a provided imputer object. Parameters ---------- imp : object An imputer class, such as BayesGaussMI. model : model class Any statsmodels model class. model_args_fn : function A function taking an imputed dataset as input and returning endog, exog. If the model is fit using a formula, returns a DataFrame used to build the model. Optional when a formula is used. model_kwds_fn : function, optional A function taking an imputed dataset as input and returning a dictionary of model keyword arguments. formula : str, optional If provided, the model is constructed using the `from_formula` class method, otherwise the `__init__` method is used. fit_args : list-like, optional List of arguments to be passed to the fit method fit_kwds : dict-like, optional Keyword arguments to be passed to the fit method xfunc : function mapping ndarray to ndarray A function that is applied to the complete data matrix prior to fitting the model burn : int Number of burn-in iterations nrep : int Number of imputed data sets to use in the analysis skip : int Number of Gibbs iterations to skip between successive multiple imputation fits. Notes ----- The imputer object must have an 'update' method, and a 'data' attribute that contains the current imputed dataset. xfunc can be used to introduce domain constraints, e.g. when imputing binary data the imputed continuous values can be rounded to 0/1. """ def __init__(self, imp, model, model_args_fn=None, model_kwds_fn=None, formula=None, fit_args=None, fit_kwds=None, xfunc=None, burn=100, nrep=20, skip=10): # The imputer self.imp = imp # The number of imputed data sets to skip between each imputed # data set tha that is used in the analysis. self.skip = skip # The model class self.model = model self.formula = formula if model_args_fn is None: def f(x): return [] model_args_fn = f self.model_args_fn = model_args_fn if model_kwds_fn is None: def f(x): return {} model_kwds_fn = f self.model_kwds_fn = model_kwds_fn if fit_args is None: def f(x): return [] fit_args = f self.fit_args = fit_args if fit_kwds is None: def f(x): return {} fit_kwds = f self.fit_kwds = fit_kwds self.xfunc = xfunc self.nrep = nrep self.skip = skip # Burn-in for k in range(burn): imp.update() def fit(self, results_cb=None): """ Impute datasets, fit models, and pool results. Parameters ---------- results_cb : function, optional If provided, each results instance r is passed through `results_cb`, then appended to the `results` attribute of the MIResults object. To save complete results, use `results_cb=lambda x: x`. The default behavior is to save no results. Returns ------- A MIResults object. """ par, cov = [], [] all_results = [] for k in range(self.nrep): for k in range(self.skip+1): self.imp.update() da = self.imp.data if self.xfunc is not None: da = self.xfunc(da) if self.formula is None: model = self.model(*self.model_args_fn(da), **self.model_kwds_fn(da)) else: model = self.model.from_formula( self.formula, *self.model_args_fn(da), **self.model_kwds_fn(da)) result = model.fit(*self.fit_args(da), **self.fit_kwds(da)) if results_cb is not None: all_results.append(results_cb(result)) par.append(np.asarray(result.params.copy())) cov.append(np.asarray(result.cov_params().copy())) params, cov_params, fmi = self._combine(par, cov) r = MIResults(self, model, params, cov_params) r.fmi = fmi r.results = all_results return r def _combine(self, par, cov): # Helper function to apply "Rubin's combining rule" par = np.asarray(par) # Number of imputations m = par.shape[0] # Point estimate params = par.mean(0) # Within-imputation covariance wcov = sum(cov) / len(cov) # Between-imputation covariance bcov = np.cov(par.T) bcov = np.atleast_2d(bcov) # Overall covariance covp = wcov + (1 + 1/float(m))*bcov # Fraction of missing information fmi = (1 + 1/float(m)) * np.diag(bcov) / np.diag(covp) return params, covp, fmi class MIResults(LikelihoodModelResults): """ A results class for multiple imputation (MI). Parameters ---------- mi : MI instance The MI object that produced the results model : instance of statsmodels model class This can be any instance from the multiple imputation runs. It is used to get class information, the specific parameter and data values are not used. params : array_like The overall multiple imputation parameter estimates. normalized_cov_params : array_like (2d) The overall variance covariance matrix of the estimates. """ def __init__(self, mi, model, params, normalized_cov_params): super().__init__(model, params, normalized_cov_params) self.mi = mi self._model = model def summary(self, title=None, alpha=.05): """ Summarize the results of running multiple imputation. Parameters ---------- title : str, optional Title for the top table. If not None, then this replaces the default title alpha : float Significance level for the confidence intervals Returns ------- smry : Summary instance This holds the summary tables and text, which can be printed or converted to various output formats. """ from statsmodels.iolib import summary2 smry = summary2.Summary() float_format = "%8.3f" info = {} info["Method:"] = "MI" info["Model:"] = self.mi.model.__name__ info["Dependent variable:"] = self._model.endog_names info["Sample size:"] = "%d" % self.mi.imp.data.shape[0] info["Num. imputations"] = "%d" % self.mi.nrep smry.add_dict(info, align='l', float_format=float_format) param = summary2.summary_params(self, alpha=alpha) param["FMI"] = self.fmi smry.add_df(param, float_format=float_format) smry.add_title(title=title, results=self) return smry