""" Beta regression for modeling rates and proportions. References ---------- GrĂ¼n, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in Economics and Statistics, 2011. Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer? Maximum-likelihood regression with beta-distributed dependent variables." Psychological methods 11.1 (2006): 54. """ import numpy as np from scipy.special import gammaln as lgamma import patsy import statsmodels.base.wrapper as wrap import statsmodels.regression.linear_model as lm from statsmodels.tools.decorators import cache_readonly from statsmodels.base.model import ( GenericLikelihoodModel, GenericLikelihoodModelResults, _LLRMixin) from statsmodels.genmod import families _init_example = """ Beta regression with default of logit-link for exog and log-link for precision. >>> mod = BetaModel(endog, exog) >>> rslt = mod.fit() >>> print(rslt.summary()) We can also specify a formula and a specific structure and use the identity-link for precision. >>> from sm.families.links import identity >>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe') >>> mod = BetaModel.from_formula('iyield ~ C(batch, Treatment(10)) + temp', ... dat, exog_precision=Z, ... link_precision=identity()) In the case of proportion-data, we may think that the precision depends on the number of measurements. E.g for sequence data, on the number of sequence reads covering a site: >>> Z = patsy.dmatrix('~ coverage', df) >>> formula = 'methylation ~ disease + age + gender + coverage' >>> mod = BetaModel.from_formula(formula, df, Z) >>> rslt = mod.fit() """ class BetaModel(GenericLikelihoodModel): __doc__ = """Beta Regression. The Model is parameterized by mean and precision. Both can depend on explanatory variables through link functions. Parameters ---------- endog : array_like 1d array of endogenous response variable. exog : array_like A nobs x k array where `nobs` is the number of observations and `k` is the number of regressors. An intercept is not included by default and should be added by the user (models specified using a formula include an intercept by default). See `statsmodels.tools.add_constant`. exog_precision : array_like 2d array of variables for the precision. link : link Any link in sm.families.links for mean, should have range in interval [0, 1]. Default is logit-link. link_precision : link Any link in sm.families.links for precision, should have range in positive line. Default is log-link. **kwds : extra keywords Keyword options that will be handled by super classes. Not all general keywords will be supported in this class. Notes ----- Status: experimental, new in 0.13. Core results are verified, but api can change and some extra results specific to Beta regression are missing. Examples -------- {example} See Also -------- :ref:`links` """.format(example=_init_example) def __init__(self, endog, exog, exog_precision=None, link=families.links.Logit(), link_precision=families.links.Log(), **kwds): etmp = np.array(endog) assert np.all((0 < etmp) & (etmp < 1)) if exog_precision is None: extra_names = ['precision'] exog_precision = np.ones((len(endog), 1), dtype='f') else: extra_names = ['precision-%s' % zc for zc in (exog_precision.columns if hasattr(exog_precision, 'columns') else range(1, exog_precision.shape[1] + 1))] kwds['extra_params_names'] = extra_names super().__init__(endog, exog, exog_precision=exog_precision, **kwds) self.link = link self.link_precision = link_precision # not needed, handled by super: # self.exog_precision = exog_precision # inherited df do not account for precision params self.nobs = self.endog.shape[0] self.k_extra = 1 self.df_model = self.nparams - 2 self.df_resid = self.nobs - self.nparams assert len(self.exog_precision) == len(self.endog) self.hess_type = "oim" if 'exog_precision' not in self._init_keys: self._init_keys.extend(['exog_precision']) self._init_keys.extend(['link', 'link_precision']) self._null_drop_keys = ['exog_precision'] del kwds['extra_params_names'] self._check_kwargs(kwds) self.results_class = BetaResults self.results_class_wrapper = BetaResultsWrapper @classmethod def from_formula(cls, formula, data, exog_precision_formula=None, *args, **kwargs): if exog_precision_formula is not None: if 'subset' in kwargs: d = data.ix[kwargs['subset']] Z = patsy.dmatrix(exog_precision_formula, d) else: Z = patsy.dmatrix(exog_precision_formula, data) kwargs['exog_precision'] = Z return super().from_formula(formula, data, *args, **kwargs) def _get_exogs(self): return (self.exog, self.exog_precision) def predict(self, params, exog=None, exog_precision=None, which="mean"): """Predict values for mean or precision Parameters ---------- params : array_like The model parameters. exog : array_like Array of predictor variables for mean. exog_precision : array_like Array of predictor variables for precision parameter. which : str - "mean" : mean, conditional expectation E(endog | exog) - "precision" : predicted precision - "linear" : linear predictor for the mean function - "linear-precision" : linear predictor for the precision parameter Returns ------- ndarray, predicted values """ # compatibility with old names and misspelling if which == "linpred": which = "linear" if which in ["linpred_precision", "linear_precision"]: which = "linear-precision" k_mean = self.exog.shape[1] if which in ["mean", "linear"]: if exog is None: exog = self.exog params_mean = params[:k_mean] # Zparams = params[k_mean:] linpred = np.dot(exog, params_mean) if which == "mean": mu = self.link.inverse(linpred) res = mu else: res = linpred elif which in ["precision", "linear-precision"]: if exog_precision is None: exog_precision = self.exog_precision params_prec = params[k_mean:] linpred_prec = np.dot(exog_precision, params_prec) if which == "precision": phi = self.link_precision.inverse(linpred_prec) res = phi else: res = linpred_prec elif which == "var": res = self._predict_var( params, exog=exog, exog_precision=exog_precision ) else: raise ValueError('which = %s is not available' % which) return res def _predict_precision(self, params, exog_precision=None): """Predict values for precision function for given exog_precision. Parameters ---------- params : array_like The model parameters. exog_precision : array_like Array of predictor variables for precision. Returns ------- Predicted precision. """ if exog_precision is None: exog_precision = self.exog_precision k_mean = self.exog.shape[1] params_precision = params[k_mean:] linpred_prec = np.dot(exog_precision, params_precision) phi = self.link_precision.inverse(linpred_prec) return phi def _predict_var(self, params, exog=None, exog_precision=None): """predict values for conditional variance V(endog | exog) Parameters ---------- params : array_like The model parameters. exog : array_like Array of predictor variables for mean. exog_precision : array_like Array of predictor variables for precision. Returns ------- Predicted conditional variance. """ mean = self.predict(params, exog=exog) precision = self._predict_precision(params, exog_precision=exog_precision) var_endog = mean * (1 - mean) / (1 + precision) return var_endog def loglikeobs(self, params): """ Loglikelihood for observations of the Beta regressionmodel. Parameters ---------- params : ndarray The parameters of the model, coefficients for linear predictors of the mean and of the precision function. Returns ------- loglike : ndarray The log likelihood for each observation of the model evaluated at `params`. """ return self._llobs(self.endog, self.exog, self.exog_precision, params) def _llobs(self, endog, exog, exog_precision, params): """ Loglikelihood for observations with data arguments. Parameters ---------- endog : ndarray 1d array of endogenous variable. exog : ndarray 2d array of explanatory variables. exog_precision : ndarray 2d array of explanatory variables for precision. params : ndarray The parameters of the model, coefficients for linear predictors of the mean and of the precision function. Returns ------- loglike : ndarray The log likelihood for each observation of the model evaluated at `params`. """ y, X, Z = endog, exog, exog_precision nz = Z.shape[1] params_mean = params[:-nz] params_prec = params[-nz:] linpred = np.dot(X, params_mean) linpred_prec = np.dot(Z, params_prec) mu = self.link.inverse(linpred) phi = self.link_precision.inverse(linpred_prec) eps_lb = 1e-200 alpha = np.clip(mu * phi, eps_lb, np.inf) beta = np.clip((1 - mu) * phi, eps_lb, np.inf) ll = (lgamma(phi) - lgamma(alpha) - lgamma(beta) + (mu * phi - 1) * np.log(y) + (((1 - mu) * phi) - 1) * np.log(1 - y)) return ll def score(self, params): """ Returns the score vector of the log-likelihood. http://www.tandfonline.com/doi/pdf/10.1080/00949650903389993 Parameters ---------- params : ndarray Parameter at which score is evaluated. Returns ------- score : ndarray First derivative of loglikelihood function. """ sf1, sf2 = self.score_factor(params) d1 = np.dot(sf1, self.exog) d2 = np.dot(sf2, self.exog_precision) return np.concatenate((d1, d2)) def _score_check(self, params): """Inherited score with finite differences Parameters ---------- params : ndarray Parameter at which score is evaluated. Returns ------- score based on numerical derivatives """ return super().score(params) def score_factor(self, params, endog=None): """Derivative of loglikelihood function w.r.t. linear predictors. This needs to be multiplied with the exog to obtain the score_obs. Parameters ---------- params : ndarray Parameter at which score is evaluated. Returns ------- score_factor : ndarray, 2-D A 2d weight vector used in the calculation of the score_obs. Notes ----- The score_obs can be obtained from score_factor ``sf`` using - d1 = sf[:, :1] * exog - d2 = sf[:, 1:2] * exog_precision """ from scipy import special digamma = special.psi y = self.endog if endog is None else endog X, Z = self.exog, self.exog_precision nz = Z.shape[1] Xparams = params[:-nz] Zparams = params[-nz:] # NO LINKS mu = self.link.inverse(np.dot(X, Xparams)) phi = self.link_precision.inverse(np.dot(Z, Zparams)) eps_lb = 1e-200 # lower bound for evaluating digamma, avoids -inf alpha = np.clip(mu * phi, eps_lb, np.inf) beta = np.clip((1 - mu) * phi, eps_lb, np.inf) ystar = np.log(y / (1. - y)) dig_beta = digamma(beta) mustar = digamma(alpha) - dig_beta yt = np.log(1 - y) mut = dig_beta - digamma(phi) t = 1. / self.link.deriv(mu) h = 1. / self.link_precision.deriv(phi) # sf1 = phi * t * (ystar - mustar) sf2 = h * (mu * (ystar - mustar) + yt - mut) return (sf1, sf2) def score_hessian_factor(self, params, return_hessian=False, observed=True): """Derivatives of loglikelihood function w.r.t. linear predictors. This calculates score and hessian factors at the same time, because there is a large overlap in calculations. Parameters ---------- params : ndarray Parameter at which score is evaluated. return_hessian : bool If False, then only score_factors are returned If True, the both score and hessian factors are returned observed : bool If True, then the observed Hessian is returned (default). If False, then the expected information matrix is returned. Returns ------- score_factor : ndarray, 2-D A 2d weight vector used in the calculation of the score_obs. (-jbb, -jbg, -jgg) : tuple A tuple with 3 hessian factors, corresponding to the upper triangle of the Hessian matrix. TODO: check why there are minus """ from scipy import special digamma = special.psi y, X, Z = self.endog, self.exog, self.exog_precision nz = Z.shape[1] Xparams = params[:-nz] Zparams = params[-nz:] # NO LINKS mu = self.link.inverse(np.dot(X, Xparams)) phi = self.link_precision.inverse(np.dot(Z, Zparams)) # We need to prevent mu = 0 and (1-mu) = 0 in digamma call eps_lb = 1e-200 # lower bound for evaluating digamma, avoids -inf alpha = np.clip(mu * phi, eps_lb, np.inf) beta = np.clip((1 - mu) * phi, eps_lb, np.inf) ystar = np.log(y / (1. - y)) dig_beta = digamma(beta) mustar = digamma(alpha) - dig_beta yt = np.log(1 - y) mut = dig_beta - digamma(phi) t = 1. / self.link.deriv(mu) h = 1. / self.link_precision.deriv(phi) ymu_star = (ystar - mustar) sf1 = phi * t * ymu_star sf2 = h * (mu * ymu_star + yt - mut) if return_hessian: trigamma = lambda x: special.polygamma(1, x) # noqa trig_beta = trigamma(beta) var_star = trigamma(alpha) + trig_beta var_t = trig_beta - trigamma(phi) c = - trig_beta s = self.link.deriv2(mu) q = self.link_precision.deriv2(phi) jbb = (phi * t) * var_star if observed: jbb += s * t**2 * ymu_star jbb *= t * phi jbg = phi * t * h * (mu * var_star + c) if observed: jbg -= ymu_star * t * h jgg = h**2 * (mu**2 * var_star + 2 * mu * c + var_t) if observed: jgg += (mu * ymu_star + yt - mut) * q * h**3 # **3 ? return (sf1, sf2), (-jbb, -jbg, -jgg) else: return (sf1, sf2) def score_obs(self, params): """ Score, first derivative of the loglikelihood for each observation. Parameters ---------- params : ndarray Parameter at which score is evaluated. Returns ------- score_obs : ndarray, 2d The first derivative of the loglikelihood function evaluated at params for each observation. """ sf1, sf2 = self.score_factor(params) # elementwise product for each row (observation) d1 = sf1[:, None] * self.exog d2 = sf2[:, None] * self.exog_precision return np.column_stack((d1, d2)) def hessian(self, params, observed=None): """Hessian, second derivative of loglikelihood function Parameters ---------- params : ndarray Parameter at which Hessian is evaluated. observed : bool If True, then the observed Hessian is returned (default). If False, then the expected information matrix is returned. Returns ------- hessian : ndarray Hessian, i.e. observed information, or expected information matrix. """ if self.hess_type == "eim": observed = False else: observed = True _, hf = self.score_hessian_factor(params, return_hessian=True, observed=observed) hf11, hf12, hf22 = hf # elementwise product for each row (observation) d11 = (self.exog.T * hf11).dot(self.exog) d12 = (self.exog.T * hf12).dot(self.exog_precision) d22 = (self.exog_precision.T * hf22).dot(self.exog_precision) return np.block([[d11, d12], [d12.T, d22]]) def hessian_factor(self, params, observed=True): """Derivatives of loglikelihood function w.r.t. linear predictors. """ _, hf = self.score_hessian_factor(params, return_hessian=True, observed=observed) return hf def _start_params(self, niter=2, return_intermediate=False): """find starting values Parameters ---------- niter : int Number of iterations of WLS approximation return_intermediate : bool If False (default), then only the preliminary parameter estimate will be returned. If True, then also the two results instances of the WLS estimate for mean parameters and for the precision parameters will be returned. Returns ------- sp : ndarray start parameters for the optimization res_m2 : results instance (optional) Results instance for the WLS regression of the mean function. res_p2 : results instance (optional) Results instance for the WLS regression of the precision function. Notes ----- This calculates a few iteration of weighted least squares. This is not a full scoring algorithm. """ # WLS of the mean equation uses the implied weights (inverse variance), # WLS for the precision equations uses weights that only take # account of the link transformation of the precision endog. from statsmodels.regression.linear_model import OLS, WLS res_m = OLS(self.link(self.endog), self.exog).fit() fitted = self.link.inverse(res_m.fittedvalues) resid = self.endog - fitted prec_i = fitted * (1 - fitted) / np.maximum(np.abs(resid), 1e-2)**2 - 1 res_p = OLS(self.link_precision(prec_i), self.exog_precision).fit() prec_fitted = self.link_precision.inverse(res_p.fittedvalues) # sp = np.concatenate((res_m.params, res_p.params)) for _ in range(niter): y_var_inv = (1 + prec_fitted) / (fitted * (1 - fitted)) # y_var = fitted * (1 - fitted) / (1 + prec_fitted) ylink_var_inv = y_var_inv / self.link.deriv(fitted)**2 res_m2 = WLS(self.link(self.endog), self.exog, weights=ylink_var_inv).fit() fitted = self.link.inverse(res_m2.fittedvalues) resid2 = self.endog - fitted prec_i2 = (fitted * (1 - fitted) / np.maximum(np.abs(resid2), 1e-2)**2 - 1) w_p = 1. / self.link_precision.deriv(prec_fitted)**2 res_p2 = WLS(self.link_precision(prec_i2), self.exog_precision, weights=w_p).fit() prec_fitted = self.link_precision.inverse(res_p2.fittedvalues) sp2 = np.concatenate((res_m2.params, res_p2.params)) if return_intermediate: return sp2, res_m2, res_p2 return sp2 def fit(self, start_params=None, maxiter=1000, disp=False, method='bfgs', **kwds): """ Fit the model by maximum likelihood. Parameters ---------- start_params : array-like A vector of starting values for the regression coefficients. If None, a default is chosen. maxiter : integer The maximum number of iterations disp : bool Show convergence stats. method : str The optimization method to use. kwds : Keyword arguments for the optimizer. Returns ------- BetaResults instance. """ if start_params is None: start_params = self._start_params() # # http://www.ime.usp.br/~sferrari/beta.pdf suggests starting phi # # on page 8 if "cov_type" in kwds: # this is a workaround because we cannot tell super to use eim if kwds["cov_type"].lower() == "eim": self.hess_type = "eim" del kwds["cov_type"] else: self.hess_type = "oim" res = super().fit(start_params=start_params, maxiter=maxiter, method=method, disp=disp, **kwds) if not isinstance(res, BetaResultsWrapper): # currently GenericLikelihoodModel doe not add wrapper res = BetaResultsWrapper(res) return res def _deriv_mean_dparams(self, params): """ Derivative of the expected endog with respect to the parameters. not verified yet Parameters ---------- params : ndarray parameter at which score is evaluated Returns ------- The value of the derivative of the expected endog with respect to the parameter vector. """ link = self.link lin_pred = self.predict(params, which="linear") idl = link.inverse_deriv(lin_pred) dmat = self.exog * idl[:, None] return np.column_stack((dmat, np.zeros(self.exog_precision.shape))) def _deriv_score_obs_dendog(self, params): """derivative of score_obs w.r.t. endog Parameters ---------- params : ndarray parameter at which score is evaluated Returns ------- derivative : ndarray_2d The derivative of the score_obs with respect to endog. """ from statsmodels.tools.numdiff import _approx_fprime_cs_scalar def f(y): if y.ndim == 2 and y.shape[1] == 1: y = y[:, 0] sf = self.score_factor(params, endog=y) return np.column_stack(sf) dsf = _approx_fprime_cs_scalar(self.endog[:, None], f) # deriv is 2d vector d1 = dsf[:, :1] * self.exog d2 = dsf[:, 1:2] * self.exog_precision return np.column_stack((d1, d2)) def get_distribution_params(self, params, exog=None, exog_precision=None): """ Return distribution parameters converted from model prediction. Parameters ---------- params : array_like The model parameters. exog : array_like Array of predictor variables for mean. exog_precision : array_like Array of predictor variables for mean. Returns ------- (alpha, beta) : tuple of ndarrays Parameters for the scipy distribution to evaluate predictive distribution. """ mean = self.predict(params, exog=exog) precision = self.predict(params, exog_precision=exog_precision, which="precision") return precision * mean, precision * (1 - mean) def get_distribution(self, params, exog=None, exog_precision=None): """ Return a instance of the predictive distribution. Parameters ---------- params : array_like The model parameters. exog : array_like Array of predictor variables for mean. exog_precision : array_like Array of predictor variables for mean. Returns ------- Instance of a scipy frozen distribution based on estimated parameters. See Also -------- predict Notes ----- This function delegates to the predict method to handle exog and exog_precision, which in turn makes any required transformations. Due to the behavior of ``scipy.stats.distributions objects``, the returned random number generator must be called with ``gen.rvs(n)`` where ``n`` is the number of observations in the data set used to fit the model. If any other value is used for ``n``, misleading results will be produced. """ from scipy import stats args = self.get_distribution_params(params, exog=exog, exog_precision=exog_precision) distr = stats.beta(*args) return distr class BetaResults(GenericLikelihoodModelResults, _LLRMixin): """Results class for Beta regression This class inherits from GenericLikelihoodModelResults and not all inherited methods might be appropriate in this case. """ # GenericLikeihoodmodel doesn't define fittedvalues, residuals and similar @cache_readonly def fittedvalues(self): """In-sample predicted mean, conditional expectation.""" return self.model.predict(self.params) @cache_readonly def fitted_precision(self): """In-sample predicted precision""" return self.model.predict(self.params, which="precision") @cache_readonly def resid(self): """Response residual""" return self.model.endog - self.fittedvalues @cache_readonly def resid_pearson(self): """Pearson standardize residual""" std = np.sqrt(self.model.predict(self.params, which="var")) return self.resid / std @cache_readonly def prsquared(self): """Cox-Snell Likelihood-Ratio pseudo-R-squared. 1 - exp((llnull - .llf) * (2 / nobs)) """ return self.pseudo_rsquared(kind="lr") def get_distribution_params(self, exog=None, exog_precision=None, transform=True): """ Return distribution parameters converted from model prediction. Parameters ---------- params : array_like The model parameters. exog : array_like Array of predictor variables for mean. transform : bool If transform is True and formulas have been used, then predictor ``exog`` is passed through the formula processing. Default is True. Returns ------- (alpha, beta) : tuple of ndarrays Parameters for the scipy distribution to evaluate predictive distribution. """ mean = self.predict(exog=exog, transform=transform) precision = self.predict(exog_precision=exog_precision, which="precision", transform=transform) return precision * mean, precision * (1 - mean) def get_distribution(self, exog=None, exog_precision=None, transform=True): """ Return a instance of the predictive distribution. Parameters ---------- exog : array_like Array of predictor variables for mean. exog_precision : array_like Array of predictor variables for mean. transform : bool If transform is True and formulas have been used, then predictor ``exog`` is passed through the formula processing. Default is True. Returns ------- Instance of a scipy frozen distribution based on estimated parameters. See Also -------- predict Notes ----- This function delegates to the predict method to handle exog and exog_precision, which in turn makes any required transformations. Due to the behavior of ``scipy.stats.distributions objects``, the returned random number generator must be called with ``gen.rvs(n)`` where ``n`` is the number of observations in the data set used to fit the model. If any other value is used for ``n``, misleading results will be produced. """ from scipy import stats args = self.get_distribution_params(exog=exog, exog_precision=exog_precision, transform=transform) args = (np.asarray(arg) for arg in args) distr = stats.beta(*args) return distr def get_influence(self): """ Get an instance of MLEInfluence with influence and outlier measures Returns ------- infl : MLEInfluence instance The instance has methods to calculate the main influence and outlier measures as attributes. See Also -------- statsmodels.stats.outliers_influence.MLEInfluence Notes ----- Support for mutli-link and multi-exog models is still experimental in MLEInfluence. Interface and some definitions might still change. Note: Difference to R betareg: Betareg has the same general leverage as this model. However, they use a linear approximation hat matrix to scale and studentize influence and residual statistics. MLEInfluence uses the generalized leverage as hat_matrix_diag. Additionally, MLEInfluence uses pearson residuals for residual analusis. References ---------- todo """ from statsmodels.stats.outliers_influence import MLEInfluence return MLEInfluence(self) def bootstrap(self, *args, **kwargs): raise NotImplementedError class BetaResultsWrapper(lm.RegressionResultsWrapper): pass wrap.populate_wrapper(BetaResultsWrapper, BetaResults)