import numpy as np from statsmodels.base.model import Results import statsmodels.base.wrapper as wrap from statsmodels.tools.decorators import cache_readonly """ Elastic net regularization. Routines for fitting regression models using elastic net regularization. The elastic net minimizes the objective function -llf / nobs + alpha((1 - L1_wt) * sum(params**2) / 2 + L1_wt * sum(abs(params))) The algorithm implemented here closely follows the implementation in the R glmnet package, documented here: http://cran.r-project.org/web/packages/glmnet/index.html and here: http://www.jstatsoft.org/v33/i01/paper This routine should work for any regression model that implements loglike, score, and hess. """ def _gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds): """ Negative penalized log-likelihood functions. Returns the negative penalized log-likelihood, its derivative, and its Hessian. The penalty only includes the smooth (L2) term. All three functions have argument signature (x, model), where ``x`` is a point in the parameter space and ``model`` is an arbitrary statsmodels regression model. """ def nploglike(params, model): nobs = model.nobs pen_llf = alpha[k] * (1 - L1_wt) * np.sum(params**2) / 2 llf = model.loglike(np.r_[params], **loglike_kwds) return - llf / nobs + pen_llf def npscore(params, model): nobs = model.nobs pen_grad = alpha[k] * (1 - L1_wt) * params gr = -model.score(np.r_[params], **score_kwds)[0] / nobs return gr + pen_grad def nphess(params, model): nobs = model.nobs pen_hess = alpha[k] * (1 - L1_wt) h = -model.hessian(np.r_[params], **hess_kwds)[0, 0] / nobs + pen_hess return h return nploglike, npscore, nphess def fit_elasticnet(model, method="coord_descent", maxiter=100, alpha=0., L1_wt=1., start_params=None, cnvrg_tol=1e-7, zero_tol=1e-8, refit=False, check_step=True, loglike_kwds=None, score_kwds=None, hess_kwds=None): """ Return an elastic net regularized fit to a regression model. Parameters ---------- model : model object A statsmodels object implementing ``loglike``, ``score``, and ``hessian``. method : {'coord_descent'} Only the coordinate descent algorithm is implemented. maxiter : int The maximum number of iteration cycles (an iteration cycle involves running coordinate descent on all variables). alpha : scalar or array_like The penalty weight. If a scalar, the same penalty weight applies to all variables in the model. If a vector, it must have the same length as `params`, and contains a penalty weight for each coefficient. L1_wt : scalar The fraction of the penalty given to the L1 penalty term. Must be between 0 and 1 (inclusive). If 0, the fit is a ridge fit, if 1 it is a lasso fit. start_params : array_like Starting values for `params`. cnvrg_tol : scalar If `params` changes by less than this amount (in sup-norm) in one iteration cycle, the algorithm terminates with convergence. zero_tol : scalar Any estimated coefficient smaller than this value is replaced with zero. refit : bool If True, the model is refit using only the variables that have non-zero coefficients in the regularized fit. The refitted model is not regularized. check_step : bool If True, confirm that the first step is an improvement and search further if it is not. loglike_kwds : dict-like or None Keyword arguments for the log-likelihood function. score_kwds : dict-like or None Keyword arguments for the score function. hess_kwds : dict-like or None Keyword arguments for the Hessian function. Returns ------- Results A results object. Notes ----- The ``elastic net`` penalty is a combination of L1 and L2 penalties. The function that is minimized is: -loglike/n + alpha*((1-L1_wt)*|params|_2^2/2 + L1_wt*|params|_1) where |*|_1 and |*|_2 are the L1 and L2 norms. The computational approach used here is to obtain a quadratic approximation to the smooth part of the target function: -loglike/n + alpha*(1-L1_wt)*|params|_2^2/2 then repeatedly optimize the L1 penalized version of this function along coordinate axes. """ k_exog = model.exog.shape[1] loglike_kwds = {} if loglike_kwds is None else loglike_kwds score_kwds = {} if score_kwds is None else score_kwds hess_kwds = {} if hess_kwds is None else hess_kwds if np.isscalar(alpha): alpha = alpha * np.ones(k_exog) # Define starting params if start_params is None: params = np.zeros(k_exog) else: params = start_params.copy() btol = 1e-4 params_zero = np.zeros(len(params), dtype=bool) init_args = model._get_init_kwds() # we do not need a copy of init_args b/c get_init_kwds provides new dict init_args['hasconst'] = False model_offset = init_args.pop('offset', None) if 'exposure' in init_args and init_args['exposure'] is not None: if model_offset is None: model_offset = np.log(init_args.pop('exposure')) else: model_offset += np.log(init_args.pop('exposure')) fgh_list = [ _gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds) for k in range(k_exog)] converged = False for itr in range(maxiter): # Sweep through the parameters params_save = params.copy() for k in range(k_exog): # Under the active set method, if a parameter becomes # zero we do not try to change it again. # TODO : give the user the option to switch this off if params_zero[k]: continue # Set the offset to account for the variables that are # being held fixed in the current coordinate # optimization. params0 = params.copy() params0[k] = 0 offset = np.dot(model.exog, params0) if model_offset is not None: offset += model_offset # Create a one-variable model for optimization. model_1var = model.__class__( model.endog, model.exog[:, k], offset=offset, **init_args) # Do the one-dimensional optimization. func, grad, hess = fgh_list[k] params[k] = _opt_1d( func, grad, hess, model_1var, params[k], alpha[k]*L1_wt, tol=btol, check_step=check_step) # Update the active set if itr > 0 and np.abs(params[k]) < zero_tol: params_zero[k] = True params[k] = 0. # Check for convergence pchange = np.max(np.abs(params - params_save)) if pchange < cnvrg_tol: converged = True break # Set approximate zero coefficients to be exactly zero params[np.abs(params) < zero_tol] = 0 if not refit: results = RegularizedResults(model, params) results.converged = converged return RegularizedResultsWrapper(results) # Fit the reduced model to get standard errors and other # post-estimation results. ii = np.flatnonzero(params) cov = np.zeros((k_exog, k_exog)) init_args = {k: getattr(model, k, None) for k in model._init_keys} if len(ii) > 0: model1 = model.__class__( model.endog, model.exog[:, ii], **init_args) rslt = model1.fit() params[ii] = rslt.params cov[np.ix_(ii, ii)] = rslt.normalized_cov_params else: # Hack: no variables were selected but we need to run fit in # order to get the correct results class. So just fit a model # with one variable. model1 = model.__class__(model.endog, model.exog[:, 0], **init_args) rslt = model1.fit(maxiter=0) # fit may return a results or a results wrapper if issubclass(rslt.__class__, wrap.ResultsWrapper): klass = rslt._results.__class__ else: klass = rslt.__class__ # Not all models have a scale if hasattr(rslt, 'scale'): scale = rslt.scale else: scale = 1. # The degrees of freedom should reflect the number of parameters # in the refit model, not including the zeros that are displayed # to indicate which variables were dropped. See issue #1723 for # discussion about setting df parameters in model and results # classes. p, q = model.df_model, model.df_resid model.df_model = len(ii) model.df_resid = model.nobs - model.df_model # Assuming a standard signature for creating results classes. refit = klass(model, params, cov, scale=scale) refit.regularized = True refit.converged = converged refit.method = method refit.fit_history = {'iteration': itr + 1} # Restore df in model class, see issue #1723 for discussion. model.df_model, model.df_resid = p, q return refit def _opt_1d(func, grad, hess, model, start, L1_wt, tol, check_step=True): """ One-dimensional helper for elastic net. Parameters ---------- func : function A smooth function of a single variable to be optimized with L1 penaty. grad : function The gradient of `func`. hess : function The Hessian of `func`. model : statsmodels model The model being fit. start : real A starting value for the function argument L1_wt : non-negative real The weight for the L1 penalty function. tol : non-negative real A convergence threshold. check_step : bool If True, check that the first step is an improvement and use bisection if it is not. If False, return after the first step regardless. Notes ----- ``func``, ``grad``, and ``hess`` have argument signature (x, model), where ``x`` is a point in the parameter space and ``model`` is the model being fit. If the log-likelihood for the model is exactly quadratic, the global minimum is returned in one step. Otherwise numerical bisection is used. Returns ------- The argmin of the objective function. """ # Overview: # We want to minimize L(x) + L1_wt*abs(x), where L() is a smooth # loss function that includes the log-likelihood and L2 penalty. # This is a 1-dimensional optimization. If L(x) is exactly # quadratic we can solve for the argmin exactly. Otherwise we # approximate L(x) with a quadratic function Q(x) and try to use # the minimizer of Q(x) + L1_wt*abs(x). But if this yields an # uphill step for the actual target function L(x) + L1_wt*abs(x), # then we fall back to a expensive line search. The line search # is never needed for OLS. x = start f = func(x, model) b = grad(x, model) c = hess(x, model) d = b - c*x # The optimum is achieved by hard thresholding to zero if L1_wt > np.abs(d): return 0. # x + h is the minimizer of the Q(x) + L1_wt*abs(x) if d >= 0: h = (L1_wt - b) / c elif d < 0: h = -(L1_wt + b) / c else: return np.nan # If the new point is not uphill for the target function, take it # and return. This check is a bit expensive and un-necessary for # OLS if not check_step: return x + h f1 = func(x + h, model) + L1_wt*np.abs(x + h) if f1 <= f + L1_wt*np.abs(x) + 1e-10: return x + h # Fallback for models where the loss is not quadratic from scipy.optimize import brent x_opt = brent(func, args=(model,), brack=(x-1, x+1), tol=tol) return x_opt class RegularizedResults(Results): """ Results for models estimated using regularization Parameters ---------- model : Model The model instance used to estimate the parameters. params : ndarray The estimated (regularized) parameters. """ def __init__(self, model, params): super().__init__(model, params) @cache_readonly def fittedvalues(self): """ The predicted values from the model at the estimated parameters. """ return self.model.predict(self.params) class RegularizedResultsWrapper(wrap.ResultsWrapper): _attrs = { 'params': 'columns', 'resid': 'rows', 'fittedvalues': 'rows', } _wrap_attrs = _attrs wrap.populate_wrapper(RegularizedResultsWrapper, # noqa:E305 RegularizedResults)