AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/sandbox/predict_functional.py

"""
A predict-like function that constructs means and pointwise or
simultaneous confidence bands for the function f(x) = E[Y | X*=x,
X1=x1, ...], where X* is the focus variable and X1, X2, ... are
non-focus variables.  This is especially useful when conducting a
functional regression in which the role of x is modeled with b-splines
or other basis functions.
"""
import pandas as pd
import patsy
import numpy as np
import warnings

from statsmodels.tools.sm_exceptions import ValueWarning
from statsmodels.compat.pandas import Appender

_predict_functional_doc =\
    """
    Predictions and contrasts of a fitted model as a function of a given covariate.

    The value of the focus variable varies along a sequence of its
    quantiles, calculated from the data used to fit the model.  The
    other variables are held constant either at given values, or at
    values obtained by applying given summary functions to the data
    used to fit the model.  Optionally, a second specification of the
    non-focus variables is provided and the contrast between the two
    specifications is returned.

    Parameters
    ----------
    result : statsmodels result object
        A results object for the fitted model.
    focus_var : str
        The name of the 'focus variable'.
    summaries : dict-like
        A map from names of non-focus variables to summary functions.
        Each summary function is applied to the data used to fit the
        model, to obtain a value at which the variable is held fixed.
    values : dict-like
        Values at which a given non-focus variable is held fixed.
    summaries2 : dict-like
        A second set of summary functions used to define a contrast.
    values2 : dict-like
        A second set of fixed values used to define a contrast.
    alpha : float
        `1 - alpha` is the coverage probability.
    ci_method : str
        The method for constructing the confidence band, one of
        'pointwise', 'scheffe', and 'simultaneous'.
    num_points : int
        The number of equally-spaced quantile points where the
        prediction is made.
    exog : array_like
        Explicitly provide points to cover with the confidence band.
    exog2 : array_like
        Explicitly provide points to contrast to `exog` in a functional
        confidence band.
    kwargs :
        Arguments passed to the `predict` method.

    Returns
    -------
    pred : array_like
        The predicted mean values.
    cb : array_like
        An array with two columns, containing respectively the lower
        and upper limits of a confidence band.
    fvals : array_like
        The values of the focus variable at which the prediction is
        made.

    Notes
    -----
    All variables in the model except for the focus variable should be
    included as a key in either `summaries` or `values` (unless `exog`
    is provided).

    If `summaries2` and `values2` are not provided, the returned value
    contains predicted conditional means for the outcome as the focus
    variable varies, with the other variables fixed as specified.

    If `summaries2` and/or `values2` is provided, two sets of
    predicted conditional means are calculated, and the returned value
    is the contrast between them.

    If `exog` is provided, then the rows should contain a sequence of
    values approximating a continuous path through the domain of the
    covariates.  For example, if Z(s) is the covariate expressed as a
    function of s, then the rows of exog may approximate Z(g(s)) for
    some continuous function g.  If `exog` is provided then neither of
    the summaries or values arguments should be provided.  If `exog2`
    is also provided, then the returned value is a contrast between
    the functionas defined by `exog` and `exog2`.

    Examples
    --------
    Fit a model using a formula in which the predictors are age
    (modeled with splines), ethnicity (which is categorical), gender,
    and income.  Then we obtain the fitted mean values as a function
    of age for females with mean income and the most common
    ethnicity.

    >>> model = sm.OLS.from_formula('y ~ bs(age, df=4) + C(ethnicity) + gender + income', data)
    >>> result = model.fit()
    >>> mode = lambda x : x.value_counts().argmax()
    >>> summaries = {'income': np.mean, ethnicity=mode}
    >>> values = {'gender': 'female'}
    >>> pr, cb, x = predict_functional(result, 'age', summaries, values)

    Fit a model using arrays.  Plot the means as a function of x3,
    holding x1 fixed at its mean value in the data used to fit the
    model, and holding x2 fixed at 1.

    >>> model = sm.OLS(y ,x)
    >>> result = model.fit()
    >>> summaries = {'x1': np.mean}
    >>> values = {'x2': 1}
    >>> pr, cb, x = predict_functional(result, 'x3', summaries, values)

    Fit a model usng a formula and construct a contrast comparing the
    female and male predicted mean functions.

    >>> model = sm.OLS.from_formula('y ~ bs(age, df=4) + gender', data)
    >>> result = model.fit()
    >>> values = {'gender': 'female'}
    >>> values2 = {'gender': 'male'}
    >>> pr, cb, x = predict_functional(result, 'age', values=values, values2=values2)
    """


def _make_exog_from_formula(result, focus_var, summaries, values, num_points):
    """
    Create dataframes for exploring a fitted model as a function of one variable.

    This works for models fit with a formula.

    Returns
    -------
    dexog : data frame
        A data frame in which the focus variable varies and the other variables
        are fixed at specified or computed values.
    fexog : data frame
        The data frame `dexog` processed through the model formula.
    """

    model = result.model
    exog = model.data.frame

    if summaries is None:
        summaries = {}
    if values is None:
        values = {}

    if exog[focus_var].dtype is np.dtype('O'):
        raise ValueError('focus variable may not have object type')

    colnames = list(summaries.keys()) + list(values.keys()) + [focus_var]
    dtypes = [exog[x].dtype for x in colnames]

    # Check for variables whose values are not set either through
    # `values` or `summaries`.  Since the model data frame can contain
    # extra variables not referenced in the formula RHS, this may not
    # be a problem, so just warn.  There is no obvious way to extract
    # from a formula all the variable names that it references.
    varl = set(exog.columns.tolist()) - {model.endog_names}
    unmatched = varl - set(colnames)
    unmatched = list(unmatched)
    if len(unmatched) > 0:
        warnings.warn("%s in data frame but not in summaries or values."
                      % ", ".join(["'%s'" % x for x in unmatched]),
                      ValueWarning)

    # Initialize at zero so each column can be converted to any dtype.
    ix = range(num_points)
    fexog = pd.DataFrame(index=ix, columns=colnames)
    for d, x in zip(dtypes, colnames):
        fexog[x] = pd.Series(index=ix, dtype=d)

    # The values of the 'focus variable' are a sequence of percentiles
    pctls = np.linspace(0, 100, num_points).tolist()
    fvals = np.percentile(exog[focus_var], pctls)
    fvals = np.asarray(fvals)
    fexog.loc[:, focus_var] = fvals

    # The values of the other variables may be given by summary functions...
    for ky in summaries.keys():
        fexog.loc[:, ky] = summaries[ky](exog.loc[:, ky])

    # or they may be provided as given values.
    for ky in values.keys():
        fexog[ky] = values[ky]

    dexog = patsy.dmatrix(model.data.design_info, fexog,
                          return_type='dataframe')
    return dexog, fexog, fvals


def _make_exog_from_arrays(result, focus_var, summaries, values, num_points):
    """
    Create dataframes for exploring a fitted model as a function of one variable.

    This works for models fit without a formula.

    Returns
    -------
    exog : data frame
        A data frame in which the focus variable varies and the other variables
        are fixed at specified or computed values.
    """

    model = result.model
    model_exog = model.exog
    exog_names = model.exog_names

    if summaries is None:
        summaries = {}
    if values is None:
        values = {}

    exog = np.zeros((num_points, model_exog.shape[1]))

    # Check for variables whose values are not set either through
    # `values` or `summaries`.
    colnames = list(values.keys()) + list(summaries.keys()) + [focus_var]
    unmatched = set(exog_names) - set(colnames)
    unmatched = list(unmatched)
    if len(unmatched) > 0:
        warnings.warn("%s in model but not in `summaries` or `values`."
                      % ", ".join(["'%s'" % x for x in unmatched]),
                      ValueWarning)

    # The values of the 'focus variable' are a sequence of percentiles
    pctls = np.linspace(0, 100, num_points).tolist()
    ix = exog_names.index(focus_var)
    fvals = np.percentile(model_exog[:, ix], pctls)
    exog[:, ix] = fvals

    # The values of the other variables may be given by summary functions...
    for ky in summaries.keys():
        ix = exog_names.index(ky)
        exog[:, ix] = summaries[ky](model_exog[:, ix])

    # or they may be provided as given values.
    for ky in values.keys():
        ix = exog_names.index(ky)
        exog[:, ix] = values[ky]

    return exog, fvals


def _make_exog(result, focus_var, summaries, values, num_points):

    # Branch depending on whether the model was fit with a formula.
    if hasattr(result.model.data, "frame"):
        dexog, fexog, fvals = _make_exog_from_formula(result, focus_var,
                                       summaries, values, num_points)
    else:
        exog, fvals = _make_exog_from_arrays(result, focus_var, summaries,
                                 values, num_points)
        dexog, fexog = exog, exog

    return dexog, fexog, fvals


def _check_args(values, summaries, values2, summaries2):

    if values is None:
        values = {}
    if values2 is None:
        values2 = {}
    if summaries is None:
        summaries = {}
    if summaries2 is None:
        summaries2 = {}

    for (s,v) in (summaries, values), (summaries2, values2):
        ky = set(v.keys()) & set(s.keys())
        ky = list(ky)
        if len(ky) > 0:
            raise ValueError("One or more variable names are contained in both `summaries` and `values`:" +
                             ", ".join(ky))

    return values, summaries, values2, summaries2


@Appender(_predict_functional_doc)
def predict_functional(result, focus_var, summaries=None, values=None,
                       summaries2=None, values2=None, alpha=0.05,
                       ci_method="pointwise", linear=True, num_points=10,
                       exog=None, exog2=None, **kwargs):

    if ci_method not in ("pointwise", "scheffe", "simultaneous"):
        raise ValueError('confidence band method must be one of '
                         '`pointwise`, `scheffe`, and `simultaneous`.')

    contrast = (values2 is not None) or (summaries2 is not None)

    if contrast and not linear:
        raise ValueError("`linear` must be True for computing contrasts")

    model = result.model
    if exog is not None:

        if any(x is not None for x in [summaries, summaries2, values, values2]):
            raise ValueError("if `exog` is provided then do not "
                             "provide `summaries` or `values`")

        fexog = exog
        dexog = patsy.dmatrix(model.data.design_info,
                              fexog, return_type='dataframe')
        fvals = exog[focus_var]

        if exog2 is not None:
            fexog2 = exog
            dexog2 = patsy.dmatrix(model.data.design_info,
                                   fexog2, return_type='dataframe')
            fvals2 = fvals

    else:

        values, summaries, values2, summaries2 = _check_args(values,
                                                             summaries,
                                                             values2,
                                                             summaries2)

        dexog, fexog, fvals = _make_exog(result, focus_var, summaries,
                                         values, num_points)

        if len(summaries2) + len(values2) > 0:
            dexog2, fexog2, fvals2 = _make_exog(result, focus_var, summaries2,
                                                values2, num_points)

    from statsmodels.genmod.generalized_linear_model import GLM
    from statsmodels.genmod.generalized_estimating_equations import GEE
    if isinstance(result.model, (GLM, GEE)):
        kwargs_pred = kwargs.copy()
        kwargs_pred.update({"which": "linear"})
    else:
        kwargs_pred = kwargs

    pred = result.predict(exog=fexog, **kwargs_pred)
    if contrast:
        pred2 = result.predict(exog=fexog2, **kwargs_pred)
        pred = pred - pred2
        dexog = dexog - dexog2

    if ci_method == 'pointwise':

        t_test = result.t_test(dexog)
        cb = t_test.conf_int(alpha=alpha)

    elif ci_method == 'scheffe':

        t_test = result.t_test(dexog)
        sd = t_test.sd
        cb = np.zeros((num_points, 2))

        # Scheffe's method
        from scipy.stats.distributions import f as fdist
        df1 = result.model.exog.shape[1]
        df2 = result.model.exog.shape[0] - df1
        qf = fdist.cdf(1 - alpha, df1, df2)
        fx = sd * np.sqrt(df1 * qf)
        cb[:, 0] = pred - fx
        cb[:, 1] = pred + fx

    elif ci_method == 'simultaneous':

        sigma, c = _glm_basic_scr(result, dexog, alpha)
        cb = np.zeros((dexog.shape[0], 2))
        cb[:, 0] = pred - c*sigma
        cb[:, 1] = pred + c*sigma

    if not linear:
        # May need to support other models with link-like functions.
        link = result.family.link
        pred = link.inverse(pred)
        cb = link.inverse(cb)

    return pred, cb, fvals


def _glm_basic_scr(result, exog, alpha):
    """
    The basic SCR from (Sun et al. Annals of Statistics 2000).

    Computes simultaneous confidence regions (SCR).

    Parameters
    ----------
    result : results instance
        The fitted GLM results instance
    exog : array_like
        The exog values spanning the interval
    alpha : float
        `1 - alpha` is the coverage probability.

    Returns
    -------
    An array with two columns, containing the lower and upper
    confidence bounds, respectively.

    Notes
    -----
    The rows of `exog` should be a sequence of covariate values
    obtained by taking one 'free variable' x and varying it over an
    interval.  The matrix `exog` is thus the basis functions and any
    other covariates evaluated as x varies.
    """

    model = result.model
    n = model.exog.shape[0]

    # Get the Hessian without recomputing.
    cov = result.cov_params()
    hess = np.linalg.inv(cov)

    # Proposition 3.1 of Sun et al.
    A = hess / n
    B = np.linalg.cholesky(A).T # Upper Cholesky triangle

    # The variance and SD of the linear predictor at each row of exog.
    sigma2 = (np.dot(exog, cov) * exog).sum(1)
    sigma = np.asarray(np.sqrt(sigma2))

    # Calculate kappa_0 (formula 42 from Sun et al)
    bz = np.linalg.solve(B.T, exog.T).T
    bz /= np.sqrt(n)
    bz /= sigma[:, None]
    bzd = np.diff(bz, 1, axis=0)
    bzdn = (bzd**2).sum(1)
    kappa_0 = np.sqrt(bzdn).sum()

    from scipy.stats.distributions import norm

    # The root of this function is the multiplier for the confidence
    # band, see Sun et al. equation 35.
    def func(c):
        return kappa_0 * np.exp(-c**2/2) / np.pi + 2*(1 - norm.cdf(c)) - alpha

    from scipy.optimize import brentq

    c, rslt = brentq(func, 1, 10, full_output=True)
    if not rslt.converged:
        raise ValueError("Root finding error in basic SCR")

    return sigma, c