"""Correlation plot functions.""" from statsmodels.compat.pandas import deprecate_kwarg import calendar import numpy as np import pandas as pd from statsmodels.graphics import utils from statsmodels.tools.validation import array_like from statsmodels.tsa.stattools import acf, pacf, ccf def _prepare_data_corr_plot(x, lags, zero): zero = bool(zero) irregular = False if zero else True if lags is None: # GH 4663 - use a sensible default value nobs = x.shape[0] lim = min(int(np.ceil(10 * np.log10(nobs))), nobs // 2) lags = np.arange(not zero, lim + 1) elif np.isscalar(lags): lags = np.arange(not zero, int(lags) + 1) # +1 for zero lag else: irregular = True lags = np.asanyarray(lags).astype(int) nlags = lags.max(0) return lags, nlags, irregular def _plot_corr( ax, title, acf_x, confint, lags, irregular, use_vlines, vlines_kwargs, auto_ylims=False, skip_lag0_confint=True, **kwargs, ): if irregular: acf_x = acf_x[lags] if confint is not None: confint = confint[lags] if use_vlines: ax.vlines(lags, [0], acf_x, **vlines_kwargs) ax.axhline(**kwargs) kwargs.setdefault("marker", "o") kwargs.setdefault("markersize", 5) if "ls" not in kwargs: # gh-2369 kwargs.setdefault("linestyle", "None") ax.margins(0.05) ax.plot(lags, acf_x, **kwargs) ax.set_title(title) ax.set_ylim(-1, 1) if auto_ylims: ax.set_ylim( 1.25 * np.minimum(min(acf_x), min(confint[:, 0] - acf_x)), 1.25 * np.maximum(max(acf_x), max(confint[:, 1] - acf_x)), ) if confint is not None: if skip_lag0_confint and lags[0] == 0: lags = lags[1:] confint = confint[1:] acf_x = acf_x[1:] lags = lags.astype(float) lags[np.argmin(lags)] -= 0.5 lags[np.argmax(lags)] += 0.5 ax.fill_between( lags, confint[:, 0] - acf_x, confint[:, 1] - acf_x, alpha=0.25 ) @deprecate_kwarg("unbiased", "adjusted") def plot_acf( x, ax=None, lags=None, *, alpha=0.05, use_vlines=True, adjusted=False, fft=False, missing="none", title="Autocorrelation", zero=True, auto_ylims=False, bartlett_confint=True, vlines_kwargs=None, **kwargs, ): """ Plot the autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. lags : {int, array_like}, optional An int or array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett's formula. If None, no confidence intervals are plotted. use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. adjusted : bool If True, then denominators for autocovariance are n-k, otherwise n fft : bool, optional If True, computes the ACF via FFT. missing : str, optional A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs are to be treated. title : str, optional Title to place on plot. Default is 'Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. auto_ylims : bool, optional If True, adjusts automatically the y-axis limits to ACF values. bartlett_confint : bool, default True Confidence intervals for ACF values are generally placed at 2 standard errors around r_k. The formula used for standard error depends upon the situation. If the autocorrelations are being used to test for randomness of residuals as part of the ARIMA routine, the standard errors are determined assuming the residuals are white noise. The approximate formula for any lag is that standard error of each r_k = 1/sqrt(N). See section 9.4 of [1] for more details on the 1/sqrt(N) result. For more elementary discussion, see section 5.3.2 in [2]. For the ACF of raw data, the standard error at a lag k is found as if the right model was an MA(k-1). This allows the possible interpretation that if all autocorrelations past a certain lag are within the limits, the model might be an MA of order defined by the last significant autocorrelation. In this case, a moving average model is assumed for the data and the standard errors for the confidence intervals should be generated using Bartlett's formula. For more details on Bartlett formula result, see section 7.2 in [1]. vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` kwargs is used to pass matplotlib optional arguments to both the line tracing the autocorrelations and for the horizontal line at 0. These options must be valid for a Line2D object. vlines_kwargs is used to pass additional optional arguments to the vertical lines connecting each autocorrelation to the axis. These options must be valid for a LineCollection object. References ---------- [1] Brockwell and Davis, 1987. Time Series Theory and Methods [2] Brockwell and Davis, 2010. Introduction to Time Series and Forecasting, 2nd edition. Examples -------- >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> dta = sm.datasets.sunspots.load_pandas().data >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) >>> del dta["YEAR"] >>> sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40) >>> plt.show() .. plot:: plots/graphics_tsa_plot_acf.py """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs confint = None # acf has different return type based on alpha acf_x = acf( x, nlags=nlags, alpha=alpha, fft=fft, bartlett_confint=bartlett_confint, adjusted=adjusted, missing=missing, ) if alpha is not None: acf_x, confint = acf_x[:2] _plot_corr( ax, title, acf_x, confint, lags, irregular, use_vlines, vlines_kwargs, auto_ylims=auto_ylims, **kwargs, ) return fig def plot_pacf( x, ax=None, lags=None, alpha=0.05, method="ywm", use_vlines=True, title="Partial Autocorrelation", zero=True, vlines_kwargs=None, **kwargs, ): """ Plot the partial autocorrelation function Parameters ---------- x : array_like Array of time-series values ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. lags : {int, array_like}, optional An int or array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : float, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : str Specifies which method for the calculations to use: - "ywm" or "ywmle" : Yule-Walker without adjustment. Default. - "yw" or "ywadjusted" : Yule-Walker with sample-size adjustment in denominator for acovf. Default. - "ols" : regression of time series on lags of it and on constant. - "ols-inefficient" : regression of time series on lags using a single common sample to estimate all pacf coefficients. - "ols-adjusted" : regression of time series on lags with a bias adjustment. - "ld" or "ldadjusted" : Levinson-Durbin recursion with bias correction. - "ldb" or "ldbiased" : Levinson-Durbin recursion without bias correction. use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. title : str, optional Title to place on plot. Default is 'Partial Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr Notes ----- Plots lags on the horizontal and the correlations on vertical axis. Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` kwargs is used to pass matplotlib optional arguments to both the line tracing the autocorrelations and for the horizontal line at 0. These options must be valid for a Line2D object. vlines_kwargs is used to pass additional optional arguments to the vertical lines connecting each autocorrelation to the axis. These options must be valid for a LineCollection object. Examples -------- >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> dta = sm.datasets.sunspots.load_pandas().data >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) >>> del dta["YEAR"] >>> sm.graphics.tsa.plot_pacf(dta.values.squeeze(), lags=40, method="ywm") >>> plt.show() .. plot:: plots/graphics_tsa_plot_pacf.py """ fig, ax = utils.create_mpl_ax(ax) vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) confint = None if alpha is None: acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method) else: acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) _plot_corr( ax, title, acf_x, confint, lags, irregular, use_vlines, vlines_kwargs, **kwargs, ) return fig def plot_ccf( x, y, *, ax=None, lags=None, negative_lags=False, alpha=0.05, use_vlines=True, adjusted=False, fft=False, title="Cross-correlation", auto_ylims=False, vlines_kwargs=None, **kwargs, ): """ Plot the cross-correlation function Correlations between ``x`` and the lags of ``y`` are calculated. The lags are shown on the horizontal axis and the correlations on the vertical axis. Parameters ---------- x, y : array_like Arrays of time-series values. ax : AxesSubplot, optional If given, this subplot is used to plot in, otherwise a new figure with one subplot is created. lags : {int, array_like}, optional An int or array of lag values, used on the horizontal axis. Uses ``np.arange(lags)`` when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. negative_lags: bool, optional If True, negative lags are shown on the horizontal axis. alpha : scalar, optional If a number is given, the confidence intervals for the given level are plotted, e.g. if alpha=.05, 95 % confidence intervals are shown. If None, confidence intervals are not shown on the plot. use_vlines : bool, optional If True, shows vertical lines and markers for the correlation values. If False, only shows markers. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. adjusted : bool If True, then denominators for cross-correlations are n-k, otherwise n. fft : bool, optional If True, computes the CCF via FFT. title : str, optional Title to place on plot. Default is 'Cross-correlation'. auto_ylims : bool, optional If True, adjusts automatically the vertical axis limits to CCF values. vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- Figure The figure where the plot is drawn. This is either an existing figure if the `ax` argument is provided, or a newly created figure if `ax` is None. See Also -------- statsmodels.graphics.tsaplots.plot_acf Examples -------- >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> dta = sm.datasets.macrodata.load_pandas().data >>> diffed = dta.diff().dropna() >>> sm.graphics.tsa.plot_ccf(diffed["unemp"], diffed["infl"]) >>> plt.show() """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot(x, lags, True) vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs if negative_lags: lags = -lags ccf_res = ccf( x, y, adjusted=adjusted, fft=fft, alpha=alpha, nlags=nlags + 1 ) if alpha is not None: ccf_xy, confint = ccf_res else: ccf_xy = ccf_res confint = None _plot_corr( ax, title, ccf_xy, confint, lags, irregular, use_vlines, vlines_kwargs, auto_ylims=auto_ylims, skip_lag0_confint=False, **kwargs, ) return fig def plot_accf_grid( x, *, varnames=None, fig=None, lags=None, negative_lags=True, alpha=0.05, use_vlines=True, adjusted=False, fft=False, missing="none", zero=True, auto_ylims=False, bartlett_confint=False, vlines_kwargs=None, **kwargs, ): """ Plot auto/cross-correlation grid Plots lags on the horizontal axis and the correlations on the vertical axis of each graph. Parameters ---------- x : array_like 2D array of time-series values: rows are observations, columns are variables. varnames: sequence of str, optional Variable names to use in plot titles. If ``x`` is a pandas dataframe and ``varnames`` is provided, it overrides the column names of the dataframe. If ``varnames`` is not provided and ``x`` is not a dataframe, variable names ``x[0]``, ``x[1]``, etc. are generated. fig : Matplotlib figure instance, optional If given, this figure is used to plot in, otherwise a new figure is created. lags : {int, array_like}, optional An int or array of lag values, used on horizontal axes. Uses ``np.arange(lags)`` when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. negative_lags: bool, optional If True, negative lags are shown on the horizontal axes of plots below the main diagonal. alpha : scalar, optional If a number is given, the confidence intervals for the given level are plotted, e.g. if alpha=.05, 95 % confidence intervals are shown. If None, confidence intervals are not shown on the plot. use_vlines : bool, optional If True, shows vertical lines and markers for the correlation values. If False, only shows markers. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. adjusted : bool If True, then denominators for correlations are n-k, otherwise n. fft : bool, optional If True, computes the ACF via FFT. missing : str, optional A string in ['none', 'raise', 'conservative', 'drop'] specifying how NaNs are to be treated. zero : bool, optional Flag indicating whether to include the 0-lag autocorrelations (which are always equal to 1). Default is True. auto_ylims : bool, optional If True, adjusts automatically the vertical axis limits to correlation values. bartlett_confint : bool, default False If True, use Bartlett's formula to calculate confidence intervals in auto-correlation plots. See the description of ``plot_acf`` for details. This argument does not affect cross-correlation plots. vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- Figure If `fig` is None, the created figure. Otherwise, `fig` is returned. Plots on the grid show the cross-correlation of the row variable with the lags of the column variable. See Also -------- statsmodels.graphics.tsaplots Examples -------- >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> dta = sm.datasets.macrodata.load_pandas().data >>> diffed = dta.diff().dropna() >>> sm.graphics.tsa.plot_accf_grid(diffed[["unemp", "infl"]]) >>> plt.show() """ from statsmodels.tools.data import _is_using_pandas array_like(x, "x", ndim=2) m = x.shape[1] fig = utils.create_mpl_fig(fig) gs = fig.add_gridspec(m, m) if _is_using_pandas(x, None): varnames = varnames or list(x.columns) def get_var(i): return x.iloc[:, i] else: varnames = varnames or [f'x[{i}]' for i in range(m)] x = np.asarray(x) def get_var(i): return x[:, i] for i in range(m): for j in range(m): ax = fig.add_subplot(gs[i, j]) if i == j: plot_acf( get_var(i), ax=ax, title=f'ACF({varnames[i]})', lags=lags, alpha=alpha, use_vlines=use_vlines, adjusted=adjusted, fft=fft, missing=missing, zero=zero, auto_ylims=auto_ylims, bartlett_confint=bartlett_confint, vlines_kwargs=vlines_kwargs, **kwargs, ) else: plot_ccf( get_var(i), get_var(j), ax=ax, title=f'CCF({varnames[i]}, {varnames[j]})', lags=lags, negative_lags=negative_lags and i > j, alpha=alpha, use_vlines=use_vlines, adjusted=adjusted, fft=fft, auto_ylims=auto_ylims, vlines_kwargs=vlines_kwargs, **kwargs, ) return fig def seasonal_plot(grouped_x, xticklabels, ylabel=None, ax=None): """ Consider using one of month_plot or quarter_plot unless you need irregular plotting. Parameters ---------- grouped_x : iterable of DataFrames Should be a GroupBy object (or similar pair of group_names and groups as DataFrames) with a DatetimeIndex or PeriodIndex xticklabels : list of str List of season labels, one for each group. ylabel : str Lable for y axis ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. """ fig, ax = utils.create_mpl_ax(ax) start = 0 ticks = [] for season, df in grouped_x: df = df.copy() # or sort balks for series. may be better way df.sort_index() nobs = len(df) x_plot = np.arange(start, start + nobs) ticks.append(x_plot.mean()) ax.plot(x_plot, df.values, "k") ax.hlines( df.values.mean(), x_plot[0], x_plot[-1], colors="r", linewidth=3 ) start += nobs ax.set_xticks(ticks) ax.set_xticklabels(xticklabels) ax.set_ylabel(ylabel) ax.margins(0.1, 0.05) return fig def month_plot(x, dates=None, ylabel=None, ax=None): """ Seasonal plot of monthly data. Parameters ---------- x : array_like Seasonal data to plot. If dates is None, x must be a pandas object with a PeriodIndex or DatetimeIndex with a monthly frequency. dates : array_like, optional If `x` is not a pandas object, then dates must be supplied. ylabel : str, optional The label for the y-axis. Will attempt to use the `name` attribute of the Series. ax : Axes, optional Existing axes instance. Returns ------- Figure If `ax` is provided, the Figure instance attached to `ax`. Otherwise a new Figure instance. Examples -------- >>> import statsmodels.api as sm >>> import pandas as pd >>> dta = sm.datasets.elnino.load_pandas().data >>> dta['YEAR'] = dta.YEAR.astype(int).astype(str) >>> dta = dta.set_index('YEAR').T.unstack() >>> dates = pd.to_datetime(list(map(lambda x: '-'.join(x) + '-1', ... dta.index.values))) >>> dta.index = pd.DatetimeIndex(dates, freq='MS') >>> fig = sm.graphics.tsa.month_plot(dta) .. plot:: plots/graphics_tsa_month_plot.py """ if dates is None: from statsmodels.tools.data import _check_period_index _check_period_index(x, freq="M") else: x = pd.Series(x, index=pd.PeriodIndex(dates, freq="M")) # there's no zero month xticklabels = list(calendar.month_abbr)[1:] return seasonal_plot( x.groupby(lambda y: y.month), xticklabels, ylabel=ylabel, ax=ax ) def quarter_plot(x, dates=None, ylabel=None, ax=None): """ Seasonal plot of quarterly data Parameters ---------- x : array_like Seasonal data to plot. If dates is None, x must be a pandas object with a PeriodIndex or DatetimeIndex with a monthly frequency. dates : array_like, optional If `x` is not a pandas object, then dates must be supplied. ylabel : str, optional The label for the y-axis. Will attempt to use the `name` attribute of the Series. ax : matplotlib.axes, optional Existing axes instance. Returns ------- Figure If `ax` is provided, the Figure instance attached to `ax`. Otherwise a new Figure instance. Examples -------- >>> import statsmodels.api as sm >>> import pandas as pd >>> dta = sm.datasets.elnino.load_pandas().data >>> dta['YEAR'] = dta.YEAR.astype(int).astype(str) >>> dta = dta.set_index('YEAR').T.unstack() >>> dates = pd.to_datetime(list(map(lambda x: '-'.join(x) + '-1', ... dta.index.values))) >>> dta.index = dates.to_period('Q') >>> fig = sm.graphics.tsa.quarter_plot(dta) .. plot:: plots/graphics_tsa_quarter_plot.py """ if dates is None: from statsmodels.tools.data import _check_period_index _check_period_index(x, freq="Q") else: x = pd.Series(x, index=pd.PeriodIndex(dates, freq="Q")) xticklabels = ["q1", "q2", "q3", "q4"] return seasonal_plot( x.groupby(lambda y: y.quarter), xticklabels, ylabel=ylabel, ax=ax ) def plot_predict( result, start=None, end=None, dynamic=False, alpha=0.05, ax=None, **predict_kwargs, ): """ Parameters ---------- result : Result Any model result supporting ``get_prediction``. start : int, str, or datetime, optional Zero-indexed observation number at which to start forecasting, i.e., the first forecast is start. Can also be a date string to parse or a datetime type. Default is the the zeroth observation. end : int, str, or datetime, optional Zero-indexed observation number at which to end forecasting, i.e., the last forecast is end. Can also be a date string to parse or a datetime type. However, if the dates index does not have a fixed frequency, end must be an integer index if you want out of sample prediction. Default is the last observation in the sample. dynamic : bool, int, str, or datetime, optional Integer offset relative to `start` at which to begin dynamic prediction. Can also be an absolute date string to parse or a datetime type (these are not interpreted as offsets). Prior to this observation, true endogenous values will be used for prediction; starting with this observation and continuing through the end of prediction, forecasted endogenous values will be used instead. alpha : {float, None} The tail probability not covered by the confidence interval. Must be in (0, 1). Confidence interval is constructed assuming normally distributed shocks. If None, figure will not show the confidence interval. ax : AxesSubplot matplotlib Axes instance to use **predict_kwargs Any additional keyword arguments to pass to ``result.get_prediction``. Returns ------- Figure matplotlib Figure containing the prediction plot """ from statsmodels.graphics.utils import _import_mpl, create_mpl_ax _ = _import_mpl() fig, ax = create_mpl_ax(ax) from statsmodels.tsa.base.prediction import PredictionResults # use predict so you set dates pred: PredictionResults = result.get_prediction( start=start, end=end, dynamic=dynamic, **predict_kwargs ) mean = pred.predicted_mean if isinstance(mean, (pd.Series, pd.DataFrame)): x = mean.index mean.plot(ax=ax, label="forecast") else: x = np.arange(mean.shape[0]) ax.plot(x, mean, label="forecast") if alpha is not None: label = f"{1-alpha:.0%} confidence interval" ci = pred.conf_int(alpha) conf_int = np.asarray(ci) ax.fill_between( x, conf_int[:, 0], conf_int[:, 1], color="gray", alpha=0.5, label=label, ) ax.legend(loc="best") return fig