362 lines
11 KiB
Python
362 lines
11 KiB
Python
|
"""
|
||
|
Seasonal Decomposition by Moving Averages
|
||
|
"""
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from pandas.core.nanops import nanmean as pd_nanmean
|
||
|
|
||
|
from statsmodels.tools.validation import PandasWrapper, array_like
|
||
|
from statsmodels.tsa.stl._stl import STL
|
||
|
from statsmodels.tsa.filters.filtertools import convolution_filter
|
||
|
from statsmodels.tsa.stl.mstl import MSTL
|
||
|
from statsmodels.tsa.tsatools import freq_to_period
|
||
|
|
||
|
__all__ = [
|
||
|
"STL",
|
||
|
"seasonal_decompose",
|
||
|
"seasonal_mean",
|
||
|
"DecomposeResult",
|
||
|
"MSTL",
|
||
|
]
|
||
|
|
||
|
|
||
|
def _extrapolate_trend(trend, npoints):
|
||
|
"""
|
||
|
Replace nan values on trend's end-points with least-squares extrapolated
|
||
|
values with regression considering npoints closest defined points.
|
||
|
"""
|
||
|
front = next(
|
||
|
i for i, vals in enumerate(trend) if not np.any(np.isnan(vals))
|
||
|
)
|
||
|
back = (
|
||
|
trend.shape[0]
|
||
|
- 1
|
||
|
- next(
|
||
|
i
|
||
|
for i, vals in enumerate(trend[::-1])
|
||
|
if not np.any(np.isnan(vals))
|
||
|
)
|
||
|
)
|
||
|
front_last = min(front + npoints, back)
|
||
|
back_first = max(front, back - npoints)
|
||
|
|
||
|
k, n = np.linalg.lstsq(
|
||
|
np.c_[np.arange(front, front_last), np.ones(front_last - front)],
|
||
|
trend[front:front_last],
|
||
|
rcond=-1,
|
||
|
)[0]
|
||
|
extra = (np.arange(0, front) * np.c_[k] + np.c_[n]).T
|
||
|
if trend.ndim == 1:
|
||
|
extra = extra.squeeze()
|
||
|
trend[:front] = extra
|
||
|
|
||
|
k, n = np.linalg.lstsq(
|
||
|
np.c_[np.arange(back_first, back), np.ones(back - back_first)],
|
||
|
trend[back_first:back],
|
||
|
rcond=-1,
|
||
|
)[0]
|
||
|
extra = (np.arange(back + 1, trend.shape[0]) * np.c_[k] + np.c_[n]).T
|
||
|
if trend.ndim == 1:
|
||
|
extra = extra.squeeze()
|
||
|
trend[back + 1 :] = extra
|
||
|
|
||
|
return trend
|
||
|
|
||
|
|
||
|
def seasonal_mean(x, period):
|
||
|
"""
|
||
|
Return means for each period in x. period is an int that gives the
|
||
|
number of periods per cycle. E.g., 12 for monthly. NaNs are ignored
|
||
|
in the mean.
|
||
|
"""
|
||
|
return np.array([pd_nanmean(x[i::period], axis=0) for i in range(period)])
|
||
|
|
||
|
|
||
|
def seasonal_decompose(
|
||
|
x,
|
||
|
model="additive",
|
||
|
filt=None,
|
||
|
period=None,
|
||
|
two_sided=True,
|
||
|
extrapolate_trend=0,
|
||
|
):
|
||
|
"""
|
||
|
Seasonal decomposition using moving averages.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x : array_like
|
||
|
Time series. If 2d, individual series are in columns. x must contain 2
|
||
|
complete cycles.
|
||
|
model : {"additive", "multiplicative"}, optional
|
||
|
Type of seasonal component. Abbreviations are accepted.
|
||
|
filt : array_like, optional
|
||
|
The filter coefficients for filtering out the seasonal component.
|
||
|
The concrete moving average method used in filtering is determined by
|
||
|
two_sided.
|
||
|
period : int, optional
|
||
|
Period of the series (e.g., 1 for annual, 4 for quarterly, etc). Must
|
||
|
be used if x is not a pandas object or if the index of x does not have
|
||
|
a frequency. Overrides default periodicity of x if x is a pandas
|
||
|
object with a timeseries index.
|
||
|
two_sided : bool, optional
|
||
|
The moving average method used in filtering.
|
||
|
If True (default), a centered moving average is computed using the
|
||
|
filt. If False, the filter coefficients are for past values only.
|
||
|
extrapolate_trend : int or 'freq', optional
|
||
|
If set to > 0, the trend resulting from the convolution is
|
||
|
linear least-squares extrapolated on both ends (or the single one
|
||
|
if two_sided is False) considering this many (+1) closest points.
|
||
|
If set to 'freq', use `freq` closest points. Setting this parameter
|
||
|
results in no NaN values in trend or resid components.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
DecomposeResult
|
||
|
A object with seasonal, trend, and resid attributes.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.tsa.filters.bk_filter.bkfilter
|
||
|
Baxter-King filter.
|
||
|
statsmodels.tsa.filters.cf_filter.cffilter
|
||
|
Christiano-Fitzgerald asymmetric, random walk filter.
|
||
|
statsmodels.tsa.filters.hp_filter.hpfilter
|
||
|
Hodrick-Prescott filter.
|
||
|
statsmodels.tsa.filters.convolution_filter
|
||
|
Linear filtering via convolution.
|
||
|
statsmodels.tsa.seasonal.STL
|
||
|
Season-Trend decomposition using LOESS.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This is a naive decomposition. More sophisticated methods should
|
||
|
be preferred.
|
||
|
|
||
|
The additive model is Y[t] = T[t] + S[t] + e[t]
|
||
|
|
||
|
The multiplicative model is Y[t] = T[t] * S[t] * e[t]
|
||
|
|
||
|
The results are obtained by first estimating the trend by applying
|
||
|
a convolution filter to the data. The trend is then removed from the
|
||
|
series and the average of this de-trended series for each period is
|
||
|
the returned seasonal component.
|
||
|
"""
|
||
|
pfreq = period
|
||
|
pw = PandasWrapper(x)
|
||
|
if period is None:
|
||
|
pfreq = getattr(getattr(x, "index", None), "inferred_freq", None)
|
||
|
|
||
|
x = array_like(x, "x", maxdim=2)
|
||
|
nobs = len(x)
|
||
|
|
||
|
if not np.all(np.isfinite(x)):
|
||
|
raise ValueError("This function does not handle missing values")
|
||
|
if model.startswith("m"):
|
||
|
if np.any(x <= 0):
|
||
|
raise ValueError(
|
||
|
"Multiplicative seasonality is not appropriate "
|
||
|
"for zero and negative values"
|
||
|
)
|
||
|
|
||
|
if period is None:
|
||
|
if pfreq is not None:
|
||
|
pfreq = freq_to_period(pfreq)
|
||
|
period = pfreq
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
"You must specify a period or x must be a pandas object with "
|
||
|
"a PeriodIndex or a DatetimeIndex with a freq not set to None"
|
||
|
)
|
||
|
if x.shape[0] < 2 * pfreq:
|
||
|
raise ValueError(
|
||
|
f"x must have 2 complete cycles requires {2 * pfreq} "
|
||
|
f"observations. x only has {x.shape[0]} observation(s)"
|
||
|
)
|
||
|
|
||
|
if filt is None:
|
||
|
if period % 2 == 0: # split weights at ends
|
||
|
filt = np.array([0.5] + [1] * (period - 1) + [0.5]) / period
|
||
|
else:
|
||
|
filt = np.repeat(1.0 / period, period)
|
||
|
|
||
|
nsides = int(two_sided) + 1
|
||
|
trend = convolution_filter(x, filt, nsides)
|
||
|
|
||
|
if extrapolate_trend == "freq":
|
||
|
extrapolate_trend = period - 1
|
||
|
|
||
|
if extrapolate_trend > 0:
|
||
|
trend = _extrapolate_trend(trend, extrapolate_trend + 1)
|
||
|
|
||
|
if model.startswith("m"):
|
||
|
detrended = x / trend
|
||
|
else:
|
||
|
detrended = x - trend
|
||
|
|
||
|
period_averages = seasonal_mean(detrended, period)
|
||
|
|
||
|
if model.startswith("m"):
|
||
|
period_averages /= np.mean(period_averages, axis=0)
|
||
|
else:
|
||
|
period_averages -= np.mean(period_averages, axis=0)
|
||
|
|
||
|
seasonal = np.tile(period_averages.T, nobs // period + 1).T[:nobs]
|
||
|
|
||
|
if model.startswith("m"):
|
||
|
resid = x / seasonal / trend
|
||
|
else:
|
||
|
resid = detrended - seasonal
|
||
|
|
||
|
results = []
|
||
|
for s, name in zip(
|
||
|
(seasonal, trend, resid, x), ("seasonal", "trend", "resid", None)
|
||
|
):
|
||
|
results.append(pw.wrap(s.squeeze(), columns=name))
|
||
|
return DecomposeResult(
|
||
|
seasonal=results[0],
|
||
|
trend=results[1],
|
||
|
resid=results[2],
|
||
|
observed=results[3],
|
||
|
)
|
||
|
|
||
|
|
||
|
class DecomposeResult:
|
||
|
"""
|
||
|
Results class for seasonal decompositions
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
observed : array_like
|
||
|
The data series that has been decomposed.
|
||
|
seasonal : array_like
|
||
|
The seasonal component of the data series.
|
||
|
trend : array_like
|
||
|
The trend component of the data series.
|
||
|
resid : array_like
|
||
|
The residual component of the data series.
|
||
|
weights : array_like, optional
|
||
|
The weights used to reduce outlier influence.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, observed, seasonal, trend, resid, weights=None):
|
||
|
self._seasonal = seasonal
|
||
|
self._trend = trend
|
||
|
if weights is None:
|
||
|
weights = np.ones_like(observed)
|
||
|
if isinstance(observed, pd.Series):
|
||
|
weights = pd.Series(
|
||
|
weights, index=observed.index, name="weights"
|
||
|
)
|
||
|
self._weights = weights
|
||
|
self._resid = resid
|
||
|
self._observed = observed
|
||
|
|
||
|
@property
|
||
|
def observed(self):
|
||
|
"""Observed data"""
|
||
|
return self._observed
|
||
|
|
||
|
@property
|
||
|
def seasonal(self):
|
||
|
"""The estimated seasonal component"""
|
||
|
return self._seasonal
|
||
|
|
||
|
@property
|
||
|
def trend(self):
|
||
|
"""The estimated trend component"""
|
||
|
return self._trend
|
||
|
|
||
|
@property
|
||
|
def resid(self):
|
||
|
"""The estimated residuals"""
|
||
|
return self._resid
|
||
|
|
||
|
@property
|
||
|
def weights(self):
|
||
|
"""The weights used in the robust estimation"""
|
||
|
return self._weights
|
||
|
|
||
|
@property
|
||
|
def nobs(self):
|
||
|
"""Number of observations"""
|
||
|
return self._observed.shape
|
||
|
|
||
|
def plot(
|
||
|
self,
|
||
|
observed=True,
|
||
|
seasonal=True,
|
||
|
trend=True,
|
||
|
resid=True,
|
||
|
weights=False,
|
||
|
):
|
||
|
"""
|
||
|
Plot estimated components
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
observed : bool
|
||
|
Include the observed series in the plot
|
||
|
seasonal : bool
|
||
|
Include the seasonal component in the plot
|
||
|
trend : bool
|
||
|
Include the trend component in the plot
|
||
|
resid : bool
|
||
|
Include the residual in the plot
|
||
|
weights : bool
|
||
|
Include the weights in the plot (if any)
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
matplotlib.figure.Figure
|
||
|
The figure instance that containing the plot.
|
||
|
"""
|
||
|
from pandas.plotting import register_matplotlib_converters
|
||
|
|
||
|
from statsmodels.graphics.utils import _import_mpl
|
||
|
|
||
|
plt = _import_mpl()
|
||
|
register_matplotlib_converters()
|
||
|
series = [(self._observed, "Observed")] if observed else []
|
||
|
series += [(self.trend, "trend")] if trend else []
|
||
|
|
||
|
if self.seasonal.ndim == 1:
|
||
|
series += [(self.seasonal, "seasonal")] if seasonal else []
|
||
|
elif self.seasonal.ndim > 1:
|
||
|
if isinstance(self.seasonal, pd.DataFrame):
|
||
|
for col in self.seasonal.columns:
|
||
|
series += (
|
||
|
[(self.seasonal[col], "seasonal")] if seasonal else []
|
||
|
)
|
||
|
else:
|
||
|
for i in range(self.seasonal.shape[1]):
|
||
|
series += (
|
||
|
[(self.seasonal[:, i], "seasonal")] if seasonal else []
|
||
|
)
|
||
|
|
||
|
series += [(self.resid, "residual")] if resid else []
|
||
|
series += [(self.weights, "weights")] if weights else []
|
||
|
|
||
|
if isinstance(self._observed, (pd.DataFrame, pd.Series)):
|
||
|
nobs = self._observed.shape[0]
|
||
|
xlim = self._observed.index[0], self._observed.index[nobs - 1]
|
||
|
else:
|
||
|
xlim = (0, self._observed.shape[0] - 1)
|
||
|
|
||
|
fig, axs = plt.subplots(len(series), 1, sharex=True)
|
||
|
for i, (ax, (series, def_name)) in enumerate(zip(axs, series)):
|
||
|
if def_name != "residual":
|
||
|
ax.plot(series)
|
||
|
else:
|
||
|
ax.plot(series, marker="o", linestyle="none")
|
||
|
ax.plot(xlim, (0, 0), color="#000000", zorder=-3)
|
||
|
name = getattr(series, "name", def_name)
|
||
|
if def_name != "Observed":
|
||
|
name = name.capitalize()
|
||
|
title = ax.set_title if i == 0 and observed else ax.set_ylabel
|
||
|
title(name)
|
||
|
ax.set_xlim(xlim)
|
||
|
|
||
|
fig.tight_layout()
|
||
|
return fig
|