677 lines
24 KiB
Python
677 lines
24 KiB
Python
"""
|
|
Base tools for handling various kinds of data structures, attaching metadata to
|
|
results, and doing data cleaning
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from statsmodels.compat.python import lmap
|
|
|
|
from functools import reduce
|
|
|
|
import numpy as np
|
|
from pandas import DataFrame, Series, isnull, MultiIndex
|
|
|
|
import statsmodels.tools.data as data_util
|
|
from statsmodels.tools.decorators import cache_readonly, cache_writable
|
|
from statsmodels.tools.sm_exceptions import MissingDataError
|
|
|
|
|
|
def _asarray_2dcolumns(x):
|
|
if np.asarray(x).ndim > 1 and np.asarray(x).squeeze().ndim == 1:
|
|
return
|
|
|
|
|
|
def _asarray_2d_null_rows(x):
|
|
"""
|
|
Makes sure input is an array and is 2d. Makes sure output is 2d. True
|
|
indicates a null in the rows of 2d x.
|
|
"""
|
|
#Have to have the asarrays because isnull does not account for array_like
|
|
#input
|
|
x = np.asarray(x)
|
|
if x.ndim == 1:
|
|
x = x[:, None]
|
|
return np.any(isnull(x), axis=1)[:, None]
|
|
|
|
|
|
def _nan_rows(*arrs):
|
|
"""
|
|
Returns a boolean array which is True where any of the rows in any
|
|
of the _2d_ arrays in arrs are NaNs. Inputs can be any mixture of Series,
|
|
DataFrames or array_like.
|
|
"""
|
|
if len(arrs) == 1:
|
|
arrs += ([[False]],)
|
|
|
|
def _nan_row_maybe_two_inputs(x, y):
|
|
# check for dtype bc dataframe has dtypes
|
|
x_is_boolean_array = hasattr(x, 'dtype') and x.dtype == bool and x
|
|
return np.logical_or(_asarray_2d_null_rows(x),
|
|
(x_is_boolean_array | _asarray_2d_null_rows(y)))
|
|
return reduce(_nan_row_maybe_two_inputs, arrs).squeeze()
|
|
|
|
|
|
class ModelData:
|
|
"""
|
|
Class responsible for handling input data and extracting metadata into the
|
|
appropriate form
|
|
"""
|
|
_param_names = None
|
|
_cov_names = None
|
|
|
|
def __init__(self, endog, exog=None, missing='none', hasconst=None,
|
|
**kwargs):
|
|
if data_util._is_recarray(endog) or data_util._is_recarray(exog):
|
|
from statsmodels.tools.sm_exceptions import recarray_exception
|
|
raise NotImplementedError(recarray_exception)
|
|
if 'design_info' in kwargs:
|
|
self.design_info = kwargs.pop('design_info')
|
|
if 'formula' in kwargs:
|
|
self.formula = kwargs.pop('formula')
|
|
if missing != 'none':
|
|
arrays, nan_idx = self.handle_missing(endog, exog, missing,
|
|
**kwargs)
|
|
self.missing_row_idx = nan_idx
|
|
self.__dict__.update(arrays) # attach all the data arrays
|
|
self.orig_endog = self.endog
|
|
self.orig_exog = self.exog
|
|
self.endog, self.exog = self._convert_endog_exog(self.endog,
|
|
self.exog)
|
|
else:
|
|
self.__dict__.update(kwargs) # attach the extra arrays anyway
|
|
self.orig_endog = endog
|
|
self.orig_exog = exog
|
|
self.endog, self.exog = self._convert_endog_exog(endog, exog)
|
|
|
|
self.const_idx = None
|
|
self.k_constant = 0
|
|
self._handle_constant(hasconst)
|
|
self._check_integrity()
|
|
self._cache = {}
|
|
|
|
def __getstate__(self):
|
|
from copy import copy
|
|
d = copy(self.__dict__)
|
|
if "design_info" in d:
|
|
del d["design_info"]
|
|
d["restore_design_info"] = True
|
|
return d
|
|
|
|
def __setstate__(self, d):
|
|
if "restore_design_info" in d:
|
|
# NOTE: there may be a more performant way to do this
|
|
from patsy import dmatrices, PatsyError
|
|
exc = []
|
|
try:
|
|
data = d['frame']
|
|
except KeyError:
|
|
data = d['orig_endog'].join(d['orig_exog'])
|
|
|
|
for depth in [2, 3, 1, 0, 4]: # sequence is a guess where to likely find it
|
|
try:
|
|
_, design = dmatrices(d['formula'], data, eval_env=depth,
|
|
return_type='dataframe')
|
|
break
|
|
except (NameError, PatsyError) as e:
|
|
exc.append(e) # why do I need a reference from outside except block
|
|
pass
|
|
else:
|
|
raise exc[-1]
|
|
|
|
self.design_info = design.design_info
|
|
del d["restore_design_info"]
|
|
self.__dict__.update(d)
|
|
|
|
def _handle_constant(self, hasconst):
|
|
if hasconst is False or self.exog is None:
|
|
self.k_constant = 0
|
|
self.const_idx = None
|
|
else:
|
|
# detect where the constant is
|
|
check_implicit = False
|
|
exog_max = np.max(self.exog, axis=0)
|
|
if not np.isfinite(exog_max).all():
|
|
raise MissingDataError('exog contains inf or nans')
|
|
exog_min = np.min(self.exog, axis=0)
|
|
const_idx = np.where(exog_max == exog_min)[0].squeeze()
|
|
self.k_constant = const_idx.size
|
|
|
|
if self.k_constant == 1:
|
|
if self.exog[:, const_idx].mean() != 0:
|
|
self.const_idx = int(const_idx)
|
|
else:
|
|
# we only have a zero column and no other constant
|
|
check_implicit = True
|
|
elif self.k_constant > 1:
|
|
# we have more than one constant column
|
|
# look for ones
|
|
values = [] # keep values if we need != 0
|
|
for idx in const_idx:
|
|
value = self.exog[:, idx].mean()
|
|
if value == 1:
|
|
self.k_constant = 1
|
|
self.const_idx = int(idx)
|
|
break
|
|
values.append(value)
|
|
else:
|
|
# we did not break, no column of ones
|
|
pos = (np.array(values) != 0)
|
|
if pos.any():
|
|
# take the first nonzero column
|
|
self.k_constant = 1
|
|
self.const_idx = int(const_idx[pos.argmax()])
|
|
else:
|
|
# only zero columns
|
|
check_implicit = True
|
|
elif self.k_constant == 0:
|
|
check_implicit = True
|
|
else:
|
|
# should not be here
|
|
pass
|
|
|
|
if check_implicit and not hasconst:
|
|
# look for implicit constant
|
|
# Compute rank of augmented matrix
|
|
augmented_exog = np.column_stack(
|
|
(np.ones(self.exog.shape[0]), self.exog))
|
|
rank_augm = np.linalg.matrix_rank(augmented_exog)
|
|
rank_orig = np.linalg.matrix_rank(self.exog)
|
|
self.k_constant = int(rank_orig == rank_augm)
|
|
self.const_idx = None
|
|
elif hasconst:
|
|
# Ensure k_constant is 1 any time hasconst is True
|
|
# even if one is not found
|
|
self.k_constant = 1
|
|
|
|
@classmethod
|
|
def _drop_nans(cls, x, nan_mask):
|
|
return x[nan_mask]
|
|
|
|
@classmethod
|
|
def _drop_nans_2d(cls, x, nan_mask):
|
|
return x[nan_mask][:, nan_mask]
|
|
|
|
@classmethod
|
|
def handle_missing(cls, endog, exog, missing, **kwargs):
|
|
"""
|
|
This returns a dictionary with keys endog, exog and the keys of
|
|
kwargs. It preserves Nones.
|
|
"""
|
|
none_array_names = []
|
|
|
|
# patsy's already dropped NaNs in y/X
|
|
missing_idx = kwargs.pop('missing_idx', None)
|
|
|
|
if missing_idx is not None:
|
|
# y, X already handled by patsy. add back in later.
|
|
combined = ()
|
|
combined_names = []
|
|
if exog is None:
|
|
none_array_names += ['exog']
|
|
elif exog is not None:
|
|
combined = (endog, exog)
|
|
combined_names = ['endog', 'exog']
|
|
else:
|
|
combined = (endog,)
|
|
combined_names = ['endog']
|
|
none_array_names += ['exog']
|
|
|
|
# deal with other arrays
|
|
combined_2d = ()
|
|
combined_2d_names = []
|
|
if len(kwargs):
|
|
for key, value_array in kwargs.items():
|
|
if value_array is None or np.ndim(value_array) == 0:
|
|
none_array_names += [key]
|
|
continue
|
|
# grab 1d arrays
|
|
if value_array.ndim == 1:
|
|
combined += (np.asarray(value_array),)
|
|
combined_names += [key]
|
|
elif value_array.squeeze().ndim == 1:
|
|
combined += (np.asarray(value_array),)
|
|
combined_names += [key]
|
|
|
|
# grab 2d arrays that are _assumed_ to be symmetric
|
|
elif value_array.ndim == 2:
|
|
combined_2d += (np.asarray(value_array),)
|
|
combined_2d_names += [key]
|
|
else:
|
|
raise ValueError("Arrays with more than 2 dimensions "
|
|
"are not yet handled")
|
|
|
|
if missing_idx is not None:
|
|
nan_mask = missing_idx
|
|
updated_row_mask = None
|
|
if combined: # there were extra arrays not handled by patsy
|
|
combined_nans = _nan_rows(*combined)
|
|
if combined_nans.shape[0] != nan_mask.shape[0]:
|
|
raise ValueError("Shape mismatch between endog/exog "
|
|
"and extra arrays given to model.")
|
|
# for going back and updated endog/exog
|
|
updated_row_mask = combined_nans[~nan_mask]
|
|
nan_mask |= combined_nans # for updating extra arrays only
|
|
if combined_2d:
|
|
combined_2d_nans = _nan_rows(combined_2d)
|
|
if combined_2d_nans.shape[0] != nan_mask.shape[0]:
|
|
raise ValueError("Shape mismatch between endog/exog "
|
|
"and extra 2d arrays given to model.")
|
|
if updated_row_mask is not None:
|
|
updated_row_mask |= combined_2d_nans[~nan_mask]
|
|
else:
|
|
updated_row_mask = combined_2d_nans[~nan_mask]
|
|
nan_mask |= combined_2d_nans
|
|
|
|
else:
|
|
nan_mask = _nan_rows(*combined)
|
|
if combined_2d:
|
|
nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)
|
|
|
|
if not np.any(nan_mask): # no missing do not do anything
|
|
combined = dict(zip(combined_names, combined))
|
|
if combined_2d:
|
|
combined.update(dict(zip(combined_2d_names, combined_2d)))
|
|
if none_array_names:
|
|
combined.update({k: kwargs.get(k, None)
|
|
for k in none_array_names})
|
|
|
|
if missing_idx is not None:
|
|
combined.update({'endog': endog})
|
|
if exog is not None:
|
|
combined.update({'exog': exog})
|
|
|
|
return combined, []
|
|
|
|
elif missing == 'raise':
|
|
raise MissingDataError("NaNs were encountered in the data")
|
|
|
|
elif missing == 'drop':
|
|
nan_mask = ~nan_mask
|
|
drop_nans = lambda x: cls._drop_nans(x, nan_mask)
|
|
drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
|
|
combined = dict(zip(combined_names, lmap(drop_nans, combined)))
|
|
|
|
if missing_idx is not None:
|
|
if updated_row_mask is not None:
|
|
updated_row_mask = ~updated_row_mask
|
|
# update endog/exog with this new information
|
|
endog = cls._drop_nans(endog, updated_row_mask)
|
|
if exog is not None:
|
|
exog = cls._drop_nans(exog, updated_row_mask)
|
|
|
|
combined.update({'endog': endog})
|
|
if exog is not None:
|
|
combined.update({'exog': exog})
|
|
|
|
if combined_2d:
|
|
combined.update(dict(zip(combined_2d_names,
|
|
lmap(drop_nans_2d, combined_2d))))
|
|
if none_array_names:
|
|
combined.update({k: kwargs.get(k, None)
|
|
for k in none_array_names})
|
|
|
|
return combined, np.where(~nan_mask)[0].tolist()
|
|
else:
|
|
raise ValueError("missing option %s not understood" % missing)
|
|
|
|
def _convert_endog_exog(self, endog, exog):
|
|
|
|
# for consistent outputs if endog is (n,1)
|
|
yarr = self._get_yarr(endog)
|
|
xarr = None
|
|
if exog is not None:
|
|
xarr = self._get_xarr(exog)
|
|
if xarr.ndim == 1:
|
|
xarr = xarr[:, None]
|
|
if xarr.ndim != 2:
|
|
raise ValueError("exog is not 1d or 2d")
|
|
|
|
return yarr, xarr
|
|
|
|
@cache_writable()
|
|
def ynames(self):
|
|
endog = self.orig_endog
|
|
ynames = self._get_names(endog)
|
|
if not ynames:
|
|
ynames = _make_endog_names(self.endog)
|
|
|
|
if len(ynames) == 1:
|
|
return ynames[0]
|
|
else:
|
|
return list(ynames)
|
|
|
|
@cache_writable()
|
|
def xnames(self) -> list[str] | None:
|
|
exog = self.orig_exog
|
|
if exog is not None:
|
|
xnames = self._get_names(exog)
|
|
if not xnames:
|
|
xnames = _make_exog_names(self.exog)
|
|
return list(xnames)
|
|
return None
|
|
|
|
@property
|
|
def param_names(self):
|
|
# for handling names of 'extra' parameters in summary, etc.
|
|
return self._param_names or self.xnames
|
|
|
|
@param_names.setter
|
|
def param_names(self, values):
|
|
self._param_names = values
|
|
|
|
@property
|
|
def cov_names(self):
|
|
"""
|
|
Labels for covariance matrices
|
|
|
|
In multidimensional models, each dimension of a covariance matrix
|
|
differs from the number of param_names.
|
|
|
|
If not set, returns param_names
|
|
"""
|
|
# for handling names of covariance names in multidimensional models
|
|
if self._cov_names is not None:
|
|
return self._cov_names
|
|
return self.param_names
|
|
|
|
@cov_names.setter
|
|
def cov_names(self, value):
|
|
# for handling names of covariance names in multidimensional models
|
|
self._cov_names = value
|
|
|
|
@cache_readonly
|
|
def row_labels(self):
|
|
exog = self.orig_exog
|
|
if exog is not None:
|
|
row_labels = self._get_row_labels(exog)
|
|
else:
|
|
endog = self.orig_endog
|
|
row_labels = self._get_row_labels(endog)
|
|
return row_labels
|
|
|
|
def _get_row_labels(self, arr):
|
|
return None
|
|
|
|
def _get_names(self, arr):
|
|
if isinstance(arr, DataFrame):
|
|
if isinstance(arr.columns, MultiIndex):
|
|
# Flatten MultiIndexes into "simple" column names
|
|
return ['_'.join(level for level in c if level)
|
|
for c in arr.columns]
|
|
else:
|
|
return list(arr.columns)
|
|
elif isinstance(arr, Series):
|
|
if arr.name:
|
|
return [arr.name]
|
|
else:
|
|
return
|
|
else:
|
|
try:
|
|
return arr.dtype.names
|
|
except AttributeError:
|
|
pass
|
|
|
|
return None
|
|
|
|
def _get_yarr(self, endog):
|
|
if data_util._is_structured_ndarray(endog):
|
|
endog = data_util.struct_to_ndarray(endog)
|
|
endog = np.asarray(endog)
|
|
if len(endog) == 1: # never squeeze to a scalar
|
|
if endog.ndim == 1:
|
|
return endog
|
|
elif endog.ndim > 1:
|
|
return np.asarray([endog.squeeze()])
|
|
|
|
return endog.squeeze()
|
|
|
|
def _get_xarr(self, exog):
|
|
if data_util._is_structured_ndarray(exog):
|
|
exog = data_util.struct_to_ndarray(exog)
|
|
return np.asarray(exog)
|
|
|
|
def _check_integrity(self):
|
|
if self.exog is not None:
|
|
if len(self.exog) != len(self.endog):
|
|
raise ValueError("endog and exog matrices are different sizes")
|
|
|
|
def wrap_output(self, obj, how='columns', names=None):
|
|
if how == 'columns':
|
|
return self.attach_columns(obj)
|
|
elif how == 'rows':
|
|
return self.attach_rows(obj)
|
|
elif how == 'cov':
|
|
return self.attach_cov(obj)
|
|
elif how == 'dates':
|
|
return self.attach_dates(obj)
|
|
elif how == 'columns_eq':
|
|
return self.attach_columns_eq(obj)
|
|
elif how == 'cov_eq':
|
|
return self.attach_cov_eq(obj)
|
|
elif how == 'generic_columns':
|
|
return self.attach_generic_columns(obj, names)
|
|
elif how == 'generic_columns_2d':
|
|
return self.attach_generic_columns_2d(obj, names)
|
|
elif how == 'ynames':
|
|
return self.attach_ynames(obj)
|
|
elif how == 'multivariate_confint':
|
|
return self.attach_mv_confint(obj)
|
|
else:
|
|
return obj
|
|
|
|
def attach_columns(self, result):
|
|
return result
|
|
|
|
def attach_columns_eq(self, result):
|
|
return result
|
|
|
|
def attach_cov(self, result):
|
|
return result
|
|
|
|
def attach_cov_eq(self, result):
|
|
return result
|
|
|
|
def attach_rows(self, result):
|
|
return result
|
|
|
|
def attach_dates(self, result):
|
|
return result
|
|
|
|
def attach_mv_confint(self, result):
|
|
return result
|
|
|
|
def attach_generic_columns(self, result, *args, **kwargs):
|
|
return result
|
|
|
|
def attach_generic_columns_2d(self, result, *args, **kwargs):
|
|
return result
|
|
|
|
def attach_ynames(self, result):
|
|
return result
|
|
|
|
|
|
class PatsyData(ModelData):
|
|
def _get_names(self, arr):
|
|
return arr.design_info.column_names
|
|
|
|
|
|
class PandasData(ModelData):
|
|
"""
|
|
Data handling class which knows how to reattach pandas metadata to model
|
|
results
|
|
"""
|
|
|
|
def _convert_endog_exog(self, endog, exog=None):
|
|
#TODO: remove this when we handle dtype systematically
|
|
endog = np.asarray(endog)
|
|
exog = exog if exog is None else np.asarray(exog)
|
|
if endog.dtype == object or exog is not None and exog.dtype == object:
|
|
raise ValueError("Pandas data cast to numpy dtype of object. "
|
|
"Check input data with np.asarray(data).")
|
|
return super()._convert_endog_exog(endog, exog)
|
|
|
|
@classmethod
|
|
def _drop_nans(cls, x, nan_mask):
|
|
if isinstance(x, (Series, DataFrame)):
|
|
return x.loc[nan_mask]
|
|
else: # extra arguments could be plain ndarrays
|
|
return super()._drop_nans(x, nan_mask)
|
|
|
|
@classmethod
|
|
def _drop_nans_2d(cls, x, nan_mask):
|
|
if isinstance(x, (Series, DataFrame)):
|
|
return x.loc[nan_mask].loc[:, nan_mask]
|
|
else: # extra arguments could be plain ndarrays
|
|
return super()._drop_nans_2d(x, nan_mask)
|
|
|
|
def _check_integrity(self):
|
|
endog, exog = self.orig_endog, self.orig_exog
|
|
# exog can be None and we could be upcasting one or the other
|
|
if (exog is not None and
|
|
(hasattr(endog, 'index') and hasattr(exog, 'index')) and
|
|
not self.orig_endog.index.equals(self.orig_exog.index)):
|
|
raise ValueError("The indices for endog and exog are not aligned")
|
|
super()._check_integrity()
|
|
|
|
def _get_row_labels(self, arr):
|
|
try:
|
|
return arr.index
|
|
except AttributeError:
|
|
# if we've gotten here it's because endog is pandas and
|
|
# exog is not, so just return the row labels from endog
|
|
return self.orig_endog.index
|
|
|
|
def attach_generic_columns(self, result, names):
|
|
# get the attribute to use
|
|
column_names = getattr(self, names, None)
|
|
return Series(result, index=column_names)
|
|
|
|
def attach_generic_columns_2d(self, result, rownames, colnames=None):
|
|
colnames = colnames or rownames
|
|
rownames = getattr(self, rownames, None)
|
|
colnames = getattr(self, colnames, None)
|
|
return DataFrame(result, index=rownames, columns=colnames)
|
|
|
|
def attach_columns(self, result):
|
|
# this can either be a 1d array or a scalar
|
|
# do not squeeze because it might be a 2d row array
|
|
# if it needs a squeeze, the bug is elsewhere
|
|
if result.ndim <= 1:
|
|
return Series(result, index=self.param_names)
|
|
else: # for e.g., confidence intervals
|
|
return DataFrame(result, index=self.param_names)
|
|
|
|
def attach_columns_eq(self, result):
|
|
return DataFrame(result, index=self.xnames, columns=self.ynames)
|
|
|
|
def attach_cov(self, result):
|
|
return DataFrame(result, index=self.cov_names, columns=self.cov_names)
|
|
|
|
def attach_cov_eq(self, result):
|
|
return DataFrame(result, index=self.ynames, columns=self.ynames)
|
|
|
|
def attach_rows(self, result):
|
|
# assumes if len(row_labels) > len(result) it's bc it was truncated
|
|
# at the front, for AR lags, for example
|
|
squeezed = result.squeeze()
|
|
k_endog = np.array(self.ynames, ndmin=1).shape[0]
|
|
if k_endog > 1 and squeezed.shape == (k_endog,):
|
|
squeezed = squeezed[None, :]
|
|
# May be zero-dim, for example in the case of forecast one step in tsa
|
|
if squeezed.ndim < 2:
|
|
out = Series(squeezed)
|
|
else:
|
|
out = DataFrame(result)
|
|
out.columns = self.ynames
|
|
out.index = self.row_labels[-len(result):]
|
|
return out
|
|
|
|
def attach_dates(self, result):
|
|
squeezed = result.squeeze()
|
|
k_endog = np.array(self.ynames, ndmin=1).shape[0]
|
|
if k_endog > 1 and squeezed.shape == (k_endog,):
|
|
squeezed = np.asarray(squeezed)[None, :]
|
|
# May be zero-dim, for example in the case of forecast one step in tsa
|
|
if squeezed.ndim < 2:
|
|
return Series(squeezed, index=self.predict_dates)
|
|
else:
|
|
return DataFrame(np.asarray(result),
|
|
index=self.predict_dates,
|
|
columns=self.ynames)
|
|
|
|
def attach_mv_confint(self, result):
|
|
return DataFrame(result.reshape((-1, 2)),
|
|
index=self.cov_names,
|
|
columns=['lower', 'upper'])
|
|
|
|
def attach_ynames(self, result):
|
|
squeezed = result.squeeze()
|
|
# May be zero-dim, for example in the case of forecast one step in tsa
|
|
if squeezed.ndim < 2:
|
|
return Series(squeezed, name=self.ynames)
|
|
else:
|
|
return DataFrame(result, columns=self.ynames)
|
|
|
|
|
|
def _make_endog_names(endog):
|
|
if endog.ndim == 1 or endog.shape[1] == 1:
|
|
ynames = ['y']
|
|
else: # for VAR
|
|
ynames = ['y%d' % (i+1) for i in range(endog.shape[1])]
|
|
|
|
return ynames
|
|
|
|
|
|
def _make_exog_names(exog):
|
|
exog_var = exog.var(0)
|
|
if (exog_var == 0).any():
|
|
# assumes one constant in first or last position
|
|
# avoid exception if more than one constant
|
|
const_idx = exog_var.argmin()
|
|
exog_names = ['x%d' % i for i in range(1, exog.shape[1])]
|
|
exog_names.insert(const_idx, 'const')
|
|
else:
|
|
exog_names = ['x%d' % i for i in range(1, exog.shape[1]+1)]
|
|
|
|
return exog_names
|
|
|
|
|
|
def handle_missing(endog, exog=None, missing='none', **kwargs):
|
|
klass = handle_data_class_factory(endog, exog)
|
|
if missing == 'none':
|
|
ret_dict = dict(endog=endog, exog=exog)
|
|
ret_dict.update(kwargs)
|
|
return ret_dict, None
|
|
return klass.handle_missing(endog, exog, missing=missing, **kwargs)
|
|
|
|
|
|
def handle_data_class_factory(endog, exog):
|
|
"""
|
|
Given inputs
|
|
"""
|
|
if data_util._is_using_ndarray_type(endog, exog):
|
|
klass = ModelData
|
|
elif data_util._is_using_pandas(endog, exog):
|
|
klass = PandasData
|
|
elif data_util._is_using_patsy(endog, exog):
|
|
klass = PatsyData
|
|
# keep this check last
|
|
elif data_util._is_using_ndarray(endog, exog):
|
|
klass = ModelData
|
|
else:
|
|
raise ValueError('unrecognized data structures: %s / %s' %
|
|
(type(endog), type(exog)))
|
|
return klass
|
|
|
|
|
|
def handle_data(endog, exog, missing='none', hasconst=None, **kwargs):
|
|
# deal with lists and tuples up-front
|
|
if isinstance(endog, (list, tuple)):
|
|
endog = np.asarray(endog)
|
|
if isinstance(exog, (list, tuple)):
|
|
exog = np.asarray(exog)
|
|
|
|
klass = handle_data_class_factory(endog, exog)
|
|
return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
|
|
**kwargs)
|