187 lines
6.7 KiB
Python
187 lines
6.7 KiB
Python
"""
|
|
Created on Sun Nov 14 08:21:41 2010
|
|
|
|
Author: josef-pktd
|
|
License: BSD (3-clause)
|
|
"""
|
|
|
|
import numpy as np
|
|
import statsmodels.api as sm
|
|
from statsmodels.sandbox.tools import pca
|
|
from statsmodels.sandbox.tools.cross_val import LeaveOneOut
|
|
|
|
#converting example Principal Component Regression to a class
|
|
#from sandbox/example_pca_regression.py
|
|
|
|
|
|
class FactorModelUnivariate:
|
|
'''
|
|
|
|
Todo:
|
|
check treatment of const, make it optional ?
|
|
add hasconst (0 or 1), needed when selecting nfact+hasconst
|
|
options are arguments in calc_factors, should be more public instead
|
|
cross-validation is slow for large number of observations
|
|
'''
|
|
def __init__(self, endog, exog):
|
|
#do this in a superclass?
|
|
self.endog = np.asarray(endog)
|
|
self.exog = np.asarray(exog)
|
|
|
|
|
|
def calc_factors(self, x=None, keepdim=0, addconst=True):
|
|
'''get factor decomposition of exogenous variables
|
|
|
|
This uses principal component analysis to obtain the factors. The number
|
|
of factors kept is the maximum that will be considered in the regression.
|
|
'''
|
|
if x is None:
|
|
x = self.exog
|
|
else:
|
|
x = np.asarray(x)
|
|
xred, fact, evals, evecs = pca(x, keepdim=keepdim, normalize=1)
|
|
self.exog_reduced = xred
|
|
#self.factors = fact
|
|
if addconst:
|
|
self.factors = sm.add_constant(fact, prepend=True)
|
|
self.hasconst = 1 #needs to be int
|
|
else:
|
|
self.factors = fact
|
|
self.hasconst = 0 #needs to be int
|
|
|
|
self.evals = evals
|
|
self.evecs = evecs
|
|
|
|
def fit_fixed_nfact(self, nfact):
|
|
if not hasattr(self, 'factors_wconst'):
|
|
self.calc_factors()
|
|
return sm.OLS(self.endog, self.factors[:,:nfact+1]).fit()
|
|
|
|
def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None):
|
|
'''estimate the model and selection criteria for up to maxfact factors
|
|
|
|
The selection criteria that are calculated are AIC, BIC, and R2_adj. and
|
|
additionally cross-validation prediction error sum of squares if `skip_crossval`
|
|
is false. Cross-validation is not used by default because it can be
|
|
time consuming to calculate.
|
|
|
|
By default the cross-validation method is Leave-one-out on the full dataset.
|
|
A different cross-validation sample can be specified as an argument to
|
|
cv_iter.
|
|
|
|
Results are attached in `results_find_nfact`
|
|
|
|
|
|
|
|
'''
|
|
#print 'OLS on Factors'
|
|
if not hasattr(self, 'factors'):
|
|
self.calc_factors()
|
|
|
|
hasconst = self.hasconst
|
|
if maxfact is None:
|
|
maxfact = self.factors.shape[1] - hasconst
|
|
|
|
if (maxfact+hasconst) < 1:
|
|
raise ValueError('nothing to do, number of factors (incl. constant) should ' +
|
|
'be at least 1')
|
|
|
|
#temporary safety
|
|
maxfact = min(maxfact, 10)
|
|
|
|
y0 = self.endog
|
|
results = []
|
|
#xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1)
|
|
for k in range(1, maxfact+hasconst): #k includes now the constnat
|
|
#xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1)
|
|
# this is faster and same result
|
|
fact = self.factors[:,:k]
|
|
res = sm.OLS(y0, fact).fit()
|
|
## print 'k =', k
|
|
## print res.params
|
|
## print 'aic: ', res.aic
|
|
## print 'bic: ', res.bic
|
|
## print 'llf: ', res.llf
|
|
## print 'R2 ', res.rsquared
|
|
## print 'R2 adj', res.rsquared_adj
|
|
|
|
if not skip_crossval:
|
|
if cv_iter is None:
|
|
cv_iter = LeaveOneOut(len(y0))
|
|
prederr2 = 0.
|
|
for inidx, outidx in cv_iter:
|
|
res_l1o = sm.OLS(y0[inidx], fact[inidx,:]).fit()
|
|
#print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
|
|
prederr2 += (y0[outidx] -
|
|
res_l1o.model.predict(res_l1o.params, fact[outidx,:]))**2.
|
|
else:
|
|
prederr2 = np.nan
|
|
|
|
results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])
|
|
|
|
self.results_find_nfact = results = np.array(results)
|
|
self.best_nfact = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
|
|
np.argmin(results[:,-1],0))]
|
|
|
|
def summary_find_nfact(self):
|
|
'''provides a summary for the selection of the number of factors
|
|
|
|
Returns
|
|
-------
|
|
sumstr : str
|
|
summary of the results for selecting the number of factors
|
|
|
|
'''
|
|
if not hasattr(self, 'results_find_nfact'):
|
|
self.fit_find_nfact()
|
|
|
|
|
|
results = self.results_find_nfact
|
|
sumstr = ''
|
|
sumstr += '\n' + 'Best result for k, by AIC, BIC, R2_adj, L1O'
|
|
# best = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
|
|
# np.argmin(results[:,-1],0))]
|
|
|
|
sumstr += '\n' + ' '*19 + '%5d %4d %6d %5d' % tuple(self.best_nfact)
|
|
|
|
from statsmodels.iolib.table import SimpleTable
|
|
|
|
headers = 'k, AIC, BIC, R2_adj, L1O'.split(', ')
|
|
numformat = ['%6d'] + ['%10.3f']*4 #'%10.4f'
|
|
txt_fmt1 = dict(data_fmts = numformat)
|
|
tabl = SimpleTable(results, headers, None, txt_fmt=txt_fmt1)
|
|
|
|
sumstr += '\n' + "PCA regression on simulated data,"
|
|
sumstr += '\n' + "DGP: 2 factors and 4 explanatory variables"
|
|
sumstr += '\n' + tabl.__str__()
|
|
sumstr += '\n' + "Notes: k is number of components of PCA,"
|
|
sumstr += '\n' + " constant is added additionally"
|
|
sumstr += '\n' + " k=0 means regression on constant only"
|
|
sumstr += '\n' + " L1O: sum of squared prediction errors for leave-one-out"
|
|
return sumstr
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
examples = [1]
|
|
if 1 in examples:
|
|
nobs = 500
|
|
f0 = np.c_[np.random.normal(size=(nobs,2)), np.ones((nobs,1))]
|
|
f2xcoef = np.c_[np.repeat(np.eye(2),2,0),np.arange(4)[::-1]].T
|
|
f2xcoef = np.array([[ 1., 1., 0., 0.],
|
|
[ 0., 0., 1., 1.],
|
|
[ 3., 2., 1., 0.]])
|
|
f2xcoef = np.array([[ 0.1, 3., 1., 0.],
|
|
[ 0., 0., 1.5, 0.1],
|
|
[ 3., 2., 1., 0.]])
|
|
x0 = np.dot(f0, f2xcoef)
|
|
x0 += 0.1*np.random.normal(size=x0.shape)
|
|
ytrue = np.dot(f0,[1., 1., 1.])
|
|
y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape)
|
|
|
|
mod = FactorModelUnivariate(y0, x0)
|
|
print(mod.summary_find_nfact())
|
|
print("with cross validation - slower")
|
|
mod.fit_find_nfact(maxfact=None, skip_crossval=False, cv_iter=None)
|
|
print(mod.summary_find_nfact())
|