122 lines
3.9 KiB
Python
122 lines
3.9 KiB
Python
|
'''calculating anova and verifying with NIST test data
|
||
|
|
||
|
compares my implementations, stats.f_oneway and anova using statsmodels.OLS
|
||
|
'''
|
||
|
from statsmodels.compat.python import lmap
|
||
|
import os
|
||
|
import numpy as np
|
||
|
from scipy import stats
|
||
|
from statsmodels.tools.tools import add_constant
|
||
|
from statsmodels.regression.linear_model import OLS
|
||
|
from .try_ols_anova import data2dummy
|
||
|
|
||
|
filenameli = ['SiRstv.dat', 'SmLs01.dat', 'SmLs02.dat', 'SmLs03.dat', 'AtmWtAg.dat',
|
||
|
'SmLs04.dat', 'SmLs05.dat', 'SmLs06.dat', 'SmLs07.dat', 'SmLs08.dat',
|
||
|
'SmLs09.dat']
|
||
|
##filename = 'SmLs03.dat' #'SiRstv.dat' #'SmLs09.dat'#, 'AtmWtAg.dat' #'SmLs07.dat'
|
||
|
|
||
|
|
||
|
##path = __file__
|
||
|
##print(locals().keys()
|
||
|
###print(path
|
||
|
|
||
|
|
||
|
def getnist(filename):
|
||
|
here = os.path.dirname(__file__)
|
||
|
fname = os.path.abspath(os.path.join(here, 'data', filename))
|
||
|
with open(fname, encoding="utf-8") as fd:
|
||
|
content = fd.read().split('\n')
|
||
|
|
||
|
data = [line.split() for line in content[60:]]
|
||
|
certified = [line.split() for line in content[40:48] if line]
|
||
|
dataf = np.loadtxt(fname, skiprows=60)
|
||
|
y,x = dataf.T
|
||
|
y = y.astype(int)
|
||
|
caty = np.unique(y)
|
||
|
f = float(certified[0][-1])
|
||
|
R2 = float(certified[2][-1])
|
||
|
resstd = float(certified[4][-1])
|
||
|
dfbn = int(certified[0][-4])
|
||
|
dfwn = int(certified[1][-3]) # dfbn->dfwn is this correct
|
||
|
prob = stats.f.sf(f,dfbn,dfwn)
|
||
|
return y, x, np.array([f, prob, R2, resstd]), certified, caty
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def anova_oneway(y, x, seq=0):
|
||
|
# new version to match NIST
|
||
|
# no generalization or checking of arguments, tested only for 1d
|
||
|
yrvs = y[:,np.newaxis] #- min(y)
|
||
|
#subracting mean increases numerical accuracy for NIST test data sets
|
||
|
xrvs = x[:,np.newaxis] - x.mean() #for 1d#- 1e12 trick for 'SmLs09.dat'
|
||
|
|
||
|
from .try_catdata import groupsstats_dummy
|
||
|
meang, varg, xdevmeangr, countg = groupsstats_dummy(yrvs[:, :1],
|
||
|
xrvs[:, :1])
|
||
|
# TODO: the following does not work as replacement
|
||
|
# from .try_catdata import groupsstats_dummy, groupstatsbin
|
||
|
# gcount, gmean , meanarr, withinvar, withinvararr = groupstatsbin(y, x)
|
||
|
sswn = np.dot(xdevmeangr.T,xdevmeangr)
|
||
|
ssbn = np.dot((meang-xrvs.mean())**2, countg.T)
|
||
|
nobs = yrvs.shape[0]
|
||
|
ncat = meang.shape[1]
|
||
|
dfbn = ncat - 1
|
||
|
dfwn = nobs - ncat
|
||
|
msb = ssbn/float(dfbn)
|
||
|
msw = sswn/float(dfwn)
|
||
|
f = msb/msw
|
||
|
prob = stats.f.sf(f,dfbn,dfwn)
|
||
|
R2 = (ssbn/(sswn+ssbn)) #R-squared
|
||
|
resstd = np.sqrt(msw) #residual standard deviation
|
||
|
#print(f, prob
|
||
|
|
||
|
def _fix2scalar(z): # return number
|
||
|
if np.shape(z) == (1, 1):
|
||
|
return z[0, 0]
|
||
|
else:
|
||
|
return z
|
||
|
f, prob, R2, resstd = lmap(_fix2scalar, (f, prob, R2, resstd))
|
||
|
return f, prob, R2, resstd
|
||
|
|
||
|
|
||
|
def anova_ols(y, x):
|
||
|
X = add_constant(data2dummy(x), prepend=False)
|
||
|
res = OLS(y, X).fit()
|
||
|
return res.fvalue, res.f_pvalue, res.rsquared, np.sqrt(res.mse_resid)
|
||
|
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
print('\n using new ANOVA anova_oneway')
|
||
|
print('f, prob, R2, resstd')
|
||
|
for fn in filenameli:
|
||
|
print(fn)
|
||
|
y, x, cert, certified, caty = getnist(fn)
|
||
|
res = anova_oneway(y, x)
|
||
|
# TODO: figure out why these results are less accurate/precise
|
||
|
# than others
|
||
|
rtol = {
|
||
|
"SmLs08.dat": .027,
|
||
|
"SmLs07.dat": 1.7e-3,
|
||
|
"SmLs09.dat": 1e-4
|
||
|
}.get(fn, 1e-7)
|
||
|
np.testing.assert_allclose(np.array(res), cert, rtol=rtol)
|
||
|
|
||
|
print('\n using stats ANOVA f_oneway')
|
||
|
for fn in filenameli:
|
||
|
print(fn)
|
||
|
y, x, cert, certified, caty = getnist(fn)
|
||
|
xlist = [x[y==ii] for ii in caty]
|
||
|
res = stats.f_oneway(*xlist)
|
||
|
print(np.array(res) - cert[:2])
|
||
|
|
||
|
print('\n using statsmodels.OLS')
|
||
|
print('f, prob, R2, resstd')
|
||
|
for fn in filenameli[:]:
|
||
|
print(fn)
|
||
|
y, x, cert, certified, caty = getnist(fn)
|
||
|
res = anova_ols(x, y)
|
||
|
print(np.array(res) - cert)
|