247 lines
7.0 KiB
Python
247 lines
7.0 KiB
Python
|
'''Tools for multivariate analysis
|
||
|
|
||
|
|
||
|
Author : Josef Perktold
|
||
|
License : BSD-3
|
||
|
|
||
|
|
||
|
|
||
|
TODO:
|
||
|
|
||
|
- names of functions, currently just "working titles"
|
||
|
|
||
|
'''
|
||
|
|
||
|
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from statsmodels.tools.tools import Bunch
|
||
|
|
||
|
def partial_project(endog, exog):
|
||
|
'''helper function to get linear projection or partialling out of variables
|
||
|
|
||
|
endog variables are projected on exog variables
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
array of variables where the effect of exog is partialled out.
|
||
|
exog : ndarray
|
||
|
array of variables on which the endog variables are projected.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
res : instance of Bunch with
|
||
|
|
||
|
- params : OLS parameter estimates from projection of endog on exog
|
||
|
- fittedvalues : predicted values of endog given exog
|
||
|
- resid : residual of the regression, values of endog with effect of
|
||
|
exog partialled out
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This is no-frills mainly for internal calculations, no error checking or
|
||
|
array conversion is performed, at least for now.
|
||
|
|
||
|
'''
|
||
|
x1, x2 = endog, exog
|
||
|
params = np.linalg.pinv(x2).dot(x1)
|
||
|
predicted = x2.dot(params)
|
||
|
residual = x1 - predicted
|
||
|
res = Bunch(params=params,
|
||
|
fittedvalues=predicted,
|
||
|
resid=residual)
|
||
|
|
||
|
return res
|
||
|
|
||
|
|
||
|
|
||
|
def cancorr(x1, x2, demean=True, standardize=False):
|
||
|
'''canonical correlation coefficient beween 2 arrays
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x1, x2 : ndarrays, 2_D
|
||
|
two 2-dimensional data arrays, observations in rows, variables in columns
|
||
|
demean : bool
|
||
|
If demean is true, then the mean is subtracted from each variable
|
||
|
standardize : bool
|
||
|
If standardize is true, then each variable is demeaned and divided by
|
||
|
its standard deviation. Rescaling does not change the canonical
|
||
|
correlation coefficients.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ccorr : ndarray, 1d
|
||
|
canonical correlation coefficients, sorted from largest to smallest.
|
||
|
Note, that these are the square root of the eigenvalues.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This is a helper function for other statistical functions. It only
|
||
|
calculates the canonical correlation coefficients and does not do a full
|
||
|
canoncial correlation analysis
|
||
|
|
||
|
The canonical correlation coefficient is calculated with the generalized
|
||
|
matrix inverse and does not raise an exception if one of the data arrays
|
||
|
have less than full column rank.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
cc_ranktest
|
||
|
cc_stats
|
||
|
CCA not yet
|
||
|
|
||
|
'''
|
||
|
#x, y = x1, x2
|
||
|
if demean or standardize:
|
||
|
x1 = (x1 - x1.mean(0))
|
||
|
x2 = (x2 - x2.mean(0))
|
||
|
|
||
|
if standardize:
|
||
|
#std does not make a difference to canonical correlation coefficients
|
||
|
x1 /= x1.std(0)
|
||
|
x2 /= x2.std(0)
|
||
|
|
||
|
t1 = np.linalg.pinv(x1).dot(x2)
|
||
|
t2 = np.linalg.pinv(x2).dot(x1)
|
||
|
m = t1.dot(t2)
|
||
|
cc = np.sqrt(np.linalg.eigvals(m))
|
||
|
cc.sort()
|
||
|
return cc[::-1]
|
||
|
|
||
|
|
||
|
def cc_ranktest(x1, x2, demean=True, fullrank=False):
|
||
|
'''rank tests based on smallest canonical correlation coefficients
|
||
|
|
||
|
Anderson canonical correlations test (LM test) and
|
||
|
Cragg-Donald test (Wald test)
|
||
|
Assumes homoskedasticity and independent observations, overrejects if
|
||
|
there is heteroscedasticity or autocorrelation.
|
||
|
|
||
|
The Null Hypothesis is that the rank is k - 1, the alternative hypothesis
|
||
|
is that the rank is at least k.
|
||
|
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x1, x2 : ndarrays, 2_D
|
||
|
two 2-dimensional data arrays, observations in rows, variables in columns
|
||
|
demean : bool
|
||
|
If demean is true, then the mean is subtracted from each variable.
|
||
|
fullrank : bool
|
||
|
If true, then only the test that the matrix has full rank is returned.
|
||
|
If false, the test for all possible ranks are returned. However, no
|
||
|
the p-values are not corrected for the multiplicity of tests.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
value : float
|
||
|
value of the test statistic
|
||
|
p-value : float
|
||
|
p-value for the test Null Hypothesis tha the smallest canonical
|
||
|
correlation coefficient is zero. based on chi-square distribution
|
||
|
df : int
|
||
|
degrees of freedom for thechi-square distribution in the hypothesis test
|
||
|
ccorr : ndarray, 1d
|
||
|
All canonical correlation coefficients sorted from largest to smallest.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Degrees of freedom for the distribution of the test statistic are based on
|
||
|
number of columns of x1 and x2 and not on their matrix rank.
|
||
|
(I'm not sure yet what the interpretation of the test is if x1 or x2 are of
|
||
|
reduced rank.)
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
cancorr
|
||
|
cc_stats
|
||
|
|
||
|
'''
|
||
|
|
||
|
from scipy import stats
|
||
|
|
||
|
nobs1, k1 = x1.shape
|
||
|
nobs2, k2 = x2.shape
|
||
|
|
||
|
cc = cancorr(x1, x2, demean=demean)
|
||
|
cc2 = cc * cc
|
||
|
if fullrank:
|
||
|
df = np.abs(k1 - k2) + 1
|
||
|
value = nobs1 * cc2[-1]
|
||
|
w_value = nobs1 * (cc2[-1] / (1. - cc2[-1]))
|
||
|
return value, stats.chi2.sf(value, df), df, cc, w_value, stats.chi2.sf(w_value, df)
|
||
|
else:
|
||
|
r = np.arange(min(k1, k2))[::-1]
|
||
|
df = (k1 - r) * (k2 - r)
|
||
|
values = nobs1 * cc2[::-1].cumsum()
|
||
|
w_values = nobs1 * (cc2 / (1. - cc2))[::-1].cumsum()
|
||
|
return values, stats.chi2.sf(values, df), df, cc, w_values, stats.chi2.sf(w_values, df)
|
||
|
|
||
|
|
||
|
def cc_stats(x1, x2, demean=True):
|
||
|
'''MANOVA statistics based on canonical correlation coefficient
|
||
|
|
||
|
Calculates Pillai's Trace, Wilk's Lambda, Hotelling's Trace and
|
||
|
Roy's Largest Root.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x1, x2 : ndarrays, 2_D
|
||
|
two 2-dimensional data arrays, observations in rows, variables in columns
|
||
|
demean : bool
|
||
|
If demean is true, then the mean is subtracted from each variable.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
res : dict
|
||
|
Dictionary containing the test statistics.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
|
||
|
same as `canon` in Stata
|
||
|
|
||
|
missing: F-statistics and p-values
|
||
|
|
||
|
TODO: should return a results class instead
|
||
|
produces nans sometimes, singular, perfect correlation of x1, x2 ?
|
||
|
|
||
|
'''
|
||
|
|
||
|
nobs1, k1 = x1.shape # endogenous ?
|
||
|
nobs2, k2 = x2.shape
|
||
|
cc = cancorr(x1, x2, demean=demean)
|
||
|
cc2 = cc**2
|
||
|
lam = (cc2 / (1 - cc2)) # what if max cc2 is 1 ?
|
||
|
# Problem: ccr might not care if x1 or x2 are reduced rank,
|
||
|
# but df will depend on rank
|
||
|
df_model = k1 * k2 # df_hypothesis (we do not include mean in x1, x2)
|
||
|
df_resid = k1 * (nobs1 - k2 - demean)
|
||
|
s = min(df_model, k1)
|
||
|
m = 0.5 * (df_model - k1)
|
||
|
n = 0.5 * (df_resid - k1 - 1)
|
||
|
|
||
|
df1 = k1 * df_model
|
||
|
df2 = k2
|
||
|
|
||
|
|
||
|
pt_value = cc2.sum() # Pillai's trace
|
||
|
wl_value = np.product(1 / (1 + lam)) # Wilk's Lambda
|
||
|
ht_value = lam.sum() # Hotelling's Trace
|
||
|
rm_value = lam.max() # Roy's largest root
|
||
|
#from scipy import stats
|
||
|
# what's the distribution, the test statistic ?
|
||
|
res = {}
|
||
|
res['canonical correlation coefficient'] = cc
|
||
|
res['eigenvalues'] = lam
|
||
|
res["Pillai's Trace"] = pt_value
|
||
|
res["Wilk's Lambda"] = wl_value
|
||
|
res["Hotelling's Trace"] = ht_value
|
||
|
res["Roy's Largest Root"] = rm_value
|
||
|
res['df_resid'] = df_resid
|
||
|
res['df_m'] = m
|
||
|
return res
|