'''Tools for multivariate analysis Author : Josef Perktold License : BSD-3 TODO: - names of functions, currently just "working titles" ''' import numpy as np from statsmodels.tools.tools import Bunch def partial_project(endog, exog): '''helper function to get linear projection or partialling out of variables endog variables are projected on exog variables Parameters ---------- endog : ndarray array of variables where the effect of exog is partialled out. exog : ndarray array of variables on which the endog variables are projected. Returns ------- res : instance of Bunch with - params : OLS parameter estimates from projection of endog on exog - fittedvalues : predicted values of endog given exog - resid : residual of the regression, values of endog with effect of exog partialled out Notes ----- This is no-frills mainly for internal calculations, no error checking or array conversion is performed, at least for now. ''' x1, x2 = endog, exog params = np.linalg.pinv(x2).dot(x1) predicted = x2.dot(params) residual = x1 - predicted res = Bunch(params=params, fittedvalues=predicted, resid=residual) return res def cancorr(x1, x2, demean=True, standardize=False): '''canonical correlation coefficient beween 2 arrays Parameters ---------- x1, x2 : ndarrays, 2_D two 2-dimensional data arrays, observations in rows, variables in columns demean : bool If demean is true, then the mean is subtracted from each variable standardize : bool If standardize is true, then each variable is demeaned and divided by its standard deviation. Rescaling does not change the canonical correlation coefficients. Returns ------- ccorr : ndarray, 1d canonical correlation coefficients, sorted from largest to smallest. Note, that these are the square root of the eigenvalues. Notes ----- This is a helper function for other statistical functions. It only calculates the canonical correlation coefficients and does not do a full canoncial correlation analysis The canonical correlation coefficient is calculated with the generalized matrix inverse and does not raise an exception if one of the data arrays have less than full column rank. See Also -------- cc_ranktest cc_stats CCA not yet ''' #x, y = x1, x2 if demean or standardize: x1 = (x1 - x1.mean(0)) x2 = (x2 - x2.mean(0)) if standardize: #std does not make a difference to canonical correlation coefficients x1 /= x1.std(0) x2 /= x2.std(0) t1 = np.linalg.pinv(x1).dot(x2) t2 = np.linalg.pinv(x2).dot(x1) m = t1.dot(t2) cc = np.sqrt(np.linalg.eigvals(m)) cc.sort() return cc[::-1] def cc_ranktest(x1, x2, demean=True, fullrank=False): '''rank tests based on smallest canonical correlation coefficients Anderson canonical correlations test (LM test) and Cragg-Donald test (Wald test) Assumes homoskedasticity and independent observations, overrejects if there is heteroscedasticity or autocorrelation. The Null Hypothesis is that the rank is k - 1, the alternative hypothesis is that the rank is at least k. Parameters ---------- x1, x2 : ndarrays, 2_D two 2-dimensional data arrays, observations in rows, variables in columns demean : bool If demean is true, then the mean is subtracted from each variable. fullrank : bool If true, then only the test that the matrix has full rank is returned. If false, the test for all possible ranks are returned. However, no the p-values are not corrected for the multiplicity of tests. Returns ------- value : float value of the test statistic p-value : float p-value for the test Null Hypothesis tha the smallest canonical correlation coefficient is zero. based on chi-square distribution df : int degrees of freedom for thechi-square distribution in the hypothesis test ccorr : ndarray, 1d All canonical correlation coefficients sorted from largest to smallest. Notes ----- Degrees of freedom for the distribution of the test statistic are based on number of columns of x1 and x2 and not on their matrix rank. (I'm not sure yet what the interpretation of the test is if x1 or x2 are of reduced rank.) See Also -------- cancorr cc_stats ''' from scipy import stats nobs1, k1 = x1.shape nobs2, k2 = x2.shape cc = cancorr(x1, x2, demean=demean) cc2 = cc * cc if fullrank: df = np.abs(k1 - k2) + 1 value = nobs1 * cc2[-1] w_value = nobs1 * (cc2[-1] / (1. - cc2[-1])) return value, stats.chi2.sf(value, df), df, cc, w_value, stats.chi2.sf(w_value, df) else: r = np.arange(min(k1, k2))[::-1] df = (k1 - r) * (k2 - r) values = nobs1 * cc2[::-1].cumsum() w_values = nobs1 * (cc2 / (1. - cc2))[::-1].cumsum() return values, stats.chi2.sf(values, df), df, cc, w_values, stats.chi2.sf(w_values, df) def cc_stats(x1, x2, demean=True): '''MANOVA statistics based on canonical correlation coefficient Calculates Pillai's Trace, Wilk's Lambda, Hotelling's Trace and Roy's Largest Root. Parameters ---------- x1, x2 : ndarrays, 2_D two 2-dimensional data arrays, observations in rows, variables in columns demean : bool If demean is true, then the mean is subtracted from each variable. Returns ------- res : dict Dictionary containing the test statistics. Notes ----- same as `canon` in Stata missing: F-statistics and p-values TODO: should return a results class instead produces nans sometimes, singular, perfect correlation of x1, x2 ? ''' nobs1, k1 = x1.shape # endogenous ? nobs2, k2 = x2.shape cc = cancorr(x1, x2, demean=demean) cc2 = cc**2 lam = (cc2 / (1 - cc2)) # what if max cc2 is 1 ? # Problem: ccr might not care if x1 or x2 are reduced rank, # but df will depend on rank df_model = k1 * k2 # df_hypothesis (we do not include mean in x1, x2) df_resid = k1 * (nobs1 - k2 - demean) s = min(df_model, k1) m = 0.5 * (df_model - k1) n = 0.5 * (df_resid - k1 - 1) df1 = k1 * df_model df2 = k2 pt_value = cc2.sum() # Pillai's trace wl_value = np.product(1 / (1 + lam)) # Wilk's Lambda ht_value = lam.sum() # Hotelling's Trace rm_value = lam.max() # Roy's largest root #from scipy import stats # what's the distribution, the test statistic ? res = {} res['canonical correlation coefficient'] = cc res['eigenvalues'] = lam res["Pillai's Trace"] = pt_value res["Wilk's Lambda"] = wl_value res["Hotelling's Trace"] = ht_value res["Roy's Largest Root"] = rm_value res['df_resid'] = df_resid res['df_m'] = m return res