196 lines
6.1 KiB
Python
196 lines
6.1 KiB
Python
'''using multivariate dependence and divergence measures
|
|
|
|
The standard correlation coefficient measures only linear dependence between
|
|
random variables.
|
|
kendall's tau measures any monotonic relationship also non-linear.
|
|
|
|
mutual information measures any kind of dependence, but does not distinguish
|
|
between positive and negative relationship
|
|
|
|
|
|
mutualinfo_kde and mutualinfo_binning follow Khan et al. 2007
|
|
|
|
Shiraj Khan, Sharba Bandyopadhyay, Auroop R. Ganguly, Sunil Saigal,
|
|
David J. Erickson, III, Vladimir Protopopescu, and George Ostrouchov,
|
|
Relative performance of mutual information estimation methods for
|
|
quantifying the dependence among short and noisy data,
|
|
Phys. Rev. E 76, 026209 (2007)
|
|
http://pre.aps.org/abstract/PRE/v76/i2/e026209
|
|
|
|
|
|
'''
|
|
|
|
import numpy as np
|
|
from scipy import stats
|
|
from scipy.stats import gaussian_kde
|
|
|
|
import statsmodels.sandbox.infotheo as infotheo
|
|
|
|
|
|
def mutualinfo_kde(y, x, normed=True):
|
|
'''mutual information of two random variables estimated with kde
|
|
|
|
'''
|
|
nobs = len(x)
|
|
if not len(y) == nobs:
|
|
raise ValueError('both data arrays need to have the same size')
|
|
x = np.asarray(x, float)
|
|
y = np.asarray(y, float)
|
|
yx = np.vstack((y,x))
|
|
kde_x = gaussian_kde(x)(x)
|
|
kde_y = gaussian_kde(y)(y)
|
|
kde_yx = gaussian_kde(yx)(yx)
|
|
|
|
mi_obs = np.log(kde_yx) - np.log(kde_x) - np.log(kde_y)
|
|
mi = mi_obs.sum() / nobs
|
|
if normed:
|
|
mi_normed = np.sqrt(1. - np.exp(-2 * mi))
|
|
return mi_normed
|
|
else:
|
|
return mi
|
|
|
|
def mutualinfo_kde_2sample(y, x, normed=True):
|
|
'''mutual information of two random variables estimated with kde
|
|
|
|
'''
|
|
nobs = len(x)
|
|
x = np.asarray(x, float)
|
|
y = np.asarray(y, float)
|
|
#yx = np.vstack((y,x))
|
|
kde_x = gaussian_kde(x.T)(x.T)
|
|
kde_y = gaussian_kde(y.T)(x.T)
|
|
#kde_yx = gaussian_kde(yx)(yx)
|
|
|
|
mi_obs = np.log(kde_x) - np.log(kde_y)
|
|
if len(mi_obs) != nobs:
|
|
raise ValueError("Wrong number of observations")
|
|
mi = mi_obs.mean()
|
|
if normed:
|
|
mi_normed = np.sqrt(1. - np.exp(-2 * mi))
|
|
return mi_normed
|
|
else:
|
|
return mi
|
|
|
|
def mutualinfo_binned(y, x, bins, normed=True):
|
|
'''mutual information of two random variables estimated with kde
|
|
|
|
|
|
|
|
Notes
|
|
-----
|
|
bins='auto' selects the number of bins so that approximately 5 observations
|
|
are expected to be in each bin under the assumption of independence. This
|
|
follows roughly the description in Kahn et al. 2007
|
|
|
|
'''
|
|
nobs = len(x)
|
|
if not len(y) == nobs:
|
|
raise ValueError('both data arrays need to have the same size')
|
|
x = np.asarray(x, float)
|
|
y = np.asarray(y, float)
|
|
#yx = np.vstack((y,x))
|
|
|
|
|
|
## fyx, binsy, binsx = np.histogram2d(y, x, bins=bins)
|
|
## fx, binsx_ = np.histogram(x, bins=binsx)
|
|
## fy, binsy_ = np.histogram(y, bins=binsy)
|
|
|
|
if bins == 'auto':
|
|
ys = np.sort(y)
|
|
xs = np.sort(x)
|
|
#quantiles = np.array([0,0.25, 0.4, 0.6, 0.75, 1])
|
|
qbin_sqr = np.sqrt(5./nobs)
|
|
quantiles = np.linspace(0, 1, 1./qbin_sqr)
|
|
quantile_index = ((nobs-1)*quantiles).astype(int)
|
|
#move edges so that they do not coincide with an observation
|
|
shift = 1e-6 + np.ones(quantiles.shape)
|
|
shift[0] -= 2*1e-6
|
|
binsy = ys[quantile_index] + shift
|
|
binsx = xs[quantile_index] + shift
|
|
|
|
elif np.size(bins) == 1:
|
|
binsy = bins
|
|
binsx = bins
|
|
elif (len(bins) == 2):
|
|
binsy, binsx = bins
|
|
## if np.size(bins[0]) == 1:
|
|
## binsx = bins[0]
|
|
## if np.size(bins[1]) == 1:
|
|
## binsx = bins[1]
|
|
|
|
fx, binsx = np.histogram(x, bins=binsx)
|
|
fy, binsy = np.histogram(y, bins=binsy)
|
|
fyx, binsy, binsx = np.histogram2d(y, x, bins=(binsy, binsx))
|
|
|
|
pyx = fyx * 1. / nobs
|
|
px = fx * 1. / nobs
|
|
py = fy * 1. / nobs
|
|
|
|
|
|
mi_obs = pyx * (np.log(pyx+1e-10) - np.log(py)[:,None] - np.log(px))
|
|
mi = mi_obs.sum()
|
|
|
|
if normed:
|
|
mi_normed = np.sqrt(1. - np.exp(-2 * mi))
|
|
return mi_normed, (pyx, py, px, binsy, binsx), mi_obs
|
|
else:
|
|
return mi
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import statsmodels.api as sm
|
|
|
|
funtype = ['linear', 'quadratic'][1]
|
|
nobs = 200
|
|
sig = 2#5.
|
|
#x = np.linspace(-3, 3, nobs) + np.random.randn(nobs)
|
|
x = np.sort(3*np.random.randn(nobs))
|
|
exog = sm.add_constant(x, prepend=True)
|
|
#y = 0 + np.log(1+x**2) + sig * np.random.randn(nobs)
|
|
if funtype == 'quadratic':
|
|
y = 0 + x**2 + sig * np.random.randn(nobs)
|
|
if funtype == 'linear':
|
|
y = 0 + x + sig * np.random.randn(nobs)
|
|
|
|
print('correlation')
|
|
print(np.corrcoef(y,x)[0, 1])
|
|
print('pearsonr', stats.pearsonr(y,x))
|
|
print('spearmanr', stats.spearmanr(y,x))
|
|
print('kendalltau', stats.kendalltau(y,x))
|
|
|
|
pxy, binsx, binsy = np.histogram2d(x,y, bins=5)
|
|
px, binsx_ = np.histogram(x, bins=binsx)
|
|
py, binsy_ = np.histogram(y, bins=binsy)
|
|
print('mutualinfo', infotheo.mutualinfo(px*1./nobs, py*1./nobs,
|
|
1e-15+pxy*1./nobs, logbase=np.e))
|
|
|
|
print('mutualinfo_kde normed', mutualinfo_kde(y,x))
|
|
print('mutualinfo_kde ', mutualinfo_kde(y,x, normed=False))
|
|
mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
|
|
mutualinfo_binned(y, x, 5, normed=True)
|
|
print('mutualinfo_binned normed', mi_normed)
|
|
print('mutualinfo_binned ', mi_obs.sum())
|
|
|
|
mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
|
|
mutualinfo_binned(y, x, 'auto', normed=True)
|
|
print('auto')
|
|
print('mutualinfo_binned normed', mi_normed)
|
|
print('mutualinfo_binned ', mi_obs.sum())
|
|
|
|
ys = np.sort(y)
|
|
xs = np.sort(x)
|
|
by = ys[((nobs-1)*np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)]
|
|
bx = xs[((nobs-1)*np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)]
|
|
mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
|
|
mutualinfo_binned(y, x, (by,bx), normed=True)
|
|
print('quantiles')
|
|
print('mutualinfo_binned normed', mi_normed)
|
|
print('mutualinfo_binned ', mi_obs.sum())
|
|
|
|
doplot = 1#False
|
|
if doplot:
|
|
import matplotlib.pyplot as plt
|
|
plt.plot(x, y, 'o')
|
|
olsres = sm.OLS(y, exog).fit()
|
|
plt.plot(x, olsres.fittedvalues)
|