242 lines
7.5 KiB
Python
242 lines
7.5 KiB
Python
import numpy as np
|
|
from numpy.testing import (assert_equal, assert_array_equal,
|
|
assert_array_almost_equal, assert_approx_equal,
|
|
assert_allclose)
|
|
import pytest
|
|
from pytest import raises as assert_raises
|
|
from scipy.special import xlogy
|
|
from scipy.stats.contingency import (margins, expected_freq,
|
|
chi2_contingency, association)
|
|
|
|
|
|
def test_margins():
|
|
a = np.array([1])
|
|
m = margins(a)
|
|
assert_equal(len(m), 1)
|
|
m0 = m[0]
|
|
assert_array_equal(m0, np.array([1]))
|
|
|
|
a = np.array([[1]])
|
|
m0, m1 = margins(a)
|
|
expected0 = np.array([[1]])
|
|
expected1 = np.array([[1]])
|
|
assert_array_equal(m0, expected0)
|
|
assert_array_equal(m1, expected1)
|
|
|
|
a = np.arange(12).reshape(2, 6)
|
|
m0, m1 = margins(a)
|
|
expected0 = np.array([[15], [51]])
|
|
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
|
|
assert_array_equal(m0, expected0)
|
|
assert_array_equal(m1, expected1)
|
|
|
|
a = np.arange(24).reshape(2, 3, 4)
|
|
m0, m1, m2 = margins(a)
|
|
expected0 = np.array([[[66]], [[210]]])
|
|
expected1 = np.array([[[60], [92], [124]]])
|
|
expected2 = np.array([[[60, 66, 72, 78]]])
|
|
assert_array_equal(m0, expected0)
|
|
assert_array_equal(m1, expected1)
|
|
assert_array_equal(m2, expected2)
|
|
|
|
|
|
def test_expected_freq():
|
|
assert_array_equal(expected_freq([1]), np.array([1.0]))
|
|
|
|
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
|
|
e = expected_freq(observed)
|
|
assert_array_equal(e, np.ones_like(observed))
|
|
|
|
observed = np.array([[10, 10, 20], [20, 20, 20]])
|
|
e = expected_freq(observed)
|
|
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
|
|
assert_array_almost_equal(e, correct)
|
|
|
|
|
|
def test_chi2_contingency_trivial():
|
|
# Some very simple tests for chi2_contingency.
|
|
|
|
# A trivial case
|
|
obs = np.array([[1, 2], [1, 2]])
|
|
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
|
assert_equal(chi2, 0.0)
|
|
assert_equal(p, 1.0)
|
|
assert_equal(dof, 1)
|
|
assert_array_equal(obs, expected)
|
|
|
|
# A *really* trivial case: 1-D data.
|
|
obs = np.array([1, 2, 3])
|
|
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
|
assert_equal(chi2, 0.0)
|
|
assert_equal(p, 1.0)
|
|
assert_equal(dof, 0)
|
|
assert_array_equal(obs, expected)
|
|
|
|
|
|
def test_chi2_contingency_R():
|
|
# Some test cases that were computed independently, using R.
|
|
|
|
# Rcode = \
|
|
# """
|
|
# # Data vector.
|
|
# data <- c(
|
|
# 12, 34, 23, 4, 47, 11,
|
|
# 35, 31, 11, 34, 10, 18,
|
|
# 12, 32, 9, 18, 13, 19,
|
|
# 12, 12, 14, 9, 33, 25
|
|
# )
|
|
#
|
|
# # Create factor tags:r=rows, c=columns, t=tiers
|
|
# r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
|
|
# c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
|
|
# t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
|
|
#
|
|
# # 3-way Chi squared test of independence
|
|
# s = summary(xtabs(data~r+c+t))
|
|
# print(s)
|
|
# """
|
|
# Routput = \
|
|
# """
|
|
# Call: xtabs(formula = data ~ r + c + t)
|
|
# Number of cases in table: 478
|
|
# Number of factors: 3
|
|
# Test for independence of all factors:
|
|
# Chisq = 102.17, df = 17, p-value = 3.514e-14
|
|
# """
|
|
obs = np.array(
|
|
[[[12, 34, 23],
|
|
[35, 31, 11],
|
|
[12, 32, 9],
|
|
[12, 12, 14]],
|
|
[[4, 47, 11],
|
|
[34, 10, 18],
|
|
[18, 13, 19],
|
|
[9, 33, 25]]])
|
|
chi2, p, dof, expected = chi2_contingency(obs)
|
|
assert_approx_equal(chi2, 102.17, significant=5)
|
|
assert_approx_equal(p, 3.514e-14, significant=4)
|
|
assert_equal(dof, 17)
|
|
|
|
# Rcode = \
|
|
# """
|
|
# # Data vector.
|
|
# data <- c(
|
|
# #
|
|
# 12, 17,
|
|
# 11, 16,
|
|
# #
|
|
# 11, 12,
|
|
# 15, 16,
|
|
# #
|
|
# 23, 15,
|
|
# 30, 22,
|
|
# #
|
|
# 14, 17,
|
|
# 15, 16
|
|
# )
|
|
#
|
|
# # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
|
|
# r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
|
|
# c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
|
|
# d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
|
|
# t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
|
|
#
|
|
# # 4-way Chi squared test of independence
|
|
# s = summary(xtabs(data~r+c+d+t))
|
|
# print(s)
|
|
# """
|
|
# Routput = \
|
|
# """
|
|
# Call: xtabs(formula = data ~ r + c + d + t)
|
|
# Number of cases in table: 262
|
|
# Number of factors: 4
|
|
# Test for independence of all factors:
|
|
# Chisq = 8.758, df = 11, p-value = 0.6442
|
|
# """
|
|
obs = np.array(
|
|
[[[[12, 17],
|
|
[11, 16]],
|
|
[[11, 12],
|
|
[15, 16]]],
|
|
[[[23, 15],
|
|
[30, 22]],
|
|
[[14, 17],
|
|
[15, 16]]]])
|
|
chi2, p, dof, expected = chi2_contingency(obs)
|
|
assert_approx_equal(chi2, 8.758, significant=4)
|
|
assert_approx_equal(p, 0.6442, significant=4)
|
|
assert_equal(dof, 11)
|
|
|
|
|
|
def test_chi2_contingency_g():
|
|
c = np.array([[15, 60], [15, 90]])
|
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
|
|
correction=False)
|
|
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
|
|
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
|
|
correction=True)
|
|
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
|
|
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
|
|
|
|
c = np.array([[10, 12, 10], [12, 10, 10]])
|
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
|
|
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
|
|
|
|
|
def test_chi2_contingency_bad_args():
|
|
# Test that "bad" inputs raise a ValueError.
|
|
|
|
# Negative value in the array of observed frequencies.
|
|
obs = np.array([[-1, 10], [1, 2]])
|
|
assert_raises(ValueError, chi2_contingency, obs)
|
|
|
|
# The zeros in this will result in zeros in the array
|
|
# of expected frequencies.
|
|
obs = np.array([[0, 1], [0, 1]])
|
|
assert_raises(ValueError, chi2_contingency, obs)
|
|
|
|
# A degenerate case: `observed` has size 0.
|
|
obs = np.empty((0, 8))
|
|
assert_raises(ValueError, chi2_contingency, obs)
|
|
|
|
|
|
def test_chi2_contingency_yates_gh13875():
|
|
# Magnitude of Yates' continuity correction should not exceed difference
|
|
# between expected and observed value of the statistic; see gh-13875
|
|
observed = np.array([[1573, 3], [4, 0]])
|
|
p = chi2_contingency(observed)[1]
|
|
assert_allclose(p, 1, rtol=1e-12)
|
|
|
|
|
|
@pytest.mark.parametrize("correction", [False, True])
|
|
def test_result(correction):
|
|
obs = np.array([[1, 2], [1, 2]])
|
|
res = chi2_contingency(obs, correction=correction)
|
|
assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
|
|
|
|
|
|
def test_bad_association_args():
|
|
# Invalid Test Statistic
|
|
assert_raises(ValueError, association, [[1, 2], [3, 4]], "X")
|
|
# Invalid array shape
|
|
assert_raises(ValueError, association, [[[1, 2]], [[3, 4]]], "cramer")
|
|
# chi2_contingency exception
|
|
assert_raises(ValueError, association, [[-1, 10], [1, 2]], 'cramer')
|
|
# Invalid Array Item Data Type
|
|
assert_raises(ValueError, association,
|
|
np.array([[1, 2], ["dd", 4]], dtype=object), 'cramer')
|
|
|
|
|
|
@pytest.mark.parametrize('stat, expected',
|
|
[('cramer', 0.09222412010290792),
|
|
('tschuprow', 0.0775509319944633),
|
|
('pearson', 0.12932925727138758)])
|
|
def test_assoc(stat, expected):
|
|
# 2d Array
|
|
obs1 = np.array([[12, 13, 14, 15, 16],
|
|
[17, 16, 18, 19, 11],
|
|
[9, 15, 14, 12, 11]])
|
|
a = association(observed=obs1, method=stat)
|
|
assert_allclose(a, expected)
|