AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/patsy/test_build.py

743 lines
30 KiB
Python
Raw Normal View History

2024-10-02 22:15:59 +04:00
# This file is part of Patsy
# Copyright (C) 2012-2013 Nathaniel Smith <njs@pobox.com>
# See file LICENSE.txt for license information.
# There are a number of unit tests in build.py, but this file contains more
# thorough tests of the overall design matrix building system. (These are
# still not exhaustive end-to-end tests, though -- for that see
# test_highlevel.py.)
from __future__ import print_function
import six
import numpy as np
import pytest
from patsy import PatsyError
from patsy.util import (atleast_2d_column_default,
have_pandas, have_pandas_categorical)
from patsy.desc import Term, INTERCEPT
from patsy.build import *
from patsy.categorical import C
from patsy.user_util import balanced, LookupFactor
from patsy.design_info import DesignMatrix, DesignInfo
if have_pandas:
import pandas
def assert_full_rank(m):
m = atleast_2d_column_default(m)
if m.shape[1] == 0:
return True
u, s, v = np.linalg.svd(m)
rank = np.sum(s > 1e-10)
assert rank == m.shape[1]
def test_assert_full_rank():
assert_full_rank(np.eye(10))
assert_full_rank([[1, 0], [1, 0], [1, 0], [1, 1]])
pytest.raises(AssertionError,
assert_full_rank, [[1, 0], [2, 0]])
pytest.raises(AssertionError,
assert_full_rank, [[1, 2], [2, 4]])
pytest.raises(AssertionError,
assert_full_rank, [[1, 2, 3], [1, 10, 100]])
# col1 + col2 = col3
pytest.raises(AssertionError,
assert_full_rank, [[1, 2, 3], [1, 5, 6], [1, 6, 7]])
def make_termlist(*entries):
terms = []
for entry in entries:
terms.append(Term([LookupFactor(name) for name in entry]))
return terms
def check_design_matrix(mm, expected_rank, termlist, column_names=None):
assert_full_rank(mm)
assert set(mm.design_info.terms) == set(termlist)
if column_names is not None:
assert mm.design_info.column_names == column_names
assert mm.ndim == 2
assert mm.shape[1] == expected_rank
def make_matrix(data, expected_rank, entries, column_names=None):
termlist = make_termlist(*entries)
def iter_maker():
yield data
design_infos = design_matrix_builders([termlist], iter_maker, eval_env=0)
matrices = build_design_matrices(design_infos, data)
matrix = matrices[0]
assert (design_infos[0].term_slices
== matrix.design_info.term_slices)
assert (design_infos[0].column_names
== matrix.design_info.column_names)
assert matrix.design_info is design_infos[0]
check_design_matrix(matrix, expected_rank, termlist,
column_names=column_names)
return matrix
def test_simple():
data = balanced(a=2, b=2)
x1 = data["x1"] = np.linspace(0, 1, len(data["a"]))
x2 = data["x2"] = data["x1"] ** 2
m = make_matrix(data, 2, [["a"]], column_names=["a[a1]", "a[a2]"])
assert np.allclose(m, [[1, 0], [1, 0], [0, 1], [0, 1]])
m = make_matrix(data, 2, [[], ["a"]], column_names=["Intercept", "a[T.a2]"])
assert np.allclose(m, [[1, 0], [1, 0], [1, 1], [1, 1]])
m = make_matrix(data, 4, [["a", "b"]],
column_names=["a[a1]:b[b1]", "a[a2]:b[b1]",
"a[a1]:b[b2]", "a[a2]:b[b2]"])
assert np.allclose(m, [[1, 0, 0, 0],
[0, 0, 1, 0],
[0, 1, 0, 0],
[0, 0, 0, 1]])
m = make_matrix(data, 4, [[], ["a"], ["b"], ["a", "b"]],
column_names=["Intercept", "a[T.a2]",
"b[T.b2]", "a[T.a2]:b[T.b2]"])
assert np.allclose(m, [[1, 0, 0, 0],
[1, 0, 1, 0],
[1, 1, 0, 0],
[1, 1, 1, 1]])
m = make_matrix(data, 4, [[], ["b"], ["a"], ["b", "a"]],
column_names=["Intercept", "b[T.b2]",
"a[T.a2]", "b[T.b2]:a[T.a2]"])
assert np.allclose(m, [[1, 0, 0, 0],
[1, 1, 0, 0],
[1, 0, 1, 0],
[1, 1, 1, 1]])
m = make_matrix(data, 4, [["a"], ["x1"], ["a", "x1"]],
column_names=["a[a1]", "a[a2]", "x1", "a[T.a2]:x1"])
assert np.allclose(m, [[1, 0, x1[0], 0],
[1, 0, x1[1], 0],
[0, 1, x1[2], x1[2]],
[0, 1, x1[3], x1[3]]])
m = make_matrix(data, 3, [["x1"], ["x2"], ["x2", "x1"]],
column_names=["x1", "x2", "x2:x1"])
assert np.allclose(m, np.column_stack((x1, x2, x1 * x2)))
def test_R_bugs():
data = balanced(a=2, b=2, c=2)
data["x"] = np.linspace(0, 1, len(data["a"]))
# For "1 + a:b", R produces a design matrix with too many columns (5
# instead of 4), because it can't tell that there is a redundancy between
# the two terms.
make_matrix(data, 4, [[], ["a", "b"]])
# For "0 + a:x + a:b", R produces a design matrix with too few columns (4
# instead of 6), because it thinks that there is a redundancy which
# doesn't exist.
make_matrix(data, 6, [["a", "x"], ["a", "b"]])
# This can be compared with "0 + a:c + a:b", where the redundancy does
# exist. Confusingly, adding another categorical factor increases the
# baseline dimensionality to 8, and then the redundancy reduces it to 6
# again, so the result is the same as before but for different reasons. (R
# does get this one right, but we might as well test it.)
make_matrix(data, 6, [["a", "c"], ["a", "b"]])
def test_redundancy_thoroughly():
# To make sure there aren't any lurking bugs analogous to the ones that R
# has (see above), we check that we get the correct matrix rank for every
# possible combination of 2 categorical and 2 numerical factors.
data = balanced(a=2, b=2, repeat=5)
data["x1"] = np.linspace(0, 1, len(data["a"]))
data["x2"] = data["x1"] ** 2
def all_subsets(l):
if not l:
yield tuple()
else:
obj = l[0]
for subset in all_subsets(l[1:]):
yield tuple(sorted(subset))
yield tuple(sorted((obj,) + subset))
all_terms = list(all_subsets(("a", "b", "x1", "x2")))
all_termlist_templates = list(all_subsets(all_terms))
print(len(all_termlist_templates))
# eliminate some of the symmetric versions to speed things up
redundant = [[("b",), ("a",)],
[("x2",), ("x1",)],
[("b", "x2"), ("a", "x1")],
[("a", "b", "x2"), ("a", "b", "x1")],
[("b", "x1", "x2"), ("a", "x1", "x2")]]
count = 0
import time
start = time.time()
for termlist_template in all_termlist_templates:
termlist_set = set(termlist_template)
for dispreferred, preferred in redundant:
if dispreferred in termlist_set and preferred not in termlist_set:
break
else:
expanded_terms = set()
for term_template in termlist_template:
numeric = tuple([t for t in term_template if t.startswith("x")])
rest = [t for t in term_template if not t.startswith("x")]
for subset_rest in all_subsets(rest):
expanded_terms.add(frozenset(subset_rest + numeric))
# Because our categorical variables have 2 levels, each expanded
# term corresponds to 1 unique dimension of variation
expected_rank = len(expanded_terms)
if termlist_template in [(), ((),)]:
# No data dependence, should fail
pytest.raises(PatsyError,
make_matrix,
data, expected_rank, termlist_template)
else:
make_matrix(data, expected_rank, termlist_template)
count += 1
if count % 100 == 0:
print("Completed:", count)
print("Took %0.2f seconds" % (time.time() - start,))
test_redundancy_thoroughly.slow = 1
def test_data_types():
basic_dict = {"a": ["a1", "a2", "a1", "a2"],
"x": [1, 2, 3, 4]}
# On Python 2, this is identical to basic_dict:
basic_dict_bytes = dict(basic_dict)
basic_dict_bytes["a"] = [s.encode("ascii") for s in basic_dict_bytes["a"]]
# On Python 3, this is identical to basic_dict:
basic_dict_unicode = {"a": ["a1", "a2", "a1", "a2"],
"x": [1, 2, 3, 4]}
basic_dict_unicode = dict(basic_dict)
basic_dict_unicode["a"] = [six.text_type(s) for s in basic_dict_unicode["a"]]
structured_array_bytes = np.array(list(zip(basic_dict["a"],
basic_dict["x"])),
dtype=[("a", "S2"), ("x", int)])
structured_array_unicode = np.array(list(zip(basic_dict["a"],
basic_dict["x"])),
dtype=[("a", "U2"), ("x", int)])
recarray_bytes = structured_array_bytes.view(np.recarray)
recarray_unicode = structured_array_unicode.view(np.recarray)
datas = [basic_dict, structured_array_bytes, structured_array_unicode,
recarray_bytes, recarray_unicode]
if have_pandas:
df_bytes = pandas.DataFrame(basic_dict_bytes)
datas.append(df_bytes)
df_unicode = pandas.DataFrame(basic_dict_unicode)
datas.append(df_unicode)
for data in datas:
m = make_matrix(data, 4, [["a"], ["a", "x"]],
column_names=["a[a1]", "a[a2]", "a[a1]:x", "a[a2]:x"])
assert np.allclose(m, [[1, 0, 1, 0],
[0, 1, 0, 2],
[1, 0, 3, 0],
[0, 1, 0, 4]])
def test_build_design_matrices_dtype():
data = {"x": [1, 2, 3]}
def iter_maker():
yield data
builder = design_matrix_builders([make_termlist("x")], iter_maker, 0)[0]
mat = build_design_matrices([builder], data)[0]
assert mat.dtype == np.dtype(np.float64)
mat = build_design_matrices([builder], data, dtype=np.float32)[0]
assert mat.dtype == np.dtype(np.float32)
if hasattr(np, "float128"):
mat = build_design_matrices([builder], data, dtype=np.float128)[0]
assert mat.dtype == np.dtype(np.float128)
def test_return_type():
data = {"x": [1, 2, 3]}
def iter_maker():
yield data
builder = design_matrix_builders([make_termlist("x")], iter_maker, 0)[0]
# Check explicitly passing return_type="matrix" works
mat = build_design_matrices([builder], data, return_type="matrix")[0]
assert isinstance(mat, DesignMatrix)
# Check that nonsense is detected
pytest.raises(PatsyError,
build_design_matrices, [builder], data,
return_type="asdfsadf")
def test_NA_action():
initial_data = {"x": [1, 2, 3], "c": ["c1", "c2", "c1"]}
def iter_maker():
yield initial_data
builder = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0)[0]
# By default drops rows containing either NaN or None
mat = build_design_matrices([builder],
{"x": [10.0, np.nan, 20.0],
"c": np.asarray(["c1", "c2", None],
dtype=object)})[0]
assert mat.shape == (1, 3)
assert np.array_equal(mat, [[1.0, 0.0, 10.0]])
# NA_action="a string" also accepted:
mat = build_design_matrices([builder],
{"x": [10.0, np.nan, 20.0],
"c": np.asarray(["c1", "c2", None],
dtype=object)},
NA_action="drop")[0]
assert mat.shape == (1, 3)
assert np.array_equal(mat, [[1.0, 0.0, 10.0]])
# And objects
from patsy.missing import NAAction
# allows NaN's to pass through
NA_action = NAAction(NA_types=[])
mat = build_design_matrices([builder],
{"x": [10.0, np.nan],
"c": np.asarray(["c1", "c2"],
dtype=object)},
NA_action=NA_action)[0]
assert mat.shape == (2, 3)
# According to this (and only this) function, NaN == NaN.
np.testing.assert_array_equal(mat, [[1.0, 0.0, 10.0], [0.0, 1.0, np.nan]])
# NA_action="raise"
pytest.raises(PatsyError,
build_design_matrices,
[builder],
{"x": [10.0, np.nan, 20.0],
"c": np.asarray(["c1", "c2", None],
dtype=object)},
NA_action="raise")
def test_NA_drop_preserves_levels():
# Even if all instances of some level are dropped, we still include it in
# the output matrix (as an all-zeros column)
data = {"x": [1.0, np.nan, 3.0], "c": ["c1", "c2", "c3"]}
def iter_maker():
yield data
design_info = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0)[0]
assert design_info.column_names == ["c[c1]", "c[c2]", "c[c3]", "x"]
mat, = build_design_matrices([design_info], data)
assert mat.shape == (2, 4)
assert np.array_equal(mat, [[1.0, 0.0, 0.0, 1.0],
[0.0, 0.0, 1.0, 3.0]])
def test_return_type_pandas():
if not have_pandas:
return
data = pandas.DataFrame({"x": [1, 2, 3],
"y": [4, 5, 6],
"a": ["a1", "a2", "a1"]},
index=[10, 20, 30])
def iter_maker():
yield data
int_builder, = design_matrix_builders([make_termlist([])], iter_maker, 0)
(y_builder, x_builder) = design_matrix_builders([make_termlist("y"),
make_termlist("x")],
iter_maker,
eval_env=0)
(x_a_builder,) = design_matrix_builders([make_termlist("x", "a")],
iter_maker,
eval_env=0)
(x_y_builder,) = design_matrix_builders([make_termlist("x", "y")],
iter_maker,
eval_env=0)
# Index compatibility is always checked for pandas input, regardless of
# whether we're producing pandas output
pytest.raises(PatsyError,
build_design_matrices,
[x_a_builder], {"x": data["x"], "a": data["a"][::-1]})
pytest.raises(PatsyError,
build_design_matrices,
[y_builder, x_builder],
{"x": data["x"], "y": data["y"][::-1]})
# And we also check consistency between data.index and value indexes
# Creating a mismatch between these is a bit tricky. We want a data object
# such that isinstance(data, DataFrame), but data["x"].index !=
# data.index.
class CheatingDataFrame(pandas.DataFrame):
def __getitem__(self, key):
if key == "x":
return pandas.DataFrame.__getitem__(self, key)[::-1]
else:
return pandas.DataFrame.__getitem__(self, key)
pytest.raises(PatsyError,
build_design_matrices,
[x_builder],
CheatingDataFrame(data))
# A mix of pandas input and unindexed input is fine
(mat,) = build_design_matrices([x_y_builder],
{"x": data["x"], "y": [40, 50, 60]})
assert np.allclose(mat, [[1, 40], [2, 50], [3, 60]])
# with return_type="dataframe", we get out DataFrames with nice indices
# and nice column names and design_info
y_df, x_df = build_design_matrices([y_builder, x_builder], data,
return_type="dataframe")
assert isinstance(y_df, pandas.DataFrame)
assert isinstance(x_df, pandas.DataFrame)
assert np.array_equal(y_df, [[4], [5], [6]])
assert np.array_equal(x_df, [[1], [2], [3]])
assert np.array_equal(y_df.index, [10, 20, 30])
assert np.array_equal(x_df.index, [10, 20, 30])
assert np.array_equal(y_df.columns, ["y"])
assert np.array_equal(x_df.columns, ["x"])
assert y_df.design_info.column_names == ["y"]
assert x_df.design_info.column_names == ["x"]
assert y_df.design_info.term_names == ["y"]
assert x_df.design_info.term_names == ["x"]
# Same with mix of pandas and unindexed info, even if in different
# matrices
y_df, x_df = build_design_matrices([y_builder, x_builder],
{"y": [7, 8, 9], "x": data["x"]},
return_type="dataframe")
assert isinstance(y_df, pandas.DataFrame)
assert isinstance(x_df, pandas.DataFrame)
assert np.array_equal(y_df, [[7], [8], [9]])
assert np.array_equal(x_df, [[1], [2], [3]])
assert np.array_equal(y_df.index, [10, 20, 30])
assert np.array_equal(x_df.index, [10, 20, 30])
assert np.array_equal(y_df.columns, ["y"])
assert np.array_equal(x_df.columns, ["x"])
assert y_df.design_info.column_names == ["y"]
assert x_df.design_info.column_names == ["x"]
assert y_df.design_info.term_names == ["y"]
assert x_df.design_info.term_names == ["x"]
# Check categorical works for carrying index too
(x_a_df,) = build_design_matrices([x_a_builder],
{"x": [-1, -2, -3], "a": data["a"]},
return_type="dataframe")
assert isinstance(x_a_df, pandas.DataFrame)
assert np.array_equal(x_a_df, [[1, 0, -1], [0, 1, -2], [1, 0, -3]])
assert np.array_equal(x_a_df.index, [10, 20, 30])
# And if we have no indexed input, then we let pandas make up an index as
# per its usual rules:
(x_y_df,) = build_design_matrices([x_y_builder],
{"y": [7, 8, 9], "x": [10, 11, 12]},
return_type="dataframe")
assert isinstance(x_y_df, pandas.DataFrame)
assert np.array_equal(x_y_df, [[10, 7], [11, 8], [12, 9]])
assert np.array_equal(x_y_df.index, [0, 1, 2])
# If 'data' is a DataFrame, then that suffices, even if no factors are
# available.
(int_df,) = build_design_matrices([int_builder], data,
return_type="dataframe")
assert isinstance(int_df, pandas.DataFrame)
assert np.array_equal(int_df, [[1], [1], [1]])
assert int_df.index.equals(pandas.Index([10, 20, 30]))
import patsy.build
had_pandas = patsy.build.have_pandas
try:
patsy.build.have_pandas = False
# return_type="dataframe" gives a nice error if pandas is not available
pytest.raises(PatsyError,
build_design_matrices,
[x_builder], {"x": [1, 2, 3]}, return_type="dataframe")
finally:
patsy.build.have_pandas = had_pandas
x_df, = build_design_matrices([x_a_builder],
{"x": [1.0, np.nan, 3.0],
"a": np.asarray([None, "a2", "a1"],
dtype=object)},
NA_action="drop",
return_type="dataframe")
assert x_df.index.equals(pandas.Index([2]))
def test_data_mismatch():
test_cases_twoway = [
# Data type mismatch
([1, 2, 3], [True, False, True]),
(C(["a", "b", "c"], levels=["c", "b", "a"]),
C(["a", "b", "c"], levels=["a", "b", "c"])),
# column number mismatches
([[1], [2], [3]], [[1, 1], [2, 2], [3, 3]]),
([[1, 1, 1], [2, 2, 2], [3, 3, 3]], [[1, 1], [2, 2], [3, 3]]),
]
test_cases_oneway = [
([1, 2, 3], ["a", "b", "c"]),
([1, 2, 3], C(["a", "b", "c"])),
([True, False, True], C(["a", "b", "c"])),
([True, False, True], ["a", "b", "c"]),
]
setup_predict_only = [
# This is not an error if both are fed in during make_builders, but it
# is an error to pass one to make_builders and the other to
# make_matrices.
(["a", "b", "c"], ["a", "b", "d"]),
]
termlist = make_termlist(["x"])
def t_incremental(data1, data2):
def iter_maker():
yield {"x": data1}
yield {"x": data2}
try:
builders = design_matrix_builders([termlist], iter_maker, 0)
build_design_matrices(builders, {"x": data1})
build_design_matrices(builders, {"x": data2})
except PatsyError:
pass
else:
raise AssertionError
def t_setup_predict(data1, data2):
def iter_maker():
yield {"x": data1}
builders = design_matrix_builders([termlist], iter_maker, 0)
pytest.raises(PatsyError,
build_design_matrices, builders, {"x": data2})
for (a, b) in test_cases_twoway:
t_incremental(a, b)
t_incremental(b, a)
t_setup_predict(a, b)
t_setup_predict(b, a)
for (a, b) in test_cases_oneway:
t_incremental(a, b)
t_setup_predict(a, b)
for (a, b) in setup_predict_only:
t_setup_predict(a, b)
t_setup_predict(b, a)
pytest.raises(PatsyError,
make_matrix, {"x": [1, 2, 3], "y": [1, 2, 3, 4]},
2, [["x"], ["y"]])
def test_data_independent_builder():
data = {"x": [1, 2, 3]}
def iter_maker():
yield data
# Trying to build a matrix that doesn't depend on the data at all is an
# error, if:
# - the index argument is not given
# - the data is not a DataFrame
# - there are no other matrices
null_builder = design_matrix_builders([make_termlist()], iter_maker, 0)[0]
pytest.raises(PatsyError, build_design_matrices, [null_builder], data)
intercept_builder = design_matrix_builders([make_termlist([])],
iter_maker,
eval_env=0)[0]
pytest.raises(PatsyError, build_design_matrices, [intercept_builder], data)
pytest.raises(PatsyError,
build_design_matrices,
[null_builder, intercept_builder], data)
# If data is a DataFrame, it sets the number of rows.
if have_pandas:
int_m, null_m = build_design_matrices([intercept_builder,
null_builder],
pandas.DataFrame(data))
assert np.allclose(int_m, [[1], [1], [1]])
assert null_m.shape == (3, 0)
# If there are other matrices that do depend on the data, we make the
# data-independent matrices have the same number of rows.
x_termlist = make_termlist(["x"])
builders = design_matrix_builders([x_termlist, make_termlist()],
iter_maker,
eval_env=0)
x_m, null_m = build_design_matrices(builders, data)
assert np.allclose(x_m, [[1], [2], [3]])
assert null_m.shape == (3, 0)
builders = design_matrix_builders([x_termlist, make_termlist([])],
iter_maker,
eval_env=0)
x_m, null_m = build_design_matrices(builders, data)
x_m, intercept_m = build_design_matrices(builders, data)
assert np.allclose(x_m, [[1], [2], [3]])
assert np.allclose(intercept_m, [[1], [1], [1]])
def test_same_factor_in_two_matrices():
data = {"x": [1, 2, 3], "a": ["a1", "a2", "a1"]}
def iter_maker():
yield data
t1 = make_termlist(["x"])
t2 = make_termlist(["x", "a"])
builders = design_matrix_builders([t1, t2], iter_maker, eval_env=0)
m1, m2 = build_design_matrices(builders, data)
check_design_matrix(m1, 1, t1, column_names=["x"])
assert np.allclose(m1, [[1], [2], [3]])
check_design_matrix(m2, 2, t2, column_names=["x:a[a1]", "x:a[a2]"])
assert np.allclose(m2, [[1, 0], [0, 2], [3, 0]])
def test_eval_env_type_builder():
data = {"x": [1, 2, 3]}
def iter_maker():
yield data
pytest.raises(TypeError, design_matrix_builders, [make_termlist("x")], iter_maker, "foo")
def test_categorical():
data_strings = {"a": ["a1", "a2", "a1"]}
data_categ = {"a": C(["a2", "a1", "a2"])}
datas = [data_strings, data_categ]
if have_pandas_categorical:
data_pandas = {"a": pandas.Categorical(["a1", "a2", "a2"])}
datas.append(data_pandas)
def t(data1, data2):
def iter_maker():
yield data1
builders = design_matrix_builders([make_termlist(["a"])],
iter_maker,
eval_env=0)
build_design_matrices(builders, data2)
for data1 in datas:
for data2 in datas:
t(data1, data2)
def test_contrast():
from patsy.contrasts import ContrastMatrix, Sum
values = ["a1", "a3", "a1", "a2"]
# No intercept in model, full-rank coding of 'a'
m = make_matrix({"a": C(values)}, 3, [["a"]],
column_names=["a[a1]", "a[a2]", "a[a3]"])
assert np.allclose(m, [[1, 0, 0],
[0, 0, 1],
[1, 0, 0],
[0, 1, 0]])
for s in (Sum, Sum()):
m = make_matrix({"a": C(values, s)}, 3, [["a"]],
column_names=["a[mean]", "a[S.a1]", "a[S.a2]"])
# Output from R
assert np.allclose(m, [[1, 1, 0],
[1,-1, -1],
[1, 1, 0],
[1, 0, 1]])
m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [["a"]],
column_names=["a[mean]", "a[S.a2]", "a[S.a3]"])
# Output from R
assert np.allclose(m, [[1, -1, -1],
[1, 0, 1],
[1, -1, -1],
[1, 1, 0]])
# Intercept in model, non-full-rank coding of 'a'
m = make_matrix({"a": C(values)}, 3, [[], ["a"]],
column_names=["Intercept", "a[T.a2]", "a[T.a3]"])
assert np.allclose(m, [[1, 0, 0],
[1, 0, 1],
[1, 0, 0],
[1, 1, 0]])
for s in (Sum, Sum()):
m = make_matrix({"a": C(values, s)}, 3, [[], ["a"]],
column_names=["Intercept", "a[S.a1]", "a[S.a2]"])
# Output from R
assert np.allclose(m, [[1, 1, 0],
[1,-1, -1],
[1, 1, 0],
[1, 0, 1]])
m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [[], ["a"]],
column_names=["Intercept", "a[S.a2]", "a[S.a3]"])
# Output from R
assert np.allclose(m, [[1, -1, -1],
[1, 0, 1],
[1, -1, -1],
[1, 1, 0]])
# Weird ad hoc less-than-full-rank coding of 'a'
m = make_matrix({"a": C(values, [[7, 12],
[2, 13],
[8, -1]])},
2, [["a"]],
column_names=["a[custom0]", "a[custom1]"])
assert np.allclose(m, [[7, 12],
[8, -1],
[7, 12],
[2, 13]])
m = make_matrix({"a": C(values, ContrastMatrix([[7, 12],
[2, 13],
[8, -1]],
["[foo]", "[bar]"]))},
2, [["a"]],
column_names=["a[foo]", "a[bar]"])
assert np.allclose(m, [[7, 12],
[8, -1],
[7, 12],
[2, 13]])
def test_DesignInfo_subset():
# For each combination of:
# formula, term names, term objects, mixed term name and term objects
# check that results match subset of full build
# and that removed variables don't hurt
all_data = {"x": [1, 2],
"y": [[3.1, 3.2],
[4.1, 4.2]],
"z": [5, 6]}
all_terms = make_termlist("x", "y", "z")
def iter_maker():
yield all_data
all_builder = design_matrix_builders([all_terms], iter_maker, 0)[0]
full_matrix = build_design_matrices([all_builder], all_data)[0]
def t(which_terms, variables, columns):
sub_design_info = all_builder.subset(which_terms)
sub_data = {}
for variable in variables:
sub_data[variable] = all_data[variable]
sub_matrix = build_design_matrices([sub_design_info], sub_data)[0]
sub_full_matrix = full_matrix[:, columns]
if not isinstance(which_terms, six.string_types):
assert len(which_terms) == len(sub_design_info.terms)
assert np.array_equal(sub_matrix, sub_full_matrix)
t("~ 0 + x + y + z", ["x", "y", "z"], slice(None))
t(["x", "y", "z"], ["x", "y", "z"], slice(None))
# Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in
# all versions.
if not six.PY3:
t([unicode("x"), unicode("y"), unicode("z")],
["x", "y", "z"], slice(None))
t(all_terms, ["x", "y", "z"], slice(None))
t([all_terms[0], "y", all_terms[2]], ["x", "y", "z"], slice(None))
t("~ 0 + x + z", ["x", "z"], [0, 3])
t(["x", "z"], ["x", "z"], [0, 3])
# Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in
# all versions.
if not six.PY3:
t([unicode("x"), unicode("z")], ["x", "z"], [0, 3])
t([all_terms[0], all_terms[2]], ["x", "z"], [0, 3])
t([all_terms[0], "z"], ["x", "z"], [0, 3])
t("~ 0 + z + x", ["x", "z"], [3, 0])
t(["z", "x"], ["x", "z"], [3, 0])
t([six.text_type("z"), six.text_type("x")], ["x", "z"], [3, 0])
t([all_terms[2], all_terms[0]], ["x", "z"], [3, 0])
t([all_terms[2], "x"], ["x", "z"], [3, 0])
t("~ 0 + y", ["y"], [1, 2])
t(["y"], ["y"], [1, 2])
t([six.text_type("y")], ["y"], [1, 2])
t([all_terms[1]], ["y"], [1, 2])
# Formula can't have a LHS
pytest.raises(PatsyError, all_builder.subset, "a ~ a")
# Term must exist
pytest.raises(KeyError, all_builder.subset, "~ asdf")
pytest.raises(KeyError, all_builder.subset, ["asdf"])
pytest.raises(KeyError,
all_builder.subset, [Term(["asdf"])])
# Also check for a minimal DesignInfo (column names only)
min_di = DesignInfo(["a", "b", "c"])
min_di_subset = min_di.subset(["c", "a"])
assert min_di_subset.column_names == ["c", "a"]
assert min_di_subset.terms is None