# This file is part of Patsy # Copyright (C) 2012-2013 Nathaniel Smith # See file LICENSE.txt for license information. # There are a number of unit tests in build.py, but this file contains more # thorough tests of the overall design matrix building system. (These are # still not exhaustive end-to-end tests, though -- for that see # test_highlevel.py.) from __future__ import print_function import six import numpy as np import pytest from patsy import PatsyError from patsy.util import (atleast_2d_column_default, have_pandas, have_pandas_categorical) from patsy.desc import Term, INTERCEPT from patsy.build import * from patsy.categorical import C from patsy.user_util import balanced, LookupFactor from patsy.design_info import DesignMatrix, DesignInfo if have_pandas: import pandas def assert_full_rank(m): m = atleast_2d_column_default(m) if m.shape[1] == 0: return True u, s, v = np.linalg.svd(m) rank = np.sum(s > 1e-10) assert rank == m.shape[1] def test_assert_full_rank(): assert_full_rank(np.eye(10)) assert_full_rank([[1, 0], [1, 0], [1, 0], [1, 1]]) pytest.raises(AssertionError, assert_full_rank, [[1, 0], [2, 0]]) pytest.raises(AssertionError, assert_full_rank, [[1, 2], [2, 4]]) pytest.raises(AssertionError, assert_full_rank, [[1, 2, 3], [1, 10, 100]]) # col1 + col2 = col3 pytest.raises(AssertionError, assert_full_rank, [[1, 2, 3], [1, 5, 6], [1, 6, 7]]) def make_termlist(*entries): terms = [] for entry in entries: terms.append(Term([LookupFactor(name) for name in entry])) return terms def check_design_matrix(mm, expected_rank, termlist, column_names=None): assert_full_rank(mm) assert set(mm.design_info.terms) == set(termlist) if column_names is not None: assert mm.design_info.column_names == column_names assert mm.ndim == 2 assert mm.shape[1] == expected_rank def make_matrix(data, expected_rank, entries, column_names=None): termlist = make_termlist(*entries) def iter_maker(): yield data design_infos = design_matrix_builders([termlist], iter_maker, eval_env=0) matrices = build_design_matrices(design_infos, data) matrix = matrices[0] assert (design_infos[0].term_slices == matrix.design_info.term_slices) assert (design_infos[0].column_names == matrix.design_info.column_names) assert matrix.design_info is design_infos[0] check_design_matrix(matrix, expected_rank, termlist, column_names=column_names) return matrix def test_simple(): data = balanced(a=2, b=2) x1 = data["x1"] = np.linspace(0, 1, len(data["a"])) x2 = data["x2"] = data["x1"] ** 2 m = make_matrix(data, 2, [["a"]], column_names=["a[a1]", "a[a2]"]) assert np.allclose(m, [[1, 0], [1, 0], [0, 1], [0, 1]]) m = make_matrix(data, 2, [[], ["a"]], column_names=["Intercept", "a[T.a2]"]) assert np.allclose(m, [[1, 0], [1, 0], [1, 1], [1, 1]]) m = make_matrix(data, 4, [["a", "b"]], column_names=["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"]) assert np.allclose(m, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) m = make_matrix(data, 4, [[], ["a"], ["b"], ["a", "b"]], column_names=["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"]) assert np.allclose(m, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]]) m = make_matrix(data, 4, [[], ["b"], ["a"], ["b", "a"]], column_names=["Intercept", "b[T.b2]", "a[T.a2]", "b[T.b2]:a[T.a2]"]) assert np.allclose(m, [[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]]) m = make_matrix(data, 4, [["a"], ["x1"], ["a", "x1"]], column_names=["a[a1]", "a[a2]", "x1", "a[T.a2]:x1"]) assert np.allclose(m, [[1, 0, x1[0], 0], [1, 0, x1[1], 0], [0, 1, x1[2], x1[2]], [0, 1, x1[3], x1[3]]]) m = make_matrix(data, 3, [["x1"], ["x2"], ["x2", "x1"]], column_names=["x1", "x2", "x2:x1"]) assert np.allclose(m, np.column_stack((x1, x2, x1 * x2))) def test_R_bugs(): data = balanced(a=2, b=2, c=2) data["x"] = np.linspace(0, 1, len(data["a"])) # For "1 + a:b", R produces a design matrix with too many columns (5 # instead of 4), because it can't tell that there is a redundancy between # the two terms. make_matrix(data, 4, [[], ["a", "b"]]) # For "0 + a:x + a:b", R produces a design matrix with too few columns (4 # instead of 6), because it thinks that there is a redundancy which # doesn't exist. make_matrix(data, 6, [["a", "x"], ["a", "b"]]) # This can be compared with "0 + a:c + a:b", where the redundancy does # exist. Confusingly, adding another categorical factor increases the # baseline dimensionality to 8, and then the redundancy reduces it to 6 # again, so the result is the same as before but for different reasons. (R # does get this one right, but we might as well test it.) make_matrix(data, 6, [["a", "c"], ["a", "b"]]) def test_redundancy_thoroughly(): # To make sure there aren't any lurking bugs analogous to the ones that R # has (see above), we check that we get the correct matrix rank for every # possible combination of 2 categorical and 2 numerical factors. data = balanced(a=2, b=2, repeat=5) data["x1"] = np.linspace(0, 1, len(data["a"])) data["x2"] = data["x1"] ** 2 def all_subsets(l): if not l: yield tuple() else: obj = l[0] for subset in all_subsets(l[1:]): yield tuple(sorted(subset)) yield tuple(sorted((obj,) + subset)) all_terms = list(all_subsets(("a", "b", "x1", "x2"))) all_termlist_templates = list(all_subsets(all_terms)) print(len(all_termlist_templates)) # eliminate some of the symmetric versions to speed things up redundant = [[("b",), ("a",)], [("x2",), ("x1",)], [("b", "x2"), ("a", "x1")], [("a", "b", "x2"), ("a", "b", "x1")], [("b", "x1", "x2"), ("a", "x1", "x2")]] count = 0 import time start = time.time() for termlist_template in all_termlist_templates: termlist_set = set(termlist_template) for dispreferred, preferred in redundant: if dispreferred in termlist_set and preferred not in termlist_set: break else: expanded_terms = set() for term_template in termlist_template: numeric = tuple([t for t in term_template if t.startswith("x")]) rest = [t for t in term_template if not t.startswith("x")] for subset_rest in all_subsets(rest): expanded_terms.add(frozenset(subset_rest + numeric)) # Because our categorical variables have 2 levels, each expanded # term corresponds to 1 unique dimension of variation expected_rank = len(expanded_terms) if termlist_template in [(), ((),)]: # No data dependence, should fail pytest.raises(PatsyError, make_matrix, data, expected_rank, termlist_template) else: make_matrix(data, expected_rank, termlist_template) count += 1 if count % 100 == 0: print("Completed:", count) print("Took %0.2f seconds" % (time.time() - start,)) test_redundancy_thoroughly.slow = 1 def test_data_types(): basic_dict = {"a": ["a1", "a2", "a1", "a2"], "x": [1, 2, 3, 4]} # On Python 2, this is identical to basic_dict: basic_dict_bytes = dict(basic_dict) basic_dict_bytes["a"] = [s.encode("ascii") for s in basic_dict_bytes["a"]] # On Python 3, this is identical to basic_dict: basic_dict_unicode = {"a": ["a1", "a2", "a1", "a2"], "x": [1, 2, 3, 4]} basic_dict_unicode = dict(basic_dict) basic_dict_unicode["a"] = [six.text_type(s) for s in basic_dict_unicode["a"]] structured_array_bytes = np.array(list(zip(basic_dict["a"], basic_dict["x"])), dtype=[("a", "S2"), ("x", int)]) structured_array_unicode = np.array(list(zip(basic_dict["a"], basic_dict["x"])), dtype=[("a", "U2"), ("x", int)]) recarray_bytes = structured_array_bytes.view(np.recarray) recarray_unicode = structured_array_unicode.view(np.recarray) datas = [basic_dict, structured_array_bytes, structured_array_unicode, recarray_bytes, recarray_unicode] if have_pandas: df_bytes = pandas.DataFrame(basic_dict_bytes) datas.append(df_bytes) df_unicode = pandas.DataFrame(basic_dict_unicode) datas.append(df_unicode) for data in datas: m = make_matrix(data, 4, [["a"], ["a", "x"]], column_names=["a[a1]", "a[a2]", "a[a1]:x", "a[a2]:x"]) assert np.allclose(m, [[1, 0, 1, 0], [0, 1, 0, 2], [1, 0, 3, 0], [0, 1, 0, 4]]) def test_build_design_matrices_dtype(): data = {"x": [1, 2, 3]} def iter_maker(): yield data builder = design_matrix_builders([make_termlist("x")], iter_maker, 0)[0] mat = build_design_matrices([builder], data)[0] assert mat.dtype == np.dtype(np.float64) mat = build_design_matrices([builder], data, dtype=np.float32)[0] assert mat.dtype == np.dtype(np.float32) if hasattr(np, "float128"): mat = build_design_matrices([builder], data, dtype=np.float128)[0] assert mat.dtype == np.dtype(np.float128) def test_return_type(): data = {"x": [1, 2, 3]} def iter_maker(): yield data builder = design_matrix_builders([make_termlist("x")], iter_maker, 0)[0] # Check explicitly passing return_type="matrix" works mat = build_design_matrices([builder], data, return_type="matrix")[0] assert isinstance(mat, DesignMatrix) # Check that nonsense is detected pytest.raises(PatsyError, build_design_matrices, [builder], data, return_type="asdfsadf") def test_NA_action(): initial_data = {"x": [1, 2, 3], "c": ["c1", "c2", "c1"]} def iter_maker(): yield initial_data builder = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0)[0] # By default drops rows containing either NaN or None mat = build_design_matrices([builder], {"x": [10.0, np.nan, 20.0], "c": np.asarray(["c1", "c2", None], dtype=object)})[0] assert mat.shape == (1, 3) assert np.array_equal(mat, [[1.0, 0.0, 10.0]]) # NA_action="a string" also accepted: mat = build_design_matrices([builder], {"x": [10.0, np.nan, 20.0], "c": np.asarray(["c1", "c2", None], dtype=object)}, NA_action="drop")[0] assert mat.shape == (1, 3) assert np.array_equal(mat, [[1.0, 0.0, 10.0]]) # And objects from patsy.missing import NAAction # allows NaN's to pass through NA_action = NAAction(NA_types=[]) mat = build_design_matrices([builder], {"x": [10.0, np.nan], "c": np.asarray(["c1", "c2"], dtype=object)}, NA_action=NA_action)[0] assert mat.shape == (2, 3) # According to this (and only this) function, NaN == NaN. np.testing.assert_array_equal(mat, [[1.0, 0.0, 10.0], [0.0, 1.0, np.nan]]) # NA_action="raise" pytest.raises(PatsyError, build_design_matrices, [builder], {"x": [10.0, np.nan, 20.0], "c": np.asarray(["c1", "c2", None], dtype=object)}, NA_action="raise") def test_NA_drop_preserves_levels(): # Even if all instances of some level are dropped, we still include it in # the output matrix (as an all-zeros column) data = {"x": [1.0, np.nan, 3.0], "c": ["c1", "c2", "c3"]} def iter_maker(): yield data design_info = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0)[0] assert design_info.column_names == ["c[c1]", "c[c2]", "c[c3]", "x"] mat, = build_design_matrices([design_info], data) assert mat.shape == (2, 4) assert np.array_equal(mat, [[1.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 3.0]]) def test_return_type_pandas(): if not have_pandas: return data = pandas.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6], "a": ["a1", "a2", "a1"]}, index=[10, 20, 30]) def iter_maker(): yield data int_builder, = design_matrix_builders([make_termlist([])], iter_maker, 0) (y_builder, x_builder) = design_matrix_builders([make_termlist("y"), make_termlist("x")], iter_maker, eval_env=0) (x_a_builder,) = design_matrix_builders([make_termlist("x", "a")], iter_maker, eval_env=0) (x_y_builder,) = design_matrix_builders([make_termlist("x", "y")], iter_maker, eval_env=0) # Index compatibility is always checked for pandas input, regardless of # whether we're producing pandas output pytest.raises(PatsyError, build_design_matrices, [x_a_builder], {"x": data["x"], "a": data["a"][::-1]}) pytest.raises(PatsyError, build_design_matrices, [y_builder, x_builder], {"x": data["x"], "y": data["y"][::-1]}) # And we also check consistency between data.index and value indexes # Creating a mismatch between these is a bit tricky. We want a data object # such that isinstance(data, DataFrame), but data["x"].index != # data.index. class CheatingDataFrame(pandas.DataFrame): def __getitem__(self, key): if key == "x": return pandas.DataFrame.__getitem__(self, key)[::-1] else: return pandas.DataFrame.__getitem__(self, key) pytest.raises(PatsyError, build_design_matrices, [x_builder], CheatingDataFrame(data)) # A mix of pandas input and unindexed input is fine (mat,) = build_design_matrices([x_y_builder], {"x": data["x"], "y": [40, 50, 60]}) assert np.allclose(mat, [[1, 40], [2, 50], [3, 60]]) # with return_type="dataframe", we get out DataFrames with nice indices # and nice column names and design_info y_df, x_df = build_design_matrices([y_builder, x_builder], data, return_type="dataframe") assert isinstance(y_df, pandas.DataFrame) assert isinstance(x_df, pandas.DataFrame) assert np.array_equal(y_df, [[4], [5], [6]]) assert np.array_equal(x_df, [[1], [2], [3]]) assert np.array_equal(y_df.index, [10, 20, 30]) assert np.array_equal(x_df.index, [10, 20, 30]) assert np.array_equal(y_df.columns, ["y"]) assert np.array_equal(x_df.columns, ["x"]) assert y_df.design_info.column_names == ["y"] assert x_df.design_info.column_names == ["x"] assert y_df.design_info.term_names == ["y"] assert x_df.design_info.term_names == ["x"] # Same with mix of pandas and unindexed info, even if in different # matrices y_df, x_df = build_design_matrices([y_builder, x_builder], {"y": [7, 8, 9], "x": data["x"]}, return_type="dataframe") assert isinstance(y_df, pandas.DataFrame) assert isinstance(x_df, pandas.DataFrame) assert np.array_equal(y_df, [[7], [8], [9]]) assert np.array_equal(x_df, [[1], [2], [3]]) assert np.array_equal(y_df.index, [10, 20, 30]) assert np.array_equal(x_df.index, [10, 20, 30]) assert np.array_equal(y_df.columns, ["y"]) assert np.array_equal(x_df.columns, ["x"]) assert y_df.design_info.column_names == ["y"] assert x_df.design_info.column_names == ["x"] assert y_df.design_info.term_names == ["y"] assert x_df.design_info.term_names == ["x"] # Check categorical works for carrying index too (x_a_df,) = build_design_matrices([x_a_builder], {"x": [-1, -2, -3], "a": data["a"]}, return_type="dataframe") assert isinstance(x_a_df, pandas.DataFrame) assert np.array_equal(x_a_df, [[1, 0, -1], [0, 1, -2], [1, 0, -3]]) assert np.array_equal(x_a_df.index, [10, 20, 30]) # And if we have no indexed input, then we let pandas make up an index as # per its usual rules: (x_y_df,) = build_design_matrices([x_y_builder], {"y": [7, 8, 9], "x": [10, 11, 12]}, return_type="dataframe") assert isinstance(x_y_df, pandas.DataFrame) assert np.array_equal(x_y_df, [[10, 7], [11, 8], [12, 9]]) assert np.array_equal(x_y_df.index, [0, 1, 2]) # If 'data' is a DataFrame, then that suffices, even if no factors are # available. (int_df,) = build_design_matrices([int_builder], data, return_type="dataframe") assert isinstance(int_df, pandas.DataFrame) assert np.array_equal(int_df, [[1], [1], [1]]) assert int_df.index.equals(pandas.Index([10, 20, 30])) import patsy.build had_pandas = patsy.build.have_pandas try: patsy.build.have_pandas = False # return_type="dataframe" gives a nice error if pandas is not available pytest.raises(PatsyError, build_design_matrices, [x_builder], {"x": [1, 2, 3]}, return_type="dataframe") finally: patsy.build.have_pandas = had_pandas x_df, = build_design_matrices([x_a_builder], {"x": [1.0, np.nan, 3.0], "a": np.asarray([None, "a2", "a1"], dtype=object)}, NA_action="drop", return_type="dataframe") assert x_df.index.equals(pandas.Index([2])) def test_data_mismatch(): test_cases_twoway = [ # Data type mismatch ([1, 2, 3], [True, False, True]), (C(["a", "b", "c"], levels=["c", "b", "a"]), C(["a", "b", "c"], levels=["a", "b", "c"])), # column number mismatches ([[1], [2], [3]], [[1, 1], [2, 2], [3, 3]]), ([[1, 1, 1], [2, 2, 2], [3, 3, 3]], [[1, 1], [2, 2], [3, 3]]), ] test_cases_oneway = [ ([1, 2, 3], ["a", "b", "c"]), ([1, 2, 3], C(["a", "b", "c"])), ([True, False, True], C(["a", "b", "c"])), ([True, False, True], ["a", "b", "c"]), ] setup_predict_only = [ # This is not an error if both are fed in during make_builders, but it # is an error to pass one to make_builders and the other to # make_matrices. (["a", "b", "c"], ["a", "b", "d"]), ] termlist = make_termlist(["x"]) def t_incremental(data1, data2): def iter_maker(): yield {"x": data1} yield {"x": data2} try: builders = design_matrix_builders([termlist], iter_maker, 0) build_design_matrices(builders, {"x": data1}) build_design_matrices(builders, {"x": data2}) except PatsyError: pass else: raise AssertionError def t_setup_predict(data1, data2): def iter_maker(): yield {"x": data1} builders = design_matrix_builders([termlist], iter_maker, 0) pytest.raises(PatsyError, build_design_matrices, builders, {"x": data2}) for (a, b) in test_cases_twoway: t_incremental(a, b) t_incremental(b, a) t_setup_predict(a, b) t_setup_predict(b, a) for (a, b) in test_cases_oneway: t_incremental(a, b) t_setup_predict(a, b) for (a, b) in setup_predict_only: t_setup_predict(a, b) t_setup_predict(b, a) pytest.raises(PatsyError, make_matrix, {"x": [1, 2, 3], "y": [1, 2, 3, 4]}, 2, [["x"], ["y"]]) def test_data_independent_builder(): data = {"x": [1, 2, 3]} def iter_maker(): yield data # Trying to build a matrix that doesn't depend on the data at all is an # error, if: # - the index argument is not given # - the data is not a DataFrame # - there are no other matrices null_builder = design_matrix_builders([make_termlist()], iter_maker, 0)[0] pytest.raises(PatsyError, build_design_matrices, [null_builder], data) intercept_builder = design_matrix_builders([make_termlist([])], iter_maker, eval_env=0)[0] pytest.raises(PatsyError, build_design_matrices, [intercept_builder], data) pytest.raises(PatsyError, build_design_matrices, [null_builder, intercept_builder], data) # If data is a DataFrame, it sets the number of rows. if have_pandas: int_m, null_m = build_design_matrices([intercept_builder, null_builder], pandas.DataFrame(data)) assert np.allclose(int_m, [[1], [1], [1]]) assert null_m.shape == (3, 0) # If there are other matrices that do depend on the data, we make the # data-independent matrices have the same number of rows. x_termlist = make_termlist(["x"]) builders = design_matrix_builders([x_termlist, make_termlist()], iter_maker, eval_env=0) x_m, null_m = build_design_matrices(builders, data) assert np.allclose(x_m, [[1], [2], [3]]) assert null_m.shape == (3, 0) builders = design_matrix_builders([x_termlist, make_termlist([])], iter_maker, eval_env=0) x_m, null_m = build_design_matrices(builders, data) x_m, intercept_m = build_design_matrices(builders, data) assert np.allclose(x_m, [[1], [2], [3]]) assert np.allclose(intercept_m, [[1], [1], [1]]) def test_same_factor_in_two_matrices(): data = {"x": [1, 2, 3], "a": ["a1", "a2", "a1"]} def iter_maker(): yield data t1 = make_termlist(["x"]) t2 = make_termlist(["x", "a"]) builders = design_matrix_builders([t1, t2], iter_maker, eval_env=0) m1, m2 = build_design_matrices(builders, data) check_design_matrix(m1, 1, t1, column_names=["x"]) assert np.allclose(m1, [[1], [2], [3]]) check_design_matrix(m2, 2, t2, column_names=["x:a[a1]", "x:a[a2]"]) assert np.allclose(m2, [[1, 0], [0, 2], [3, 0]]) def test_eval_env_type_builder(): data = {"x": [1, 2, 3]} def iter_maker(): yield data pytest.raises(TypeError, design_matrix_builders, [make_termlist("x")], iter_maker, "foo") def test_categorical(): data_strings = {"a": ["a1", "a2", "a1"]} data_categ = {"a": C(["a2", "a1", "a2"])} datas = [data_strings, data_categ] if have_pandas_categorical: data_pandas = {"a": pandas.Categorical(["a1", "a2", "a2"])} datas.append(data_pandas) def t(data1, data2): def iter_maker(): yield data1 builders = design_matrix_builders([make_termlist(["a"])], iter_maker, eval_env=0) build_design_matrices(builders, data2) for data1 in datas: for data2 in datas: t(data1, data2) def test_contrast(): from patsy.contrasts import ContrastMatrix, Sum values = ["a1", "a3", "a1", "a2"] # No intercept in model, full-rank coding of 'a' m = make_matrix({"a": C(values)}, 3, [["a"]], column_names=["a[a1]", "a[a2]", "a[a3]"]) assert np.allclose(m, [[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) for s in (Sum, Sum()): m = make_matrix({"a": C(values, s)}, 3, [["a"]], column_names=["a[mean]", "a[S.a1]", "a[S.a2]"]) # Output from R assert np.allclose(m, [[1, 1, 0], [1,-1, -1], [1, 1, 0], [1, 0, 1]]) m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [["a"]], column_names=["a[mean]", "a[S.a2]", "a[S.a3]"]) # Output from R assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]]) # Intercept in model, non-full-rank coding of 'a' m = make_matrix({"a": C(values)}, 3, [[], ["a"]], column_names=["Intercept", "a[T.a2]", "a[T.a3]"]) assert np.allclose(m, [[1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0]]) for s in (Sum, Sum()): m = make_matrix({"a": C(values, s)}, 3, [[], ["a"]], column_names=["Intercept", "a[S.a1]", "a[S.a2]"]) # Output from R assert np.allclose(m, [[1, 1, 0], [1,-1, -1], [1, 1, 0], [1, 0, 1]]) m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [[], ["a"]], column_names=["Intercept", "a[S.a2]", "a[S.a3]"]) # Output from R assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]]) # Weird ad hoc less-than-full-rank coding of 'a' m = make_matrix({"a": C(values, [[7, 12], [2, 13], [8, -1]])}, 2, [["a"]], column_names=["a[custom0]", "a[custom1]"]) assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]]) m = make_matrix({"a": C(values, ContrastMatrix([[7, 12], [2, 13], [8, -1]], ["[foo]", "[bar]"]))}, 2, [["a"]], column_names=["a[foo]", "a[bar]"]) assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]]) def test_DesignInfo_subset(): # For each combination of: # formula, term names, term objects, mixed term name and term objects # check that results match subset of full build # and that removed variables don't hurt all_data = {"x": [1, 2], "y": [[3.1, 3.2], [4.1, 4.2]], "z": [5, 6]} all_terms = make_termlist("x", "y", "z") def iter_maker(): yield all_data all_builder = design_matrix_builders([all_terms], iter_maker, 0)[0] full_matrix = build_design_matrices([all_builder], all_data)[0] def t(which_terms, variables, columns): sub_design_info = all_builder.subset(which_terms) sub_data = {} for variable in variables: sub_data[variable] = all_data[variable] sub_matrix = build_design_matrices([sub_design_info], sub_data)[0] sub_full_matrix = full_matrix[:, columns] if not isinstance(which_terms, six.string_types): assert len(which_terms) == len(sub_design_info.terms) assert np.array_equal(sub_matrix, sub_full_matrix) t("~ 0 + x + y + z", ["x", "y", "z"], slice(None)) t(["x", "y", "z"], ["x", "y", "z"], slice(None)) # Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in # all versions. if not six.PY3: t([unicode("x"), unicode("y"), unicode("z")], ["x", "y", "z"], slice(None)) t(all_terms, ["x", "y", "z"], slice(None)) t([all_terms[0], "y", all_terms[2]], ["x", "y", "z"], slice(None)) t("~ 0 + x + z", ["x", "z"], [0, 3]) t(["x", "z"], ["x", "z"], [0, 3]) # Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in # all versions. if not six.PY3: t([unicode("x"), unicode("z")], ["x", "z"], [0, 3]) t([all_terms[0], all_terms[2]], ["x", "z"], [0, 3]) t([all_terms[0], "z"], ["x", "z"], [0, 3]) t("~ 0 + z + x", ["x", "z"], [3, 0]) t(["z", "x"], ["x", "z"], [3, 0]) t([six.text_type("z"), six.text_type("x")], ["x", "z"], [3, 0]) t([all_terms[2], all_terms[0]], ["x", "z"], [3, 0]) t([all_terms[2], "x"], ["x", "z"], [3, 0]) t("~ 0 + y", ["y"], [1, 2]) t(["y"], ["y"], [1, 2]) t([six.text_type("y")], ["y"], [1, 2]) t([all_terms[1]], ["y"], [1, 2]) # Formula can't have a LHS pytest.raises(PatsyError, all_builder.subset, "a ~ a") # Term must exist pytest.raises(KeyError, all_builder.subset, "~ asdf") pytest.raises(KeyError, all_builder.subset, ["asdf"]) pytest.raises(KeyError, all_builder.subset, [Term(["asdf"])]) # Also check for a minimal DesignInfo (column names only) min_di = DesignInfo(["a", "b", "c"]) min_di_subset = min_di.subset(["c", "a"]) assert min_di_subset.column_names == ["c", "a"] assert min_di_subset.terms is None