246 lines
8.9 KiB
Python
246 lines
8.9 KiB
Python
# This file is part of Patsy
|
|
# Copyright (C) 2012 Nathaniel Smith <njs@pobox.com>
|
|
# See file LICENSE.txt for license information.
|
|
|
|
# Miscellaneous utilities that are useful to users (as compared to
|
|
# patsy.util, which is misc. utilities useful for implementing patsy).
|
|
|
|
# These are made available in the patsy.* namespace
|
|
__all__ = ["balanced", "demo_data", "LookupFactor"]
|
|
|
|
import itertools
|
|
import numpy as np
|
|
from patsy import PatsyError
|
|
from patsy.categorical import C
|
|
from patsy.util import no_pickling, assert_no_pickling
|
|
|
|
def balanced(**kwargs):
|
|
"""balanced(factor_name=num_levels, [factor_name=num_levels, ..., repeat=1])
|
|
|
|
Create simple balanced factorial designs for testing.
|
|
|
|
Given some factor names and the number of desired levels for each,
|
|
generates a balanced factorial design in the form of a data
|
|
dictionary. For example:
|
|
|
|
.. ipython::
|
|
|
|
In [1]: balanced(a=2, b=3)
|
|
Out[1]:
|
|
{'a': ['a1', 'a1', 'a1', 'a2', 'a2', 'a2'],
|
|
'b': ['b1', 'b2', 'b3', 'b1', 'b2', 'b3']}
|
|
|
|
By default it produces exactly one instance of each combination of levels,
|
|
but if you want multiple replicates this can be accomplished via the
|
|
`repeat` argument:
|
|
|
|
.. ipython::
|
|
|
|
In [2]: balanced(a=2, b=2, repeat=2)
|
|
Out[2]:
|
|
{'a': ['a1', 'a1', 'a2', 'a2', 'a1', 'a1', 'a2', 'a2'],
|
|
'b': ['b1', 'b2', 'b1', 'b2', 'b1', 'b2', 'b1', 'b2']}
|
|
"""
|
|
repeat = kwargs.pop("repeat", 1)
|
|
levels = []
|
|
names = sorted(kwargs)
|
|
for name in names:
|
|
level_count = kwargs[name]
|
|
levels.append(["%s%s" % (name, i) for i in range(1, level_count + 1)])
|
|
# zip(*...) does an "unzip"
|
|
values = zip(*itertools.product(*levels))
|
|
data = {}
|
|
for name, value in zip(names, values):
|
|
data[name] = list(value) * repeat
|
|
return data
|
|
|
|
def test_balanced():
|
|
data = balanced(a=2, b=3)
|
|
assert data["a"] == ["a1", "a1", "a1", "a2", "a2", "a2"]
|
|
assert data["b"] == ["b1", "b2", "b3", "b1", "b2", "b3"]
|
|
data = balanced(a=2, b=3, repeat=2)
|
|
assert data["a"] == ["a1", "a1", "a1", "a2", "a2", "a2",
|
|
"a1", "a1", "a1", "a2", "a2", "a2"]
|
|
assert data["b"] == ["b1", "b2", "b3", "b1", "b2", "b3",
|
|
"b1", "b2", "b3", "b1", "b2", "b3"]
|
|
|
|
def demo_data(*names, **kwargs):
|
|
"""demo_data(*names, nlevels=2, min_rows=5)
|
|
|
|
Create simple categorical/numerical demo data.
|
|
|
|
Pass in a set of variable names, and this function will return a simple
|
|
data set using those variable names.
|
|
|
|
Names whose first letter falls in the range "a" through "m" will be made
|
|
categorical (with `nlevels` levels). Those that start with a "p" through
|
|
"z" are numerical.
|
|
|
|
We attempt to produce a balanced design on the categorical variables,
|
|
repeating as necessary to generate at least `min_rows` data
|
|
points. Categorical variables are returned as a list of strings.
|
|
|
|
Numerical data is generated by sampling from a normal distribution. A
|
|
fixed random seed is used, so that identical calls to demo_data() will
|
|
produce identical results. Numerical data is returned in a numpy array.
|
|
|
|
Example:
|
|
|
|
.. ipython:
|
|
|
|
In [1]: patsy.demo_data("a", "b", "x", "y")
|
|
Out[1]:
|
|
{'a': ['a1', 'a1', 'a2', 'a2', 'a1', 'a1', 'a2', 'a2'],
|
|
'b': ['b1', 'b2', 'b1', 'b2', 'b1', 'b2', 'b1', 'b2'],
|
|
'x': array([ 1.76405235, 0.40015721, 0.97873798, 2.2408932 ,
|
|
1.86755799, -0.97727788, 0.95008842, -0.15135721]),
|
|
'y': array([-0.10321885, 0.4105985 , 0.14404357, 1.45427351,
|
|
0.76103773, 0.12167502, 0.44386323, 0.33367433])}
|
|
"""
|
|
nlevels = kwargs.pop("nlevels", 2)
|
|
min_rows = kwargs.pop("min_rows", 5)
|
|
if kwargs:
|
|
raise TypeError("unexpected keyword arguments %r" % (kwargs,))
|
|
numerical = set()
|
|
categorical = {}
|
|
for name in names:
|
|
if name[0] in "abcdefghijklmn":
|
|
categorical[name] = nlevels
|
|
elif name[0] in "pqrstuvwxyz":
|
|
numerical.add(name)
|
|
else:
|
|
raise PatsyError("bad name %r" % (name,))
|
|
balanced_design_size = np.prod(list(categorical.values()), dtype=int)
|
|
repeat = int(np.ceil(min_rows * 1.0 / balanced_design_size))
|
|
num_rows = repeat * balanced_design_size
|
|
data = balanced(repeat=repeat, **categorical)
|
|
r = np.random.RandomState(0)
|
|
for name in sorted(numerical):
|
|
data[name] = r.normal(size=num_rows)
|
|
return data
|
|
|
|
def test_demo_data():
|
|
d1 = demo_data("a", "b", "x")
|
|
assert sorted(d1.keys()) == ["a", "b", "x"]
|
|
assert d1["a"] == ["a1", "a1", "a2", "a2", "a1", "a1", "a2", "a2"]
|
|
assert d1["b"] == ["b1", "b2", "b1", "b2", "b1", "b2", "b1", "b2"]
|
|
assert d1["x"].dtype == np.dtype(float)
|
|
assert d1["x"].shape == (8,)
|
|
|
|
d2 = demo_data("x", "y")
|
|
assert sorted(d2.keys()) == ["x", "y"]
|
|
assert len(d2["x"]) == len(d2["y"]) == 5
|
|
|
|
assert len(demo_data("x", min_rows=10)["x"]) == 10
|
|
assert len(demo_data("a", "b", "x", min_rows=10)["x"]) == 12
|
|
assert len(demo_data("a", "b", "x", min_rows=10, nlevels=3)["x"]) == 18
|
|
|
|
import pytest
|
|
pytest.raises(PatsyError, demo_data, "a", "b", "__123")
|
|
pytest.raises(TypeError, demo_data, "a", "b", asdfasdf=123)
|
|
|
|
class LookupFactor(object):
|
|
"""A simple factor class that simply looks up a named entry in the given
|
|
data.
|
|
|
|
Useful for programatically constructing formulas, and as a simple example
|
|
of the factor protocol. For details see
|
|
:ref:`expert-model-specification`.
|
|
|
|
Example::
|
|
|
|
dmatrix(ModelDesc([], [Term([LookupFactor("x")])]), {"x": [1, 2, 3]})
|
|
|
|
:arg varname: The name of this variable; used as a lookup key in the
|
|
passed in data dictionary/DataFrame/whatever.
|
|
:arg force_categorical: If True, then treat this factor as
|
|
categorical. (Equivalent to using :func:`C` in a regular formula, but
|
|
of course you can't do that with a :class:`LookupFactor`.
|
|
:arg contrast: If given, the contrast to use; see :func:`C`. (Requires
|
|
``force_categorical=True``.)
|
|
:arg levels: If given, the categorical levels; see :func:`C`. (Requires
|
|
``force_categorical=True``.)
|
|
:arg origin: Either ``None``, or the :class:`Origin` of this factor for use
|
|
in error reporting.
|
|
|
|
.. versionadded:: 0.2.0
|
|
The ``force_categorical`` and related arguments.
|
|
"""
|
|
def __init__(self, varname,
|
|
force_categorical=False, contrast=None, levels=None,
|
|
origin=None):
|
|
self._varname = varname
|
|
self._force_categorical = force_categorical
|
|
self._contrast = contrast
|
|
self._levels = levels
|
|
self.origin = origin
|
|
if not self._force_categorical:
|
|
if contrast is not None:
|
|
raise ValueError("contrast= requires force_categorical=True")
|
|
if levels is not None:
|
|
raise ValueError("levels= requires force_categorical=True")
|
|
|
|
def name(self):
|
|
return self._varname
|
|
|
|
def __repr__(self):
|
|
return "%s(%r)" % (self.__class__.__name__, self._varname)
|
|
|
|
def __eq__(self, other):
|
|
return (isinstance(other, LookupFactor)
|
|
and self._varname == other._varname
|
|
and self._force_categorical == other._force_categorical
|
|
and self._contrast == other._contrast
|
|
and self._levels == other._levels)
|
|
|
|
def __ne__(self, other):
|
|
return not self == other
|
|
|
|
def __hash__(self):
|
|
return hash((LookupFactor, self._varname,
|
|
self._force_categorical, self._contrast, self._levels))
|
|
|
|
def memorize_passes_needed(self, state, eval_env):
|
|
return 0
|
|
|
|
def memorize_chunk(self, state, which_pass, data): # pragma: no cover
|
|
assert False
|
|
|
|
def memorize_finish(self, state, which_pass): # pragma: no cover
|
|
assert False
|
|
|
|
def eval(self, memorize_state, data):
|
|
value = data[self._varname]
|
|
if self._force_categorical:
|
|
value = C(value, contrast=self._contrast, levels=self._levels)
|
|
return value
|
|
|
|
__getstate__ = no_pickling
|
|
|
|
def test_LookupFactor():
|
|
l_a = LookupFactor("a")
|
|
assert l_a.name() == "a"
|
|
assert l_a == LookupFactor("a")
|
|
assert l_a != LookupFactor("b")
|
|
assert hash(l_a) == hash(LookupFactor("a"))
|
|
assert hash(l_a) != hash(LookupFactor("b"))
|
|
assert l_a.eval({}, {"a": 1}) == 1
|
|
assert l_a.eval({}, {"a": 2}) == 2
|
|
assert repr(l_a) == "LookupFactor('a')"
|
|
assert l_a.origin is None
|
|
l_with_origin = LookupFactor("b", origin="asdf")
|
|
assert l_with_origin.origin == "asdf"
|
|
|
|
l_c = LookupFactor("c", force_categorical=True,
|
|
contrast="CONTRAST", levels=(1, 2))
|
|
box = l_c.eval({}, {"c": [1, 1, 2]})
|
|
assert box.data == [1, 1, 2]
|
|
assert box.contrast == "CONTRAST"
|
|
assert box.levels == (1, 2)
|
|
|
|
import pytest
|
|
pytest.raises(ValueError, LookupFactor, "nc", contrast="CONTRAST")
|
|
pytest.raises(ValueError, LookupFactor, "nc", levels=(1, 2))
|
|
|
|
assert_no_pickling(LookupFactor("a"))
|