749 lines
28 KiB
Python
749 lines
28 KiB
Python
# This file is part of Patsy
|
|
# Copyright (C) 2011-2013 Nathaniel Smith <njs@pobox.com>
|
|
# See file LICENSE.txt for license information.
|
|
|
|
# Some generic utilities.
|
|
|
|
__all__ = ["atleast_2d_column_default", "uniqueify_list",
|
|
"widest_float", "widest_complex", "wide_dtype_for", "widen",
|
|
"repr_pretty_delegate", "repr_pretty_impl",
|
|
"SortAnythingKey", "safe_scalar_isnan", "safe_isnan",
|
|
"iterable",
|
|
"have_pandas",
|
|
"have_pandas_categorical",
|
|
"have_pandas_categorical_dtype",
|
|
"pandas_Categorical_from_codes",
|
|
"pandas_Categorical_categories",
|
|
"pandas_Categorical_codes",
|
|
"safe_is_pandas_categorical_dtype",
|
|
"safe_is_pandas_categorical",
|
|
"safe_issubdtype",
|
|
"no_pickling",
|
|
"assert_no_pickling",
|
|
"safe_string_eq",
|
|
]
|
|
|
|
import sys
|
|
import numpy as np
|
|
import six
|
|
from six.moves import cStringIO as StringIO
|
|
from .compat import optional_dep_ok
|
|
|
|
try:
|
|
import pandas
|
|
except ImportError:
|
|
have_pandas = False
|
|
else:
|
|
have_pandas = True
|
|
|
|
# Pandas versions < 0.9.0 don't have Categorical
|
|
# Can drop this guard whenever we drop support for such older versions of
|
|
# pandas.
|
|
have_pandas_categorical = (have_pandas and hasattr(pandas, "Categorical"))
|
|
if not have_pandas:
|
|
_pandas_is_categorical_dtype = None
|
|
else:
|
|
if hasattr(pandas, "CategoricalDtype"): # pandas >= 0.25
|
|
_pandas_is_categorical_dtype = lambda x: isinstance(getattr(x, "dtype", x), pandas.CategoricalDtype)
|
|
elif hasattr(pandas, "api"): # pandas >= 0.19
|
|
_pandas_is_categorical_dtype = getattr(pandas.api.types, "is_categorical_dtype", None)
|
|
else: # pandas <=0.18
|
|
_pandas_is_categorical_dtype = getattr(pandas.core.common,
|
|
"is_categorical_dtype", None)
|
|
have_pandas_categorical_dtype = _pandas_is_categorical_dtype is not None
|
|
|
|
|
|
# Passes through Series and DataFrames, call np.asarray() on everything else
|
|
def asarray_or_pandas(a, copy=False, dtype=None, subok=False):
|
|
if have_pandas:
|
|
if isinstance(a, (pandas.Series, pandas.DataFrame)):
|
|
# The .name attribute on Series is discarded when passing through
|
|
# the constructor:
|
|
# https://github.com/pydata/pandas/issues/1578
|
|
extra_args = {}
|
|
if hasattr(a, "name"):
|
|
extra_args["name"] = a.name
|
|
return a.__class__(a, copy=copy, dtype=dtype, **extra_args)
|
|
return np.array(a, copy=copy, dtype=dtype, subok=subok)
|
|
|
|
|
|
def test_asarray_or_pandas():
|
|
import warnings
|
|
assert type(asarray_or_pandas([1, 2, 3])) is np.ndarray
|
|
with warnings.catch_warnings() as w:
|
|
warnings.filterwarnings('ignore', 'the matrix subclass',
|
|
PendingDeprecationWarning)
|
|
assert type(asarray_or_pandas(np.matrix([[1, 2, 3]]))) is np.ndarray
|
|
assert type(asarray_or_pandas(
|
|
np.matrix([[1, 2, 3]]), subok=True)) is np.matrix
|
|
assert w is None
|
|
a = np.array([1, 2, 3])
|
|
assert asarray_or_pandas(a) is a
|
|
a_copy = asarray_or_pandas(a, copy=True)
|
|
assert np.array_equal(a, a_copy)
|
|
a_copy[0] = 100
|
|
assert not np.array_equal(a, a_copy)
|
|
assert np.allclose(asarray_or_pandas([1, 2, 3], dtype=float),
|
|
[1.0, 2.0, 3.0])
|
|
assert asarray_or_pandas([1, 2, 3], dtype=float).dtype == np.dtype(float)
|
|
a_view = asarray_or_pandas(a, dtype=a.dtype)
|
|
a_view[0] = 99
|
|
assert a[0] == 99
|
|
global have_pandas
|
|
if have_pandas:
|
|
s = pandas.Series([1, 2, 3], name="A", index=[10, 20, 30])
|
|
s_view1 = asarray_or_pandas(s)
|
|
assert s_view1.name == "A"
|
|
assert np.array_equal(s_view1.index, [10, 20, 30])
|
|
s_view1[10] = 101
|
|
assert s[10] == 101
|
|
s_copy = asarray_or_pandas(s, copy=True)
|
|
assert s_copy.name == "A"
|
|
assert np.array_equal(s_copy.index, [10, 20, 30])
|
|
assert np.array_equal(s_copy, s)
|
|
s_copy[10] = 100
|
|
assert not np.array_equal(s_copy, s)
|
|
assert asarray_or_pandas(s, dtype=float).dtype == np.dtype(float)
|
|
s_view2 = asarray_or_pandas(s, dtype=s.dtype)
|
|
assert s_view2.name == "A"
|
|
assert np.array_equal(s_view2.index, [10, 20, 30])
|
|
s_view2[10] = 99
|
|
assert s[10] == 99
|
|
|
|
df = pandas.DataFrame([[1, 2, 3]],
|
|
columns=["A", "B", "C"],
|
|
index=[10])
|
|
df_view1 = asarray_or_pandas(df)
|
|
df_view1.loc[10, "A"] = 101
|
|
assert np.array_equal(df_view1.columns, ["A", "B", "C"])
|
|
assert np.array_equal(df_view1.index, [10])
|
|
assert df.loc[10, "A"] == 101
|
|
df_copy = asarray_or_pandas(df, copy=True)
|
|
assert np.array_equal(df_copy, df)
|
|
assert np.array_equal(df_copy.columns, ["A", "B", "C"])
|
|
assert np.array_equal(df_copy.index, [10])
|
|
df_copy.loc[10, "A"] = 100
|
|
assert not np.array_equal(df_copy, df)
|
|
df_converted = asarray_or_pandas(df, dtype=float)
|
|
assert df_converted["A"].dtype == np.dtype(float)
|
|
assert np.allclose(df_converted, df)
|
|
assert np.array_equal(df_converted.columns, ["A", "B", "C"])
|
|
assert np.array_equal(df_converted.index, [10])
|
|
df_view2 = asarray_or_pandas(df, dtype=df["A"].dtype)
|
|
assert np.array_equal(df_view2.columns, ["A", "B", "C"])
|
|
assert np.array_equal(df_view2.index, [10])
|
|
# This actually makes a copy, not a view, because of a pandas bug:
|
|
# https://github.com/pydata/pandas/issues/1572
|
|
assert np.array_equal(df, df_view2)
|
|
# df_view2[0][0] = 99
|
|
# assert df[0][0] == 99
|
|
|
|
had_pandas = have_pandas
|
|
try:
|
|
have_pandas = False
|
|
assert (type(asarray_or_pandas(pandas.Series([1, 2, 3])))
|
|
is np.ndarray)
|
|
assert (type(asarray_or_pandas(pandas.DataFrame([[1, 2, 3]])))
|
|
is np.ndarray)
|
|
finally:
|
|
have_pandas = had_pandas
|
|
|
|
# Like np.atleast_2d, but this converts lower-dimensional arrays into columns,
|
|
# instead of rows. It also converts ndarray subclasses into basic ndarrays,
|
|
# which makes it easier to guarantee correctness. However, there are many
|
|
# places in the code where we want to preserve pandas indexing information if
|
|
# present, so there is also an option
|
|
def atleast_2d_column_default(a, preserve_pandas=False):
|
|
if preserve_pandas and have_pandas:
|
|
if isinstance(a, pandas.Series):
|
|
return pandas.DataFrame(a)
|
|
elif isinstance(a, pandas.DataFrame):
|
|
return a
|
|
# fall through
|
|
a = np.asarray(a)
|
|
a = np.atleast_1d(a)
|
|
if a.ndim <= 1:
|
|
a = a.reshape((-1, 1))
|
|
assert a.ndim >= 2
|
|
return a
|
|
|
|
|
|
def test_atleast_2d_column_default():
|
|
import warnings
|
|
assert np.all(atleast_2d_column_default([1, 2, 3]) == [[1], [2], [3]])
|
|
|
|
assert atleast_2d_column_default(1).shape == (1, 1)
|
|
assert atleast_2d_column_default([1]).shape == (1, 1)
|
|
assert atleast_2d_column_default([[1]]).shape == (1, 1)
|
|
assert atleast_2d_column_default([[[1]]]).shape == (1, 1, 1)
|
|
|
|
assert atleast_2d_column_default([1, 2, 3]).shape == (3, 1)
|
|
assert atleast_2d_column_default([[1], [2], [3]]).shape == (3, 1)
|
|
|
|
with warnings.catch_warnings() as w:
|
|
warnings.filterwarnings('ignore', 'the matrix subclass',
|
|
PendingDeprecationWarning)
|
|
assert type(atleast_2d_column_default(np.matrix(1))) == np.ndarray
|
|
assert w is None
|
|
|
|
global have_pandas
|
|
if have_pandas:
|
|
assert (type(atleast_2d_column_default(pandas.Series([1, 2])))
|
|
== np.ndarray)
|
|
assert (type(atleast_2d_column_default(pandas.DataFrame([[1], [2]])))
|
|
== np.ndarray)
|
|
assert (type(atleast_2d_column_default(pandas.Series([1, 2]),
|
|
preserve_pandas=True))
|
|
== pandas.DataFrame)
|
|
assert (type(atleast_2d_column_default(pandas.DataFrame([[1], [2]]),
|
|
preserve_pandas=True))
|
|
== pandas.DataFrame)
|
|
s = pandas.Series([10, 11, 12], name="hi", index=["a", "b", "c"])
|
|
df = atleast_2d_column_default(s, preserve_pandas=True)
|
|
assert isinstance(df, pandas.DataFrame)
|
|
assert np.all(df.columns == ["hi"])
|
|
assert np.all(df.index == ["a", "b", "c"])
|
|
with warnings.catch_warnings() as w:
|
|
warnings.filterwarnings('ignore', 'the matrix subclass',
|
|
PendingDeprecationWarning)
|
|
assert (type(atleast_2d_column_default(np.matrix(1),
|
|
preserve_pandas=True))
|
|
== np.ndarray)
|
|
assert w is None
|
|
assert (type(atleast_2d_column_default([1, 2, 3], preserve_pandas=True))
|
|
== np.ndarray)
|
|
|
|
if have_pandas:
|
|
had_pandas = have_pandas
|
|
try:
|
|
have_pandas = False
|
|
assert (type(atleast_2d_column_default(pandas.Series([1, 2]),
|
|
preserve_pandas=True))
|
|
== np.ndarray)
|
|
assert (type(atleast_2d_column_default(pandas.DataFrame([[1], [2]]),
|
|
preserve_pandas=True))
|
|
== np.ndarray)
|
|
finally:
|
|
have_pandas = had_pandas
|
|
|
|
# A version of .reshape() that knows how to down-convert a 1-column
|
|
# pandas.DataFrame into a pandas.Series. Useful for code that wants to be
|
|
# agnostic between 1d and 2d data, with the pattern:
|
|
# new_a = atleast_2d_column_default(a, preserve_pandas=True)
|
|
# # do stuff to new_a, which can assume it's always 2 dimensional
|
|
# return pandas_friendly_reshape(new_a, a.shape)
|
|
def pandas_friendly_reshape(a, new_shape):
|
|
if not have_pandas:
|
|
return a.reshape(new_shape)
|
|
if not isinstance(a, pandas.DataFrame):
|
|
return a.reshape(new_shape)
|
|
# we have a DataFrame. Only supported reshapes are no-op, and
|
|
# single-column DataFrame -> Series.
|
|
if new_shape == a.shape:
|
|
return a
|
|
if len(new_shape) == 1 and a.shape[1] == 1:
|
|
if new_shape[0] != a.shape[0]:
|
|
raise ValueError("arrays have incompatible sizes")
|
|
return a[a.columns[0]]
|
|
raise ValueError("cannot reshape a DataFrame with shape %s to shape %s"
|
|
% (a.shape, new_shape))
|
|
|
|
def test_pandas_friendly_reshape():
|
|
import pytest
|
|
global have_pandas
|
|
assert np.allclose(pandas_friendly_reshape(np.arange(10).reshape(5, 2),
|
|
(2, 5)),
|
|
np.arange(10).reshape(2, 5))
|
|
if have_pandas:
|
|
df = pandas.DataFrame({"x": [1, 2, 3]}, index=["a", "b", "c"])
|
|
noop = pandas_friendly_reshape(df, (3, 1))
|
|
assert isinstance(noop, pandas.DataFrame)
|
|
assert np.array_equal(noop.index, ["a", "b", "c"])
|
|
assert np.array_equal(noop.columns, ["x"])
|
|
squozen = pandas_friendly_reshape(df, (3,))
|
|
assert isinstance(squozen, pandas.Series)
|
|
assert np.array_equal(squozen.index, ["a", "b", "c"])
|
|
assert squozen.name == "x"
|
|
|
|
pytest.raises(ValueError, pandas_friendly_reshape, df, (4,))
|
|
pytest.raises(ValueError, pandas_friendly_reshape, df, (1, 3))
|
|
pytest.raises(ValueError, pandas_friendly_reshape, df, (3, 3))
|
|
|
|
had_pandas = have_pandas
|
|
try:
|
|
have_pandas = False
|
|
# this will try to do a reshape directly, and DataFrames *have* no
|
|
# reshape method
|
|
pytest.raises(AttributeError, pandas_friendly_reshape, df, (3,))
|
|
finally:
|
|
have_pandas = had_pandas
|
|
|
|
def uniqueify_list(seq):
|
|
seq_new = []
|
|
seen = set()
|
|
for obj in seq:
|
|
if obj not in seen:
|
|
seq_new.append(obj)
|
|
seen.add(obj)
|
|
return seq_new
|
|
|
|
def test_to_uniqueify_list():
|
|
assert uniqueify_list([1, 2, 3]) == [1, 2, 3]
|
|
assert uniqueify_list([1, 3, 3, 2, 3, 1]) == [1, 3, 2]
|
|
assert uniqueify_list([3, 2, 1, 4, 1, 2, 3]) == [3, 2, 1, 4]
|
|
|
|
for float_type in ("float128", "float96", "float64"):
|
|
if hasattr(np, float_type):
|
|
widest_float = getattr(np, float_type)
|
|
break
|
|
else: # pragma: no cover
|
|
assert False
|
|
for complex_type in ("complex256", "complex196", "complex128"):
|
|
if hasattr(np, complex_type):
|
|
widest_complex = getattr(np, complex_type)
|
|
break
|
|
else: # pragma: no cover
|
|
assert False
|
|
|
|
def wide_dtype_for(arr):
|
|
arr = np.asarray(arr)
|
|
if (safe_issubdtype(arr.dtype, np.integer)
|
|
or safe_issubdtype(arr.dtype, np.floating)):
|
|
return widest_float
|
|
elif safe_issubdtype(arr.dtype, np.complexfloating):
|
|
return widest_complex
|
|
raise ValueError("cannot widen a non-numeric type %r" % (arr.dtype,))
|
|
|
|
def widen(arr):
|
|
return np.asarray(arr, dtype=wide_dtype_for(arr))
|
|
|
|
def test_wide_dtype_for_and_widen():
|
|
assert np.allclose(widen([1, 2, 3]), [1, 2, 3])
|
|
assert widen([1, 2, 3]).dtype == widest_float
|
|
assert np.allclose(widen([1.0, 2.0, 3.0]), [1, 2, 3])
|
|
assert widen([1.0, 2.0, 3.0]).dtype == widest_float
|
|
assert np.allclose(widen([1+0j, 2, 3]), [1, 2, 3])
|
|
assert widen([1+0j, 2, 3]).dtype == widest_complex
|
|
import pytest
|
|
pytest.raises(ValueError, widen, ["hi"])
|
|
|
|
class PushbackAdapter(object):
|
|
def __init__(self, it):
|
|
self._it = it
|
|
self._pushed = []
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def push_back(self, obj):
|
|
self._pushed.append(obj)
|
|
|
|
def next(self):
|
|
if self._pushed:
|
|
return self._pushed.pop()
|
|
else:
|
|
# May raise StopIteration
|
|
return six.advance_iterator(self._it)
|
|
__next__ = next
|
|
|
|
def peek(self):
|
|
try:
|
|
obj = six.advance_iterator(self)
|
|
except StopIteration:
|
|
raise ValueError("no more data")
|
|
self.push_back(obj)
|
|
return obj
|
|
|
|
def has_more(self):
|
|
try:
|
|
self.peek()
|
|
except ValueError:
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
def test_PushbackAdapter():
|
|
it = PushbackAdapter(iter([1, 2, 3, 4]))
|
|
assert it.has_more()
|
|
assert six.advance_iterator(it) == 1
|
|
it.push_back(0)
|
|
assert six.advance_iterator(it) == 0
|
|
assert six.advance_iterator(it) == 2
|
|
assert it.peek() == 3
|
|
it.push_back(10)
|
|
assert it.peek() == 10
|
|
it.push_back(20)
|
|
assert it.peek() == 20
|
|
assert it.has_more()
|
|
assert list(it) == [20, 10, 3, 4]
|
|
assert not it.has_more()
|
|
|
|
# The IPython pretty-printer gives very nice output that is difficult to get
|
|
# otherwise, e.g., look how much more readable this is than if it were all
|
|
# smooshed onto one line:
|
|
#
|
|
# ModelDesc(input_code='y ~ x*asdf',
|
|
# lhs_terms=[Term([EvalFactor('y')])],
|
|
# rhs_terms=[Term([]),
|
|
# Term([EvalFactor('x')]),
|
|
# Term([EvalFactor('asdf')]),
|
|
# Term([EvalFactor('x'), EvalFactor('asdf')])],
|
|
# )
|
|
#
|
|
# But, we don't want to assume it always exists; nor do we want to be
|
|
# re-writing every repr function twice, once for regular repr and once for
|
|
# the pretty printer. So, here's an ugly fallback implementation that can be
|
|
# used unconditionally to implement __repr__ in terms of _pretty_repr_.
|
|
#
|
|
# Pretty printer docs:
|
|
# http://ipython.org/ipython-doc/dev/api/generated/IPython.lib.pretty.html
|
|
|
|
class _MiniPPrinter(object):
|
|
def __init__(self):
|
|
self._out = StringIO()
|
|
self.indentation = 0
|
|
|
|
def text(self, text):
|
|
self._out.write(text)
|
|
|
|
def breakable(self, sep=" "):
|
|
self._out.write(sep)
|
|
|
|
def begin_group(self, _, text):
|
|
self.text(text)
|
|
|
|
def end_group(self, _, text):
|
|
self.text(text)
|
|
|
|
def pretty(self, obj):
|
|
if hasattr(obj, "_repr_pretty_"):
|
|
obj._repr_pretty_(self, False)
|
|
else:
|
|
self.text(repr(obj))
|
|
|
|
def getvalue(self):
|
|
return self._out.getvalue()
|
|
|
|
def _mini_pretty(obj):
|
|
printer = _MiniPPrinter()
|
|
printer.pretty(obj)
|
|
return printer.getvalue()
|
|
|
|
def repr_pretty_delegate(obj):
|
|
# If IPython is already loaded, then might as well use it. (Most commonly
|
|
# this will occur if we are in an IPython session, but somehow someone has
|
|
# called repr() directly. This can happen for example if printing an
|
|
# container like a namedtuple that IPython lacks special code for
|
|
# pretty-printing.) But, if IPython is not already imported, we do not
|
|
# attempt to import it. This makes patsy itself faster to import (as of
|
|
# Nov. 2012 I measured the extra overhead from loading IPython as ~4
|
|
# seconds on a cold cache), it prevents IPython from automatically
|
|
# spawning a bunch of child processes (!) which may not be what you want
|
|
# if you are not otherwise using IPython, and it avoids annoying the
|
|
# pandas people who have some hack to tell whether you are using IPython
|
|
# in their test suite (see patsy bug #12).
|
|
if optional_dep_ok and "IPython" in sys.modules:
|
|
from IPython.lib.pretty import pretty
|
|
return pretty(obj)
|
|
else:
|
|
return _mini_pretty(obj)
|
|
|
|
def repr_pretty_impl(p, obj, args, kwargs=[]):
|
|
name = obj.__class__.__name__
|
|
p.begin_group(len(name) + 1, "%s(" % (name,))
|
|
started = [False]
|
|
def new_item():
|
|
if started[0]:
|
|
p.text(",")
|
|
p.breakable()
|
|
started[0] = True
|
|
for arg in args:
|
|
new_item()
|
|
p.pretty(arg)
|
|
for label, value in kwargs:
|
|
new_item()
|
|
p.begin_group(len(label) + 1, "%s=" % (label,))
|
|
p.pretty(value)
|
|
p.end_group(len(label) + 1, "")
|
|
p.end_group(len(name) + 1, ")")
|
|
|
|
def test_repr_pretty():
|
|
assert repr_pretty_delegate("asdf") == "'asdf'"
|
|
printer = _MiniPPrinter()
|
|
class MyClass(object):
|
|
pass
|
|
repr_pretty_impl(printer, MyClass(),
|
|
["a", 1], [("foo", "bar"), ("asdf", "asdf")])
|
|
assert printer.getvalue() == "MyClass('a', 1, foo='bar', asdf='asdf')"
|
|
|
|
# In Python 3, objects of different types are not generally comparable, so a
|
|
# list of heterogeneous types cannot be sorted. This implements a Python 2
|
|
# style comparison for arbitrary types. (It works on Python 2 too, but just
|
|
# gives you the built-in ordering.) To understand why this is tricky, consider
|
|
# this example:
|
|
# a = 1 # type 'int'
|
|
# b = 1.5 # type 'float'
|
|
# class gggg:
|
|
# pass
|
|
# c = gggg()
|
|
# sorted([a, b, c])
|
|
# The fallback ordering sorts by class name, so according to the fallback
|
|
# ordering, we have b < c < a. But, of course, a and b are comparable (even
|
|
# though they're of different types), so we also have a < b. This is
|
|
# inconsistent. There is no general solution to this problem (which I guess is
|
|
# why Python 3 stopped trying), but the worst offender is all the different
|
|
# "numeric" classes (int, float, complex, decimal, rational...), so as a
|
|
# special-case, we sort all numeric objects to the start of the list.
|
|
# (In Python 2, there is also a similar special case for str and unicode, but
|
|
# we don't have to worry about that for Python 3.)
|
|
class SortAnythingKey(object):
|
|
def __init__(self, obj):
|
|
self.obj = obj
|
|
|
|
def _python_lt(self, other_obj):
|
|
# On Py2, < never raises an error, so this is just <. (Actually it
|
|
# does raise a TypeError for comparing complex to numeric, but not for
|
|
# comparisons of complex to other types. Sigh. Whatever.)
|
|
# On Py3, this returns a bool if available, and otherwise returns
|
|
# NotImplemented
|
|
try:
|
|
return self.obj < other_obj
|
|
except TypeError:
|
|
return NotImplemented
|
|
|
|
def __lt__(self, other):
|
|
assert isinstance(other, SortAnythingKey)
|
|
result = self._python_lt(other.obj)
|
|
if result is not NotImplemented:
|
|
return result
|
|
# Okay, that didn't work, time to fall back.
|
|
# If one of these is a number, then it is smaller.
|
|
if self._python_lt(0) is not NotImplemented:
|
|
return True
|
|
if other._python_lt(0) is not NotImplemented:
|
|
return False
|
|
# Also check ==, since it may well be defined for otherwise
|
|
# unorderable objects, and if so then we should be consistent with
|
|
# it:
|
|
if self.obj == other.obj:
|
|
return False
|
|
# Otherwise, we break ties based on class name and memory position
|
|
return ((self.obj.__class__.__name__, id(self.obj))
|
|
< (other.obj.__class__.__name__, id(other.obj)))
|
|
|
|
def test_SortAnythingKey():
|
|
assert sorted([20, 10, 0, 15], key=SortAnythingKey) == [0, 10, 15, 20]
|
|
assert sorted([10, -1.5], key=SortAnythingKey) == [-1.5, 10]
|
|
assert sorted([10, "a", 20.5, "b"], key=SortAnythingKey) == [10, 20.5, "a", "b"]
|
|
class a(object):
|
|
pass
|
|
class b(object):
|
|
pass
|
|
class z(object):
|
|
pass
|
|
a_obj = a()
|
|
b_obj = b()
|
|
z_obj = z()
|
|
o_obj = object()
|
|
assert (sorted([z_obj, a_obj, 1, b_obj, o_obj], key=SortAnythingKey)
|
|
== [1, a_obj, b_obj, o_obj, z_obj])
|
|
|
|
# NaN checking functions that work on arbitrary objects, on old Python
|
|
# versions (math.isnan is only in 2.6+), etc.
|
|
def safe_scalar_isnan(x):
|
|
try:
|
|
return np.isnan(float(x))
|
|
except (TypeError, ValueError, NotImplementedError):
|
|
return False
|
|
safe_isnan = np.vectorize(safe_scalar_isnan, otypes=[bool])
|
|
|
|
def test_safe_scalar_isnan():
|
|
assert not safe_scalar_isnan(True)
|
|
assert not safe_scalar_isnan(None)
|
|
assert not safe_scalar_isnan("sadf")
|
|
assert not safe_scalar_isnan((1, 2, 3))
|
|
assert not safe_scalar_isnan(np.asarray([1, 2, 3]))
|
|
assert not safe_scalar_isnan([np.nan])
|
|
assert safe_scalar_isnan(np.nan)
|
|
assert safe_scalar_isnan(np.float32(np.nan))
|
|
assert safe_scalar_isnan(float(np.nan))
|
|
|
|
def test_safe_isnan():
|
|
assert np.array_equal(safe_isnan([1, True, None, np.nan, "asdf"]),
|
|
[False, False, False, True, False])
|
|
assert safe_isnan(np.nan).ndim == 0
|
|
assert safe_isnan(np.nan)
|
|
assert not safe_isnan(None)
|
|
# raw isnan raises a *different* error for strings than for objects:
|
|
assert not safe_isnan("asdf")
|
|
|
|
def iterable(obj):
|
|
try:
|
|
iter(obj)
|
|
except Exception:
|
|
return False
|
|
return True
|
|
|
|
def test_iterable():
|
|
assert iterable("asdf")
|
|
assert iterable([])
|
|
assert iterable({"a": 1})
|
|
assert not iterable(1)
|
|
assert not iterable(iterable)
|
|
|
|
##### Handling Pandas's categorical stuff is horrible and hateful
|
|
|
|
# Basically they decided that they didn't like how numpy does things, so their
|
|
# categorical stuff is *kinda* like how numpy would do it (e.g. they have a
|
|
# special ".dtype" attribute to mark categorical data), so by default you'll
|
|
# find yourself using the same code paths to handle pandas categorical data
|
|
# and other non-categorical data. BUT, all the idioms for detecting
|
|
# categorical data blow up with errors if you try them with real numpy dtypes,
|
|
# and all numpy's idioms for detecting non-categorical types blow up with
|
|
# errors if you try them with pandas categorical stuff. So basically they have
|
|
# just poisoned all code that touches dtypes; the old numpy stuff is unsafe,
|
|
# and you must use special code like below.
|
|
#
|
|
# Also there are hoops to jump through to handle both the old style
|
|
# (Categorical objects) and new-style (Series with dtype="category").
|
|
|
|
# Needed to support pandas < 0.15
|
|
def pandas_Categorical_from_codes(codes, categories):
|
|
assert have_pandas_categorical
|
|
|
|
# Old versions of pandas sometimes fail to coerce this to an array and
|
|
# just return it directly from .labels (?!).
|
|
codes = np.asarray(codes)
|
|
if hasattr(pandas.Categorical, "from_codes"):
|
|
return pandas.Categorical.from_codes(codes, categories)
|
|
else:
|
|
return pandas.Categorical(codes, categories)
|
|
|
|
def test_pandas_Categorical_from_codes():
|
|
if not have_pandas_categorical:
|
|
return
|
|
c = pandas_Categorical_from_codes([1, 1, 0, -1], ["a", "b"])
|
|
assert np.all(np.asarray(c)[:-1] == ["b", "b", "a"])
|
|
assert np.isnan(np.asarray(c)[-1])
|
|
|
|
# Needed to support pandas < 0.15
|
|
def pandas_Categorical_categories(cat):
|
|
# In 0.15+, a categorical Series has a .cat attribute which is similar to
|
|
# a Categorical object, and Categorical objects are what have .categories
|
|
# and .codes attributes.
|
|
if hasattr(cat, "cat"):
|
|
cat = cat.cat
|
|
if hasattr(cat, "categories"):
|
|
return cat.categories
|
|
else:
|
|
return cat.levels
|
|
|
|
# Needed to support pandas < 0.15
|
|
def pandas_Categorical_codes(cat):
|
|
# In 0.15+, a categorical Series has a .cat attribute which is a
|
|
# Categorical object, and Categorical objects are what have .categories /
|
|
# .codes attributes.
|
|
if hasattr(cat, "cat"):
|
|
cat = cat.cat
|
|
if hasattr(cat, "codes"):
|
|
return cat.codes
|
|
else:
|
|
return cat.labels
|
|
|
|
def test_pandas_Categorical_accessors():
|
|
if not have_pandas_categorical:
|
|
return
|
|
c = pandas_Categorical_from_codes([1, 1, 0, -1], ["a", "b"])
|
|
assert np.all(pandas_Categorical_categories(c) == ["a", "b"])
|
|
assert np.all(pandas_Categorical_codes(c) == [1, 1, 0, -1])
|
|
|
|
if have_pandas_categorical_dtype:
|
|
s = pandas.Series(c)
|
|
assert np.all(pandas_Categorical_categories(s) == ["a", "b"])
|
|
assert np.all(pandas_Categorical_codes(s) == [1, 1, 0, -1])
|
|
|
|
# Needed to support pandas >= 0.15 (!)
|
|
def safe_is_pandas_categorical_dtype(dt):
|
|
if not have_pandas_categorical_dtype:
|
|
return False
|
|
return _pandas_is_categorical_dtype(dt)
|
|
|
|
# Needed to support pandas >= 0.15 (!)
|
|
def safe_is_pandas_categorical(data):
|
|
if not have_pandas_categorical:
|
|
return False
|
|
if isinstance(data, pandas.Categorical):
|
|
return True
|
|
if hasattr(data, "dtype"):
|
|
return safe_is_pandas_categorical_dtype(data.dtype)
|
|
return False
|
|
|
|
def test_safe_is_pandas_categorical():
|
|
assert not safe_is_pandas_categorical(np.arange(10))
|
|
|
|
if have_pandas_categorical:
|
|
c_obj = pandas.Categorical(["a", "b"])
|
|
assert safe_is_pandas_categorical(c_obj)
|
|
|
|
if have_pandas_categorical_dtype:
|
|
s_obj = pandas.Series(["a", "b"], dtype="category")
|
|
assert safe_is_pandas_categorical(s_obj)
|
|
|
|
# Needed to support pandas >= 0.15 (!)
|
|
# Calling np.issubdtype on a pandas categorical will blow up -- the officially
|
|
# recommended solution is to replace every piece of code like
|
|
# np.issubdtype(foo.dtype, bool)
|
|
# with code like
|
|
# isinstance(foo.dtype, np.dtype) and np.issubdtype(foo.dtype, bool)
|
|
# or
|
|
# not pandas.is_categorical_dtype(foo.dtype) and issubdtype(foo.dtype, bool)
|
|
# We do the latter (with extra hoops) because the isinstance check is not
|
|
# safe. See
|
|
# https://github.com/pydata/pandas/issues/9581
|
|
# https://github.com/pydata/pandas/issues/9581#issuecomment-77099564
|
|
def safe_issubdtype(dt1, dt2):
|
|
if safe_is_pandas_categorical_dtype(dt1):
|
|
return False
|
|
return np.issubdtype(dt1, dt2)
|
|
|
|
def test_safe_issubdtype():
|
|
assert safe_issubdtype(int, np.integer)
|
|
assert safe_issubdtype(np.dtype(float), np.floating)
|
|
assert not safe_issubdtype(int, np.floating)
|
|
assert not safe_issubdtype(np.dtype(float), np.integer)
|
|
|
|
if have_pandas_categorical_dtype:
|
|
bad_dtype = pandas.Series(["a", "b"], dtype="category")
|
|
assert not safe_issubdtype(bad_dtype, np.integer)
|
|
|
|
def no_pickling(*args, **kwargs):
|
|
raise NotImplementedError(
|
|
"Sorry, pickling not yet supported. "
|
|
"See https://github.com/pydata/patsy/issues/26 if you want to "
|
|
"help.")
|
|
|
|
def assert_no_pickling(obj):
|
|
import pickle
|
|
import pytest
|
|
pytest.raises(NotImplementedError, pickle.dumps, obj)
|
|
|
|
# Use like:
|
|
# if safe_string_eq(constraints, "center"):
|
|
# ...
|
|
# where 'constraints' might be a string or an array. (If it's an array, then
|
|
# we can't use == becaues it might broadcast and ugh.)
|
|
def safe_string_eq(obj, value):
|
|
if isinstance(obj, six.string_types):
|
|
return obj == value
|
|
else:
|
|
return False
|
|
|
|
def test_safe_string_eq():
|
|
assert safe_string_eq("foo", "foo")
|
|
assert not safe_string_eq("foo", "bar")
|
|
|
|
if not six.PY3:
|
|
assert safe_string_eq(unicode("foo"), "foo")
|
|
|
|
assert not safe_string_eq(np.empty((2, 2)), "foo")
|