182 lines
6.7 KiB
Python
182 lines
6.7 KiB
Python
# This file is part of Patsy
|
|
# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
|
|
# See file LICENSE.txt for license information.
|
|
|
|
# Stateful transform protocol:
|
|
# def __init__(self):
|
|
# pass
|
|
# def memorize_chunk(self, input_data):
|
|
# return None
|
|
# def memorize_finish(self):
|
|
# return None
|
|
# def transform(self, input_data):
|
|
# return output_data
|
|
|
|
# BETTER WAY: always run the first row of data through the builder alone, and
|
|
# check that it gives the same output row as when running the whole block of
|
|
# data through at once. This gives us the same information, but it's robust
|
|
# against people writing their own centering functions.
|
|
|
|
# QUESTION: right now we refuse to even fit a model that contains a
|
|
# my_transform(x)-style function. Maybe we should allow it to be fit (with a
|
|
# warning), and only disallow making predictions with it? Need to revisit this
|
|
# question once it's clearer what exactly our public API will look like,
|
|
# because right now I'm not sure how to tell whether we are being called for
|
|
# fitting versus being called for prediction.
|
|
|
|
from functools import wraps
|
|
import numpy as np
|
|
from patsy.util import (atleast_2d_column_default,
|
|
asarray_or_pandas, pandas_friendly_reshape,
|
|
wide_dtype_for, safe_issubdtype,
|
|
no_pickling, assert_no_pickling)
|
|
|
|
# These are made available in the patsy.* namespace
|
|
__all__ = ["stateful_transform",
|
|
"center", "standardize", "scale",
|
|
]
|
|
|
|
def stateful_transform(class_):
|
|
"""Create a stateful transform callable object from a class that fulfills
|
|
the :ref:`stateful transform protocol <stateful-transform-protocol>`.
|
|
"""
|
|
@wraps(class_)
|
|
def stateful_transform_wrapper(*args, **kwargs):
|
|
transform = class_()
|
|
transform.memorize_chunk(*args, **kwargs)
|
|
transform.memorize_finish()
|
|
return transform.transform(*args, **kwargs)
|
|
stateful_transform_wrapper.__patsy_stateful_transform__ = class_
|
|
return stateful_transform_wrapper
|
|
|
|
# class NonIncrementalStatefulTransform(object):
|
|
# def __init__(self):
|
|
# self._data = []
|
|
#
|
|
# def memorize_chunk(self, input_data, *args, **kwargs):
|
|
# self._data.append(input_data)
|
|
# self._args = _args
|
|
# self._kwargs = kwargs
|
|
#
|
|
# def memorize_finish(self):
|
|
# all_data = np.vstack(self._data)
|
|
# args = self._args
|
|
# kwargs = self._kwargs
|
|
# del self._data
|
|
# del self._args
|
|
# del self._kwargs
|
|
# self.memorize_all(all_data, *args, **kwargs)
|
|
#
|
|
# def memorize_all(self, input_data, *args, **kwargs):
|
|
# raise NotImplementedError
|
|
#
|
|
# def transform(self, input_data, *args, **kwargs):
|
|
# raise NotImplementedError
|
|
#
|
|
# class QuantileEstimatingTransform(NonIncrementalStatefulTransform):
|
|
# def memorize_all(self, input_data, *args, **kwargs):
|
|
|
|
class Center(object):
|
|
"""center(x)
|
|
|
|
A stateful transform that centers input data, i.e., subtracts the mean.
|
|
|
|
If input has multiple columns, centers each column separately.
|
|
|
|
Equivalent to ``standardize(x, rescale=False)``
|
|
"""
|
|
def __init__(self):
|
|
self._sum = None
|
|
self._count = 0
|
|
|
|
def memorize_chunk(self, x):
|
|
x = atleast_2d_column_default(x)
|
|
self._count += x.shape[0]
|
|
this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
|
|
# This is to handle potentially multi-column x's:
|
|
if self._sum is None:
|
|
self._sum = this_total
|
|
else:
|
|
self._sum += this_total
|
|
|
|
def memorize_finish(self):
|
|
pass
|
|
|
|
def transform(self, x):
|
|
x = asarray_or_pandas(x)
|
|
# This doesn't copy data unless our input is a DataFrame that has
|
|
# heterogeneous types. And in that case we're going to be munging the
|
|
# types anyway, so copying isn't a big deal.
|
|
x_arr = np.asarray(x)
|
|
if safe_issubdtype(x_arr.dtype, np.integer):
|
|
dt = float
|
|
else:
|
|
dt = x_arr.dtype
|
|
mean_val = np.asarray(self._sum / self._count, dtype=dt)
|
|
centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
|
|
return pandas_friendly_reshape(centered, x.shape)
|
|
|
|
__getstate__ = no_pickling
|
|
|
|
center = stateful_transform(Center)
|
|
|
|
# See:
|
|
# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
|
|
# or page 232 of Knuth vol. 3 (3rd ed.).
|
|
class Standardize(object):
|
|
"""standardize(x, center=True, rescale=True, ddof=0)
|
|
|
|
A stateful transform that standardizes input data, i.e. it subtracts the
|
|
mean and divides by the sample standard deviation.
|
|
|
|
Either centering or rescaling or both can be disabled by use of keyword
|
|
arguments. The `ddof` argument controls the delta degrees of freedom when
|
|
computing the standard deviation (cf. :func:`numpy.std`). The default of
|
|
``ddof=0`` produces the maximum likelihood estimate; use ``ddof=1`` if you
|
|
prefer the square root of the unbiased estimate of the variance.
|
|
|
|
If input has multiple columns, standardizes each column separately.
|
|
|
|
.. note:: This function computes the mean and standard deviation using a
|
|
memory-efficient online algorithm, making it suitable for use with
|
|
large incrementally processed data-sets.
|
|
"""
|
|
def __init__(self):
|
|
self.current_n = 0
|
|
self.current_mean = None
|
|
self.current_M2 = None
|
|
|
|
def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
|
|
x = atleast_2d_column_default(x)
|
|
if self.current_mean is None:
|
|
self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
|
|
self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
|
|
# XX this can surely be vectorized but I am feeling lazy:
|
|
for i in range(x.shape[0]):
|
|
self.current_n += 1
|
|
delta = x[i, :] - self.current_mean
|
|
self.current_mean += delta / self.current_n
|
|
self.current_M2 += delta * (x[i, :] - self.current_mean)
|
|
|
|
def memorize_finish(self):
|
|
pass
|
|
|
|
def transform(self, x, center=True, rescale=True, ddof=0):
|
|
# XX: this forces all inputs to double-precision real, even if the
|
|
# input is single- or extended-precision or complex. But I got all
|
|
# tangled up in knots trying to do that without breaking something
|
|
# else (e.g. by requiring an extra copy).
|
|
x = asarray_or_pandas(x, copy=True, dtype=float)
|
|
x_2d = atleast_2d_column_default(x, preserve_pandas=True)
|
|
if center:
|
|
x_2d -= self.current_mean
|
|
if rescale:
|
|
x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof))
|
|
return pandas_friendly_reshape(x_2d, x.shape)
|
|
|
|
__getstate__ = no_pickling
|
|
|
|
standardize = stateful_transform(Standardize)
|
|
# R compatibility:
|
|
scale = standardize
|