233 lines
8.3 KiB
Python
233 lines
8.3 KiB
Python
from __future__ import annotations
|
|
from dataclasses import dataclass
|
|
from typing import ClassVar
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pandas import DataFrame
|
|
|
|
from seaborn._core.groupby import GroupBy
|
|
from seaborn._core.scales import Scale
|
|
from seaborn._stats.base import Stat
|
|
|
|
from typing import TYPE_CHECKING
|
|
if TYPE_CHECKING:
|
|
from numpy.typing import ArrayLike
|
|
|
|
|
|
@dataclass
|
|
class Count(Stat):
|
|
"""
|
|
Count distinct observations within groups.
|
|
|
|
See Also
|
|
--------
|
|
Hist : A more fully-featured transform including binning and/or normalization.
|
|
|
|
Examples
|
|
--------
|
|
.. include:: ../docstrings/objects.Count.rst
|
|
|
|
"""
|
|
group_by_orient: ClassVar[bool] = True
|
|
|
|
def __call__(
|
|
self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale],
|
|
) -> DataFrame:
|
|
|
|
var = {"x": "y", "y": "x"}[orient]
|
|
res = (
|
|
groupby
|
|
.agg(data.assign(**{var: data[orient]}), {var: len})
|
|
.dropna(subset=["x", "y"])
|
|
.reset_index(drop=True)
|
|
)
|
|
return res
|
|
|
|
|
|
@dataclass
|
|
class Hist(Stat):
|
|
"""
|
|
Bin observations, count them, and optionally normalize or cumulate.
|
|
|
|
Parameters
|
|
----------
|
|
stat : str
|
|
Aggregate statistic to compute in each bin:
|
|
|
|
- `count`: the number of observations
|
|
- `density`: normalize so that the total area of the histogram equals 1
|
|
- `percent`: normalize so that bar heights sum to 100
|
|
- `probability` or `proportion`: normalize so that bar heights sum to 1
|
|
- `frequency`: divide the number of observations by the bin width
|
|
|
|
bins : str, int, or ArrayLike
|
|
Generic parameter that can be the name of a reference rule, the number
|
|
of bins, or the bin breaks. Passed to :func:`numpy.histogram_bin_edges`.
|
|
binwidth : float
|
|
Width of each bin; overrides `bins` but can be used with `binrange`.
|
|
Note that if `binwidth` does not evenly divide the bin range, the actual
|
|
bin width used will be only approximately equal to the parameter value.
|
|
binrange : (min, max)
|
|
Lowest and highest value for bin edges; can be used with either
|
|
`bins` (when a number) or `binwidth`. Defaults to data extremes.
|
|
common_norm : bool or list of variables
|
|
When not `False`, the normalization is applied across groups. Use
|
|
`True` to normalize across all groups, or pass variable name(s) that
|
|
define normalization groups.
|
|
common_bins : bool or list of variables
|
|
When not `False`, the same bins are used for all groups. Use `True` to
|
|
share bins across all groups, or pass variable name(s) to share within.
|
|
cumulative : bool
|
|
If True, cumulate the bin values.
|
|
discrete : bool
|
|
If True, set `binwidth` and `binrange` so that bins have unit width and
|
|
are centered on integer values
|
|
|
|
Notes
|
|
-----
|
|
The choice of bins for computing and plotting a histogram can exert
|
|
substantial influence on the insights that one is able to draw from the
|
|
visualization. If the bins are too large, they may erase important features.
|
|
On the other hand, bins that are too small may be dominated by random
|
|
variability, obscuring the shape of the true underlying distribution. The
|
|
default bin size is determined using a reference rule that depends on the
|
|
sample size and variance. This works well in many cases, (i.e., with
|
|
"well-behaved" data) but it fails in others. It is always a good to try
|
|
different bin sizes to be sure that you are not missing something important.
|
|
This function allows you to specify bins in several different ways, such as
|
|
by setting the total number of bins to use, the width of each bin, or the
|
|
specific locations where the bins should break.
|
|
|
|
Examples
|
|
--------
|
|
.. include:: ../docstrings/objects.Hist.rst
|
|
|
|
"""
|
|
stat: str = "count"
|
|
bins: str | int | ArrayLike = "auto"
|
|
binwidth: float | None = None
|
|
binrange: tuple[float, float] | None = None
|
|
common_norm: bool | list[str] = True
|
|
common_bins: bool | list[str] = True
|
|
cumulative: bool = False
|
|
discrete: bool = False
|
|
|
|
def __post_init__(self):
|
|
|
|
stat_options = [
|
|
"count", "density", "percent", "probability", "proportion", "frequency"
|
|
]
|
|
self._check_param_one_of("stat", stat_options)
|
|
|
|
def _define_bin_edges(self, vals, weight, bins, binwidth, binrange, discrete):
|
|
"""Inner function that takes bin parameters as arguments."""
|
|
vals = vals.replace(-np.inf, np.nan).replace(np.inf, np.nan).dropna()
|
|
|
|
if binrange is None:
|
|
start, stop = vals.min(), vals.max()
|
|
else:
|
|
start, stop = binrange
|
|
|
|
if discrete:
|
|
bin_edges = np.arange(start - .5, stop + 1.5)
|
|
else:
|
|
if binwidth is not None:
|
|
bins = int(round((stop - start) / binwidth))
|
|
bin_edges = np.histogram_bin_edges(vals, bins, binrange, weight)
|
|
|
|
# TODO warning or cap on too many bins?
|
|
|
|
return bin_edges
|
|
|
|
def _define_bin_params(self, data, orient, scale_type):
|
|
"""Given data, return numpy.histogram parameters to define bins."""
|
|
vals = data[orient]
|
|
weights = data.get("weight", None)
|
|
|
|
# TODO We'll want this for ordinal / discrete scales too
|
|
# (Do we need discrete as a parameter or just infer from scale?)
|
|
discrete = self.discrete or scale_type == "nominal"
|
|
|
|
bin_edges = self._define_bin_edges(
|
|
vals, weights, self.bins, self.binwidth, self.binrange, discrete,
|
|
)
|
|
|
|
if isinstance(self.bins, (str, int)):
|
|
n_bins = len(bin_edges) - 1
|
|
bin_range = bin_edges.min(), bin_edges.max()
|
|
bin_kws = dict(bins=n_bins, range=bin_range)
|
|
else:
|
|
bin_kws = dict(bins=bin_edges)
|
|
|
|
return bin_kws
|
|
|
|
def _get_bins_and_eval(self, data, orient, groupby, scale_type):
|
|
|
|
bin_kws = self._define_bin_params(data, orient, scale_type)
|
|
return groupby.apply(data, self._eval, orient, bin_kws)
|
|
|
|
def _eval(self, data, orient, bin_kws):
|
|
|
|
vals = data[orient]
|
|
weights = data.get("weight", None)
|
|
|
|
density = self.stat == "density"
|
|
hist, edges = np.histogram(vals, **bin_kws, weights=weights, density=density)
|
|
|
|
width = np.diff(edges)
|
|
center = edges[:-1] + width / 2
|
|
|
|
return pd.DataFrame({orient: center, "count": hist, "space": width})
|
|
|
|
def _normalize(self, data):
|
|
|
|
hist = data["count"]
|
|
if self.stat == "probability" or self.stat == "proportion":
|
|
hist = hist.astype(float) / hist.sum()
|
|
elif self.stat == "percent":
|
|
hist = hist.astype(float) / hist.sum() * 100
|
|
elif self.stat == "frequency":
|
|
hist = hist.astype(float) / data["space"]
|
|
|
|
if self.cumulative:
|
|
if self.stat in ["density", "frequency"]:
|
|
hist = (hist * data["space"]).cumsum()
|
|
else:
|
|
hist = hist.cumsum()
|
|
|
|
return data.assign(**{self.stat: hist})
|
|
|
|
def __call__(
|
|
self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale],
|
|
) -> DataFrame:
|
|
|
|
scale_type = scales[orient].__class__.__name__.lower()
|
|
grouping_vars = [str(v) for v in data if v in groupby.order]
|
|
if not grouping_vars or self.common_bins is True:
|
|
bin_kws = self._define_bin_params(data, orient, scale_type)
|
|
data = groupby.apply(data, self._eval, orient, bin_kws)
|
|
else:
|
|
if self.common_bins is False:
|
|
bin_groupby = GroupBy(grouping_vars)
|
|
else:
|
|
bin_groupby = GroupBy(self.common_bins)
|
|
self._check_grouping_vars("common_bins", grouping_vars)
|
|
|
|
data = bin_groupby.apply(
|
|
data, self._get_bins_and_eval, orient, groupby, scale_type,
|
|
)
|
|
|
|
if not grouping_vars or self.common_norm is True:
|
|
data = self._normalize(data)
|
|
else:
|
|
if self.common_norm is False:
|
|
norm_groupby = GroupBy(grouping_vars)
|
|
else:
|
|
norm_groupby = GroupBy(self.common_norm)
|
|
self._check_grouping_vars("common_norm", grouping_vars)
|
|
data = norm_groupby.apply(data, self._normalize)
|
|
|
|
other = {"x": "y", "y": "x"}[orient]
|
|
return data.assign(**{other: data[self.stat]})
|