AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/seaborn/_stats/counting.py

233 lines
8.3 KiB
Python
Raw Normal View History

2024-10-02 22:15:59 +04:00
from __future__ import annotations
from dataclasses import dataclass
from typing import ClassVar
import numpy as np
import pandas as pd
from pandas import DataFrame
from seaborn._core.groupby import GroupBy
from seaborn._core.scales import Scale
from seaborn._stats.base import Stat
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from numpy.typing import ArrayLike
@dataclass
class Count(Stat):
"""
Count distinct observations within groups.
See Also
--------
Hist : A more fully-featured transform including binning and/or normalization.
Examples
--------
.. include:: ../docstrings/objects.Count.rst
"""
group_by_orient: ClassVar[bool] = True
def __call__(
self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale],
) -> DataFrame:
var = {"x": "y", "y": "x"}[orient]
res = (
groupby
.agg(data.assign(**{var: data[orient]}), {var: len})
.dropna(subset=["x", "y"])
.reset_index(drop=True)
)
return res
@dataclass
class Hist(Stat):
"""
Bin observations, count them, and optionally normalize or cumulate.
Parameters
----------
stat : str
Aggregate statistic to compute in each bin:
- `count`: the number of observations
- `density`: normalize so that the total area of the histogram equals 1
- `percent`: normalize so that bar heights sum to 100
- `probability` or `proportion`: normalize so that bar heights sum to 1
- `frequency`: divide the number of observations by the bin width
bins : str, int, or ArrayLike
Generic parameter that can be the name of a reference rule, the number
of bins, or the bin breaks. Passed to :func:`numpy.histogram_bin_edges`.
binwidth : float
Width of each bin; overrides `bins` but can be used with `binrange`.
Note that if `binwidth` does not evenly divide the bin range, the actual
bin width used will be only approximately equal to the parameter value.
binrange : (min, max)
Lowest and highest value for bin edges; can be used with either
`bins` (when a number) or `binwidth`. Defaults to data extremes.
common_norm : bool or list of variables
When not `False`, the normalization is applied across groups. Use
`True` to normalize across all groups, or pass variable name(s) that
define normalization groups.
common_bins : bool or list of variables
When not `False`, the same bins are used for all groups. Use `True` to
share bins across all groups, or pass variable name(s) to share within.
cumulative : bool
If True, cumulate the bin values.
discrete : bool
If True, set `binwidth` and `binrange` so that bins have unit width and
are centered on integer values
Notes
-----
The choice of bins for computing and plotting a histogram can exert
substantial influence on the insights that one is able to draw from the
visualization. If the bins are too large, they may erase important features.
On the other hand, bins that are too small may be dominated by random
variability, obscuring the shape of the true underlying distribution. The
default bin size is determined using a reference rule that depends on the
sample size and variance. This works well in many cases, (i.e., with
"well-behaved" data) but it fails in others. It is always a good to try
different bin sizes to be sure that you are not missing something important.
This function allows you to specify bins in several different ways, such as
by setting the total number of bins to use, the width of each bin, or the
specific locations where the bins should break.
Examples
--------
.. include:: ../docstrings/objects.Hist.rst
"""
stat: str = "count"
bins: str | int | ArrayLike = "auto"
binwidth: float | None = None
binrange: tuple[float, float] | None = None
common_norm: bool | list[str] = True
common_bins: bool | list[str] = True
cumulative: bool = False
discrete: bool = False
def __post_init__(self):
stat_options = [
"count", "density", "percent", "probability", "proportion", "frequency"
]
self._check_param_one_of("stat", stat_options)
def _define_bin_edges(self, vals, weight, bins, binwidth, binrange, discrete):
"""Inner function that takes bin parameters as arguments."""
vals = vals.replace(-np.inf, np.nan).replace(np.inf, np.nan).dropna()
if binrange is None:
start, stop = vals.min(), vals.max()
else:
start, stop = binrange
if discrete:
bin_edges = np.arange(start - .5, stop + 1.5)
else:
if binwidth is not None:
bins = int(round((stop - start) / binwidth))
bin_edges = np.histogram_bin_edges(vals, bins, binrange, weight)
# TODO warning or cap on too many bins?
return bin_edges
def _define_bin_params(self, data, orient, scale_type):
"""Given data, return numpy.histogram parameters to define bins."""
vals = data[orient]
weights = data.get("weight", None)
# TODO We'll want this for ordinal / discrete scales too
# (Do we need discrete as a parameter or just infer from scale?)
discrete = self.discrete or scale_type == "nominal"
bin_edges = self._define_bin_edges(
vals, weights, self.bins, self.binwidth, self.binrange, discrete,
)
if isinstance(self.bins, (str, int)):
n_bins = len(bin_edges) - 1
bin_range = bin_edges.min(), bin_edges.max()
bin_kws = dict(bins=n_bins, range=bin_range)
else:
bin_kws = dict(bins=bin_edges)
return bin_kws
def _get_bins_and_eval(self, data, orient, groupby, scale_type):
bin_kws = self._define_bin_params(data, orient, scale_type)
return groupby.apply(data, self._eval, orient, bin_kws)
def _eval(self, data, orient, bin_kws):
vals = data[orient]
weights = data.get("weight", None)
density = self.stat == "density"
hist, edges = np.histogram(vals, **bin_kws, weights=weights, density=density)
width = np.diff(edges)
center = edges[:-1] + width / 2
return pd.DataFrame({orient: center, "count": hist, "space": width})
def _normalize(self, data):
hist = data["count"]
if self.stat == "probability" or self.stat == "proportion":
hist = hist.astype(float) / hist.sum()
elif self.stat == "percent":
hist = hist.astype(float) / hist.sum() * 100
elif self.stat == "frequency":
hist = hist.astype(float) / data["space"]
if self.cumulative:
if self.stat in ["density", "frequency"]:
hist = (hist * data["space"]).cumsum()
else:
hist = hist.cumsum()
return data.assign(**{self.stat: hist})
def __call__(
self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale],
) -> DataFrame:
scale_type = scales[orient].__class__.__name__.lower()
grouping_vars = [str(v) for v in data if v in groupby.order]
if not grouping_vars or self.common_bins is True:
bin_kws = self._define_bin_params(data, orient, scale_type)
data = groupby.apply(data, self._eval, orient, bin_kws)
else:
if self.common_bins is False:
bin_groupby = GroupBy(grouping_vars)
else:
bin_groupby = GroupBy(self.common_bins)
self._check_grouping_vars("common_bins", grouping_vars)
data = bin_groupby.apply(
data, self._get_bins_and_eval, orient, groupby, scale_type,
)
if not grouping_vars or self.common_norm is True:
data = self._normalize(data)
else:
if self.common_norm is False:
norm_groupby = GroupBy(grouping_vars)
else:
norm_groupby = GroupBy(self.common_norm)
self._check_grouping_vars("common_norm", grouping_vars)
data = norm_groupby.apply(data, self._normalize)
other = {"x": "y", "y": "x"}[orient]
return data.assign(**{other: data[self.stat]})