2532 lines
85 KiB
Python
2532 lines
85 KiB
Python
"""Plotting functions for visualizing distributions."""
|
|
from numbers import Number
|
|
from functools import partial
|
|
import math
|
|
import textwrap
|
|
import warnings
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib as mpl
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.transforms as tx
|
|
from matplotlib.cbook import normalize_kwargs
|
|
from matplotlib.colors import to_rgba
|
|
from matplotlib.collections import LineCollection
|
|
|
|
from ._base import VectorPlotter
|
|
|
|
# We have moved univariate histogram computation over to the new Hist class,
|
|
# but still use the older Histogram for bivariate computation.
|
|
from ._statistics import ECDF, Histogram, KDE
|
|
from ._stats.counting import Hist
|
|
|
|
from .axisgrid import (
|
|
FacetGrid,
|
|
_facet_docs,
|
|
)
|
|
from .utils import (
|
|
remove_na,
|
|
_get_transform_functions,
|
|
_kde_support,
|
|
_check_argument,
|
|
_assign_default_kwargs,
|
|
_default_color,
|
|
)
|
|
from .palettes import color_palette
|
|
from .external import husl
|
|
from .external.kde import gaussian_kde
|
|
from ._docstrings import (
|
|
DocstringComponents,
|
|
_core_docs,
|
|
)
|
|
|
|
|
|
__all__ = ["displot", "histplot", "kdeplot", "ecdfplot", "rugplot", "distplot"]
|
|
|
|
# ==================================================================================== #
|
|
# Module documentation
|
|
# ==================================================================================== #
|
|
|
|
_dist_params = dict(
|
|
|
|
multiple="""
|
|
multiple : {{"layer", "stack", "fill"}}
|
|
Method for drawing multiple elements when semantic mapping creates subsets.
|
|
Only relevant with univariate data.
|
|
""",
|
|
log_scale="""
|
|
log_scale : bool or number, or pair of bools or numbers
|
|
Set axis scale(s) to log. A single value sets the data axis for any numeric
|
|
axes in the plot. A pair of values sets each axis independently.
|
|
Numeric values are interpreted as the desired base (default 10).
|
|
When `None` or `False`, seaborn defers to the existing Axes scale.
|
|
""",
|
|
legend="""
|
|
legend : bool
|
|
If False, suppress the legend for semantic variables.
|
|
""",
|
|
cbar="""
|
|
cbar : bool
|
|
If True, add a colorbar to annotate the color mapping in a bivariate plot.
|
|
Note: Does not currently support plots with a ``hue`` variable well.
|
|
""",
|
|
cbar_ax="""
|
|
cbar_ax : :class:`matplotlib.axes.Axes`
|
|
Pre-existing axes for the colorbar.
|
|
""",
|
|
cbar_kws="""
|
|
cbar_kws : dict
|
|
Additional parameters passed to :meth:`matplotlib.figure.Figure.colorbar`.
|
|
""",
|
|
)
|
|
|
|
_param_docs = DocstringComponents.from_nested_components(
|
|
core=_core_docs["params"],
|
|
facets=DocstringComponents(_facet_docs),
|
|
dist=DocstringComponents(_dist_params),
|
|
kde=DocstringComponents.from_function_params(KDE.__init__),
|
|
hist=DocstringComponents.from_function_params(Histogram.__init__),
|
|
ecdf=DocstringComponents.from_function_params(ECDF.__init__),
|
|
)
|
|
|
|
|
|
# ==================================================================================== #
|
|
# Internal API
|
|
# ==================================================================================== #
|
|
|
|
|
|
class _DistributionPlotter(VectorPlotter):
|
|
|
|
wide_structure = {"x": "@values", "hue": "@columns"}
|
|
flat_structure = {"x": "@values"}
|
|
|
|
def __init__(
|
|
self,
|
|
data=None,
|
|
variables={},
|
|
):
|
|
|
|
super().__init__(data=data, variables=variables)
|
|
|
|
@property
|
|
def univariate(self):
|
|
"""Return True if only x or y are used."""
|
|
# TODO this could go down to core, but putting it here now.
|
|
# We'd want to be conceptually clear that univariate only applies
|
|
# to x/y and not to other semantics, which can exist.
|
|
# We haven't settled on a good conceptual name for x/y.
|
|
return bool({"x", "y"} - set(self.variables))
|
|
|
|
@property
|
|
def data_variable(self):
|
|
"""Return the variable with data for univariate plots."""
|
|
# TODO This could also be in core, but it should have a better name.
|
|
if not self.univariate:
|
|
raise AttributeError("This is not a univariate plot")
|
|
return {"x", "y"}.intersection(self.variables).pop()
|
|
|
|
@property
|
|
def has_xy_data(self):
|
|
"""Return True at least one of x or y is defined."""
|
|
# TODO see above points about where this should go
|
|
return bool({"x", "y"} & set(self.variables))
|
|
|
|
def _add_legend(
|
|
self,
|
|
ax_obj, artist, fill, element, multiple, alpha, artist_kws, legend_kws,
|
|
):
|
|
"""Add artists that reflect semantic mappings and put then in a legend."""
|
|
# TODO note that this doesn't handle numeric mappings like the relational plots
|
|
handles = []
|
|
labels = []
|
|
for level in self._hue_map.levels:
|
|
color = self._hue_map(level)
|
|
|
|
kws = self._artist_kws(
|
|
artist_kws, fill, element, multiple, color, alpha
|
|
)
|
|
|
|
# color gets added to the kws to workaround an issue with barplot's color
|
|
# cycle integration but it causes problems in this context where we are
|
|
# setting artist properties directly, so pop it off here
|
|
if "facecolor" in kws:
|
|
kws.pop("color", None)
|
|
|
|
handles.append(artist(**kws))
|
|
labels.append(level)
|
|
|
|
if isinstance(ax_obj, mpl.axes.Axes):
|
|
ax_obj.legend(handles, labels, title=self.variables["hue"], **legend_kws)
|
|
else: # i.e. a FacetGrid. TODO make this better
|
|
legend_data = dict(zip(labels, handles))
|
|
ax_obj.add_legend(
|
|
legend_data,
|
|
title=self.variables["hue"],
|
|
label_order=self.var_levels["hue"],
|
|
**legend_kws
|
|
)
|
|
|
|
def _artist_kws(self, kws, fill, element, multiple, color, alpha):
|
|
"""Handle differences between artists in filled/unfilled plots."""
|
|
kws = kws.copy()
|
|
if fill:
|
|
kws = normalize_kwargs(kws, mpl.collections.PolyCollection)
|
|
kws.setdefault("facecolor", to_rgba(color, alpha))
|
|
|
|
if element == "bars":
|
|
# Make bar() interface with property cycle correctly
|
|
# https://github.com/matplotlib/matplotlib/issues/19385
|
|
kws["color"] = "none"
|
|
|
|
if multiple in ["stack", "fill"] or element == "bars":
|
|
kws.setdefault("edgecolor", mpl.rcParams["patch.edgecolor"])
|
|
else:
|
|
kws.setdefault("edgecolor", to_rgba(color, 1))
|
|
elif element == "bars":
|
|
kws["facecolor"] = "none"
|
|
kws["edgecolor"] = to_rgba(color, alpha)
|
|
else:
|
|
kws["color"] = to_rgba(color, alpha)
|
|
return kws
|
|
|
|
def _quantile_to_level(self, data, quantile):
|
|
"""Return data levels corresponding to quantile cuts of mass."""
|
|
isoprop = np.asarray(quantile)
|
|
values = np.ravel(data)
|
|
sorted_values = np.sort(values)[::-1]
|
|
normalized_values = np.cumsum(sorted_values) / values.sum()
|
|
idx = np.searchsorted(normalized_values, 1 - isoprop)
|
|
levels = np.take(sorted_values, idx, mode="clip")
|
|
return levels
|
|
|
|
def _cmap_from_color(self, color):
|
|
"""Return a sequential colormap given a color seed."""
|
|
# Like so much else here, this is broadly useful, but keeping it
|
|
# in this class to signify that I haven't thought overly hard about it...
|
|
r, g, b, _ = to_rgba(color)
|
|
h, s, _ = husl.rgb_to_husl(r, g, b)
|
|
xx = np.linspace(-1, 1, int(1.15 * 256))[:256]
|
|
ramp = np.zeros((256, 3))
|
|
ramp[:, 0] = h
|
|
ramp[:, 1] = s * np.cos(xx)
|
|
ramp[:, 2] = np.linspace(35, 80, 256)
|
|
colors = np.clip([husl.husl_to_rgb(*hsl) for hsl in ramp], 0, 1)
|
|
return mpl.colors.ListedColormap(colors[::-1])
|
|
|
|
def _default_discrete(self):
|
|
"""Find default values for discrete hist estimation based on variable type."""
|
|
if self.univariate:
|
|
discrete = self.var_types[self.data_variable] == "categorical"
|
|
else:
|
|
discrete_x = self.var_types["x"] == "categorical"
|
|
discrete_y = self.var_types["y"] == "categorical"
|
|
discrete = discrete_x, discrete_y
|
|
return discrete
|
|
|
|
def _resolve_multiple(self, curves, multiple):
|
|
"""Modify the density data structure to handle multiple densities."""
|
|
|
|
# Default baselines have all densities starting at 0
|
|
baselines = {k: np.zeros_like(v) for k, v in curves.items()}
|
|
|
|
# TODO we should have some central clearinghouse for checking if any
|
|
# "grouping" (terminnology?) semantics have been assigned
|
|
if "hue" not in self.variables:
|
|
return curves, baselines
|
|
|
|
if multiple in ("stack", "fill"):
|
|
|
|
# Setting stack or fill means that the curves share a
|
|
# support grid / set of bin edges, so we can make a dataframe
|
|
# Reverse the column order to plot from top to bottom
|
|
curves = pd.DataFrame(curves).iloc[:, ::-1]
|
|
|
|
# Find column groups that are nested within col/row variables
|
|
column_groups = {}
|
|
for i, keyd in enumerate(map(dict, curves.columns)):
|
|
facet_key = keyd.get("col", None), keyd.get("row", None)
|
|
column_groups.setdefault(facet_key, [])
|
|
column_groups[facet_key].append(i)
|
|
|
|
baselines = curves.copy()
|
|
|
|
for col_idxs in column_groups.values():
|
|
cols = curves.columns[col_idxs]
|
|
|
|
norm_constant = curves[cols].sum(axis="columns")
|
|
|
|
# Take the cumulative sum to stack
|
|
curves[cols] = curves[cols].cumsum(axis="columns")
|
|
|
|
# Normalize by row sum to fill
|
|
if multiple == "fill":
|
|
curves[cols] = curves[cols].div(norm_constant, axis="index")
|
|
|
|
# Define where each segment starts
|
|
baselines[cols] = curves[cols].shift(1, axis=1).fillna(0)
|
|
|
|
if multiple == "dodge":
|
|
|
|
# Account for the unique semantic (non-faceting) levels
|
|
# This will require rethiniking if we add other semantics!
|
|
hue_levels = self.var_levels["hue"]
|
|
n = len(hue_levels)
|
|
f_fwd, f_inv = self._get_scale_transforms(self.data_variable)
|
|
for key in curves:
|
|
|
|
level = dict(key)["hue"]
|
|
hist = curves[key].reset_index(name="heights")
|
|
level_idx = hue_levels.index(level)
|
|
|
|
a = f_fwd(hist["edges"])
|
|
b = f_fwd(hist["edges"] + hist["widths"])
|
|
w = (b - a) / n
|
|
new_min = f_inv(a + level_idx * w)
|
|
new_max = f_inv(a + (level_idx + 1) * w)
|
|
hist["widths"] = new_max - new_min
|
|
hist["edges"] = new_min
|
|
|
|
curves[key] = hist.set_index(["edges", "widths"])["heights"]
|
|
|
|
return curves, baselines
|
|
|
|
# -------------------------------------------------------------------------------- #
|
|
# Computation
|
|
# -------------------------------------------------------------------------------- #
|
|
|
|
def _compute_univariate_density(
|
|
self,
|
|
data_variable,
|
|
common_norm,
|
|
common_grid,
|
|
estimate_kws,
|
|
warn_singular=True,
|
|
):
|
|
|
|
# Initialize the estimator object
|
|
estimator = KDE(**estimate_kws)
|
|
|
|
if set(self.variables) - {"x", "y"}:
|
|
if common_grid:
|
|
all_observations = self.comp_data.dropna()
|
|
estimator.define_support(all_observations[data_variable])
|
|
else:
|
|
common_norm = False
|
|
|
|
all_data = self.plot_data.dropna()
|
|
if common_norm and "weights" in all_data:
|
|
whole_weight = all_data["weights"].sum()
|
|
else:
|
|
whole_weight = len(all_data)
|
|
|
|
densities = {}
|
|
|
|
for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
|
|
|
|
# Extract the data points from this sub set and remove nulls
|
|
observations = sub_data[data_variable]
|
|
|
|
# Extract the weights for this subset of observations
|
|
if "weights" in self.variables:
|
|
weights = sub_data["weights"]
|
|
part_weight = weights.sum()
|
|
else:
|
|
weights = None
|
|
part_weight = len(sub_data)
|
|
|
|
# Estimate the density of observations at this level
|
|
variance = np.nan_to_num(observations.var())
|
|
singular = len(observations) < 2 or math.isclose(variance, 0)
|
|
try:
|
|
if not singular:
|
|
# Convoluted approach needed because numerical failures
|
|
# can manifest in a few different ways.
|
|
density, support = estimator(observations, weights=weights)
|
|
except np.linalg.LinAlgError:
|
|
singular = True
|
|
|
|
if singular:
|
|
msg = (
|
|
"Dataset has 0 variance; skipping density estimate. "
|
|
"Pass `warn_singular=False` to disable this warning."
|
|
)
|
|
if warn_singular:
|
|
warnings.warn(msg, UserWarning, stacklevel=4)
|
|
continue
|
|
|
|
# Invert the scaling of the support points
|
|
_, f_inv = self._get_scale_transforms(self.data_variable)
|
|
support = f_inv(support)
|
|
|
|
# Apply a scaling factor so that the integral over all subsets is 1
|
|
if common_norm:
|
|
density *= part_weight / whole_weight
|
|
|
|
# Store the density for this level
|
|
key = tuple(sub_vars.items())
|
|
densities[key] = pd.Series(density, index=support)
|
|
|
|
return densities
|
|
|
|
# -------------------------------------------------------------------------------- #
|
|
# Plotting
|
|
# -------------------------------------------------------------------------------- #
|
|
|
|
def plot_univariate_histogram(
|
|
self,
|
|
multiple,
|
|
element,
|
|
fill,
|
|
common_norm,
|
|
common_bins,
|
|
shrink,
|
|
kde,
|
|
kde_kws,
|
|
color,
|
|
legend,
|
|
line_kws,
|
|
estimate_kws,
|
|
**plot_kws,
|
|
):
|
|
|
|
# -- Default keyword dicts
|
|
kde_kws = {} if kde_kws is None else kde_kws.copy()
|
|
line_kws = {} if line_kws is None else line_kws.copy()
|
|
estimate_kws = {} if estimate_kws is None else estimate_kws.copy()
|
|
|
|
# -- Input checking
|
|
_check_argument("multiple", ["layer", "stack", "fill", "dodge"], multiple)
|
|
_check_argument("element", ["bars", "step", "poly"], element)
|
|
|
|
auto_bins_with_weights = (
|
|
"weights" in self.variables
|
|
and estimate_kws["bins"] == "auto"
|
|
and estimate_kws["binwidth"] is None
|
|
and not estimate_kws["discrete"]
|
|
)
|
|
if auto_bins_with_weights:
|
|
msg = (
|
|
"`bins` cannot be 'auto' when using weights. "
|
|
"Setting `bins=10`, but you will likely want to adjust."
|
|
)
|
|
warnings.warn(msg, UserWarning)
|
|
estimate_kws["bins"] = 10
|
|
|
|
# Simplify downstream code if we are not normalizing
|
|
if estimate_kws["stat"] == "count":
|
|
common_norm = False
|
|
|
|
orient = self.data_variable
|
|
|
|
# Now initialize the Histogram estimator
|
|
estimator = Hist(**estimate_kws)
|
|
histograms = {}
|
|
|
|
# Do pre-compute housekeeping related to multiple groups
|
|
all_data = self.comp_data.dropna()
|
|
all_weights = all_data.get("weights", None)
|
|
|
|
multiple_histograms = set(self.variables) - {"x", "y"}
|
|
if multiple_histograms:
|
|
if common_bins:
|
|
bin_kws = estimator._define_bin_params(all_data, orient, None)
|
|
else:
|
|
common_norm = False
|
|
|
|
if common_norm and all_weights is not None:
|
|
whole_weight = all_weights.sum()
|
|
else:
|
|
whole_weight = len(all_data)
|
|
|
|
# Estimate the smoothed kernel densities, for use later
|
|
if kde:
|
|
# TODO alternatively, clip at min/max bins?
|
|
kde_kws.setdefault("cut", 0)
|
|
kde_kws["cumulative"] = estimate_kws["cumulative"]
|
|
densities = self._compute_univariate_density(
|
|
self.data_variable,
|
|
common_norm,
|
|
common_bins,
|
|
kde_kws,
|
|
warn_singular=False,
|
|
)
|
|
|
|
# First pass through the data to compute the histograms
|
|
for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
|
|
|
|
# Prepare the relevant data
|
|
key = tuple(sub_vars.items())
|
|
orient = self.data_variable
|
|
|
|
if "weights" in self.variables:
|
|
sub_data["weight"] = sub_data.pop("weights")
|
|
part_weight = sub_data["weight"].sum()
|
|
else:
|
|
part_weight = len(sub_data)
|
|
|
|
# Do the histogram computation
|
|
if not (multiple_histograms and common_bins):
|
|
bin_kws = estimator._define_bin_params(sub_data, orient, None)
|
|
res = estimator._normalize(estimator._eval(sub_data, orient, bin_kws))
|
|
heights = res[estimator.stat].to_numpy()
|
|
widths = res["space"].to_numpy()
|
|
edges = res[orient].to_numpy() - widths / 2
|
|
|
|
# Rescale the smoothed curve to match the histogram
|
|
if kde and key in densities:
|
|
density = densities[key]
|
|
if estimator.cumulative:
|
|
hist_norm = heights.max()
|
|
else:
|
|
hist_norm = (heights * widths).sum()
|
|
densities[key] *= hist_norm
|
|
|
|
# Convert edges back to original units for plotting
|
|
ax = self._get_axes(sub_vars)
|
|
_, inv = _get_transform_functions(ax, self.data_variable)
|
|
widths = inv(edges + widths) - inv(edges)
|
|
edges = inv(edges)
|
|
|
|
# Pack the histogram data and metadata together
|
|
edges = edges + (1 - shrink) / 2 * widths
|
|
widths *= shrink
|
|
index = pd.MultiIndex.from_arrays([
|
|
pd.Index(edges, name="edges"),
|
|
pd.Index(widths, name="widths"),
|
|
])
|
|
hist = pd.Series(heights, index=index, name="heights")
|
|
|
|
# Apply scaling to normalize across groups
|
|
if common_norm:
|
|
hist *= part_weight / whole_weight
|
|
|
|
# Store the finalized histogram data for future plotting
|
|
histograms[key] = hist
|
|
|
|
# Modify the histogram and density data to resolve multiple groups
|
|
histograms, baselines = self._resolve_multiple(histograms, multiple)
|
|
if kde:
|
|
densities, _ = self._resolve_multiple(
|
|
densities, None if multiple == "dodge" else multiple
|
|
)
|
|
|
|
# Set autoscaling-related meta
|
|
sticky_stat = (0, 1) if multiple == "fill" else (0, np.inf)
|
|
if multiple == "fill":
|
|
# Filled plots should not have any margins
|
|
bin_vals = histograms.index.to_frame()
|
|
edges = bin_vals["edges"]
|
|
widths = bin_vals["widths"]
|
|
sticky_data = (
|
|
edges.min(),
|
|
edges.max() + widths.loc[edges.idxmax()]
|
|
)
|
|
else:
|
|
sticky_data = []
|
|
|
|
# --- Handle default visual attributes
|
|
|
|
# Note: default linewidth is determined after plotting
|
|
|
|
# Default alpha should depend on other parameters
|
|
if fill:
|
|
# Note: will need to account for other grouping semantics if added
|
|
if "hue" in self.variables and multiple == "layer":
|
|
default_alpha = .5 if element == "bars" else .25
|
|
elif kde:
|
|
default_alpha = .5
|
|
else:
|
|
default_alpha = .75
|
|
else:
|
|
default_alpha = 1
|
|
alpha = plot_kws.pop("alpha", default_alpha) # TODO make parameter?
|
|
|
|
hist_artists = []
|
|
|
|
# Go back through the dataset and draw the plots
|
|
for sub_vars, _ in self.iter_data("hue", reverse=True):
|
|
|
|
key = tuple(sub_vars.items())
|
|
hist = histograms[key].rename("heights").reset_index()
|
|
bottom = np.asarray(baselines[key])
|
|
|
|
ax = self._get_axes(sub_vars)
|
|
|
|
# Define the matplotlib attributes that depend on semantic mapping
|
|
if "hue" in self.variables:
|
|
sub_color = self._hue_map(sub_vars["hue"])
|
|
else:
|
|
sub_color = color
|
|
|
|
artist_kws = self._artist_kws(
|
|
plot_kws, fill, element, multiple, sub_color, alpha
|
|
)
|
|
|
|
if element == "bars":
|
|
|
|
# Use matplotlib bar plotting
|
|
|
|
plot_func = ax.bar if self.data_variable == "x" else ax.barh
|
|
artists = plot_func(
|
|
hist["edges"],
|
|
hist["heights"] - bottom,
|
|
hist["widths"],
|
|
bottom,
|
|
align="edge",
|
|
**artist_kws,
|
|
)
|
|
|
|
for bar in artists:
|
|
if self.data_variable == "x":
|
|
bar.sticky_edges.x[:] = sticky_data
|
|
bar.sticky_edges.y[:] = sticky_stat
|
|
else:
|
|
bar.sticky_edges.x[:] = sticky_stat
|
|
bar.sticky_edges.y[:] = sticky_data
|
|
|
|
hist_artists.extend(artists)
|
|
|
|
else:
|
|
|
|
# Use either fill_between or plot to draw hull of histogram
|
|
if element == "step":
|
|
|
|
final = hist.iloc[-1]
|
|
x = np.append(hist["edges"], final["edges"] + final["widths"])
|
|
y = np.append(hist["heights"], final["heights"])
|
|
b = np.append(bottom, bottom[-1])
|
|
|
|
if self.data_variable == "x":
|
|
step = "post"
|
|
drawstyle = "steps-post"
|
|
else:
|
|
step = "post" # fillbetweenx handles mapping internally
|
|
drawstyle = "steps-pre"
|
|
|
|
elif element == "poly":
|
|
|
|
x = hist["edges"] + hist["widths"] / 2
|
|
y = hist["heights"]
|
|
b = bottom
|
|
|
|
step = None
|
|
drawstyle = None
|
|
|
|
if self.data_variable == "x":
|
|
if fill:
|
|
artist = ax.fill_between(x, b, y, step=step, **artist_kws)
|
|
else:
|
|
artist, = ax.plot(x, y, drawstyle=drawstyle, **artist_kws)
|
|
artist.sticky_edges.x[:] = sticky_data
|
|
artist.sticky_edges.y[:] = sticky_stat
|
|
else:
|
|
if fill:
|
|
artist = ax.fill_betweenx(x, b, y, step=step, **artist_kws)
|
|
else:
|
|
artist, = ax.plot(y, x, drawstyle=drawstyle, **artist_kws)
|
|
artist.sticky_edges.x[:] = sticky_stat
|
|
artist.sticky_edges.y[:] = sticky_data
|
|
|
|
hist_artists.append(artist)
|
|
|
|
if kde:
|
|
|
|
# Add in the density curves
|
|
|
|
try:
|
|
density = densities[key]
|
|
except KeyError:
|
|
continue
|
|
support = density.index
|
|
|
|
if "x" in self.variables:
|
|
line_args = support, density
|
|
sticky_x, sticky_y = None, (0, np.inf)
|
|
else:
|
|
line_args = density, support
|
|
sticky_x, sticky_y = (0, np.inf), None
|
|
|
|
line_kws["color"] = to_rgba(sub_color, 1)
|
|
line, = ax.plot(
|
|
*line_args, **line_kws,
|
|
)
|
|
|
|
if sticky_x is not None:
|
|
line.sticky_edges.x[:] = sticky_x
|
|
if sticky_y is not None:
|
|
line.sticky_edges.y[:] = sticky_y
|
|
|
|
if element == "bars" and "linewidth" not in plot_kws:
|
|
|
|
# Now we handle linewidth, which depends on the scaling of the plot
|
|
|
|
# We will base everything on the minimum bin width
|
|
hist_metadata = pd.concat([
|
|
# Use .items for generality over dict or df
|
|
h.index.to_frame() for _, h in histograms.items()
|
|
]).reset_index(drop=True)
|
|
thin_bar_idx = hist_metadata["widths"].idxmin()
|
|
binwidth = hist_metadata.loc[thin_bar_idx, "widths"]
|
|
left_edge = hist_metadata.loc[thin_bar_idx, "edges"]
|
|
|
|
# Set initial value
|
|
default_linewidth = math.inf
|
|
|
|
# Loop through subsets based only on facet variables
|
|
for sub_vars, _ in self.iter_data():
|
|
|
|
ax = self._get_axes(sub_vars)
|
|
|
|
# Needed in some cases to get valid transforms.
|
|
# Innocuous in other cases?
|
|
ax.autoscale_view()
|
|
|
|
# Convert binwidth from data coordinates to pixels
|
|
pts_x, pts_y = 72 / ax.figure.dpi * abs(
|
|
ax.transData.transform([left_edge + binwidth] * 2)
|
|
- ax.transData.transform([left_edge] * 2)
|
|
)
|
|
if self.data_variable == "x":
|
|
binwidth_points = pts_x
|
|
else:
|
|
binwidth_points = pts_y
|
|
|
|
# The relative size of the lines depends on the appearance
|
|
# This is a provisional value and may need more tweaking
|
|
default_linewidth = min(.1 * binwidth_points, default_linewidth)
|
|
|
|
# Set the attributes
|
|
for bar in hist_artists:
|
|
|
|
# Don't let the lines get too thick
|
|
max_linewidth = bar.get_linewidth()
|
|
if not fill:
|
|
max_linewidth *= 1.5
|
|
|
|
linewidth = min(default_linewidth, max_linewidth)
|
|
|
|
# If not filling, don't let lines disappear
|
|
if not fill:
|
|
min_linewidth = .5
|
|
linewidth = max(linewidth, min_linewidth)
|
|
|
|
bar.set_linewidth(linewidth)
|
|
|
|
# --- Finalize the plot ----
|
|
|
|
# Axis labels
|
|
ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
|
|
default_x = default_y = ""
|
|
if self.data_variable == "x":
|
|
default_y = estimator.stat.capitalize()
|
|
if self.data_variable == "y":
|
|
default_x = estimator.stat.capitalize()
|
|
self._add_axis_labels(ax, default_x, default_y)
|
|
|
|
# Legend for semantic variables
|
|
if "hue" in self.variables and legend:
|
|
|
|
if fill or element == "bars":
|
|
artist = partial(mpl.patches.Patch)
|
|
else:
|
|
artist = partial(mpl.lines.Line2D, [], [])
|
|
|
|
ax_obj = self.ax if self.ax is not None else self.facets
|
|
self._add_legend(
|
|
ax_obj, artist, fill, element, multiple, alpha, plot_kws, {},
|
|
)
|
|
|
|
def plot_bivariate_histogram(
|
|
self,
|
|
common_bins, common_norm,
|
|
thresh, pthresh, pmax,
|
|
color, legend,
|
|
cbar, cbar_ax, cbar_kws,
|
|
estimate_kws,
|
|
**plot_kws,
|
|
):
|
|
|
|
# Default keyword dicts
|
|
cbar_kws = {} if cbar_kws is None else cbar_kws.copy()
|
|
|
|
# Now initialize the Histogram estimator
|
|
estimator = Histogram(**estimate_kws)
|
|
|
|
# Do pre-compute housekeeping related to multiple groups
|
|
if set(self.variables) - {"x", "y"}:
|
|
all_data = self.comp_data.dropna()
|
|
if common_bins:
|
|
estimator.define_bin_params(
|
|
all_data["x"],
|
|
all_data["y"],
|
|
all_data.get("weights", None),
|
|
)
|
|
else:
|
|
common_norm = False
|
|
|
|
# -- Determine colormap threshold and norm based on the full data
|
|
|
|
full_heights = []
|
|
for _, sub_data in self.iter_data(from_comp_data=True):
|
|
sub_heights, _ = estimator(
|
|
sub_data["x"], sub_data["y"], sub_data.get("weights", None)
|
|
)
|
|
full_heights.append(sub_heights)
|
|
|
|
common_color_norm = not set(self.variables) - {"x", "y"} or common_norm
|
|
|
|
if pthresh is not None and common_color_norm:
|
|
thresh = self._quantile_to_level(full_heights, pthresh)
|
|
|
|
plot_kws.setdefault("vmin", 0)
|
|
if common_color_norm:
|
|
if pmax is not None:
|
|
vmax = self._quantile_to_level(full_heights, pmax)
|
|
else:
|
|
vmax = plot_kws.pop("vmax", max(map(np.max, full_heights)))
|
|
else:
|
|
vmax = None
|
|
|
|
# Get a default color
|
|
# (We won't follow the color cycle here, as multiple plots are unlikely)
|
|
if color is None:
|
|
color = "C0"
|
|
|
|
# --- Loop over data (subsets) and draw the histograms
|
|
for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
|
|
|
|
if sub_data.empty:
|
|
continue
|
|
|
|
# Do the histogram computation
|
|
heights, (x_edges, y_edges) = estimator(
|
|
sub_data["x"],
|
|
sub_data["y"],
|
|
weights=sub_data.get("weights", None),
|
|
)
|
|
|
|
# Get the axes for this plot
|
|
ax = self._get_axes(sub_vars)
|
|
|
|
# Invert the scale for the edges
|
|
_, inv_x = _get_transform_functions(ax, "x")
|
|
_, inv_y = _get_transform_functions(ax, "y")
|
|
x_edges = inv_x(x_edges)
|
|
y_edges = inv_y(y_edges)
|
|
|
|
# Apply scaling to normalize across groups
|
|
if estimator.stat != "count" and common_norm:
|
|
heights *= len(sub_data) / len(all_data)
|
|
|
|
# Define the specific kwargs for this artist
|
|
artist_kws = plot_kws.copy()
|
|
if "hue" in self.variables:
|
|
color = self._hue_map(sub_vars["hue"])
|
|
cmap = self._cmap_from_color(color)
|
|
artist_kws["cmap"] = cmap
|
|
else:
|
|
cmap = artist_kws.pop("cmap", None)
|
|
if isinstance(cmap, str):
|
|
cmap = color_palette(cmap, as_cmap=True)
|
|
elif cmap is None:
|
|
cmap = self._cmap_from_color(color)
|
|
artist_kws["cmap"] = cmap
|
|
|
|
# Set the upper norm on the colormap
|
|
if not common_color_norm and pmax is not None:
|
|
vmax = self._quantile_to_level(heights, pmax)
|
|
if vmax is not None:
|
|
artist_kws["vmax"] = vmax
|
|
|
|
# Make cells at or below the threshold transparent
|
|
if not common_color_norm and pthresh:
|
|
thresh = self._quantile_to_level(heights, pthresh)
|
|
if thresh is not None:
|
|
heights = np.ma.masked_less_equal(heights, thresh)
|
|
|
|
# pcolormesh is going to turn the grid off, but we want to keep it
|
|
# I'm not sure if there's a better way to get the grid state
|
|
x_grid = any([l.get_visible() for l in ax.xaxis.get_gridlines()])
|
|
y_grid = any([l.get_visible() for l in ax.yaxis.get_gridlines()])
|
|
|
|
mesh = ax.pcolormesh(
|
|
x_edges,
|
|
y_edges,
|
|
heights.T,
|
|
**artist_kws,
|
|
)
|
|
|
|
# pcolormesh sets sticky edges, but we only want them if not thresholding
|
|
if thresh is not None:
|
|
mesh.sticky_edges.x[:] = []
|
|
mesh.sticky_edges.y[:] = []
|
|
|
|
# Add an optional colorbar
|
|
# Note, we want to improve this. When hue is used, it will stack
|
|
# multiple colorbars with redundant ticks in an ugly way.
|
|
# But it's going to take some work to have multiple colorbars that
|
|
# share ticks nicely.
|
|
if cbar:
|
|
ax.figure.colorbar(mesh, cbar_ax, ax, **cbar_kws)
|
|
|
|
# Reset the grid state
|
|
if x_grid:
|
|
ax.grid(True, axis="x")
|
|
if y_grid:
|
|
ax.grid(True, axis="y")
|
|
|
|
# --- Finalize the plot
|
|
|
|
ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
|
|
self._add_axis_labels(ax)
|
|
|
|
if "hue" in self.variables and legend:
|
|
|
|
# TODO if possible, I would like to move the contour
|
|
# intensity information into the legend too and label the
|
|
# iso proportions rather than the raw density values
|
|
|
|
artist_kws = {}
|
|
artist = partial(mpl.patches.Patch)
|
|
ax_obj = self.ax if self.ax is not None else self.facets
|
|
self._add_legend(
|
|
ax_obj, artist, True, False, "layer", 1, artist_kws, {},
|
|
)
|
|
|
|
def plot_univariate_density(
|
|
self,
|
|
multiple,
|
|
common_norm,
|
|
common_grid,
|
|
warn_singular,
|
|
fill,
|
|
color,
|
|
legend,
|
|
estimate_kws,
|
|
**plot_kws,
|
|
):
|
|
|
|
# Handle conditional defaults
|
|
if fill is None:
|
|
fill = multiple in ("stack", "fill")
|
|
|
|
# Preprocess the matplotlib keyword dictionaries
|
|
if fill:
|
|
artist = mpl.collections.PolyCollection
|
|
else:
|
|
artist = mpl.lines.Line2D
|
|
plot_kws = normalize_kwargs(plot_kws, artist)
|
|
|
|
# Input checking
|
|
_check_argument("multiple", ["layer", "stack", "fill"], multiple)
|
|
|
|
# Always share the evaluation grid when stacking
|
|
subsets = bool(set(self.variables) - {"x", "y"})
|
|
if subsets and multiple in ("stack", "fill"):
|
|
common_grid = True
|
|
|
|
# Do the computation
|
|
densities = self._compute_univariate_density(
|
|
self.data_variable,
|
|
common_norm,
|
|
common_grid,
|
|
estimate_kws,
|
|
warn_singular,
|
|
)
|
|
|
|
# Adjust densities based on the `multiple` rule
|
|
densities, baselines = self._resolve_multiple(densities, multiple)
|
|
|
|
# Control the interaction with autoscaling by defining sticky_edges
|
|
# i.e. we don't want autoscale margins below the density curve
|
|
sticky_density = (0, 1) if multiple == "fill" else (0, np.inf)
|
|
|
|
if multiple == "fill":
|
|
# Filled plots should not have any margins
|
|
sticky_support = densities.index.min(), densities.index.max()
|
|
else:
|
|
sticky_support = []
|
|
|
|
if fill:
|
|
if multiple == "layer":
|
|
default_alpha = .25
|
|
else:
|
|
default_alpha = .75
|
|
else:
|
|
default_alpha = 1
|
|
alpha = plot_kws.pop("alpha", default_alpha) # TODO make parameter?
|
|
|
|
# Now iterate through the subsets and draw the densities
|
|
# We go backwards so stacked densities read from top-to-bottom
|
|
for sub_vars, _ in self.iter_data("hue", reverse=True):
|
|
|
|
# Extract the support grid and density curve for this level
|
|
key = tuple(sub_vars.items())
|
|
try:
|
|
density = densities[key]
|
|
except KeyError:
|
|
continue
|
|
support = density.index
|
|
fill_from = baselines[key]
|
|
|
|
ax = self._get_axes(sub_vars)
|
|
|
|
if "hue" in self.variables:
|
|
sub_color = self._hue_map(sub_vars["hue"])
|
|
else:
|
|
sub_color = color
|
|
|
|
artist_kws = self._artist_kws(
|
|
plot_kws, fill, False, multiple, sub_color, alpha
|
|
)
|
|
|
|
# Either plot a curve with observation values on the x axis
|
|
if "x" in self.variables:
|
|
|
|
if fill:
|
|
artist = ax.fill_between(support, fill_from, density, **artist_kws)
|
|
|
|
else:
|
|
artist, = ax.plot(support, density, **artist_kws)
|
|
|
|
artist.sticky_edges.x[:] = sticky_support
|
|
artist.sticky_edges.y[:] = sticky_density
|
|
|
|
# Or plot a curve with observation values on the y axis
|
|
else:
|
|
if fill:
|
|
artist = ax.fill_betweenx(support, fill_from, density, **artist_kws)
|
|
else:
|
|
artist, = ax.plot(density, support, **artist_kws)
|
|
|
|
artist.sticky_edges.x[:] = sticky_density
|
|
artist.sticky_edges.y[:] = sticky_support
|
|
|
|
# --- Finalize the plot ----
|
|
|
|
ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
|
|
default_x = default_y = ""
|
|
if self.data_variable == "x":
|
|
default_y = "Density"
|
|
if self.data_variable == "y":
|
|
default_x = "Density"
|
|
self._add_axis_labels(ax, default_x, default_y)
|
|
|
|
if "hue" in self.variables and legend:
|
|
|
|
if fill:
|
|
artist = partial(mpl.patches.Patch)
|
|
else:
|
|
artist = partial(mpl.lines.Line2D, [], [])
|
|
|
|
ax_obj = self.ax if self.ax is not None else self.facets
|
|
self._add_legend(
|
|
ax_obj, artist, fill, False, multiple, alpha, plot_kws, {},
|
|
)
|
|
|
|
def plot_bivariate_density(
|
|
self,
|
|
common_norm,
|
|
fill,
|
|
levels,
|
|
thresh,
|
|
color,
|
|
legend,
|
|
cbar,
|
|
warn_singular,
|
|
cbar_ax,
|
|
cbar_kws,
|
|
estimate_kws,
|
|
**contour_kws,
|
|
):
|
|
|
|
contour_kws = contour_kws.copy()
|
|
|
|
estimator = KDE(**estimate_kws)
|
|
|
|
if not set(self.variables) - {"x", "y"}:
|
|
common_norm = False
|
|
|
|
all_data = self.plot_data.dropna()
|
|
|
|
# Loop through the subsets and estimate the KDEs
|
|
densities, supports = {}, {}
|
|
|
|
for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
|
|
|
|
# Extract the data points from this sub set
|
|
observations = sub_data[["x", "y"]]
|
|
min_variance = observations.var().fillna(0).min()
|
|
observations = observations["x"], observations["y"]
|
|
|
|
# Extract the weights for this subset of observations
|
|
if "weights" in self.variables:
|
|
weights = sub_data["weights"]
|
|
else:
|
|
weights = None
|
|
|
|
# Estimate the density of observations at this level
|
|
singular = math.isclose(min_variance, 0)
|
|
try:
|
|
if not singular:
|
|
density, support = estimator(*observations, weights=weights)
|
|
except np.linalg.LinAlgError:
|
|
# Testing for 0 variance doesn't catch all cases where scipy raises,
|
|
# but we can also get a ValueError, so we need this convoluted approach
|
|
singular = True
|
|
|
|
if singular:
|
|
msg = (
|
|
"KDE cannot be estimated (0 variance or perfect covariance). "
|
|
"Pass `warn_singular=False` to disable this warning."
|
|
)
|
|
if warn_singular:
|
|
warnings.warn(msg, UserWarning, stacklevel=3)
|
|
continue
|
|
|
|
# Transform the support grid back to the original scale
|
|
ax = self._get_axes(sub_vars)
|
|
_, inv_x = _get_transform_functions(ax, "x")
|
|
_, inv_y = _get_transform_functions(ax, "y")
|
|
support = inv_x(support[0]), inv_y(support[1])
|
|
|
|
# Apply a scaling factor so that the integral over all subsets is 1
|
|
if common_norm:
|
|
density *= len(sub_data) / len(all_data)
|
|
|
|
key = tuple(sub_vars.items())
|
|
densities[key] = density
|
|
supports[key] = support
|
|
|
|
# Define a grid of iso-proportion levels
|
|
if thresh is None:
|
|
thresh = 0
|
|
if isinstance(levels, Number):
|
|
levels = np.linspace(thresh, 1, levels)
|
|
else:
|
|
if min(levels) < 0 or max(levels) > 1:
|
|
raise ValueError("levels must be in [0, 1]")
|
|
|
|
# Transform from iso-proportions to iso-densities
|
|
if common_norm:
|
|
common_levels = self._quantile_to_level(
|
|
list(densities.values()), levels,
|
|
)
|
|
draw_levels = {k: common_levels for k in densities}
|
|
else:
|
|
draw_levels = {
|
|
k: self._quantile_to_level(d, levels)
|
|
for k, d in densities.items()
|
|
}
|
|
|
|
# Define the coloring of the contours
|
|
if "hue" in self.variables:
|
|
for param in ["cmap", "colors"]:
|
|
if param in contour_kws:
|
|
msg = f"{param} parameter ignored when using hue mapping."
|
|
warnings.warn(msg, UserWarning)
|
|
contour_kws.pop(param)
|
|
else:
|
|
|
|
# Work out a default coloring of the contours
|
|
coloring_given = set(contour_kws) & {"cmap", "colors"}
|
|
if fill and not coloring_given:
|
|
cmap = self._cmap_from_color(color)
|
|
contour_kws["cmap"] = cmap
|
|
if not fill and not coloring_given:
|
|
contour_kws["colors"] = [color]
|
|
|
|
# Use our internal colormap lookup
|
|
cmap = contour_kws.pop("cmap", None)
|
|
if isinstance(cmap, str):
|
|
cmap = color_palette(cmap, as_cmap=True)
|
|
if cmap is not None:
|
|
contour_kws["cmap"] = cmap
|
|
|
|
# Loop through the subsets again and plot the data
|
|
for sub_vars, _ in self.iter_data("hue"):
|
|
|
|
if "hue" in sub_vars:
|
|
color = self._hue_map(sub_vars["hue"])
|
|
if fill:
|
|
contour_kws["cmap"] = self._cmap_from_color(color)
|
|
else:
|
|
contour_kws["colors"] = [color]
|
|
|
|
ax = self._get_axes(sub_vars)
|
|
|
|
# Choose the function to plot with
|
|
# TODO could add a pcolormesh based option as well
|
|
# Which would look something like element="raster"
|
|
if fill:
|
|
contour_func = ax.contourf
|
|
else:
|
|
contour_func = ax.contour
|
|
|
|
key = tuple(sub_vars.items())
|
|
if key not in densities:
|
|
continue
|
|
density = densities[key]
|
|
xx, yy = supports[key]
|
|
|
|
# Pop the label kwarg which is unused by contour_func (but warns)
|
|
contour_kws.pop("label", None)
|
|
|
|
cset = contour_func(
|
|
xx, yy, density,
|
|
levels=draw_levels[key],
|
|
**contour_kws,
|
|
)
|
|
|
|
# Add a color bar representing the contour heights
|
|
# Note: this shows iso densities, not iso proportions
|
|
# See more notes in histplot about how this could be improved
|
|
if cbar:
|
|
cbar_kws = {} if cbar_kws is None else cbar_kws
|
|
ax.figure.colorbar(cset, cbar_ax, ax, **cbar_kws)
|
|
|
|
# --- Finalize the plot
|
|
ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
|
|
self._add_axis_labels(ax)
|
|
|
|
if "hue" in self.variables and legend:
|
|
|
|
# TODO if possible, I would like to move the contour
|
|
# intensity information into the legend too and label the
|
|
# iso proportions rather than the raw density values
|
|
|
|
artist_kws = {}
|
|
if fill:
|
|
artist = partial(mpl.patches.Patch)
|
|
else:
|
|
artist = partial(mpl.lines.Line2D, [], [])
|
|
|
|
ax_obj = self.ax if self.ax is not None else self.facets
|
|
self._add_legend(
|
|
ax_obj, artist, fill, False, "layer", 1, artist_kws, {},
|
|
)
|
|
|
|
def plot_univariate_ecdf(self, estimate_kws, legend, **plot_kws):
|
|
|
|
estimator = ECDF(**estimate_kws)
|
|
|
|
# Set the draw style to step the right way for the data variable
|
|
drawstyles = dict(x="steps-post", y="steps-pre")
|
|
plot_kws["drawstyle"] = drawstyles[self.data_variable]
|
|
|
|
# Loop through the subsets, transform and plot the data
|
|
for sub_vars, sub_data in self.iter_data(
|
|
"hue", reverse=True, from_comp_data=True,
|
|
):
|
|
|
|
# Compute the ECDF
|
|
if sub_data.empty:
|
|
continue
|
|
|
|
observations = sub_data[self.data_variable]
|
|
weights = sub_data.get("weights", None)
|
|
stat, vals = estimator(observations, weights=weights)
|
|
|
|
# Assign attributes based on semantic mapping
|
|
artist_kws = plot_kws.copy()
|
|
if "hue" in self.variables:
|
|
artist_kws["color"] = self._hue_map(sub_vars["hue"])
|
|
|
|
# Return the data variable to the linear domain
|
|
ax = self._get_axes(sub_vars)
|
|
_, inv = _get_transform_functions(ax, self.data_variable)
|
|
vals = inv(vals)
|
|
|
|
# Manually set the minimum value on a "log" scale
|
|
if isinstance(inv.__self__, mpl.scale.LogTransform):
|
|
vals[0] = -np.inf
|
|
|
|
# Work out the orientation of the plot
|
|
if self.data_variable == "x":
|
|
plot_args = vals, stat
|
|
stat_variable = "y"
|
|
else:
|
|
plot_args = stat, vals
|
|
stat_variable = "x"
|
|
|
|
if estimator.stat == "count":
|
|
top_edge = len(observations)
|
|
else:
|
|
top_edge = 1
|
|
|
|
# Draw the line for this subset
|
|
artist, = ax.plot(*plot_args, **artist_kws)
|
|
sticky_edges = getattr(artist.sticky_edges, stat_variable)
|
|
sticky_edges[:] = 0, top_edge
|
|
|
|
# --- Finalize the plot ----
|
|
ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
|
|
stat = estimator.stat.capitalize()
|
|
default_x = default_y = ""
|
|
if self.data_variable == "x":
|
|
default_y = stat
|
|
if self.data_variable == "y":
|
|
default_x = stat
|
|
self._add_axis_labels(ax, default_x, default_y)
|
|
|
|
if "hue" in self.variables and legend:
|
|
artist = partial(mpl.lines.Line2D, [], [])
|
|
alpha = plot_kws.get("alpha", 1)
|
|
ax_obj = self.ax if self.ax is not None else self.facets
|
|
self._add_legend(
|
|
ax_obj, artist, False, False, None, alpha, plot_kws, {},
|
|
)
|
|
|
|
def plot_rug(self, height, expand_margins, legend, **kws):
|
|
|
|
for sub_vars, sub_data, in self.iter_data(from_comp_data=True):
|
|
|
|
ax = self._get_axes(sub_vars)
|
|
|
|
kws.setdefault("linewidth", 1)
|
|
|
|
if expand_margins:
|
|
xmarg, ymarg = ax.margins()
|
|
if "x" in self.variables:
|
|
ymarg += height * 2
|
|
if "y" in self.variables:
|
|
xmarg += height * 2
|
|
ax.margins(x=xmarg, y=ymarg)
|
|
|
|
if "hue" in self.variables:
|
|
kws.pop("c", None)
|
|
kws.pop("color", None)
|
|
|
|
if "x" in self.variables:
|
|
self._plot_single_rug(sub_data, "x", height, ax, kws)
|
|
if "y" in self.variables:
|
|
self._plot_single_rug(sub_data, "y", height, ax, kws)
|
|
|
|
# --- Finalize the plot
|
|
self._add_axis_labels(ax)
|
|
if "hue" in self.variables and legend:
|
|
# TODO ideally i'd like the legend artist to look like a rug
|
|
legend_artist = partial(mpl.lines.Line2D, [], [])
|
|
self._add_legend(
|
|
ax, legend_artist, False, False, None, 1, {}, {},
|
|
)
|
|
|
|
def _plot_single_rug(self, sub_data, var, height, ax, kws):
|
|
"""Draw a rugplot along one axis of the plot."""
|
|
vector = sub_data[var]
|
|
n = len(vector)
|
|
|
|
# Return data to linear domain
|
|
_, inv = _get_transform_functions(ax, var)
|
|
vector = inv(vector)
|
|
|
|
# We'll always add a single collection with varying colors
|
|
if "hue" in self.variables:
|
|
colors = self._hue_map(sub_data["hue"])
|
|
else:
|
|
colors = None
|
|
|
|
# Build the array of values for the LineCollection
|
|
if var == "x":
|
|
|
|
trans = tx.blended_transform_factory(ax.transData, ax.transAxes)
|
|
xy_pairs = np.column_stack([
|
|
np.repeat(vector, 2), np.tile([0, height], n)
|
|
])
|
|
|
|
if var == "y":
|
|
|
|
trans = tx.blended_transform_factory(ax.transAxes, ax.transData)
|
|
xy_pairs = np.column_stack([
|
|
np.tile([0, height], n), np.repeat(vector, 2)
|
|
])
|
|
|
|
# Draw the lines on the plot
|
|
line_segs = xy_pairs.reshape([n, 2, 2])
|
|
ax.add_collection(LineCollection(
|
|
line_segs, transform=trans, colors=colors, **kws
|
|
))
|
|
|
|
ax.autoscale_view(scalex=var == "x", scaley=var == "y")
|
|
|
|
|
|
# ==================================================================================== #
|
|
# External API
|
|
# ==================================================================================== #
|
|
|
|
def histplot(
|
|
data=None, *,
|
|
# Vector variables
|
|
x=None, y=None, hue=None, weights=None,
|
|
# Histogram computation parameters
|
|
stat="count", bins="auto", binwidth=None, binrange=None,
|
|
discrete=None, cumulative=False, common_bins=True, common_norm=True,
|
|
# Histogram appearance parameters
|
|
multiple="layer", element="bars", fill=True, shrink=1,
|
|
# Histogram smoothing with a kernel density estimate
|
|
kde=False, kde_kws=None, line_kws=None,
|
|
# Bivariate histogram parameters
|
|
thresh=0, pthresh=None, pmax=None, cbar=False, cbar_ax=None, cbar_kws=None,
|
|
# Hue mapping parameters
|
|
palette=None, hue_order=None, hue_norm=None, color=None,
|
|
# Axes information
|
|
log_scale=None, legend=True, ax=None,
|
|
# Other appearance keywords
|
|
**kwargs,
|
|
):
|
|
|
|
p = _DistributionPlotter(
|
|
data=data,
|
|
variables=dict(x=x, y=y, hue=hue, weights=weights),
|
|
)
|
|
|
|
p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
|
|
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
|
|
p._attach(ax, log_scale=log_scale)
|
|
|
|
if p.univariate: # Note, bivariate plots won't cycle
|
|
if fill:
|
|
method = ax.bar if element == "bars" else ax.fill_between
|
|
else:
|
|
method = ax.plot
|
|
color = _default_color(method, hue, color, kwargs)
|
|
|
|
if not p.has_xy_data:
|
|
return ax
|
|
|
|
# Default to discrete bins for categorical variables
|
|
if discrete is None:
|
|
discrete = p._default_discrete()
|
|
|
|
estimate_kws = dict(
|
|
stat=stat,
|
|
bins=bins,
|
|
binwidth=binwidth,
|
|
binrange=binrange,
|
|
discrete=discrete,
|
|
cumulative=cumulative,
|
|
)
|
|
|
|
if p.univariate:
|
|
|
|
p.plot_univariate_histogram(
|
|
multiple=multiple,
|
|
element=element,
|
|
fill=fill,
|
|
shrink=shrink,
|
|
common_norm=common_norm,
|
|
common_bins=common_bins,
|
|
kde=kde,
|
|
kde_kws=kde_kws,
|
|
color=color,
|
|
legend=legend,
|
|
estimate_kws=estimate_kws,
|
|
line_kws=line_kws,
|
|
**kwargs,
|
|
)
|
|
|
|
else:
|
|
|
|
p.plot_bivariate_histogram(
|
|
common_bins=common_bins,
|
|
common_norm=common_norm,
|
|
thresh=thresh,
|
|
pthresh=pthresh,
|
|
pmax=pmax,
|
|
color=color,
|
|
legend=legend,
|
|
cbar=cbar,
|
|
cbar_ax=cbar_ax,
|
|
cbar_kws=cbar_kws,
|
|
estimate_kws=estimate_kws,
|
|
**kwargs,
|
|
)
|
|
|
|
return ax
|
|
|
|
|
|
histplot.__doc__ = """\
|
|
Plot univariate or bivariate histograms to show distributions of datasets.
|
|
|
|
A histogram is a classic visualization tool that represents the distribution
|
|
of one or more variables by counting the number of observations that fall within
|
|
discrete bins.
|
|
|
|
This function can normalize the statistic computed within each bin to estimate
|
|
frequency, density or probability mass, and it can add a smooth curve obtained
|
|
using a kernel density estimate, similar to :func:`kdeplot`.
|
|
|
|
More information is provided in the :ref:`user guide <tutorial_hist>`.
|
|
|
|
Parameters
|
|
----------
|
|
{params.core.data}
|
|
{params.core.xy}
|
|
{params.core.hue}
|
|
weights : vector or key in ``data``
|
|
If provided, weight the contribution of the corresponding data points
|
|
towards the count in each bin by these factors.
|
|
{params.hist.stat}
|
|
{params.hist.bins}
|
|
{params.hist.binwidth}
|
|
{params.hist.binrange}
|
|
discrete : bool
|
|
If True, default to ``binwidth=1`` and draw the bars so that they are
|
|
centered on their corresponding data points. This avoids "gaps" that may
|
|
otherwise appear when using discrete (integer) data.
|
|
cumulative : bool
|
|
If True, plot the cumulative counts as bins increase.
|
|
common_bins : bool
|
|
If True, use the same bins when semantic variables produce multiple
|
|
plots. If using a reference rule to determine the bins, it will be computed
|
|
with the full dataset.
|
|
common_norm : bool
|
|
If True and using a normalized statistic, the normalization will apply over
|
|
the full dataset. Otherwise, normalize each histogram independently.
|
|
multiple : {{"layer", "dodge", "stack", "fill"}}
|
|
Approach to resolving multiple elements when semantic mapping creates subsets.
|
|
Only relevant with univariate data.
|
|
element : {{"bars", "step", "poly"}}
|
|
Visual representation of the histogram statistic.
|
|
Only relevant with univariate data.
|
|
fill : bool
|
|
If True, fill in the space under the histogram.
|
|
Only relevant with univariate data.
|
|
shrink : number
|
|
Scale the width of each bar relative to the binwidth by this factor.
|
|
Only relevant with univariate data.
|
|
kde : bool
|
|
If True, compute a kernel density estimate to smooth the distribution
|
|
and show on the plot as (one or more) line(s).
|
|
Only relevant with univariate data.
|
|
kde_kws : dict
|
|
Parameters that control the KDE computation, as in :func:`kdeplot`.
|
|
line_kws : dict
|
|
Parameters that control the KDE visualization, passed to
|
|
:meth:`matplotlib.axes.Axes.plot`.
|
|
thresh : number or None
|
|
Cells with a statistic less than or equal to this value will be transparent.
|
|
Only relevant with bivariate data.
|
|
pthresh : number or None
|
|
Like ``thresh``, but a value in [0, 1] such that cells with aggregate counts
|
|
(or other statistics, when used) up to this proportion of the total will be
|
|
transparent.
|
|
pmax : number or None
|
|
A value in [0, 1] that sets that saturation point for the colormap at a value
|
|
such that cells below constitute this proportion of the total count (or
|
|
other statistic, when used).
|
|
{params.dist.cbar}
|
|
{params.dist.cbar_ax}
|
|
{params.dist.cbar_kws}
|
|
{params.core.palette}
|
|
{params.core.hue_order}
|
|
{params.core.hue_norm}
|
|
{params.core.color}
|
|
{params.dist.log_scale}
|
|
{params.dist.legend}
|
|
{params.core.ax}
|
|
kwargs
|
|
Other keyword arguments are passed to one of the following matplotlib
|
|
functions:
|
|
|
|
- :meth:`matplotlib.axes.Axes.bar` (univariate, element="bars")
|
|
- :meth:`matplotlib.axes.Axes.fill_between` (univariate, other element, fill=True)
|
|
- :meth:`matplotlib.axes.Axes.plot` (univariate, other element, fill=False)
|
|
- :meth:`matplotlib.axes.Axes.pcolormesh` (bivariate)
|
|
|
|
Returns
|
|
-------
|
|
{returns.ax}
|
|
|
|
See Also
|
|
--------
|
|
{seealso.displot}
|
|
{seealso.kdeplot}
|
|
{seealso.rugplot}
|
|
{seealso.ecdfplot}
|
|
{seealso.jointplot}
|
|
|
|
Notes
|
|
-----
|
|
|
|
The choice of bins for computing and plotting a histogram can exert
|
|
substantial influence on the insights that one is able to draw from the
|
|
visualization. If the bins are too large, they may erase important features.
|
|
On the other hand, bins that are too small may be dominated by random
|
|
variability, obscuring the shape of the true underlying distribution. The
|
|
default bin size is determined using a reference rule that depends on the
|
|
sample size and variance. This works well in many cases, (i.e., with
|
|
"well-behaved" data) but it fails in others. It is always a good to try
|
|
different bin sizes to be sure that you are not missing something important.
|
|
This function allows you to specify bins in several different ways, such as
|
|
by setting the total number of bins to use, the width of each bin, or the
|
|
specific locations where the bins should break.
|
|
|
|
Examples
|
|
--------
|
|
|
|
.. include:: ../docstrings/histplot.rst
|
|
|
|
""".format(
|
|
params=_param_docs,
|
|
returns=_core_docs["returns"],
|
|
seealso=_core_docs["seealso"],
|
|
)
|
|
|
|
|
|
def kdeplot(
|
|
data=None, *, x=None, y=None, hue=None, weights=None,
|
|
palette=None, hue_order=None, hue_norm=None, color=None, fill=None,
|
|
multiple="layer", common_norm=True, common_grid=False, cumulative=False,
|
|
bw_method="scott", bw_adjust=1, warn_singular=True, log_scale=None,
|
|
levels=10, thresh=.05, gridsize=200, cut=3, clip=None,
|
|
legend=True, cbar=False, cbar_ax=None, cbar_kws=None, ax=None,
|
|
**kwargs,
|
|
):
|
|
|
|
# --- Start with backwards compatability for versions < 0.11.0 ----------------
|
|
|
|
# Handle (past) deprecation of `data2`
|
|
if "data2" in kwargs:
|
|
msg = "`data2` has been removed (replaced by `y`); please update your code."
|
|
raise TypeError(msg)
|
|
|
|
# Handle deprecation of `vertical`
|
|
vertical = kwargs.pop("vertical", None)
|
|
if vertical is not None:
|
|
if vertical:
|
|
action_taken = "assigning data to `y`."
|
|
if x is None:
|
|
data, y = y, data
|
|
else:
|
|
x, y = y, x
|
|
else:
|
|
action_taken = "assigning data to `x`."
|
|
msg = textwrap.dedent(f"""\n
|
|
The `vertical` parameter is deprecated; {action_taken}
|
|
This will become an error in seaborn v0.14.0; please update your code.
|
|
""")
|
|
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
|
|
# Handle deprecation of `bw`
|
|
bw = kwargs.pop("bw", None)
|
|
if bw is not None:
|
|
msg = textwrap.dedent(f"""\n
|
|
The `bw` parameter is deprecated in favor of `bw_method` and `bw_adjust`.
|
|
Setting `bw_method={bw}`, but please see the docs for the new parameters
|
|
and update your code. This will become an error in seaborn v0.14.0.
|
|
""")
|
|
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
bw_method = bw
|
|
|
|
# Handle deprecation of `kernel`
|
|
if kwargs.pop("kernel", None) is not None:
|
|
msg = textwrap.dedent("""\n
|
|
Support for alternate kernels has been removed; using Gaussian kernel.
|
|
This will become an error in seaborn v0.14.0; please update your code.
|
|
""")
|
|
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
|
|
# Handle deprecation of shade_lowest
|
|
shade_lowest = kwargs.pop("shade_lowest", None)
|
|
if shade_lowest is not None:
|
|
if shade_lowest:
|
|
thresh = 0
|
|
msg = textwrap.dedent(f"""\n
|
|
`shade_lowest` has been replaced by `thresh`; setting `thresh={thresh}.
|
|
This will become an error in seaborn v0.14.0; please update your code.
|
|
""")
|
|
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
|
|
# Handle "soft" deprecation of shade `shade` is not really the right
|
|
# terminology here, but unlike some of the other deprecated parameters it
|
|
# is probably very commonly used and much hard to remove. This is therefore
|
|
# going to be a longer process where, first, `fill` will be introduced and
|
|
# be used throughout the documentation. In 0.12, when kwarg-only
|
|
# enforcement hits, we can remove the shade/shade_lowest out of the
|
|
# function signature all together and pull them out of the kwargs. Then we
|
|
# can actually fire a FutureWarning, and eventually remove.
|
|
shade = kwargs.pop("shade", None)
|
|
if shade is not None:
|
|
fill = shade
|
|
msg = textwrap.dedent(f"""\n
|
|
`shade` is now deprecated in favor of `fill`; setting `fill={shade}`.
|
|
This will become an error in seaborn v0.14.0; please update your code.
|
|
""")
|
|
warnings.warn(msg, FutureWarning, stacklevel=2)
|
|
|
|
# Handle `n_levels`
|
|
# This was never in the formal API but it was processed, and appeared in an
|
|
# example. We can treat as an alias for `levels` now and deprecate later.
|
|
levels = kwargs.pop("n_levels", levels)
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
|
|
|
p = _DistributionPlotter(
|
|
data=data,
|
|
variables=dict(x=x, y=y, hue=hue, weights=weights),
|
|
)
|
|
|
|
p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
|
|
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
|
|
p._attach(ax, allowed_types=["numeric", "datetime"], log_scale=log_scale)
|
|
|
|
method = ax.fill_between if fill else ax.plot
|
|
color = _default_color(method, hue, color, kwargs)
|
|
|
|
if not p.has_xy_data:
|
|
return ax
|
|
|
|
# Pack the kwargs for statistics.KDE
|
|
estimate_kws = dict(
|
|
bw_method=bw_method,
|
|
bw_adjust=bw_adjust,
|
|
gridsize=gridsize,
|
|
cut=cut,
|
|
clip=clip,
|
|
cumulative=cumulative,
|
|
)
|
|
|
|
if p.univariate:
|
|
|
|
plot_kws = kwargs.copy()
|
|
|
|
p.plot_univariate_density(
|
|
multiple=multiple,
|
|
common_norm=common_norm,
|
|
common_grid=common_grid,
|
|
fill=fill,
|
|
color=color,
|
|
legend=legend,
|
|
warn_singular=warn_singular,
|
|
estimate_kws=estimate_kws,
|
|
**plot_kws,
|
|
)
|
|
|
|
else:
|
|
|
|
p.plot_bivariate_density(
|
|
common_norm=common_norm,
|
|
fill=fill,
|
|
levels=levels,
|
|
thresh=thresh,
|
|
legend=legend,
|
|
color=color,
|
|
warn_singular=warn_singular,
|
|
cbar=cbar,
|
|
cbar_ax=cbar_ax,
|
|
cbar_kws=cbar_kws,
|
|
estimate_kws=estimate_kws,
|
|
**kwargs,
|
|
)
|
|
|
|
return ax
|
|
|
|
|
|
kdeplot.__doc__ = """\
|
|
Plot univariate or bivariate distributions using kernel density estimation.
|
|
|
|
A kernel density estimate (KDE) plot is a method for visualizing the
|
|
distribution of observations in a dataset, analogous to a histogram. KDE
|
|
represents the data using a continuous probability density curve in one or
|
|
more dimensions.
|
|
|
|
The approach is explained further in the :ref:`user guide <tutorial_kde>`.
|
|
|
|
Relative to a histogram, KDE can produce a plot that is less cluttered and
|
|
more interpretable, especially when drawing multiple distributions. But it
|
|
has the potential to introduce distortions if the underlying distribution is
|
|
bounded or not smooth. Like a histogram, the quality of the representation
|
|
also depends on the selection of good smoothing parameters.
|
|
|
|
Parameters
|
|
----------
|
|
{params.core.data}
|
|
{params.core.xy}
|
|
{params.core.hue}
|
|
weights : vector or key in ``data``
|
|
If provided, weight the kernel density estimation using these values.
|
|
{params.core.palette}
|
|
{params.core.hue_order}
|
|
{params.core.hue_norm}
|
|
{params.core.color}
|
|
fill : bool or None
|
|
If True, fill in the area under univariate density curves or between
|
|
bivariate contours. If None, the default depends on ``multiple``.
|
|
{params.dist.multiple}
|
|
common_norm : bool
|
|
If True, scale each conditional density by the number of observations
|
|
such that the total area under all densities sums to 1. Otherwise,
|
|
normalize each density independently.
|
|
common_grid : bool
|
|
If True, use the same evaluation grid for each kernel density estimate.
|
|
Only relevant with univariate data.
|
|
{params.kde.cumulative}
|
|
{params.kde.bw_method}
|
|
{params.kde.bw_adjust}
|
|
warn_singular : bool
|
|
If True, issue a warning when trying to estimate the density of data
|
|
with zero variance.
|
|
{params.dist.log_scale}
|
|
levels : int or vector
|
|
Number of contour levels or values to draw contours at. A vector argument
|
|
must have increasing values in [0, 1]. Levels correspond to iso-proportions
|
|
of the density: e.g., 20% of the probability mass will lie below the
|
|
contour drawn for 0.2. Only relevant with bivariate data.
|
|
thresh : number in [0, 1]
|
|
Lowest iso-proportion level at which to draw a contour line. Ignored when
|
|
``levels`` is a vector. Only relevant with bivariate data.
|
|
gridsize : int
|
|
Number of points on each dimension of the evaluation grid.
|
|
{params.kde.cut}
|
|
{params.kde.clip}
|
|
{params.dist.legend}
|
|
{params.dist.cbar}
|
|
{params.dist.cbar_ax}
|
|
{params.dist.cbar_kws}
|
|
{params.core.ax}
|
|
kwargs
|
|
Other keyword arguments are passed to one of the following matplotlib
|
|
functions:
|
|
|
|
- :meth:`matplotlib.axes.Axes.plot` (univariate, ``fill=False``),
|
|
- :meth:`matplotlib.axes.Axes.fill_between` (univariate, ``fill=True``),
|
|
- :meth:`matplotlib.axes.Axes.contour` (bivariate, ``fill=False``),
|
|
- :meth:`matplotlib.axes.contourf` (bivariate, ``fill=True``).
|
|
|
|
Returns
|
|
-------
|
|
{returns.ax}
|
|
|
|
See Also
|
|
--------
|
|
{seealso.displot}
|
|
{seealso.histplot}
|
|
{seealso.ecdfplot}
|
|
{seealso.jointplot}
|
|
{seealso.violinplot}
|
|
|
|
Notes
|
|
-----
|
|
|
|
The *bandwidth*, or standard deviation of the smoothing kernel, is an
|
|
important parameter. Misspecification of the bandwidth can produce a
|
|
distorted representation of the data. Much like the choice of bin width in a
|
|
histogram, an over-smoothed curve can erase true features of a
|
|
distribution, while an under-smoothed curve can create false features out of
|
|
random variability. The rule-of-thumb that sets the default bandwidth works
|
|
best when the true distribution is smooth, unimodal, and roughly bell-shaped.
|
|
It is always a good idea to check the default behavior by using ``bw_adjust``
|
|
to increase or decrease the amount of smoothing.
|
|
|
|
Because the smoothing algorithm uses a Gaussian kernel, the estimated density
|
|
curve can extend to values that do not make sense for a particular dataset.
|
|
For example, the curve may be drawn over negative values when smoothing data
|
|
that are naturally positive. The ``cut`` and ``clip`` parameters can be used
|
|
to control the extent of the curve, but datasets that have many observations
|
|
close to a natural boundary may be better served by a different visualization
|
|
method.
|
|
|
|
Similar considerations apply when a dataset is naturally discrete or "spiky"
|
|
(containing many repeated observations of the same value). Kernel density
|
|
estimation will always produce a smooth curve, which would be misleading
|
|
in these situations.
|
|
|
|
The units on the density axis are a common source of confusion. While kernel
|
|
density estimation produces a probability distribution, the height of the curve
|
|
at each point gives a density, not a probability. A probability can be obtained
|
|
only by integrating the density across a range. The curve is normalized so
|
|
that the integral over all possible values is 1, meaning that the scale of
|
|
the density axis depends on the data values.
|
|
|
|
Examples
|
|
--------
|
|
|
|
.. include:: ../docstrings/kdeplot.rst
|
|
|
|
""".format(
|
|
params=_param_docs,
|
|
returns=_core_docs["returns"],
|
|
seealso=_core_docs["seealso"],
|
|
)
|
|
|
|
|
|
def ecdfplot(
|
|
data=None, *,
|
|
# Vector variables
|
|
x=None, y=None, hue=None, weights=None,
|
|
# Computation parameters
|
|
stat="proportion", complementary=False,
|
|
# Hue mapping parameters
|
|
palette=None, hue_order=None, hue_norm=None,
|
|
# Axes information
|
|
log_scale=None, legend=True, ax=None,
|
|
# Other appearance keywords
|
|
**kwargs,
|
|
):
|
|
|
|
p = _DistributionPlotter(
|
|
data=data,
|
|
variables=dict(x=x, y=y, hue=hue, weights=weights),
|
|
)
|
|
|
|
p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
|
|
|
|
# We could support other semantics (size, style) here fairly easily
|
|
# But it would make distplot a bit more complicated.
|
|
# It's always possible to add features like that later, so I am going to defer.
|
|
# It will be even easier to wait until after there is a more general/abstract
|
|
# way to go from semantic specs to artist attributes.
|
|
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
|
|
p._attach(ax, log_scale=log_scale)
|
|
|
|
color = kwargs.pop("color", kwargs.pop("c", None))
|
|
kwargs["color"] = _default_color(ax.plot, hue, color, kwargs)
|
|
|
|
if not p.has_xy_data:
|
|
return ax
|
|
|
|
# We could add this one day, but it's of dubious value
|
|
if not p.univariate:
|
|
raise NotImplementedError("Bivariate ECDF plots are not implemented")
|
|
|
|
estimate_kws = dict(
|
|
stat=stat,
|
|
complementary=complementary,
|
|
)
|
|
|
|
p.plot_univariate_ecdf(
|
|
estimate_kws=estimate_kws,
|
|
legend=legend,
|
|
**kwargs,
|
|
)
|
|
|
|
return ax
|
|
|
|
|
|
ecdfplot.__doc__ = """\
|
|
Plot empirical cumulative distribution functions.
|
|
|
|
An ECDF represents the proportion or count of observations falling below each
|
|
unique value in a dataset. Compared to a histogram or density plot, it has the
|
|
advantage that each observation is visualized directly, meaning that there are
|
|
no binning or smoothing parameters that need to be adjusted. It also aids direct
|
|
comparisons between multiple distributions. A downside is that the relationship
|
|
between the appearance of the plot and the basic properties of the distribution
|
|
(such as its central tendency, variance, and the presence of any bimodality)
|
|
may not be as intuitive.
|
|
|
|
More information is provided in the :ref:`user guide <tutorial_ecdf>`.
|
|
|
|
Parameters
|
|
----------
|
|
{params.core.data}
|
|
{params.core.xy}
|
|
{params.core.hue}
|
|
weights : vector or key in ``data``
|
|
If provided, weight the contribution of the corresponding data points
|
|
towards the cumulative distribution using these values.
|
|
{params.ecdf.stat}
|
|
{params.ecdf.complementary}
|
|
{params.core.palette}
|
|
{params.core.hue_order}
|
|
{params.core.hue_norm}
|
|
{params.dist.log_scale}
|
|
{params.dist.legend}
|
|
{params.core.ax}
|
|
kwargs
|
|
Other keyword arguments are passed to :meth:`matplotlib.axes.Axes.plot`.
|
|
|
|
Returns
|
|
-------
|
|
{returns.ax}
|
|
|
|
See Also
|
|
--------
|
|
{seealso.displot}
|
|
{seealso.histplot}
|
|
{seealso.kdeplot}
|
|
{seealso.rugplot}
|
|
|
|
Examples
|
|
--------
|
|
|
|
.. include:: ../docstrings/ecdfplot.rst
|
|
|
|
""".format(
|
|
params=_param_docs,
|
|
returns=_core_docs["returns"],
|
|
seealso=_core_docs["seealso"],
|
|
)
|
|
|
|
|
|
def rugplot(
|
|
data=None, *, x=None, y=None, hue=None, height=.025, expand_margins=True,
|
|
palette=None, hue_order=None, hue_norm=None, legend=True, ax=None, **kwargs
|
|
):
|
|
|
|
# A note: I think it would make sense to add multiple= to rugplot and allow
|
|
# rugs for different hue variables to be shifted orthogonal to the data axis
|
|
# But is this stacking, or dodging?
|
|
|
|
# A note: if we want to add a style semantic to rugplot,
|
|
# we could make an option that draws the rug using scatterplot
|
|
|
|
# A note, it would also be nice to offer some kind of histogram/density
|
|
# rugplot, since alpha blending doesn't work great in the large n regime
|
|
|
|
# --- Start with backwards compatability for versions < 0.11.0 ----------------
|
|
|
|
a = kwargs.pop("a", None)
|
|
axis = kwargs.pop("axis", None)
|
|
|
|
if a is not None:
|
|
data = a
|
|
msg = textwrap.dedent("""\n
|
|
The `a` parameter has been replaced; use `x`, `y`, and/or `data` instead.
|
|
Please update your code; This will become an error in seaborn v0.14.0.
|
|
""")
|
|
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
|
|
if axis is not None:
|
|
if axis == "x":
|
|
x = data
|
|
elif axis == "y":
|
|
y = data
|
|
data = None
|
|
msg = textwrap.dedent(f"""\n
|
|
The `axis` parameter has been deprecated; use the `{axis}` parameter instead.
|
|
Please update your code; this will become an error in seaborn v0.14.0.
|
|
""")
|
|
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
|
|
vertical = kwargs.pop("vertical", None)
|
|
if vertical is not None:
|
|
if vertical:
|
|
action_taken = "assigning data to `y`."
|
|
if x is None:
|
|
data, y = y, data
|
|
else:
|
|
x, y = y, x
|
|
else:
|
|
action_taken = "assigning data to `x`."
|
|
msg = textwrap.dedent(f"""\n
|
|
The `vertical` parameter is deprecated; {action_taken}
|
|
This will become an error in seaborn v0.14.0; please update your code.
|
|
""")
|
|
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
|
|
|
p = _DistributionPlotter(
|
|
data=data,
|
|
variables=dict(x=x, y=y, hue=hue),
|
|
)
|
|
p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
|
|
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
|
|
p._attach(ax)
|
|
|
|
color = kwargs.pop("color", kwargs.pop("c", None))
|
|
kwargs["color"] = _default_color(ax.plot, hue, color, kwargs)
|
|
|
|
if not p.has_xy_data:
|
|
return ax
|
|
|
|
p.plot_rug(height, expand_margins, legend, **kwargs)
|
|
|
|
return ax
|
|
|
|
|
|
rugplot.__doc__ = """\
|
|
Plot marginal distributions by drawing ticks along the x and y axes.
|
|
|
|
This function is intended to complement other plots by showing the location
|
|
of individual observations in an unobtrusive way.
|
|
|
|
Parameters
|
|
----------
|
|
{params.core.data}
|
|
{params.core.xy}
|
|
{params.core.hue}
|
|
height : float
|
|
Proportion of axes extent covered by each rug element. Can be negative.
|
|
expand_margins : bool
|
|
If True, increase the axes margins by the height of the rug to avoid
|
|
overlap with other elements.
|
|
{params.core.palette}
|
|
{params.core.hue_order}
|
|
{params.core.hue_norm}
|
|
legend : bool
|
|
If False, do not add a legend for semantic variables.
|
|
{params.core.ax}
|
|
kwargs
|
|
Other keyword arguments are passed to
|
|
:meth:`matplotlib.collections.LineCollection`
|
|
|
|
Returns
|
|
-------
|
|
{returns.ax}
|
|
|
|
Examples
|
|
--------
|
|
|
|
.. include:: ../docstrings/rugplot.rst
|
|
|
|
""".format(
|
|
params=_param_docs,
|
|
returns=_core_docs["returns"],
|
|
)
|
|
|
|
|
|
def displot(
|
|
data=None, *,
|
|
# Vector variables
|
|
x=None, y=None, hue=None, row=None, col=None, weights=None,
|
|
# Other plot parameters
|
|
kind="hist", rug=False, rug_kws=None, log_scale=None, legend=True,
|
|
# Hue-mapping parameters
|
|
palette=None, hue_order=None, hue_norm=None, color=None,
|
|
# Faceting parameters
|
|
col_wrap=None, row_order=None, col_order=None,
|
|
height=5, aspect=1, facet_kws=None,
|
|
**kwargs,
|
|
):
|
|
|
|
p = _DistributionPlotter(
|
|
data=data,
|
|
variables=dict(x=x, y=y, hue=hue, weights=weights, row=row, col=col),
|
|
)
|
|
|
|
p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
|
|
|
|
_check_argument("kind", ["hist", "kde", "ecdf"], kind)
|
|
|
|
# --- Initialize the FacetGrid object
|
|
|
|
# Check for attempt to plot onto specific axes and warn
|
|
if "ax" in kwargs:
|
|
msg = (
|
|
"`displot` is a figure-level function and does not accept "
|
|
"the ax= parameter. You may wish to try {}plot.".format(kind)
|
|
)
|
|
warnings.warn(msg, UserWarning)
|
|
kwargs.pop("ax")
|
|
|
|
for var in ["row", "col"]:
|
|
# Handle faceting variables that lack name information
|
|
if var in p.variables and p.variables[var] is None:
|
|
p.variables[var] = f"_{var}_"
|
|
|
|
# Adapt the plot_data dataframe for use with FacetGrid
|
|
grid_data = p.plot_data.rename(columns=p.variables)
|
|
grid_data = grid_data.loc[:, ~grid_data.columns.duplicated()]
|
|
|
|
col_name = p.variables.get("col")
|
|
row_name = p.variables.get("row")
|
|
|
|
if facet_kws is None:
|
|
facet_kws = {}
|
|
|
|
g = FacetGrid(
|
|
data=grid_data, row=row_name, col=col_name,
|
|
col_wrap=col_wrap, row_order=row_order,
|
|
col_order=col_order, height=height,
|
|
aspect=aspect,
|
|
**facet_kws,
|
|
)
|
|
|
|
# Now attach the axes object to the plotter object
|
|
if kind == "kde":
|
|
allowed_types = ["numeric", "datetime"]
|
|
else:
|
|
allowed_types = None
|
|
p._attach(g, allowed_types=allowed_types, log_scale=log_scale)
|
|
|
|
# Check for a specification that lacks x/y data and return early
|
|
if not p.has_xy_data:
|
|
return g
|
|
|
|
if color is None and hue is None:
|
|
color = "C0"
|
|
# XXX else warn if hue is not None?
|
|
|
|
kwargs["legend"] = legend
|
|
|
|
# --- Draw the plots
|
|
|
|
if kind == "hist":
|
|
|
|
hist_kws = kwargs.copy()
|
|
|
|
# Extract the parameters that will go directly to Histogram
|
|
estimate_defaults = {}
|
|
_assign_default_kwargs(estimate_defaults, Histogram.__init__, histplot)
|
|
|
|
estimate_kws = {}
|
|
for key, default_val in estimate_defaults.items():
|
|
estimate_kws[key] = hist_kws.pop(key, default_val)
|
|
|
|
# Handle derivative defaults
|
|
if estimate_kws["discrete"] is None:
|
|
estimate_kws["discrete"] = p._default_discrete()
|
|
|
|
hist_kws["estimate_kws"] = estimate_kws
|
|
|
|
hist_kws.setdefault("color", color)
|
|
|
|
if p.univariate:
|
|
|
|
_assign_default_kwargs(hist_kws, p.plot_univariate_histogram, histplot)
|
|
p.plot_univariate_histogram(**hist_kws)
|
|
|
|
else:
|
|
|
|
_assign_default_kwargs(hist_kws, p.plot_bivariate_histogram, histplot)
|
|
p.plot_bivariate_histogram(**hist_kws)
|
|
|
|
elif kind == "kde":
|
|
|
|
kde_kws = kwargs.copy()
|
|
|
|
# Extract the parameters that will go directly to KDE
|
|
estimate_defaults = {}
|
|
_assign_default_kwargs(estimate_defaults, KDE.__init__, kdeplot)
|
|
|
|
estimate_kws = {}
|
|
for key, default_val in estimate_defaults.items():
|
|
estimate_kws[key] = kde_kws.pop(key, default_val)
|
|
|
|
kde_kws["estimate_kws"] = estimate_kws
|
|
kde_kws["color"] = color
|
|
|
|
if p.univariate:
|
|
|
|
_assign_default_kwargs(kde_kws, p.plot_univariate_density, kdeplot)
|
|
p.plot_univariate_density(**kde_kws)
|
|
|
|
else:
|
|
|
|
_assign_default_kwargs(kde_kws, p.plot_bivariate_density, kdeplot)
|
|
p.plot_bivariate_density(**kde_kws)
|
|
|
|
elif kind == "ecdf":
|
|
|
|
ecdf_kws = kwargs.copy()
|
|
|
|
# Extract the parameters that will go directly to the estimator
|
|
estimate_kws = {}
|
|
estimate_defaults = {}
|
|
_assign_default_kwargs(estimate_defaults, ECDF.__init__, ecdfplot)
|
|
for key, default_val in estimate_defaults.items():
|
|
estimate_kws[key] = ecdf_kws.pop(key, default_val)
|
|
|
|
ecdf_kws["estimate_kws"] = estimate_kws
|
|
ecdf_kws["color"] = color
|
|
|
|
if p.univariate:
|
|
|
|
_assign_default_kwargs(ecdf_kws, p.plot_univariate_ecdf, ecdfplot)
|
|
p.plot_univariate_ecdf(**ecdf_kws)
|
|
|
|
else:
|
|
|
|
raise NotImplementedError("Bivariate ECDF plots are not implemented")
|
|
|
|
# All plot kinds can include a rug
|
|
if rug:
|
|
# TODO with expand_margins=True, each facet expands margins... annoying!
|
|
if rug_kws is None:
|
|
rug_kws = {}
|
|
_assign_default_kwargs(rug_kws, p.plot_rug, rugplot)
|
|
rug_kws["legend"] = False
|
|
if color is not None:
|
|
rug_kws["color"] = color
|
|
p.plot_rug(**rug_kws)
|
|
|
|
# Call FacetGrid annotation methods
|
|
# Note that the legend is currently set inside the plotting method
|
|
g.set_axis_labels(
|
|
x_var=p.variables.get("x", g.axes.flat[0].get_xlabel()),
|
|
y_var=p.variables.get("y", g.axes.flat[0].get_ylabel()),
|
|
)
|
|
g.set_titles()
|
|
g.tight_layout()
|
|
|
|
if data is not None and (x is not None or y is not None):
|
|
if not isinstance(data, pd.DataFrame):
|
|
data = pd.DataFrame(data)
|
|
g.data = pd.merge(
|
|
data,
|
|
g.data[g.data.columns.difference(data.columns)],
|
|
left_index=True,
|
|
right_index=True,
|
|
)
|
|
else:
|
|
wide_cols = {
|
|
k: f"_{k}_" if v is None else v for k, v in p.variables.items()
|
|
}
|
|
g.data = p.plot_data.rename(columns=wide_cols)
|
|
|
|
return g
|
|
|
|
|
|
displot.__doc__ = """\
|
|
Figure-level interface for drawing distribution plots onto a FacetGrid.
|
|
|
|
This function provides access to several approaches for visualizing the
|
|
univariate or bivariate distribution of data, including subsets of data
|
|
defined by semantic mapping and faceting across multiple subplots. The
|
|
``kind`` parameter selects the approach to use:
|
|
|
|
- :func:`histplot` (with ``kind="hist"``; the default)
|
|
- :func:`kdeplot` (with ``kind="kde"``)
|
|
- :func:`ecdfplot` (with ``kind="ecdf"``; univariate-only)
|
|
|
|
Additionally, a :func:`rugplot` can be added to any kind of plot to show
|
|
individual observations.
|
|
|
|
Extra keyword arguments are passed to the underlying function, so you should
|
|
refer to the documentation for each to understand the complete set of options
|
|
for making plots with this interface.
|
|
|
|
See the :doc:`distribution plots tutorial <../tutorial/distributions>` for a more
|
|
in-depth discussion of the relative strengths and weaknesses of each approach.
|
|
The distinction between figure-level and axes-level functions is explained
|
|
further in the :doc:`user guide <../tutorial/function_overview>`.
|
|
|
|
Parameters
|
|
----------
|
|
{params.core.data}
|
|
{params.core.xy}
|
|
{params.core.hue}
|
|
{params.facets.rowcol}
|
|
weights : vector or key in ``data``
|
|
Observation weights used for computing the distribution function.
|
|
kind : {{"hist", "kde", "ecdf"}}
|
|
Approach for visualizing the data. Selects the underlying plotting function
|
|
and determines the additional set of valid parameters.
|
|
rug : bool
|
|
If True, show each observation with marginal ticks (as in :func:`rugplot`).
|
|
rug_kws : dict
|
|
Parameters to control the appearance of the rug plot.
|
|
{params.dist.log_scale}
|
|
{params.dist.legend}
|
|
{params.core.palette}
|
|
{params.core.hue_order}
|
|
{params.core.hue_norm}
|
|
{params.core.color}
|
|
{params.facets.col_wrap}
|
|
{params.facets.rowcol_order}
|
|
{params.facets.height}
|
|
{params.facets.aspect}
|
|
{params.facets.facet_kws}
|
|
kwargs
|
|
Other keyword arguments are documented with the relevant axes-level function:
|
|
|
|
- :func:`histplot` (with ``kind="hist"``)
|
|
- :func:`kdeplot` (with ``kind="kde"``)
|
|
- :func:`ecdfplot` (with ``kind="ecdf"``)
|
|
|
|
Returns
|
|
-------
|
|
{returns.facetgrid}
|
|
|
|
See Also
|
|
--------
|
|
{seealso.histplot}
|
|
{seealso.kdeplot}
|
|
{seealso.rugplot}
|
|
{seealso.ecdfplot}
|
|
{seealso.jointplot}
|
|
|
|
Examples
|
|
--------
|
|
|
|
See the API documentation for the axes-level functions for more details
|
|
about the breadth of options available for each plot kind.
|
|
|
|
.. include:: ../docstrings/displot.rst
|
|
|
|
""".format(
|
|
params=_param_docs,
|
|
returns=_core_docs["returns"],
|
|
seealso=_core_docs["seealso"],
|
|
)
|
|
|
|
|
|
# =========================================================================== #
|
|
# DEPRECATED FUNCTIONS LIVE BELOW HERE
|
|
# =========================================================================== #
|
|
|
|
|
|
def _freedman_diaconis_bins(a):
|
|
"""Calculate number of hist bins using Freedman-Diaconis rule."""
|
|
# From https://stats.stackexchange.com/questions/798/
|
|
a = np.asarray(a)
|
|
if len(a) < 2:
|
|
return 1
|
|
iqr = np.subtract.reduce(np.nanpercentile(a, [75, 25]))
|
|
h = 2 * iqr / (len(a) ** (1 / 3))
|
|
# fall back to sqrt(a) bins if iqr is 0
|
|
if h == 0:
|
|
return int(np.sqrt(a.size))
|
|
else:
|
|
return int(np.ceil((a.max() - a.min()) / h))
|
|
|
|
|
|
def distplot(a=None, bins=None, hist=True, kde=True, rug=False, fit=None,
|
|
hist_kws=None, kde_kws=None, rug_kws=None, fit_kws=None,
|
|
color=None, vertical=False, norm_hist=False, axlabel=None,
|
|
label=None, ax=None, x=None):
|
|
"""
|
|
DEPRECATED
|
|
|
|
This function has been deprecated and will be removed in seaborn v0.14.0.
|
|
It has been replaced by :func:`histplot` and :func:`displot`, two functions
|
|
with a modern API and many more capabilities.
|
|
|
|
For a guide to updating, please see this notebook:
|
|
|
|
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
|
|
|
|
"""
|
|
|
|
if kde and not hist:
|
|
axes_level_suggestion = (
|
|
"`kdeplot` (an axes-level function for kernel density plots)"
|
|
)
|
|
else:
|
|
axes_level_suggestion = (
|
|
"`histplot` (an axes-level function for histograms)"
|
|
)
|
|
|
|
msg = textwrap.dedent(f"""
|
|
|
|
`distplot` is a deprecated function and will be removed in seaborn v0.14.0.
|
|
|
|
Please adapt your code to use either `displot` (a figure-level function with
|
|
similar flexibility) or {axes_level_suggestion}.
|
|
|
|
For a guide to updating your code to use the new functions, please see
|
|
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
|
|
""")
|
|
warnings.warn(msg, UserWarning, stacklevel=2)
|
|
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
|
|
# Intelligently label the support axis
|
|
label_ax = bool(axlabel)
|
|
if axlabel is None and hasattr(a, "name"):
|
|
axlabel = a.name
|
|
if axlabel is not None:
|
|
label_ax = True
|
|
|
|
# Support new-style API
|
|
if x is not None:
|
|
a = x
|
|
|
|
# Make a a 1-d float array
|
|
a = np.asarray(a, float)
|
|
if a.ndim > 1:
|
|
a = a.squeeze()
|
|
|
|
# Drop null values from array
|
|
a = remove_na(a)
|
|
|
|
# Decide if the hist is normed
|
|
norm_hist = norm_hist or kde or (fit is not None)
|
|
|
|
# Handle dictionary defaults
|
|
hist_kws = {} if hist_kws is None else hist_kws.copy()
|
|
kde_kws = {} if kde_kws is None else kde_kws.copy()
|
|
rug_kws = {} if rug_kws is None else rug_kws.copy()
|
|
fit_kws = {} if fit_kws is None else fit_kws.copy()
|
|
|
|
# Get the color from the current color cycle
|
|
if color is None:
|
|
if vertical:
|
|
line, = ax.plot(0, a.mean())
|
|
else:
|
|
line, = ax.plot(a.mean(), 0)
|
|
color = line.get_color()
|
|
line.remove()
|
|
|
|
# Plug the label into the right kwarg dictionary
|
|
if label is not None:
|
|
if hist:
|
|
hist_kws["label"] = label
|
|
elif kde:
|
|
kde_kws["label"] = label
|
|
elif rug:
|
|
rug_kws["label"] = label
|
|
elif fit:
|
|
fit_kws["label"] = label
|
|
|
|
if hist:
|
|
if bins is None:
|
|
bins = min(_freedman_diaconis_bins(a), 50)
|
|
hist_kws.setdefault("alpha", 0.4)
|
|
hist_kws.setdefault("density", norm_hist)
|
|
|
|
orientation = "horizontal" if vertical else "vertical"
|
|
hist_color = hist_kws.pop("color", color)
|
|
ax.hist(a, bins, orientation=orientation,
|
|
color=hist_color, **hist_kws)
|
|
if hist_color != color:
|
|
hist_kws["color"] = hist_color
|
|
|
|
axis = "y" if vertical else "x"
|
|
|
|
if kde:
|
|
kde_color = kde_kws.pop("color", color)
|
|
kdeplot(**{axis: a}, ax=ax, color=kde_color, **kde_kws)
|
|
if kde_color != color:
|
|
kde_kws["color"] = kde_color
|
|
|
|
if rug:
|
|
rug_color = rug_kws.pop("color", color)
|
|
rugplot(**{axis: a}, ax=ax, color=rug_color, **rug_kws)
|
|
if rug_color != color:
|
|
rug_kws["color"] = rug_color
|
|
|
|
if fit is not None:
|
|
|
|
def pdf(x):
|
|
return fit.pdf(x, *params)
|
|
|
|
fit_color = fit_kws.pop("color", "#282828")
|
|
gridsize = fit_kws.pop("gridsize", 200)
|
|
cut = fit_kws.pop("cut", 3)
|
|
clip = fit_kws.pop("clip", (-np.inf, np.inf))
|
|
bw = gaussian_kde(a).scotts_factor() * a.std(ddof=1)
|
|
x = _kde_support(a, bw, gridsize, cut, clip)
|
|
params = fit.fit(a)
|
|
y = pdf(x)
|
|
if vertical:
|
|
x, y = y, x
|
|
ax.plot(x, y, color=fit_color, **fit_kws)
|
|
if fit_color != "#282828":
|
|
fit_kws["color"] = fit_color
|
|
|
|
if label_ax:
|
|
if vertical:
|
|
ax.set_ylabel(axlabel)
|
|
else:
|
|
ax.set_xlabel(axlabel)
|
|
|
|
return ax
|