583 lines
18 KiB
Python
583 lines
18 KiB
Python
|
"""
|
||
|
Implementation of Regression on Order Statistics for imputing left-
|
||
|
censored (non-detect data)
|
||
|
|
||
|
Method described in *Nondetects and Data Analysis* by Dennis R.
|
||
|
Helsel (John Wiley, 2005) to estimate the left-censored (non-detect)
|
||
|
values of a dataset.
|
||
|
|
||
|
Author: Paul M. Hobson
|
||
|
Company: Geosyntec Consultants (Portland, OR)
|
||
|
Date: 2016-06-14
|
||
|
|
||
|
"""
|
||
|
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from scipy import stats
|
||
|
|
||
|
|
||
|
def _ros_sort(df, observations, censorship, warn=False):
|
||
|
"""
|
||
|
This function prepares a dataframe for ROS.
|
||
|
|
||
|
It sorts ascending with
|
||
|
left-censored observations first. Censored observations larger than
|
||
|
the maximum uncensored observations are removed from the dataframe.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df : DataFrame
|
||
|
|
||
|
observations : str
|
||
|
Name of the column in the dataframe that contains observed
|
||
|
values. Censored values should be set to the detection (upper)
|
||
|
limit.
|
||
|
|
||
|
censorship : str
|
||
|
Name of the column in the dataframe that indicates that a
|
||
|
observation is left-censored. (i.e., True -> censored,
|
||
|
False -> uncensored)
|
||
|
|
||
|
Returns
|
||
|
------
|
||
|
sorted_df : DataFrame
|
||
|
The sorted dataframe with all columns dropped except the
|
||
|
observation and censorship columns.
|
||
|
"""
|
||
|
|
||
|
# separate uncensored data from censored data
|
||
|
censored = df[df[censorship]].sort_values(observations, axis=0)
|
||
|
uncensored = df[~df[censorship]].sort_values(observations, axis=0)
|
||
|
|
||
|
if censored[observations].max() > uncensored[observations].max():
|
||
|
censored = censored[censored[observations] <= uncensored[observations].max()]
|
||
|
|
||
|
if warn:
|
||
|
msg = ("Dropping censored observations greater than "
|
||
|
"the max uncensored observation.")
|
||
|
warnings.warn(msg)
|
||
|
|
||
|
combined = pd.concat([censored, uncensored], axis=0)
|
||
|
return combined[[observations, censorship]].reset_index(drop=True)
|
||
|
|
||
|
|
||
|
def cohn_numbers(df, observations, censorship):
|
||
|
r"""
|
||
|
Computes the Cohn numbers for the detection limits in the dataset.
|
||
|
|
||
|
The Cohn Numbers are:
|
||
|
|
||
|
- :math:`A_j =` the number of uncensored obs above the jth
|
||
|
threshold.
|
||
|
- :math:`B_j =` the number of observations (cen & uncen) below
|
||
|
the jth threshold.
|
||
|
- :math:`C_j =` the number of censored observations at the jth
|
||
|
threshold.
|
||
|
- :math:`\mathrm{PE}_j =` the probability of exceeding the jth
|
||
|
threshold
|
||
|
- :math:`\mathrm{DL}_j =` the unique, sorted detection limits
|
||
|
- :math:`\mathrm{DL}_{j+1} = \mathrm{DL}_j` shifted down a
|
||
|
single index (row)
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
dataframe : DataFrame
|
||
|
|
||
|
observations : str
|
||
|
Name of the column in the dataframe that contains observed
|
||
|
values. Censored values should be set to the detection (upper)
|
||
|
limit.
|
||
|
|
||
|
censorship : str
|
||
|
Name of the column in the dataframe that indicates that a
|
||
|
observation is left-censored. (i.e., True -> censored,
|
||
|
False -> uncensored)
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
cohn : DataFrame
|
||
|
"""
|
||
|
|
||
|
def nuncen_above(row):
|
||
|
""" A, the number of uncensored obs above the given threshold.
|
||
|
"""
|
||
|
|
||
|
# index of observations above the lower_dl DL
|
||
|
above = df[observations] >= row['lower_dl']
|
||
|
|
||
|
# index of observations below the upper_dl DL
|
||
|
below = df[observations] < row['upper_dl']
|
||
|
|
||
|
# index of non-detect observations
|
||
|
detect = ~df[censorship]
|
||
|
|
||
|
# return the number of observations where all conditions are True
|
||
|
return df[above & below & detect].shape[0]
|
||
|
|
||
|
def nobs_below(row):
|
||
|
""" B, the number of observations (cen & uncen) below the given
|
||
|
threshold
|
||
|
"""
|
||
|
|
||
|
# index of data less than the lower_dl DL
|
||
|
less_than = df[observations] < row['lower_dl']
|
||
|
|
||
|
# index of data less than or equal to the lower_dl DL
|
||
|
less_thanequal = df[observations] <= row['lower_dl']
|
||
|
|
||
|
# index of detects, non-detects
|
||
|
uncensored = ~df[censorship]
|
||
|
censored = df[censorship]
|
||
|
|
||
|
# number observations less than or equal to lower_dl DL and non-detect
|
||
|
LTE_censored = df[less_thanequal & censored].shape[0]
|
||
|
|
||
|
# number of observations less than lower_dl DL and detected
|
||
|
LT_uncensored = df[less_than & uncensored].shape[0]
|
||
|
|
||
|
# return the sum
|
||
|
return LTE_censored + LT_uncensored
|
||
|
|
||
|
def ncen_equal(row):
|
||
|
""" C, the number of censored observations at the given
|
||
|
threshold.
|
||
|
"""
|
||
|
|
||
|
censored_index = df[censorship]
|
||
|
censored_data = df[observations][censored_index]
|
||
|
censored_below = censored_data == row['lower_dl']
|
||
|
return censored_below.sum()
|
||
|
|
||
|
def set_upper_limit(cohn):
|
||
|
""" Sets the upper_dl DL for each row of the Cohn dataframe. """
|
||
|
if cohn.shape[0] > 1:
|
||
|
return cohn['lower_dl'].shift(-1).fillna(value=np.inf)
|
||
|
else:
|
||
|
return [np.inf]
|
||
|
|
||
|
def compute_PE(A, B):
|
||
|
""" Computes the probability of excedance for each row of the
|
||
|
Cohn dataframe. """
|
||
|
N = len(A)
|
||
|
PE = np.empty(N, dtype='float64')
|
||
|
PE[-1] = 0.0
|
||
|
for j in range(N-2, -1, -1):
|
||
|
PE[j] = PE[j+1] + (1 - PE[j+1]) * A[j] / (A[j] + B[j])
|
||
|
|
||
|
return PE
|
||
|
|
||
|
# unique, sorted detection limts
|
||
|
censored_data = df[censorship]
|
||
|
DLs = pd.unique(df.loc[censored_data, observations])
|
||
|
DLs.sort()
|
||
|
|
||
|
# if there is a observations smaller than the minimum detection limit,
|
||
|
# add that value to the array
|
||
|
if DLs.shape[0] > 0:
|
||
|
if df[observations].min() < DLs.min():
|
||
|
DLs = np.hstack([df[observations].min(), DLs])
|
||
|
|
||
|
# create a dataframe
|
||
|
# (editted for pandas 0.14 compatibility; see commit 63f162e
|
||
|
# when `pipe` and `assign` are available)
|
||
|
cohn = pd.DataFrame(DLs, columns=['lower_dl'])
|
||
|
cohn.loc[:, 'upper_dl'] = set_upper_limit(cohn)
|
||
|
cohn.loc[:, 'nuncen_above'] = cohn.apply(nuncen_above, axis=1)
|
||
|
cohn.loc[:, 'nobs_below'] = cohn.apply(nobs_below, axis=1)
|
||
|
cohn.loc[:, 'ncen_equal'] = cohn.apply(ncen_equal, axis=1)
|
||
|
cohn = cohn.reindex(range(DLs.shape[0] + 1))
|
||
|
cohn.loc[:, 'prob_exceedance'] = compute_PE(cohn['nuncen_above'], cohn['nobs_below'])
|
||
|
|
||
|
else:
|
||
|
dl_cols = ['lower_dl', 'upper_dl', 'nuncen_above',
|
||
|
'nobs_below', 'ncen_equal', 'prob_exceedance']
|
||
|
cohn = pd.DataFrame(np.empty((0, len(dl_cols))), columns=dl_cols)
|
||
|
|
||
|
return cohn
|
||
|
|
||
|
|
||
|
def _detection_limit_index(obs, cohn):
|
||
|
"""
|
||
|
Locates the corresponding detection limit for each observation.
|
||
|
|
||
|
Basically, creates an array of indices for the detection limits
|
||
|
(Cohn numbers) corresponding to each data point.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
obs : float
|
||
|
A single observation from the larger dataset.
|
||
|
|
||
|
cohn : DataFrame
|
||
|
DataFrame of Cohn numbers.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
det_limit_index : int
|
||
|
The index of the corresponding detection limit in `cohn`
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
cohn_numbers
|
||
|
"""
|
||
|
|
||
|
if cohn.shape[0] > 0:
|
||
|
index, = np.where(cohn['lower_dl'] <= obs)
|
||
|
det_limit_index = index[-1]
|
||
|
else:
|
||
|
det_limit_index = 0
|
||
|
|
||
|
return det_limit_index
|
||
|
|
||
|
|
||
|
def _ros_group_rank(df, dl_idx, censorship):
|
||
|
"""
|
||
|
Ranks each observation within the data groups.
|
||
|
|
||
|
In this case, the groups are defined by the record's detection
|
||
|
limit index and censorship status.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df : DataFrame
|
||
|
|
||
|
dl_idx : str
|
||
|
Name of the column in the dataframe the index of the
|
||
|
observations' corresponding detection limit in the `cohn`
|
||
|
dataframe.
|
||
|
|
||
|
censorship : str
|
||
|
Name of the column in the dataframe that indicates that a
|
||
|
observation is left-censored. (i.e., True -> censored,
|
||
|
False -> uncensored)
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ranks : ndarray
|
||
|
Array of ranks for the dataset.
|
||
|
"""
|
||
|
|
||
|
# (editted for pandas 0.14 compatibility; see commit 63f162e
|
||
|
# when `pipe` and `assign` are available)
|
||
|
ranks = df.copy()
|
||
|
ranks.loc[:, 'rank'] = 1
|
||
|
ranks = (
|
||
|
ranks.groupby(by=[dl_idx, censorship])['rank']
|
||
|
.transform(lambda g: g.cumsum())
|
||
|
)
|
||
|
return ranks
|
||
|
|
||
|
|
||
|
def _ros_plot_pos(row, censorship, cohn):
|
||
|
"""
|
||
|
ROS-specific plotting positions.
|
||
|
|
||
|
Computes the plotting position for an observation based on its rank,
|
||
|
censorship status, and detection limit index.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
row : {Series, dict}
|
||
|
Full observation (row) from a censored dataset. Requires a
|
||
|
'rank', 'detection_limit', and `censorship` column.
|
||
|
|
||
|
censorship : str
|
||
|
Name of the column in the dataframe that indicates that a
|
||
|
observation is left-censored. (i.e., True -> censored,
|
||
|
False -> uncensored)
|
||
|
|
||
|
cohn : DataFrame
|
||
|
DataFrame of Cohn numbers.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
plotting_position : float
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
cohn_numbers
|
||
|
"""
|
||
|
|
||
|
DL_index = row['det_limit_index']
|
||
|
rank = row['rank']
|
||
|
censored = row[censorship]
|
||
|
|
||
|
dl_1 = cohn.iloc[DL_index]
|
||
|
dl_2 = cohn.iloc[DL_index + 1]
|
||
|
if censored:
|
||
|
return (1 - dl_1['prob_exceedance']) * rank / (dl_1['ncen_equal']+1)
|
||
|
else:
|
||
|
return (1 - dl_1['prob_exceedance']) + (dl_1['prob_exceedance'] - dl_2['prob_exceedance']) * \
|
||
|
rank / (dl_1['nuncen_above']+1)
|
||
|
|
||
|
|
||
|
def _norm_plot_pos(observations):
|
||
|
"""
|
||
|
Computes standard normal (Gaussian) plotting positions using scipy.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
observations : array_like
|
||
|
Sequence of observed quantities.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
plotting_position : array of floats
|
||
|
"""
|
||
|
ppos, sorted_res = stats.probplot(observations, fit=False)
|
||
|
return stats.norm.cdf(ppos)
|
||
|
|
||
|
|
||
|
def plotting_positions(df, censorship, cohn):
|
||
|
"""
|
||
|
Compute the plotting positions for the observations.
|
||
|
|
||
|
The ROS-specific plotting postions are based on the observations'
|
||
|
rank, censorship status, and corresponding detection limit.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df : DataFrame
|
||
|
|
||
|
censorship : str
|
||
|
Name of the column in the dataframe that indicates that a
|
||
|
observation is left-censored. (i.e., True -> censored,
|
||
|
False -> uncensored)
|
||
|
|
||
|
cohn : DataFrame
|
||
|
DataFrame of Cohn numbers.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
plotting_position : array of float
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
cohn_numbers
|
||
|
"""
|
||
|
|
||
|
plot_pos = df.apply(lambda r: _ros_plot_pos(r, censorship, cohn), axis=1)
|
||
|
|
||
|
# correctly sort the plotting positions of the ND data:
|
||
|
ND_plotpos = plot_pos[df[censorship]]
|
||
|
ND_plotpos_arr = np.require(ND_plotpos, requirements="W")
|
||
|
ND_plotpos_arr.sort()
|
||
|
plot_pos.loc[df[censorship].index[df[censorship]]] = ND_plotpos_arr
|
||
|
|
||
|
return plot_pos
|
||
|
|
||
|
|
||
|
def _impute(df, observations, censorship, transform_in, transform_out):
|
||
|
"""
|
||
|
Executes the basic regression on order stat (ROS) proceedure.
|
||
|
|
||
|
Uses ROS to impute censored from the best-fit line of a
|
||
|
probability plot of the uncensored values.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df : DataFrame
|
||
|
observations : str
|
||
|
Name of the column in the dataframe that contains observed
|
||
|
values. Censored values should be set to the detection (upper)
|
||
|
limit.
|
||
|
censorship : str
|
||
|
Name of the column in the dataframe that indicates that a
|
||
|
observation is left-censored. (i.e., True -> censored,
|
||
|
False -> uncensored)
|
||
|
transform_in, transform_out : callable
|
||
|
Transformations to be applied to the data prior to fitting
|
||
|
the line and after estimated values from that line. Typically,
|
||
|
`np.log` and `np.exp` are used, respectively.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
estimated : DataFrame
|
||
|
A new dataframe with two new columns: "estimated" and "final".
|
||
|
The "estimated" column contains of the values inferred from the
|
||
|
best-fit line. The "final" column contains the estimated values
|
||
|
only where the original observations were censored, and the original
|
||
|
observations everwhere else.
|
||
|
"""
|
||
|
|
||
|
# detect/non-detect selectors
|
||
|
uncensored_mask = ~df[censorship]
|
||
|
censored_mask = df[censorship]
|
||
|
|
||
|
# fit a line to the logs of the detected data
|
||
|
fit_params = stats.linregress(
|
||
|
df['Zprelim'][uncensored_mask],
|
||
|
transform_in(df[observations][uncensored_mask])
|
||
|
)
|
||
|
|
||
|
# pull out the slope and intercept for use later
|
||
|
slope, intercept = fit_params[:2]
|
||
|
|
||
|
# model the data based on the best-fit curve
|
||
|
# (editted for pandas 0.14 compatibility; see commit 63f162e
|
||
|
# when `pipe` and `assign` are available)
|
||
|
df.loc[:, 'estimated'] = transform_out(slope * df['Zprelim'][censored_mask] + intercept)
|
||
|
df.loc[:, 'final'] = np.where(df[censorship], df['estimated'], df[observations])
|
||
|
|
||
|
return df
|
||
|
|
||
|
|
||
|
def _do_ros(df, observations, censorship, transform_in, transform_out):
|
||
|
"""
|
||
|
DataFrame-centric function to impute censored valies with ROS.
|
||
|
|
||
|
Prepares a dataframe for, and then esimates the values of a censored
|
||
|
dataset using Regression on Order Statistics
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
df : DataFrame
|
||
|
|
||
|
observations : str
|
||
|
Name of the column in the dataframe that contains observed
|
||
|
values. Censored values should be set to the detection (upper)
|
||
|
limit.
|
||
|
|
||
|
censorship : str
|
||
|
Name of the column in the dataframe that indicates that a
|
||
|
observation is left-censored. (i.e., True -> censored,
|
||
|
False -> uncensored)
|
||
|
|
||
|
transform_in, transform_out : callable
|
||
|
Transformations to be applied to the data prior to fitting
|
||
|
the line and after estimated values from that line. Typically,
|
||
|
`np.log` and `np.exp` are used, respectively.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
estimated : DataFrame
|
||
|
A new dataframe with two new columns: "estimated" and "final".
|
||
|
The "estimated" column contains of the values inferred from the
|
||
|
best-fit line. The "final" column contains the estimated values
|
||
|
only where the original observations were censored, and the original
|
||
|
observations everwhere else.
|
||
|
"""
|
||
|
|
||
|
# compute the Cohn numbers
|
||
|
cohn = cohn_numbers(df, observations=observations, censorship=censorship)
|
||
|
|
||
|
# (editted for pandas 0.14 compatibility; see commit 63f162e
|
||
|
# when `pipe` and `assign` are available)
|
||
|
modeled = _ros_sort(df, observations=observations, censorship=censorship)
|
||
|
modeled.loc[:, 'det_limit_index'] = modeled[observations].apply(_detection_limit_index, args=(cohn,))
|
||
|
modeled.loc[:, 'rank'] = _ros_group_rank(modeled, 'det_limit_index', censorship)
|
||
|
modeled.loc[:, 'plot_pos'] = plotting_positions(modeled, censorship, cohn)
|
||
|
modeled.loc[:, 'Zprelim'] = stats.norm.ppf(modeled['plot_pos'])
|
||
|
|
||
|
return _impute(modeled, observations, censorship, transform_in, transform_out)
|
||
|
|
||
|
|
||
|
def impute_ros(observations, censorship, df=None, min_uncensored=2,
|
||
|
max_fraction_censored=0.8, substitution_fraction=0.5,
|
||
|
transform_in=np.log, transform_out=np.exp,
|
||
|
as_array=True):
|
||
|
"""
|
||
|
Impute censored dataset using Regression on Order Statistics (ROS).
|
||
|
|
||
|
Method described in *Nondetects and Data Analysis* by Dennis R.
|
||
|
Helsel (John Wiley, 2005) to estimate the left-censored (non-detect)
|
||
|
values of a dataset. When there is insufficient non-censorded data,
|
||
|
simple substitution is used.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
observations : str or array-like
|
||
|
Label of the column or the float array of censored observations
|
||
|
|
||
|
censorship : str
|
||
|
Label of the column or the bool array of the censorship
|
||
|
status of the observations.
|
||
|
|
||
|
* True if censored,
|
||
|
* False if uncensored
|
||
|
|
||
|
df : DataFrame, optional
|
||
|
If `observations` and `censorship` are labels, this is the
|
||
|
DataFrame that contains those columns.
|
||
|
|
||
|
min_uncensored : int (default is 2)
|
||
|
The minimum number of uncensored values required before ROS
|
||
|
can be used to impute the censored observations. When this
|
||
|
criterion is not met, simple substituion is used instead.
|
||
|
|
||
|
max_fraction_censored : float (default is 0.8)
|
||
|
The maximum fraction of censored data below which ROS can be
|
||
|
used to impute the censored observations. When this fraction is
|
||
|
exceeded, simple substituion is used instead.
|
||
|
|
||
|
substitution_fraction : float (default is 0.5)
|
||
|
The fraction of the detection limit to be used during simple
|
||
|
substitution of the censored values.
|
||
|
|
||
|
transform_in : callable (default is np.log)
|
||
|
Transformation to be applied to the values prior to fitting a
|
||
|
line to the plotting positions vs. uncensored values.
|
||
|
|
||
|
transform_out : callable (default is np.exp)
|
||
|
Transformation to be applied to the imputed censored values
|
||
|
estimated from the previously computed best-fit line.
|
||
|
|
||
|
as_array : bool (default is True)
|
||
|
When True, a numpy array of the imputed observations is
|
||
|
returned. Otherwise, a modified copy of the original dataframe
|
||
|
with all of the intermediate calculations is returned.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
imputed : {ndarray, DataFrame}
|
||
|
The final observations where the censored values have either been
|
||
|
imputed through ROS or substituted as a fraction of the
|
||
|
detection limit.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This function requires pandas 0.14 or more recent.
|
||
|
"""
|
||
|
|
||
|
# process arrays into a dataframe, if necessary
|
||
|
if df is None:
|
||
|
df = pd.DataFrame({'obs': observations, 'cen': censorship})
|
||
|
observations = 'obs'
|
||
|
censorship = 'cen'
|
||
|
|
||
|
# basic counts/metrics of the dataset
|
||
|
N_observations = df.shape[0]
|
||
|
N_censored = df[censorship].astype(int).sum()
|
||
|
N_uncensored = N_observations - N_censored
|
||
|
fraction_censored = N_censored / N_observations
|
||
|
|
||
|
# add plotting positions if there are no censored values
|
||
|
# (editted for pandas 0.14 compatibility; see commit 63f162e
|
||
|
# when `pipe` and `assign` are available)
|
||
|
if N_censored == 0:
|
||
|
output = df[[observations, censorship]].copy()
|
||
|
output.loc[:, 'final'] = df[observations]
|
||
|
|
||
|
# substitute w/ fraction of the DLs if there's insufficient
|
||
|
# uncensored data
|
||
|
# (editted for pandas 0.14 compatibility; see commit 63f162e
|
||
|
# when `pipe` and `assign` are available)
|
||
|
elif (N_uncensored < min_uncensored) or (fraction_censored > max_fraction_censored):
|
||
|
output = df[[observations, censorship]].copy()
|
||
|
output.loc[:, 'final'] = df[observations]
|
||
|
output.loc[df[censorship], 'final'] *= substitution_fraction
|
||
|
|
||
|
|
||
|
# normal ROS stuff
|
||
|
else:
|
||
|
output = _do_ros(df, observations, censorship, transform_in, transform_out)
|
||
|
|
||
|
# convert to an array if necessary
|
||
|
if as_array:
|
||
|
output = output['final'].values
|
||
|
|
||
|
return output
|