577 lines
21 KiB
Python
577 lines
21 KiB
Python
# TODO Non-Linear Regressions can be used
|
|
# TODO Further decomposition of the two_fold parameters i.e.
|
|
# the delta method for further two_fold detail
|
|
"""
|
|
Author: Austin Adams
|
|
|
|
This class implements Oaxaca-Blinder Decomposition. It returns
|
|
a OaxacaResults Class:
|
|
|
|
OaxacaBlinder:
|
|
Two-Fold (two_fold)
|
|
Three-Fold (three_fold)
|
|
|
|
OaxacaResults:
|
|
Table Summary (summary)
|
|
|
|
Oaxaca-Blinder is a statistical method that is used to explain
|
|
the differences between two mean values. The idea is to show
|
|
from two mean values what can be explained by the data and
|
|
what cannot by using OLS regression frameworks.
|
|
|
|
"The original use by Oaxaca's was to explain the wage
|
|
differential between two different groups of workers,
|
|
but the method has since been applied to numerous other
|
|
topics." (Wikipedia)
|
|
|
|
The model is designed to accept two endogenous response variables
|
|
and two exogenous explanitory variables. They are then fit using
|
|
the specific type of decomposition that you want.
|
|
|
|
The method was famously used in Card and Krueger's paper
|
|
"School Quality and Black-White Relative Earnings: A Direct Assessment" (1992)
|
|
|
|
General reference for Oaxaca-Blinder:
|
|
|
|
B. Jann "The Blinder-Oaxaca decomposition for linear
|
|
regression models," The Stata Journal, 2008.
|
|
|
|
Econometrics references for regression models:
|
|
|
|
E. M. Kitagawa "Components of a Difference Between Two Rates"
|
|
Journal of the American Statistical Association, 1955.
|
|
|
|
A. S. Blinder "Wage Discrimination: Reduced Form and Structural
|
|
Estimates," The Journal of Human Resources, 1973.
|
|
"""
|
|
from textwrap import dedent
|
|
|
|
import numpy as np
|
|
|
|
from statsmodels.regression.linear_model import OLS
|
|
from statsmodels.tools.tools import add_constant
|
|
|
|
|
|
class OaxacaBlinder:
|
|
"""
|
|
Class to perform Oaxaca-Blinder Decomposition.
|
|
|
|
Parameters
|
|
----------
|
|
endog : array_like
|
|
The endogenous variable or the dependent variable that you are trying
|
|
to explain.
|
|
exog : array_like
|
|
The exogenous variable(s) or the independent variable(s) that you are
|
|
using to explain the endogenous variable.
|
|
bifurcate : {int, str}
|
|
The column of the exogenous variable(s) on which to split. This would
|
|
generally be the group that you wish to explain the two means for.
|
|
Int of the column for a NumPy array or int/string for the name of
|
|
the column in Pandas.
|
|
hasconst : bool, optional
|
|
Indicates whether the two exogenous variables include a user-supplied
|
|
constant. If True, a constant is assumed. If False, a constant is added
|
|
at the start. If nothing is supplied, then True is assumed.
|
|
swap : bool, optional
|
|
Imitates the STATA Oaxaca command by allowing users to choose to swap
|
|
groups. Unlike STATA, this is assumed to be True instead of False
|
|
cov_type : str, optional
|
|
See regression.linear_model.RegressionResults for a description of the
|
|
available covariance estimators
|
|
cov_kwds : dict, optional
|
|
See linear_model.RegressionResults.get_robustcov_results for a
|
|
description required keywords for alternative covariance estimators
|
|
|
|
Notes
|
|
-----
|
|
Please check if your data includes at constant. This will still run, but
|
|
will return incorrect values if set incorrectly.
|
|
|
|
You can access the models by using their code as an attribute, e.g.,
|
|
_t_model for the total model, _f_model for the first model, _s_model for
|
|
the second model.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> import statsmodels.api as sm
|
|
>>> data = sm.datasets.ccards.load()
|
|
|
|
'3' is the column of which we want to explain or which indicates
|
|
the two groups. In this case, it is if you rent.
|
|
|
|
>>> model = sm.OaxacaBlinder(df.endog, df.exog, 3, hasconst = False)
|
|
>>> model.two_fold().summary()
|
|
Oaxaca-Blinder Two-fold Effects
|
|
Unexplained Effect: 27.94091
|
|
Explained Effect: 130.80954
|
|
Gap: 158.75044
|
|
|
|
>>> model.three_fold().summary()
|
|
Oaxaca-Blinder Three-fold Effects
|
|
Endowments Effect: 321.74824
|
|
Coefficient Effect: 75.45371
|
|
Interaction Effect: -238.45151
|
|
Gap: 158.75044
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
endog,
|
|
exog,
|
|
bifurcate,
|
|
hasconst=True,
|
|
swap=True,
|
|
cov_type="nonrobust",
|
|
cov_kwds=None,
|
|
):
|
|
if str(type(exog)).find("pandas") != -1:
|
|
bifurcate = exog.columns.get_loc(bifurcate)
|
|
endog, exog = np.array(endog), np.array(exog)
|
|
|
|
self.two_fold_type = None
|
|
self.bifurcate = bifurcate
|
|
self.cov_type = cov_type
|
|
self.cov_kwds = cov_kwds
|
|
self.neumark = np.delete(exog, bifurcate, axis=1)
|
|
self.exog = exog
|
|
self.hasconst = hasconst
|
|
bi_col = exog[:, bifurcate]
|
|
endog = np.column_stack((bi_col, endog))
|
|
bi = np.unique(bi_col)
|
|
self.bi_col = bi_col
|
|
|
|
# split the data along the bifurcate axis, the issue is you need to
|
|
# delete it after you fit the model for the total model.
|
|
exog_f = exog[np.where(exog[:, bifurcate] == bi[0])]
|
|
exog_s = exog[np.where(exog[:, bifurcate] == bi[1])]
|
|
endog_f = endog[np.where(endog[:, 0] == bi[0])]
|
|
endog_s = endog[np.where(endog[:, 0] == bi[1])]
|
|
exog_f = np.delete(exog_f, bifurcate, axis=1)
|
|
exog_s = np.delete(exog_s, bifurcate, axis=1)
|
|
endog_f = endog_f[:, 1]
|
|
endog_s = endog_s[:, 1]
|
|
self.endog = endog[:, 1]
|
|
|
|
self.len_f, self.len_s = len(endog_f), len(endog_s)
|
|
self.gap = endog_f.mean() - endog_s.mean()
|
|
|
|
if swap and self.gap < 0:
|
|
endog_f, endog_s = endog_s, endog_f
|
|
exog_f, exog_s = exog_s, exog_f
|
|
self.gap = endog_f.mean() - endog_s.mean()
|
|
bi[0], bi[1] = bi[1], bi[0]
|
|
|
|
self.bi = bi
|
|
|
|
if hasconst is False:
|
|
exog_f = add_constant(exog_f, prepend=False)
|
|
exog_s = add_constant(exog_s, prepend=False)
|
|
self.exog = add_constant(self.exog, prepend=False)
|
|
self.neumark = add_constant(self.neumark, prepend=False)
|
|
|
|
self.exog_f_mean = np.mean(exog_f, axis=0)
|
|
self.exog_s_mean = np.mean(exog_s, axis=0)
|
|
|
|
self._f_model = OLS(endog_f, exog_f).fit(
|
|
cov_type=cov_type, cov_kwds=cov_kwds
|
|
)
|
|
self._s_model = OLS(endog_s, exog_s).fit(
|
|
cov_type=cov_type, cov_kwds=cov_kwds
|
|
)
|
|
|
|
def variance(self, decomp_type, n=5000, conf=0.99):
|
|
"""
|
|
A helper function to calculate the variance/std. Used to keep
|
|
the decomposition functions cleaner
|
|
"""
|
|
if self.submitted_n is not None:
|
|
n = self.submitted_n
|
|
if self.submitted_conf is not None:
|
|
conf = self.submitted_conf
|
|
if self.submitted_weight is not None:
|
|
submitted_weight = [
|
|
self.submitted_weight,
|
|
1 - self.submitted_weight,
|
|
]
|
|
bi = self.bi
|
|
bifurcate = self.bifurcate
|
|
endow_eff_list = []
|
|
coef_eff_list = []
|
|
int_eff_list = []
|
|
exp_eff_list = []
|
|
unexp_eff_list = []
|
|
for _ in range(0, n):
|
|
endog = np.column_stack((self.bi_col, self.endog))
|
|
exog = self.exog
|
|
amount = len(endog)
|
|
|
|
samples = np.random.randint(0, high=amount, size=amount)
|
|
endog = endog[samples]
|
|
exog = exog[samples]
|
|
neumark = np.delete(exog, bifurcate, axis=1)
|
|
|
|
exog_f = exog[np.where(exog[:, bifurcate] == bi[0])]
|
|
exog_s = exog[np.where(exog[:, bifurcate] == bi[1])]
|
|
endog_f = endog[np.where(endog[:, 0] == bi[0])]
|
|
endog_s = endog[np.where(endog[:, 0] == bi[1])]
|
|
exog_f = np.delete(exog_f, bifurcate, axis=1)
|
|
exog_s = np.delete(exog_s, bifurcate, axis=1)
|
|
endog_f = endog_f[:, 1]
|
|
endog_s = endog_s[:, 1]
|
|
endog = endog[:, 1]
|
|
|
|
two_fold_type = self.two_fold_type
|
|
|
|
if self.hasconst is False:
|
|
exog_f = add_constant(exog_f, prepend=False)
|
|
exog_s = add_constant(exog_s, prepend=False)
|
|
exog = add_constant(exog, prepend=False)
|
|
neumark = add_constant(neumark, prepend=False)
|
|
|
|
_f_model = OLS(endog_f, exog_f).fit(
|
|
cov_type=self.cov_type, cov_kwds=self.cov_kwds
|
|
)
|
|
_s_model = OLS(endog_s, exog_s).fit(
|
|
cov_type=self.cov_type, cov_kwds=self.cov_kwds
|
|
)
|
|
exog_f_mean = np.mean(exog_f, axis=0)
|
|
exog_s_mean = np.mean(exog_s, axis=0)
|
|
|
|
if decomp_type == 3:
|
|
endow_eff = (exog_f_mean - exog_s_mean) @ _s_model.params
|
|
coef_eff = exog_s_mean @ (_f_model.params - _s_model.params)
|
|
int_eff = (exog_f_mean - exog_s_mean) @ (
|
|
_f_model.params - _s_model.params
|
|
)
|
|
|
|
endow_eff_list.append(endow_eff)
|
|
coef_eff_list.append(coef_eff)
|
|
int_eff_list.append(int_eff)
|
|
|
|
elif decomp_type == 2:
|
|
len_f = len(exog_f)
|
|
len_s = len(exog_s)
|
|
|
|
if two_fold_type == "cotton":
|
|
t_params = (len_f / (len_f + len_s) * _f_model.params) + (
|
|
len_s / (len_f + len_s) * _s_model.params
|
|
)
|
|
|
|
elif two_fold_type == "reimers":
|
|
t_params = 0.5 * (_f_model.params + _s_model.params)
|
|
|
|
elif two_fold_type == "self_submitted":
|
|
t_params = (
|
|
submitted_weight[0] * _f_model.params
|
|
+ submitted_weight[1] * _s_model.params
|
|
)
|
|
|
|
elif two_fold_type == "nuemark":
|
|
_t_model = OLS(endog, neumark).fit(
|
|
cov_type=self.cov_type, cov_kwds=self.cov_kwds
|
|
)
|
|
t_params = _t_model.params
|
|
|
|
else:
|
|
_t_model = OLS(endog, exog).fit(
|
|
cov_type=self.cov_type, cov_kwds=self.cov_kwds
|
|
)
|
|
t_params = np.delete(_t_model.params, bifurcate)
|
|
|
|
unexplained = (exog_f_mean @ (_f_model.params - t_params)) + (
|
|
exog_s_mean @ (t_params - _s_model.params)
|
|
)
|
|
|
|
explained = (exog_f_mean - exog_s_mean) @ t_params
|
|
|
|
unexp_eff_list.append(unexplained)
|
|
exp_eff_list.append(explained)
|
|
|
|
high, low = int(n * conf), int(n * (1 - conf))
|
|
if decomp_type == 3:
|
|
return [
|
|
np.std(np.sort(endow_eff_list)[low:high]),
|
|
np.std(np.sort(coef_eff_list)[low:high]),
|
|
np.std(np.sort(int_eff_list)[low:high]),
|
|
]
|
|
elif decomp_type == 2:
|
|
return [
|
|
np.std(np.sort(unexp_eff_list)[low:high]),
|
|
np.std(np.sort(exp_eff_list)[low:high]),
|
|
]
|
|
|
|
def three_fold(self, std=False, n=None, conf=None):
|
|
"""
|
|
Calculates the three-fold Oaxaca Blinder Decompositions
|
|
|
|
Parameters
|
|
----------
|
|
std: boolean, optional
|
|
If true, bootstrapped standard errors will be calculated.
|
|
n: int, optional
|
|
A amount of iterations to calculate the bootstrapped
|
|
standard errors. This defaults to 5000.
|
|
conf: float, optional
|
|
This is the confidence required for the standard error
|
|
calculation. Defaults to .99, but could be anything less
|
|
than or equal to one. One is heavy discouraged, due to the
|
|
extreme outliers inflating the variance.
|
|
|
|
Returns
|
|
-------
|
|
OaxacaResults
|
|
A results container for the three-fold decomposition.
|
|
"""
|
|
self.submitted_n = n
|
|
self.submitted_conf = conf
|
|
self.submitted_weight = None
|
|
std_val = None
|
|
self.endow_eff = (
|
|
self.exog_f_mean - self.exog_s_mean
|
|
) @ self._s_model.params
|
|
self.coef_eff = self.exog_s_mean @ (
|
|
self._f_model.params - self._s_model.params
|
|
)
|
|
self.int_eff = (self.exog_f_mean - self.exog_s_mean) @ (
|
|
self._f_model.params - self._s_model.params
|
|
)
|
|
|
|
if std is True:
|
|
std_val = self.variance(3)
|
|
|
|
return OaxacaResults(
|
|
(self.endow_eff, self.coef_eff, self.int_eff, self.gap),
|
|
3,
|
|
std_val=std_val,
|
|
)
|
|
|
|
def two_fold(
|
|
self,
|
|
std=False,
|
|
two_fold_type="pooled",
|
|
submitted_weight=None,
|
|
n=None,
|
|
conf=None,
|
|
):
|
|
"""
|
|
Calculates the two-fold or pooled Oaxaca Blinder Decompositions
|
|
|
|
Methods
|
|
-------
|
|
std: boolean, optional
|
|
If true, bootstrapped standard errors will be calculated.
|
|
|
|
two_fold_type: string, optional
|
|
This method allows for the specific calculation of the
|
|
non-discriminatory model. There are four different types
|
|
available at this time. pooled, cotton, reimers, self_submitted.
|
|
Pooled is assumed and if a non-viable parameter is given,
|
|
pooled will be ran.
|
|
|
|
pooled - This type assumes that the pooled model's parameters
|
|
(a normal regression) is the non-discriminatory model.
|
|
This includes the indicator variable. This is generally
|
|
the best idea. If you have economic justification for
|
|
using others, then use others.
|
|
|
|
nuemark - This is similar to the pooled type, but the regression
|
|
is not done including the indicator variable.
|
|
|
|
cotton - This type uses the adjusted in Cotton (1988), which
|
|
accounts for the undervaluation of one group causing the
|
|
overevalution of another. It uses the sample size weights for
|
|
a linear combination of the two model parameters
|
|
|
|
reimers - This type uses a linear combination of the two
|
|
models with both parameters being 50% of the
|
|
non-discriminatory model.
|
|
|
|
self_submitted - This allows the user to submit their
|
|
own weights. Please be sure to put the weight of the larger mean
|
|
group only. This should be submitted in the
|
|
submitted_weights variable.
|
|
|
|
submitted_weight: int/float, required only for self_submitted,
|
|
This is the submitted weight for the larger mean. If the
|
|
weight for the larger mean is p, then the weight for the
|
|
other mean is 1-p. Only submit the first value.
|
|
|
|
n: int, optional
|
|
A amount of iterations to calculate the bootstrapped
|
|
standard errors. This defaults to 5000.
|
|
conf: float, optional
|
|
This is the confidence required for the standard error
|
|
calculation. Defaults to .99, but could be anything less
|
|
than or equal to one. One is heavy discouraged, due to the
|
|
extreme outliers inflating the variance.
|
|
|
|
Returns
|
|
-------
|
|
OaxacaResults
|
|
A results container for the two-fold decomposition.
|
|
"""
|
|
self.submitted_n = n
|
|
self.submitted_conf = conf
|
|
std_val = None
|
|
self.two_fold_type = two_fold_type
|
|
self.submitted_weight = submitted_weight
|
|
|
|
if two_fold_type == "cotton":
|
|
self.t_params = (
|
|
self.len_f / (self.len_f + self.len_s) * self._f_model.params
|
|
) + (self.len_s / (self.len_f + self.len_s) * self._s_model.params)
|
|
|
|
elif two_fold_type == "reimers":
|
|
self.t_params = 0.5 * (self._f_model.params + self._s_model.params)
|
|
|
|
elif two_fold_type == "self_submitted":
|
|
if submitted_weight is None:
|
|
raise ValueError("Please submit weights")
|
|
submitted_weight = [submitted_weight, 1 - submitted_weight]
|
|
self.t_params = (
|
|
submitted_weight[0] * self._f_model.params
|
|
+ submitted_weight[1] * self._s_model.params
|
|
)
|
|
|
|
elif two_fold_type == "nuemark":
|
|
self._t_model = OLS(self.endog, self.neumark).fit(
|
|
cov_type=self.cov_type, cov_kwds=self.cov_kwds
|
|
)
|
|
self.t_params = self._t_model.params
|
|
|
|
else:
|
|
self._t_model = OLS(self.endog, self.exog).fit(
|
|
cov_type=self.cov_type, cov_kwds=self.cov_kwds
|
|
)
|
|
self.t_params = np.delete(self._t_model.params, self.bifurcate)
|
|
|
|
self.unexplained = (
|
|
self.exog_f_mean @ (self._f_model.params - self.t_params)
|
|
) + (self.exog_s_mean @ (self.t_params - self._s_model.params))
|
|
self.explained = (self.exog_f_mean - self.exog_s_mean) @ self.t_params
|
|
|
|
if std is True:
|
|
std_val = self.variance(2)
|
|
|
|
return OaxacaResults(
|
|
(self.unexplained, self.explained, self.gap), 2, std_val=std_val
|
|
)
|
|
|
|
|
|
class OaxacaResults:
|
|
"""
|
|
This class summarizes the fit of the OaxacaBlinder model.
|
|
|
|
Use .summary() to get a table of the fitted values or
|
|
use .params to receive a list of the values
|
|
use .std to receive a list of the standard errors
|
|
|
|
If a two-fold model was fitted, this will return
|
|
unexplained effect, explained effect, and the
|
|
mean gap. The list will always be of the following order
|
|
and type. If standard error was asked for, then standard error
|
|
calculations will also be included for each variable after each
|
|
calculated effect.
|
|
|
|
unexplained : float
|
|
This is the effect that cannot be explained by the data at hand.
|
|
This does not mean it cannot be explained with more.
|
|
explained : float
|
|
This is the effect that can be explained using the data.
|
|
gap : float
|
|
This is the gap in the mean differences of the two groups.
|
|
|
|
If a three-fold model was fitted, this will
|
|
return characteristic effect, coefficient effect
|
|
interaction effect, and the mean gap. The list will
|
|
be of the following order and type. If standard error was asked
|
|
for, then standard error calculations will also be included for
|
|
each variable after each calculated effect.
|
|
|
|
endowment effect : float
|
|
This is the effect due to the group differences in
|
|
predictors
|
|
coefficient effect : float
|
|
This is the effect due to differences of the coefficients
|
|
of the two groups
|
|
interaction effect : float
|
|
This is the effect due to differences in both effects
|
|
existing at the same time between the two groups.
|
|
gap : float
|
|
This is the gap in the mean differences of the two groups.
|
|
|
|
Attributes
|
|
----------
|
|
params
|
|
A list of all values for the fitted models.
|
|
std
|
|
A list of standard error calculations.
|
|
"""
|
|
|
|
def __init__(self, results, model_type, std_val=None):
|
|
self.params = results
|
|
self.std = std_val
|
|
self.model_type = model_type
|
|
|
|
def summary(self):
|
|
"""
|
|
Print a summary table with the Oaxaca-Blinder effects
|
|
"""
|
|
if self.model_type == 2:
|
|
if self.std is None:
|
|
print(
|
|
dedent(
|
|
f"""\
|
|
Oaxaca-Blinder Two-fold Effects
|
|
Unexplained Effect: {self.params[0]:.5f}
|
|
Explained Effect: {self.params[1]:.5f}
|
|
Gap: {self.params[2]:.5f}"""
|
|
)
|
|
)
|
|
else:
|
|
print(
|
|
dedent(
|
|
"""\
|
|
Oaxaca-Blinder Two-fold Effects
|
|
Unexplained Effect: {:.5f}
|
|
Unexplained Standard Error: {:.5f}
|
|
Explained Effect: {:.5f}
|
|
Explained Standard Error: {:.5f}
|
|
Gap: {:.5f}""".format(
|
|
self.params[0],
|
|
self.std[0],
|
|
self.params[1],
|
|
self.std[1],
|
|
self.params[2],
|
|
)
|
|
)
|
|
)
|
|
if self.model_type == 3:
|
|
if self.std is None:
|
|
print(
|
|
dedent(
|
|
f"""\
|
|
Oaxaca-Blinder Three-fold Effects
|
|
Endowment Effect: {self.params[0]:.5f}
|
|
Coefficient Effect: {self.params[1]:.5f}
|
|
Interaction Effect: {self.params[2]:.5f}
|
|
Gap: {self.params[3]:.5f}"""
|
|
)
|
|
)
|
|
else:
|
|
print(
|
|
dedent(
|
|
f"""\
|
|
Oaxaca-Blinder Three-fold Effects
|
|
Endowment Effect: {self.params[0]:.5f}
|
|
Endowment Standard Error: {self.std[0]:.5f}
|
|
Coefficient Effect: {self.params[1]:.5f}
|
|
Coefficient Standard Error: {self.std[1]:.5f}
|
|
Interaction Effect: {self.params[2]:.5f}
|
|
Interaction Standard Error: {self.std[2]:.5f}
|
|
Gap: {self.params[3]:.5f}"""
|
|
)
|
|
)
|