Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/oaxaca.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# TODO Variance can be calculated for the three_fold
2# TODO Group Size Effects can be accounted for
3# TODO Non-Linear Oaxaca-Blinder can be used
4"""
5Author: Austin Adams
7This class implements Oaxaca-Blinder Decomposition. It returns
8a OaxacaResults Class:
10OaxacaBlinder:
11Two-Fold/Pooled (two_fold)
12Three-Fold (three_fold)
14OaxacaResults:
15Table Summary (summary)
17Oaxaca-Blinder is a statistical method that is used to explain
18the differences between two mean values. The idea is to show
19from two mean values what can be explained by the data and
20what cannot by using OLS regression frameworks.
22"The original use by Oaxaca's was to explain the wage
23differential between two different groups of workers,
24but the method has since been applied to numerous other
25topics." (Wikipedia)
27The model is designed to accept two endogenous response variables
28and two exogenous explanitory variables. They are then fit using
29the specific type of decomposition that you want.
31The method was famously used in Card and Krueger's paper
32"School Quality and Black-White Relative Earnings: A Direct Assessment" (1992)
34General reference for Oaxaca-Blinder:
36B. Jann "The Blinder-Oaxaca decomposition for linear
37regression models," The Stata Journal, 2008.
39Econometrics references for regression models:
41E. M. Kitagawa "Components of a Difference Between Two Rates"
42Journal of the American Statistical Association, 1955.
44A. S. Blinder "Wage Discrimination: Reduced Form and Structural
45Estimates," The Journal of Human Resources, 1973.
46"""
47from statsmodels.regression.linear_model import OLS
48from statsmodels.tools.tools import add_constant
49import numpy as np
50from textwrap import dedent
53class OaxacaBlinder(object):
54 """
55 Class to perform Oaxaca-Blinder Decomposition.
57 Parameters
58 ----------
59 endog : array_like
60 The endogenous variable or the dependent variable that you are trying
61 to explain.
62 exog : array_like
63 The exogenous variable(s) or the independent variable(s) that you are
64 using to explain the endogenous variable.
65 bifurcate : {int, str}
66 The column of the exogenous variable(s) on which to split. This would
67 generally be the group that you wish to explain the two means for.
68 Int of the column for a NumPy array or int/string for the name of
69 the column in Pandas.
70 hasconst : bool, optional
71 Indicates whether the two exogenous variables include a user-supplied
72 constant. If True, a constant is assumed. If False, a constant is added
73 at the start. If nothing is supplied, then True is assumed.
74 swap : bool, optional
75 Imitates the STATA Oaxaca command by allowing users to choose to swap
76 groups. Unlike STATA, this is assumed to be True instead of False
77 cov_type : str, optional
78 See regression.linear_model.RegressionResults for a description of the
79 available covariance estimators
80 cov_kwds : dict, optional
81 See linear_model.RegressionResults.get_robustcov_results for a
82 description required keywords for alternative covariance estimators
84 Notes
85 -----
86 Please check if your data includes at constant. This will still run, but
87 will return incorrect values if set incorrectly.
89 You can access the models by using their code as an attribute, e.g.,
90 _t_model for the total model, _f_model for the first model, _s_model for
91 the second model.
93 Examples
94 --------
95 >>> import numpy as np
96 >>> import statsmodels.api as sm
97 >>> data = sm.datasets.ccards.load()
99 '3' is the column of which we want to explain or which indicates
100 the two groups. In this case, it is if you rent.
102 >>> model = sm.OaxacaBlinder(df.endog, df.exog, 3, hasconst = False)
103 >>> model.two_fold().summary()
104 Oaxaca-Blinder Two-fold Effects
106 Unexplained Effect: 27.94091
107 Explained Effect: 130.80954
108 Gap: 158.75044
109 >>> model.three_fold().summary()
110 Oaxaca-Blinder Three-fold Effects
112 Characteristic Effect: 321.74824
113 Coefficient Effect: 75.45371
114 Interaction Effect: -238.45151
115 Gap: 158.75044
116 """
118 def __init__(self, endog, exog, bifurcate, hasconst=True,
119 swap=True, cov_type='nonrobust', cov_kwds=None):
120 if str(type(exog)).find('pandas') != -1:
121 bifurcate = exog.columns.get_loc(bifurcate)
122 endog, exog = np.array(endog), np.array(exog)
124 bi_col = exog[:, bifurcate]
125 endog = np.column_stack((bi_col, endog))
126 bi = np.unique(bi_col)
128 # split the data along the bifurcate axis, the issue is you need to
129 # delete it after you fit the model for the total model.
130 exog_f = exog[np.where(exog[:, bifurcate] == bi[0])]
131 exog_s = exog[np.where(exog[:, bifurcate] == bi[1])]
132 endog_f = endog[np.where(endog[:, 0] == bi[0])]
133 endog_s = endog[np.where(endog[:, 0] == bi[1])]
134 exog_f = np.delete(exog_f, bifurcate, axis=1)
135 exog_s = np.delete(exog_s, bifurcate, axis=1)
136 endog_f = endog_f[:, 1]
137 endog_s = endog_s[:, 1]
138 endog = endog[:, 1]
140 self.gap = endog_f.mean() - endog_s.mean()
142 if swap and self.gap < 0:
143 endog_f, endog_s = endog_s, endog_f
144 exog_f, exog_s = exog_s, exog_f
145 self.gap = endog_f.mean() - endog_s.mean()
147 if hasconst is False:
148 exog_f = add_constant(exog_f, prepend=False)
149 exog_s = add_constant(exog_s, prepend=False)
150 exog = add_constant(exog, prepend=False)
152 self._t_model = OLS(endog, exog).fit(
153 cov_type=cov_type,
154 cov_kwds=cov_kwds)
155 self._f_model = OLS(endog_f, exog_f).fit(
156 cov_type=cov_type,
157 cov_kwds=cov_kwds)
158 self._s_model = OLS(endog_s, exog_s).fit(
159 cov_type=cov_type,
160 cov_kwds=cov_kwds)
162 self.exog_f_mean = np.mean(exog_f, axis=0)
163 self.exog_s_mean = np.mean(exog_s, axis=0)
164 self.t_params = np.delete(self._t_model.params, bifurcate)
166 def three_fold(self):
167 """
168 Calculates the three-fold Oaxaca Blinder Decompositions
170 Returns
171 -------
172 OaxacaResults
173 A results container for the three-fold decomposition.
174 """
176 self.char_eff = (
177 (self.exog_f_mean - self.exog_s_mean)
178 @ self._s_model.params)
179 self.coef_eff = self.exog_s_mean @ (self._f_model.params
180 - self._s_model.params)
181 self.int_eff = ((self.exog_f_mean - self.exog_s_mean)
182 @ (self._f_model.params - self._s_model.params))
184 return OaxacaResults(
185 (self.char_eff, self.coef_eff,
186 self.int_eff, self.gap), 3)
188 def two_fold(self):
189 """
190 Calculates the two-fold or pooled Oaxaca Blinder Decompositions
192 Returns
193 -------
194 OaxacaResults
195 A results container for the two-fold decomposition.
196 """
197 self.unexplained = ((self.exog_f_mean
198 @ (self._f_model.params - self.t_params))
199 + (self.exog_s_mean
200 @ (self.t_params - self._s_model.params)))
201 self.explained = (self.exog_f_mean - self.exog_s_mean) @ self.t_params
203 return OaxacaResults((self.unexplained, self.explained, self.gap), 2)
206class OaxacaResults:
207 """
208 This class summarizes the fit of the OaxacaBlinder model.
210 Use .summary() to get a table of the fitted values or
211 use .params to receive a list of the values
213 If a two-fold model was fitted, this will return
214 unexplained effect, explained effect, and the
215 mean gap. The list will be of the following order
216 and type.
218 unexplained : float
219 This is the effect that cannot be explained by the data at hand.
220 This does not mean it cannot be explained with more.
221 explained: float
222 This is the effect that can be explained using the data.
223 gap: float
224 This is the gap in the mean differences of the two groups.
226 If a three-fold model was fitted, this will
227 return characteristic effect, coefficient effect
228 interaction effect, and the mean gap. The list will
229 be of the following order and type.
231 characteristic effect : float
232 This is the effect due to the group differences in
233 predictors
234 coefficient effect: float
235 This is the effect due to differences of the coefficients
236 of the two groups
237 interaction effect: float
238 This is the effect due to differences in both effects
239 existing at the same time between the two groups.
240 gap: float
241 This is the gap in the mean differences of the two groups.
243 Attributes
244 ----------
245 params
246 A list of all values for the fitted models.
247 """
248 def __init__(self, results, model_type):
249 self.params = results
250 self.model_type = model_type
252 def summary(self):
253 """
254 Print a summary table with the Oaxaca-Blinder effects
255 """
256 if self.model_type == 2:
257 print(dedent("""\
258 Oaxaca-Blinder Two-fold Effects
260 Unexplained Effect: {:.5f}
261 Explained Effect: {:.5f}
262 Gap: {:.5f}""".format(
263 self.params[0], self.params[1],
264 self.params[2])))
266 if self.model_type == 3:
267 print(dedent("""\
268 Oaxaca-Blinder Three-fold Effects
270 Characteristic Effect: {:.5f}
271 Coefficient Effect: {:.5f}
272 Interaction Effect: {:.5f}
273 Gap: {:.5f}""".format(
274 self.params[0], self.params[1],
275 self.params[2], self.params[3])))