102 lines
3.4 KiB
Python
102 lines
3.4 KiB
Python
|
"""Star98 Educational Testing dataset."""
|
||
|
from statsmodels.datasets import utils as du
|
||
|
|
||
|
__docformat__ = 'restructuredtext'
|
||
|
|
||
|
COPYRIGHT = """Used with express permission from the original author,
|
||
|
who retains all rights."""
|
||
|
TITLE = "Star98 Educational Dataset"
|
||
|
SOURCE = """
|
||
|
Jeff Gill's `Generalized Linear Models: A Unified Approach`
|
||
|
|
||
|
http://jgill.wustl.edu/research/books.html
|
||
|
"""
|
||
|
DESCRSHORT = """Math scores for 303 student with 10 explanatory factors"""
|
||
|
|
||
|
DESCRLONG = """
|
||
|
This data is on the California education policy and outcomes (STAR program
|
||
|
results for 1998. The data measured standardized testing by the California
|
||
|
Department of Education that required evaluation of 2nd - 11th grade students
|
||
|
by the the Stanford 9 test on a variety of subjects. This dataset is at
|
||
|
the level of the unified school district and consists of 303 cases. The
|
||
|
binary response variable represents the number of 9th graders scoring
|
||
|
over the national median value on the mathematics exam.
|
||
|
|
||
|
The data used in this example is only a subset of the original source.
|
||
|
"""
|
||
|
|
||
|
NOTE = """::
|
||
|
|
||
|
Number of Observations - 303 (counties in California).
|
||
|
|
||
|
Number of Variables - 13 and 8 interaction terms.
|
||
|
|
||
|
Definition of variables names::
|
||
|
|
||
|
NABOVE - Total number of students above the national median for the
|
||
|
math section.
|
||
|
NBELOW - Total number of students below the national median for the
|
||
|
math section.
|
||
|
LOWINC - Percentage of low income students
|
||
|
PERASIAN - Percentage of Asian student
|
||
|
PERBLACK - Percentage of black students
|
||
|
PERHISP - Percentage of Hispanic students
|
||
|
PERMINTE - Percentage of minority teachers
|
||
|
AVYRSEXP - Sum of teachers' years in educational service divided by the
|
||
|
number of teachers.
|
||
|
AVSALK - Total salary budget including benefits divided by the number
|
||
|
of full-time teachers (in thousands)
|
||
|
PERSPENK - Per-pupil spending (in thousands)
|
||
|
PTRATIO - Pupil-teacher ratio.
|
||
|
PCTAF - Percentage of students taking UC/CSU prep courses
|
||
|
PCTCHRT - Percentage of charter schools
|
||
|
PCTYRRND - Percentage of year-round schools
|
||
|
|
||
|
The below variables are interaction terms of the variables defined
|
||
|
above.
|
||
|
|
||
|
PERMINTE_AVYRSEXP
|
||
|
PEMINTE_AVSAL
|
||
|
AVYRSEXP_AVSAL
|
||
|
PERSPEN_PTRATIO
|
||
|
PERSPEN_PCTAF
|
||
|
PTRATIO_PCTAF
|
||
|
PERMINTE_AVTRSEXP_AVSAL
|
||
|
PERSPEN_PTRATIO_PCTAF
|
||
|
"""
|
||
|
|
||
|
|
||
|
|
||
|
def load():
|
||
|
"""
|
||
|
Load the star98 data and returns a Dataset class instance.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Load instance:
|
||
|
a class of the data with array attrbutes 'endog' and 'exog'
|
||
|
"""
|
||
|
return load_pandas()
|
||
|
|
||
|
|
||
|
def load_pandas():
|
||
|
data = _get_data()
|
||
|
return du.process_pandas(data, endog_idx=['NABOVE', 'NBELOW'])
|
||
|
|
||
|
|
||
|
def _get_data():
|
||
|
data = du.load_csv(__file__, 'star98.csv')
|
||
|
names = ["NABOVE","NBELOW","LOWINC","PERASIAN","PERBLACK","PERHISP",
|
||
|
"PERMINTE","AVYRSEXP","AVSALK","PERSPENK","PTRATIO","PCTAF",
|
||
|
"PCTCHRT","PCTYRRND","PERMINTE_AVYRSEXP","PERMINTE_AVSAL",
|
||
|
"AVYRSEXP_AVSAL","PERSPEN_PTRATIO","PERSPEN_PCTAF","PTRATIO_PCTAF",
|
||
|
"PERMINTE_AVYRSEXP_AVSAL","PERSPEN_PTRATIO_PCTAF"]
|
||
|
data.columns = names
|
||
|
nabove = data['NABOVE'].copy()
|
||
|
nbelow = data['NBELOW'].copy()
|
||
|
|
||
|
data['NABOVE'] = nbelow # successes
|
||
|
data['NBELOW'] = nabove - nbelow # now failures
|
||
|
|
||
|
return data
|