324 lines
14 KiB
Python
324 lines
14 KiB
Python
"""Analyze a set of multiple variables with a linear models
|
|
|
|
multiOLS:
|
|
take a model and test it on a series of variables defined over a
|
|
pandas dataset, returning a summary for each variable
|
|
|
|
multigroup:
|
|
take a boolean vector and the definition of several groups of variables
|
|
and test if the group has a fraction of true values higher than the
|
|
rest. It allows to test if the variables in the group are significantly
|
|
more significant than outside the group.
|
|
"""
|
|
from patsy import dmatrix
|
|
import pandas as pd
|
|
from statsmodels.api import OLS
|
|
from statsmodels.api import stats
|
|
import numpy as np
|
|
import logging
|
|
|
|
def _model2dataframe(model_endog, model_exog, model_type=OLS, **kwargs):
|
|
"""return a series containing the summary of a linear model
|
|
|
|
All the exceding parameters will be redirected to the linear model
|
|
"""
|
|
# create the linear model and perform the fit
|
|
model_result = model_type(model_endog, model_exog, **kwargs).fit()
|
|
# keeps track of some global statistics
|
|
statistics = pd.Series({'r2': model_result.rsquared,
|
|
'adj_r2': model_result.rsquared_adj})
|
|
# put them togher with the result for each term
|
|
result_df = pd.DataFrame({'params': model_result.params,
|
|
'pvals': model_result.pvalues,
|
|
'std': model_result.bse,
|
|
'statistics': statistics})
|
|
# add the complexive results for f-value and the total p-value
|
|
fisher_df = pd.DataFrame({'params': {'_f_test': model_result.fvalue},
|
|
'pvals': {'_f_test': model_result.f_pvalue}})
|
|
# merge them and unstack to obtain a hierarchically indexed series
|
|
res_series = pd.concat([result_df, fisher_df]).unstack()
|
|
return res_series.dropna()
|
|
|
|
|
|
def multiOLS(model, dataframe, column_list=None, method='fdr_bh',
|
|
alpha=0.05, subset=None, model_type=OLS, **kwargs):
|
|
"""apply a linear model to several endogenous variables on a dataframe
|
|
|
|
Take a linear model definition via formula and a dataframe that will be
|
|
the environment of the model, and apply the linear model to a subset
|
|
(or all) of the columns of the dataframe. It will return a dataframe
|
|
with part of the information from the linear model summary.
|
|
|
|
Parameters
|
|
----------
|
|
model : str
|
|
formula description of the model
|
|
dataframe : pandas.dataframe
|
|
dataframe where the model will be evaluated
|
|
column_list : list[str], optional
|
|
Names of the columns to analyze with the model.
|
|
If None (Default) it will perform the function on all the
|
|
eligible columns (numerical type and not in the model definition)
|
|
model_type : model class, optional
|
|
The type of model to be used. The default is the linear model.
|
|
Can be any linear model (OLS, WLS, GLS, etc..)
|
|
method : str, optional
|
|
the method used to perform the pvalue correction for multiple testing.
|
|
default is the Benjamini/Hochberg, other available methods are:
|
|
|
|
`bonferroni` : one-step correction
|
|
`sidak` : on-step correction
|
|
`holm-sidak` :
|
|
`holm` :
|
|
`simes-hochberg` :
|
|
`hommel` :
|
|
`fdr_bh` : Benjamini/Hochberg
|
|
`fdr_by` : Benjamini/Yekutieli
|
|
|
|
alpha : float, optional
|
|
the significance level used for the pvalue correction (default 0.05)
|
|
subset : bool array
|
|
the selected rows to be used in the regression
|
|
|
|
all the other parameters will be directed to the model creation.
|
|
|
|
Returns
|
|
-------
|
|
summary : pandas.DataFrame
|
|
a dataframe containing an extract from the summary of the model
|
|
obtained for each columns. It will give the model complexive f test
|
|
result and p-value, and the regression value and standard deviarion
|
|
for each of the regressors. The DataFrame has a hierachical column
|
|
structure, divided as:
|
|
|
|
- params: contains the parameters resulting from the models. Has
|
|
an additional column named _f_test containing the result of the
|
|
F test.
|
|
- pval: the pvalue results of the models. Has the _f_test column
|
|
for the significativity of the whole test.
|
|
- adj_pval: the corrected pvalues via the multitest function.
|
|
- std: uncertainties of the model parameters
|
|
- statistics: contains the r squared statistics and the adjusted
|
|
r squared.
|
|
|
|
Notes
|
|
-----
|
|
The main application of this function is on system biology to perform
|
|
a linear model testing of a lot of different parameters, like the
|
|
different genetic expression of several genes.
|
|
|
|
See Also
|
|
--------
|
|
statsmodels.stats.multitest
|
|
contains several functions to perform the multiple p-value correction
|
|
|
|
Examples
|
|
--------
|
|
Using the longley data as dataframe example
|
|
|
|
>>> import statsmodels.api as sm
|
|
>>> data = sm.datasets.longley.load_pandas()
|
|
>>> df = data.exog
|
|
>>> df['TOTEMP'] = data.endog
|
|
|
|
This will perform the specified linear model on all the
|
|
other columns of the dataframe
|
|
>>> multiOLS('GNP + 1', df)
|
|
|
|
This select only a certain subset of the columns
|
|
>>> multiOLS('GNP + 0', df, ['GNPDEFL', 'TOTEMP', 'POP'])
|
|
|
|
It is possible to specify a trasformation also on the target column,
|
|
conforming to the patsy formula specification
|
|
>>> multiOLS('GNP + 0', df, ['I(GNPDEFL**2)', 'center(TOTEMP)'])
|
|
|
|
It is possible to specify the subset of the dataframe
|
|
on which perform the analysis
|
|
>> multiOLS('GNP + 1', df, subset=df.GNPDEFL > 90)
|
|
|
|
Even a single column name can be given without enclosing it in a list
|
|
>>> multiOLS('GNP + 0', df, 'GNPDEFL')
|
|
"""
|
|
# data normalization
|
|
# if None take all the numerical columns that are not present in the model
|
|
# it's not waterproof but is a good enough criterion for everyday use
|
|
if column_list is None:
|
|
column_list = [name for name in dataframe.columns
|
|
if dataframe[name].dtype != object and name not in model]
|
|
# if it's a single string transform it in a single element list
|
|
if isinstance(column_list, str):
|
|
column_list = [column_list]
|
|
if subset is not None:
|
|
dataframe = dataframe.loc[subset]
|
|
# perform each model and retrieve the statistics
|
|
col_results = {}
|
|
# as the model will use always the same endogenous variables
|
|
# we can create them once and reuse
|
|
model_exog = dmatrix(model, data=dataframe, return_type="dataframe")
|
|
for col_name in column_list:
|
|
# it will try to interpret the column name as a valid dataframe
|
|
# index as it can be several times faster. If it fails it
|
|
# interpret it as a patsy formula (for example for centering)
|
|
try:
|
|
model_endog = dataframe[col_name]
|
|
except KeyError:
|
|
model_endog = dmatrix(col_name + ' + 0', data=dataframe)
|
|
# retrieve the result and store them
|
|
res = _model2dataframe(model_endog, model_exog, model_type, **kwargs)
|
|
col_results[col_name] = res
|
|
# mangle them togheter and sort by complexive p-value
|
|
summary = pd.DataFrame(col_results)
|
|
# order by the p-value: the most useful model first!
|
|
summary = summary.T.sort_values([('pvals', '_f_test')])
|
|
summary.index.name = 'endogenous vars'
|
|
# implementing the pvalue correction method
|
|
smt = stats.multipletests
|
|
for (key1, key2) in summary:
|
|
if key1 != 'pvals':
|
|
continue
|
|
p_values = summary[key1, key2]
|
|
corrected = smt(p_values, method=method, alpha=alpha)[1]
|
|
# extend the dataframe of results with the column
|
|
# of the corrected p_values
|
|
summary['adj_' + key1, key2] = corrected
|
|
return summary
|
|
|
|
|
|
def _test_group(pvalues, group_name, group, exact=True):
|
|
"""test if the objects in the group are different from the general set.
|
|
|
|
The test is performed on the pvalues set (ad a pandas series) over
|
|
the group specified via a fisher exact test.
|
|
"""
|
|
from scipy.stats import fisher_exact, chi2_contingency
|
|
|
|
totals = 1.0 * len(pvalues)
|
|
total_significant = 1.0 * np.sum(pvalues)
|
|
cross_index = [c for c in group if c in pvalues.index]
|
|
missing = [c for c in group if c not in pvalues.index]
|
|
if missing:
|
|
s = ('the test is not well defined if the group '
|
|
'has elements not presents in the significativity '
|
|
'array. group name: {}, missing elements: {}')
|
|
logging.warning(s.format(group_name, missing))
|
|
# how many are significant and not in the group
|
|
group_total = 1.0 * len(cross_index)
|
|
group_sign = 1.0 * len([c for c in cross_index if pvalues[c]])
|
|
group_nonsign = 1.0 * (group_total - group_sign)
|
|
# how many are significant and not outside the group
|
|
extern_sign = 1.0 * (total_significant - group_sign)
|
|
extern_nonsign = 1.0 * (totals - total_significant - group_nonsign)
|
|
# make the fisher test or the chi squared
|
|
test = fisher_exact if exact else chi2_contingency
|
|
table = [[extern_nonsign, extern_sign], [group_nonsign, group_sign]]
|
|
pvalue = test(np.array(table))[1]
|
|
# is the group more represented or less?
|
|
part = group_sign, group_nonsign, extern_sign, extern_nonsign
|
|
#increase = (group_sign / group_total) > (total_significant / totals)
|
|
increase = np.log((totals * group_sign)
|
|
/ (total_significant * group_total))
|
|
return pvalue, increase, part
|
|
|
|
|
|
def multigroup(pvals, groups, exact=True, keep_all=True, alpha=0.05):
|
|
"""Test if the given groups are different from the total partition.
|
|
|
|
Given a boolean array test if each group has a proportion of positives
|
|
different than the complexive proportion.
|
|
The test can be done as an exact Fisher test or approximated as a
|
|
Chi squared test for more speed.
|
|
|
|
Parameters
|
|
----------
|
|
pvals : pandas series of boolean
|
|
the significativity of the variables under analysis
|
|
groups : dict of list
|
|
the name of each category of variables under exam.
|
|
each one is a list of the variables included
|
|
exact : bool, optional
|
|
If True (default) use the fisher exact test, otherwise
|
|
use the chi squared test for contingencies tables.
|
|
For high number of elements in the array the fisher test can
|
|
be significantly slower than the chi squared.
|
|
keep_all : bool, optional
|
|
if False it will drop those groups where the fraction
|
|
of positive is below the expected result. If True (default)
|
|
it will keep all the significant results.
|
|
alpha : float, optional
|
|
the significativity level for the pvalue correction
|
|
on the whole set of groups (not inside the groups themselves).
|
|
|
|
Returns
|
|
-------
|
|
result_df: pandas dataframe
|
|
for each group returns:
|
|
|
|
pvals - the fisher p value of the test
|
|
adj_pvals - the adjusted pvals
|
|
increase - the log of the odd ratio between the
|
|
internal significant ratio versus the external one
|
|
_in_sign - significative elements inside the group
|
|
_in_non - non significative elements inside the group
|
|
_out_sign - significative elements outside the group
|
|
_out_non - non significative elements outside the group
|
|
|
|
Notes
|
|
-----
|
|
This test allow to see if a category of variables is generally better
|
|
suited to be described for the model. For example to see if a predictor
|
|
gives more information on demographic or economical parameters,
|
|
by creating two groups containing the endogenous variables of each
|
|
category.
|
|
|
|
This function is conceived for medical dataset with a lot of variables
|
|
that can be easily grouped into functional groups. This is because
|
|
The significativity of a group require a rather large number of
|
|
composing elements.
|
|
|
|
Examples
|
|
--------
|
|
A toy example on a real dataset, the Guerry dataset from R
|
|
>>> url = "https://raw.githubusercontent.com/vincentarelbundock/"
|
|
>>> url = url + "Rdatasets/csv/HistData/Guerry.csv"
|
|
>>> df = pd.read_csv(url, index_col='dept')
|
|
|
|
evaluate the relationship between the various paramenters whith the Wealth
|
|
>>> pvals = multiOLS('Wealth', df)['adj_pvals', '_f_test']
|
|
|
|
define the groups
|
|
>>> groups = {}
|
|
>>> groups['crime'] = ['Crime_prop', 'Infanticide',
|
|
... 'Crime_parents', 'Desertion', 'Crime_pers']
|
|
>>> groups['religion'] = ['Donation_clergy', 'Clergy', 'Donations']
|
|
>>> groups['wealth'] = ['Commerce', 'Lottery', 'Instruction', 'Literacy']
|
|
|
|
do the analysis of the significativity
|
|
>>> multigroup(pvals < 0.05, groups)
|
|
"""
|
|
pvals = pd.Series(pvals)
|
|
if not (set(pvals.unique()) <= {False, True}):
|
|
raise ValueError("the series should be binary")
|
|
if hasattr(pvals.index, 'is_unique') and not pvals.index.is_unique:
|
|
raise ValueError("series with duplicated index is not accepted")
|
|
results = {'pvals': {},
|
|
'increase': {},
|
|
'_in_sign': {},
|
|
'_in_non': {},
|
|
'_out_sign': {},
|
|
'_out_non': {}}
|
|
for group_name, group_list in groups.items():
|
|
res = _test_group(pvals, group_name, group_list, exact)
|
|
results['pvals'][group_name] = res[0]
|
|
results['increase'][group_name] = res[1]
|
|
results['_in_sign'][group_name] = res[2][0]
|
|
results['_in_non'][group_name] = res[2][1]
|
|
results['_out_sign'][group_name] = res[2][2]
|
|
results['_out_non'][group_name] = res[2][3]
|
|
result_df = pd.DataFrame(results).sort_values('pvals')
|
|
if not keep_all:
|
|
result_df = result_df[result_df.increase]
|
|
smt = stats.multipletests
|
|
corrected = smt(result_df['pvals'], method='fdr_bh', alpha=alpha)[1]
|
|
result_df['adj_pvals'] = corrected
|
|
return result_df
|