AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/sandbox/regression/runmnl.py
2024-10-02 22:15:59 +04:00

372 lines
12 KiB
Python

'''conditional logit and nested conditional logit
nested conditional logit is supposed to be the random utility version
(RU2 and maybe RU1)
References:
-----------
currently based on:
Greene, Econometric Analysis, 5th edition and draft (?)
Hess, Florian, 2002, Structural Choice analysis with nested logit models,
The Stats Journal 2(3) pp 227-252
not yet used:
Silberhorn Nadja, Yasemin Boztug, Lutz Hildebrandt, 2008, Estimation with the
nested logit model: specifications and software particularities,
OR Spectrum
Koppelman, Frank S., and Chandra Bhat with technical support from Vaneet Sethi,
Sriram Subramanian, Vincent Bernardin and Jian Zhang, 2006,
A Self Instructing Course in Mode Choice Modeling: Multinomial and
Nested Logit Models
Author: josef-pktd
License: BSD (simplified)
'''
import numpy as np
import numpy.lib.recfunctions as recf
from scipy import optimize
class TryCLogit:
'''
Conditional Logit, data handling test
Parameters
----------
endog : array (nobs,nchoices)
dummy encoding of realized choices
exog_bychoices : list of arrays
explanatory variables, one array of exog for each choice. Variables
with common coefficients have to be first in each array
ncommon : int
number of explanatory variables with common coefficients
Notes
-----
Utility for choice j is given by
$V_j = X_j * beta + Z * gamma_j$
where X_j contains generic variables (terminology Hess) that have the same
coefficient across choices, and Z are variables, like individual-specific
variables that have different coefficients across variables.
If there are choice specific constants, then they should be contained in Z.
For identification, the constant of one choice should be dropped.
'''
def __init__(self, endog, exog_bychoices, ncommon):
self.endog = endog
self.exog_bychoices = exog_bychoices
self.ncommon = ncommon
self.nobs, self.nchoices = endog.shape
self.nchoices = len(exog_bychoices)
#TODO: rename beta to params and include inclusive values for nested CL
betaind = [exog_bychoices[ii].shape[1]-ncommon for ii in range(4)]
zi = np.r_[[ncommon], ncommon + np.array(betaind).cumsum()]
beta_indices = [np.r_[np.array([0, 1]),z[zi[ii]:zi[ii+1]]]
for ii in range(len(zi)-1)]
self.beta_indices = beta_indices
#for testing only
beta = np.arange(7)
betaidx_bychoices = [beta[idx] for idx in beta_indices]
def xbetas(self, params):
'''these are the V_i
'''
res = np.empty((self.nobs, self.nchoices))
for choiceind in range(self.nchoices):
res[:,choiceind] = np.dot(self.exog_bychoices[choiceind],
params[self.beta_indices[choiceind]])
return res
def loglike(self, params):
#normalization ?
xb = self.xbetas(params)
expxb = np.exp(xb)
sumexpxb = expxb.sum(1)#[:,None]
probs = expxb/expxb.sum(1)[:,None] #we do not really need this for all
loglike = (self.endog * np.log(probs)).sum(1)
#is this the same: YES
#self.logliketest = (self.endog * xb).sum(1) - np.log(sumexpxb)
#if self.endog where index then xb[self.endog]
return -loglike.sum() #return sum for now not for each observation
def fit(self, start_params=None):
if start_params is None:
start_params = np.zeros(6) # need better np.zeros(6)
return optimize.fmin(self.loglike, start_params, maxfun=10000)
class TryNCLogit:
'''
Nested Conditional Logit (RUNMNL), data handling test
unfinished, does not do anything yet
'''
def __init__(self, endog, exog_bychoices, ncommon):
self.endog = endog
self.exog_bychoices = exog_bychoices
self.ncommon = ncommon
self.nobs, self.nchoices = endog.shape
self.nchoices = len(exog_bychoices)
#TODO rename beta to params and include inclusive values for nested CL
betaind = [exog_bychoices[ii].shape[1]-ncommon for ii in range(4)]
zi = np.r_[[ncommon], ncommon + np.array(betaind).cumsum()]
beta_indices = [np.r_[np.array([0, 1]),z[zi[ii]:zi[ii+1]]]
for ii in range(len(zi)-1)]
self.beta_indices = beta_indices
#for testing only
beta = np.arange(7)
betaidx_bychoices = [beta[idx] for idx in beta_indices]
def xbetas(self, params):
'''these are the V_i
'''
res = np.empty((self.nobs, self.nchoices))
for choiceind in range(self.nchoices):
res[:,choiceind] = np.dot(self.exog_bychoices[choiceind],
params[self.beta_indices[choiceind]])
return res
def loglike_leafbranch(self, params, tau):
#normalization ?
#check/change naming for tau
xb = self.xbetas(params)
expxb = np.exp(xb/tau)
sumexpxb = expxb.sum(1)#[:,None]
logsumexpxb = np.log(sumexpxb)
#loglike = (self.endog * xb).sum(1) - logsumexpxb
probs = expxb/sumexpxb[:,None]
return probs, logsumexpxp # noqa:F821 See GH#5756
#if self.endog where index then xb[self.endog]
#return -loglike.sum() #return sum for now not for each observation
def loglike_branch(self, params, tau):
#not yet sure how to keep track of branches during walking of tree
ivs = []
for b in branches: # noqa:F821 See GH#5756
probs, iv = self.loglike_leafbranch(params, tau)
ivs.append(iv)
#ivs = np.array(ivs) #note ivs is (nobs,nbranchchoices)
ivs = np.column_stack(ivs) # this way ?
exptiv = np.exp(tau*ivs)
sumexptiv = exptiv.sum(1)
logsumexpxb = np.log(sumexpxb) # noqa:F821 See GH#5756
probs = exptiv/sumexptiv[:,None]
####### obsolete version to try out attaching data,
####### new in treewalkerclass.py, copy new version to replace this
####### problem with bzr I will disconnect history when copying
testxb = 0 #global to class
class RU2NMNL:
'''Nested Multinomial Logit with Random Utility 2 parameterization
'''
def __init__(self, endog, exog, tree, paramsind):
self.endog = endog
self.datadict = exog
self.tree = tree
self.paramsind = paramsind
self.branchsum = ''
self.probs = {}
def calc_prob(self, tree, keys=None):
'''walking a tree bottom-up based on dictionary
'''
endog = self.endog
datadict = self.datadict
paramsind = self.paramsind
branchsum = self.branchsum
if isinstance(tree, tuple): #assumes leaves are int for choice index
name, subtree = tree
print(name, datadict[name])
print('subtree', subtree)
keys = []
if testxb:
branchsum = datadict[name]
else:
branchsum = name #0
for b in subtree:
print(b)
#branchsum += branch2(b)
branchsum = branchsum + self.calc_prob(b, keys)
print('branchsum', branchsum, keys)
for k in keys:
self.probs[k] = self.probs[k] + ['*' + name + '-prob']
else:
keys.append(tree)
self.probs[tree] = [tree + '-prob' +
'(%s)' % ', '.join(self.paramsind[tree])]
if testxb:
leavessum = sum(datadict[bi] for bi in tree)
print('final branch with', tree, ''.join(tree), leavessum) #sum(tree)
return leavessum #sum(xb[tree])
else:
return ''.join(tree) #sum(tree)
print('working on branch', tree, branchsum)
return branchsum
#Trying out ways to handle data
#------------------------------
#travel data from Greene
dta = np.genfromtxt('TableF23-2.txt', skip_header=1,
names='Mode Ttme Invc Invt GC Hinc PSize'.split())
endog = dta['Mode'].reshape(-1,4).copy() #I do not want a view
nobs, nchoices = endog.shape
datafloat = dta.view(float).reshape(-1,7)
exog = datafloat[:,1:].reshape(-1,6*nchoices).copy() #I do not want a view
print(endog.sum(0))
varnames = dta.dtype.names
print(varnames[1:])
modes = ['Air', 'Train', 'Bus', 'Car']
print(exog.mean(0).reshape(nchoices, -1)) # Greene Table 23.23
#try dummy encoding for individual-specific variables
exog_choice_names = ['GC', 'Ttme']
exog_choice = np.column_stack([dta[name] for name in exog_choice_names])
exog_choice = exog_choice.reshape(-1,len(exog_choice_names)*nchoices)
exog_choice = np.c_[endog, exog_choice] # add constant dummy
exog_individual = dta['Hinc'][:,None]
#exog2 = np.c_[exog_choice, exog_individual*endog]
# we can also overwrite and select in original datafloat
# e.g. Hinc*endog{choice)
choice_index = np.arange(dta.shape[0]) % nchoices
hinca = dta['Hinc']*(choice_index==0)
dta2=recf.append_fields(dta, ['Hinca'],[hinca], usemask=False)
#another version
xi = []
for ii in range(4):
xi.append(datafloat[choice_index==ii])
#one more
dta1 = recf.append_fields(dta, ['Const'],[np.ones(dta.shape[0])], usemask=False)
xivar = [['GC', 'Ttme', 'Const', 'Hinc'],
['GC', 'Ttme', 'Const'],
['GC', 'Ttme', 'Const'],
['GC', 'Ttme']] #need to drop one constant
xi = []
for ii in range(4):
xi.append(dta1[xivar[ii]][choice_index==ii])
#this does not change sequence of columns, bug report by Skipper I think
ncommon = 2
betaind = [len(xi[ii].dtype.names)-ncommon for ii in range(4)]
zi=np.r_[[ncommon], ncommon+np.array(betaind).cumsum()]
z=np.arange(7) #what is n?
betaindices = [np.r_[np.array([0, 1]),z[zi[ii]:zi[ii+1]]]
for ii in range(len(zi)-1)]
beta = np.arange(7)
betai = [beta[idx] for idx in betaindices]
#examples for TryCLogit
#----------------------
#get exogs as float
xifloat = [xx.view(float).reshape(nobs,-1) for xx in xi]
clogit = TryCLogit(endog, xifloat, 2)
debug = 0
if debug:
res = optimize.fmin(clogit.loglike, np.ones(6))
#estimated parameters from Greene:
tab2324 = [-0.15501, -0.09612, 0.01329, 5.2074, 3.8690, 3.1632]
if debug:
res2 = optimize.fmin(clogit.loglike, tab2324)
res3 = optimize.fmin(clogit.loglike, np.zeros(6),maxfun=10000)
#this has same numbers as Greene table 23.24, but different sequence
#coefficient on GC is exactly 10% of Greene's
#TODO: get better starting values
'''
Optimization terminated successfully.
Current function value: 199.128369
Iterations: 957
Function evaluations: 1456
array([-0.0961246 , -0.0155019 , 0.01328757, 5.20741244, 3.86905293,
3.16319074])
'''
res3corr = res3[[1, 0, 2, 3, 4, 5]]
res3corr[0] *= 10
print(res3corr - tab2324) # diff 1e-5 to 1e-6
#199.128369 - 199.1284 #llf same up to print(precision of Greene
print(clogit.fit())
tree0 = ('top',
[('Fly',['Air']),
('Ground', ['Train', 'Car', 'Bus'])
])
datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'],
[xifloat[i]for i in range(4)]))
#for testing only (mock that returns it's own name
datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'],
['Airdata', 'Traindata', 'Busdata', 'Cardata']))
datadict.update({'top' : [],
'Fly' : [],
'Ground': []})
paramsind = {'top' : [],
'Fly' : [],
'Ground': [],
'Air' : ['GC', 'Ttme', 'ConstA', 'Hinc'],
'Train' : ['GC', 'Ttme', 'ConstT'],
'Bus' : ['GC', 'Ttme', 'ConstB'],
'Car' : ['GC', 'Ttme']
}
modru = RU2NMNL(endog, datadict, tree0, paramsind)
print(modru.calc_prob(modru.tree))
print('\nmodru.probs')
print(modru.probs)