AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/sandbox/regression/treewalkerclass.py
2024-10-02 22:15:59 +04:00

620 lines
21 KiB
Python

'''
Formulas
--------
This follows mostly Greene notation (in slides)
partially ignoring factors tau or mu for now, ADDED
(if all tau==1, then runmnl==clogit)
leaf k probability :
Prob(k|j) = exp(b_k * X_k / mu_j)/ sum_{i in L(j)} (exp(b_i * X_i / mu_j)
branch j probabilities :
Prob(j) = exp(b_j * X_j + mu*IV_j )/ sum_{i in NB(j)} (exp(b_i * X_i + mu_i*IV_i)
inclusive value of branch j :
IV_j = log( sum_{i in L(j)} (exp(b_i * X_i / mu_j) )
this is the log of the denominator of the leaf probabilities
L(j) : leaves at branch j, where k is child of j
NB(j) : set of j and it's siblings
Design
------
* splitting calculation transmission between returns and changes to
instance.probs
- probability for each leaf is in instance.probs
- inclusive values and contribution of exog on branch level need to be
added separately. handed up the tree through returns
* question: should params array be accessed directly through
`self.recursionparams[self.parinddict[name]]` or should the dictionary
return the values of the params, e.g. `self.params_node_dict[name]`.
The second would be easier for fixing tau=1 for degenerate branches.
The easiest might be to do the latter only for the taus and default to 1 if
the key ('tau_'+branchname) is not found. I also need to exclude tau for
degenerate branches from params, but then I cannot change them from the
outside for testing and experimentation. (?)
* SAS manual describes restrictions on tau (though their model is a bit
different), e.g. equal tau across sibling branches, fixed tau. The also
allow linear and non-linear (? not sure) restriction on params, the
regression coefficients. Related to previous issue, callback without access
to the underlying array, where params_node_dict returns the actual params
value would provide more flexibility to impose different kinds of restrictions.
bugs/problems
-------------
* singleton branches return zero to `top`, not a value
I'm not sure what they are supposed to return, given the split between returns
and instance.probs DONE
* Why does 'Air' (singleton branch) get probability exactly 0.5 ? DONE
TODO
----
* add tau, normalization for nested logit, currently tau is 1 (clogit)
taus also needs to become part of params MOSTLY DONE
* add effect of branch level explanatory variables DONE
* write a generic multinomial logit that takes arbitrary probabilities, this
would be the same for MNL, clogit and runmnl,
delegate calculation of probabilities
* test on actual data,
- tau=1 replicate clogit numbers,
- transport example from Greene tests 1-level tree and degenerate sub-trees
- test example for multi-level trees ???
* starting values: Greene mentiones that the starting values for the nested
version come from the (non-nested) MNL version. SPSS uses constant equal
(? check transformation) to sample frequencies and zeros for slope
coefficient as starting values for (non-nested) MNL
* associated test statistics
- (I do not think I will fight with the gradient or hessian of the log-like.)
- basic MLE statistics can be generic
- tests specific to the model (?)
* nice printouts since I'm currently collecting a lot of information in the tree
recursion and everything has names
The only parts that are really necessary to get a functional nested logit are
adding the taus (DONE) and the MLE wrapper class. The rest are enhancements.
I added fake tau, one fixed tau for all branches. (OBSOLETE)
It's not clear where the tau for leaf should be added either at
original assignment of self.probs, or as part of the one-step-down
probability correction in the bottom branches. The second would be
cleaner (would make treatment of leaves and branches more symmetric,
but requires that initial assignment in the leaf only does
initialization. e.g self.probs = 1. ???
DONE added taus
still todo:
- tau for degenerate branches are not identified, set to 1 for MLE
- rename parinddict to paramsinddict
Author: Josef Perktold
License : BSD (3-clause)
'''
from statsmodels.compat.python import lrange
from pprint import pprint
import numpy as np
def randintw(w, size=1):
'''generate integer random variables given probabilties
useful because it can be used as index into any array or sequence type
Parameters
----------
w : 1d array_like
sequence of weights, probabilities. The weights are normalized to add
to one.
size : int or tuple of ints
shape of output array
Returns
-------
rvs : array of shape given by size
random variables each distributed according to the same discrete
distribution defined by (normalized) w.
Examples
--------
>>> np.random.seed(0)
>>> randintw([0.4, 0.4, 0.2], size=(2,6))
array([[1, 1, 1, 1, 1, 1],
[1, 2, 2, 0, 1, 1]])
>>> np.bincount(randintw([0.6, 0.4, 0.0], size=3000))/3000.
array([ 0.59566667, 0.40433333])
'''
#from Charles Harris, numpy mailing list
from numpy.random import random
p = np.cumsum(w)/np.sum(w)
rvs = p.searchsorted(random(np.prod(size))).reshape(size)
return rvs
def getbranches(tree):
'''
walk tree to get list of branches
Parameters
----------
tree : list of tuples
tree as defined for RU2NMNL
Returns
-------
branch : list
list of all branch names
'''
if isinstance(tree, tuple):
name, subtree = tree
a = [name]
for st in subtree:
a.extend(getbranches(st))
return a
return []
def getnodes(tree):
'''
walk tree to get list of branches and list of leaves
Parameters
----------
tree : list of tuples
tree as defined for RU2NMNL
Returns
-------
branch : list
list of all branch names
leaves : list
list of all leaves names
'''
if isinstance(tree, tuple):
name, subtree = tree
ab = [name]
al = []
#degenerate branches
if len(subtree) == 1:
adeg = [name]
else:
adeg = []
for st in subtree:
b, l, d = getnodes(st)
ab.extend(b)
al.extend(l)
adeg.extend(d)
return ab, al, adeg
return [], [tree], []
testxb = 2 #global to class to return strings instead of numbers
class RU2NMNL:
'''Nested Multinomial Logit with Random Utility 2 parameterization
Parameters
----------
endog : ndarray
not used in this part
exog : dict_like
dictionary access to data where keys correspond to branch and leaf
names. The values are the data arrays for the exog in that node.
tree : nested tuples and lists
each branch, tree or subtree, is defined by a tuple
(branch_name, [subtree1, subtree2, ..., subtreek])
Bottom branches have as subtrees the list of leaf names.
paramsind : dictionary
dictionary that maps branch and leaf names to the names of parameters,
the coefficients for exogs)
Methods
-------
get_probs
Attributes
----------
branches
leaves
paramsnames
parinddict
Notes
-----
endog needs to be encoded so it is consistent with self.leaves, which
defines the columns for the probability array. The ordering in leaves is
determined by the ordering of the tree.
In the dummy encoding of endog, the columns of endog need to have the
same order as self.leaves. In the integer encoding, the integer for a
choice has to correspond to the index in self.leaves.
(This could be made more robust, by handling the endog encoding internally
by leaf names, if endog is defined as categorical variable with
associated category level names.)
'''
def __init__(self, endog, exog, tree, paramsind):
self.endog = endog
self.datadict = exog
self.tree = tree
self.paramsind = paramsind
self.branchsum = ''
self.probs = {}
self.probstxt = {}
self.branchleaves = {}
self.branchvalues = {} #just to keep track of returns by branches
self.branchsums = {}
self.bprobs = {}
self.branches, self.leaves, self.branches_degenerate = getnodes(tree)
self.nbranches = len(self.branches)
#copied over but not quite sure yet
#unique, parameter array names,
#sorted alphabetically, order is/should be only internal
self.paramsnames = (sorted({i for j in paramsind.values()
for i in j}) +
['tau_%s' % bname for bname in self.branches])
self.nparams = len(self.paramsnames)
#mapping coefficient names to indices to unique/parameter array
self.paramsidx = {name: idx for (idx,name) in
enumerate(self.paramsnames)}
#mapping branch and leaf names to index in parameter array
self.parinddict = {k: [self.paramsidx[j] for j in v]
for k,v in self.paramsind.items()}
self.recursionparams = 1. + np.arange(len(self.paramsnames))
#for testing that individual parameters are used in the right place
self.recursionparams = np.zeros(len(self.paramsnames))
#self.recursionparams[2] = 1
self.recursionparams[-self.nbranches:] = 1 #values for tau's
#self.recursionparams[-2] = 2
def get_probs(self, params):
'''
obtain the probability array given an array of parameters
This is the function that can be called by loglike or other methods
that need the probabilities as function of the params.
Parameters
----------
params : 1d array, (nparams,)
coefficients and tau that parameterize the model. The required
length can be obtained by nparams. (and will depend on the number
of degenerate leaves - not yet)
Returns
-------
probs : ndarray, (nobs, nchoices)
probabilities for all choices for each observation. The order
is available by attribute leaves. See note in docstring of class
'''
self.recursionparams = params
self.calc_prob(self.tree)
probs_array = np.array([self.probs[leaf] for leaf in self.leaves])
return probs_array
#what's the ordering? Should be the same as sequence in tree.
#TODO: need a check/assert that this sequence is the same as the
# encoding in endog
def calc_prob(self, tree, parent=None):
'''walking a tree bottom-up based on dictionary
'''
#0.5#2 #placeholder for now
#should be tau=self.taus[name] but as part of params for optimization
endog = self.endog
datadict = self.datadict
paramsind = self.paramsind
branchsum = self.branchsum
if isinstance(tree, tuple): #assumes leaves are int for choice index
name, subtree = tree
self.branchleaves[name] = [] #register branch in dictionary
tau = self.recursionparams[self.paramsidx['tau_'+name]]
if DEBUG:
print('----------- starting next branch-----------')
print(name, datadict[name], 'tau=', tau)
print('subtree', subtree)
branchvalue = []
if testxb == 2:
branchsum = 0
elif testxb == 1:
branchsum = datadict[name]
else:
branchsum = name
for b in subtree:
if DEBUG:
print(b)
bv = self.calc_prob(b, name)
bv = np.exp(bv/tau) #this should not be here, when adding branch data
branchvalue.append(bv)
branchsum = branchsum + bv
self.branchvalues[name] = branchvalue #keep track what was returned
if DEBUG:
print('----------- returning to branch-----------')
print(name)
print('branchsum in branch', name, branchsum)
if parent:
if DEBUG:
print('parent', parent)
self.branchleaves[parent].extend(self.branchleaves[name])
if 0: #not name == 'top': # not used anymore !!! ???
#if not name == 'top':
#TODO: do I need this only on the lowest branches ?
tmpsum = 0
for k in self.branchleaves[name]:
#similar to this is now also in return branch values
#depends on what will be returned
tmpsum += self.probs[k]
iv = np.log(tmpsum)
for k in self.branchleaves[name]:
self.probstxt[k] = self.probstxt[k] + ['*' + name + '-prob' +
'(%s)' % ', '.join(self.paramsind[name])]
#TODO: does this use the denominator twice now
self.probs[k] = self.probs[k] / tmpsum
if np.size(self.datadict[name])>0:
#not used yet, might have to move one indentation level
#self.probs[k] = self.probs[k] / tmpsum
## np.exp(-self.datadict[name] *
## np.sum(self.recursionparams[self.parinddict[name]]))
if DEBUG:
print('self.datadict[name], self.probs[k]')
print(self.datadict[name], self.probs[k])
#if not name == 'top':
# self.probs[k] = self.probs[k] * np.exp( iv)
#walk one level down again to add branch probs to instance.probs
self.bprobs[name] = []
for bidx, b in enumerate(subtree):
if DEBUG:
print('repr(b)', repr(b), bidx)
#if len(b) == 1: #TODO: skip leaves, check this
if not isinstance(b, tuple): # isinstance(b, str):
#TODO: replace this with a check for branch (tuple) instead
#this implies name is a bottom branch,
#possible to add special things here
self.bprobs[name].append(self.probs[b])
#TODO: need tau possibly here
self.probs[b] = self.probs[b] / branchsum
if DEBUG:
print('*********** branchsum at bottom branch', branchsum)
#self.bprobs[name].append(self.probs[b])
else:
bname = b[0]
branchsum2 = sum(self.branchvalues[name])
assert np.abs(branchsum - branchsum2).sum() < 1e-8
bprob = branchvalue[bidx]/branchsum
self.bprobs[name].append(bprob)
for k in self.branchleaves[bname]:
if DEBUG:
print('branchprob', bname, k, bprob, branchsum)
#temporary hack with maximum to avoid zeros
self.probs[k] = self.probs[k] * np.maximum(bprob, 1e-4)
if DEBUG:
print('working on branch', tree, branchsum)
if testxb<2:
return branchsum
else: #this is the relevant part
self.branchsums[name] = branchsum
if np.size(self.datadict[name])>0:
branchxb = np.sum(self.datadict[name] *
self.recursionparams[self.parinddict[name]])
else:
branchxb = 0
if not name=='top':
tau = self.recursionparams[self.paramsidx['tau_'+name]]
else:
tau = 1
iv = branchxb + tau * branchsum #which tau: name or parent???
return branchxb + tau * np.log(branchsum) #iv
#branchsum is now IV, TODO: add effect of branch variables
else:
tau = self.recursionparams[self.paramsidx['tau_'+parent]]
if DEBUG:
print('parent', parent)
self.branchleaves[parent].append(tree) # register leave with parent
self.probstxt[tree] = [tree + '-prob' +
'(%s)' % ', '.join(self.paramsind[tree])]
#this is not yet a prob, not normalized to 1, it is exp(x*b)
leafprob = np.exp(np.sum(self.datadict[tree] *
self.recursionparams[self.parinddict[tree]])
/ tau) # fake tau for now, wrong spot ???
#it seems I get the same answer with and without tau here
self.probs[tree] = leafprob #= 1 #try initialization only
#TODO: where should I add tau in the leaves
if testxb == 2:
return np.log(leafprob)
elif testxb == 1:
leavessum = np.array(datadict[tree]) # sum((datadict[bi] for bi in datadict[tree]))
if DEBUG:
print('final branch with', tree, ''.join(tree), leavessum) #sum(tree)
return leavessum #sum(xb[tree])
elif testxb == 0:
return ''.join(tree) #sum(tree)
if __name__ == '__main__':
DEBUG = 0
endog = 5 # dummy place holder
############## Example similar to Greene
#get pickled data
#endog3, xifloat3 = pickle.load(open('xifloat2.pickle','rb'))
tree0 = ('top',
[('Fly',['Air']),
('Ground', ['Train', 'Car', 'Bus'])
])
''' this is with real data from Greene's clogit example
datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'],
[xifloat[i]for i in range(4)]))
'''
#for testing only (mock that returns it's own name
datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'],
['Airdata', 'Traindata', 'Busdata', 'Cardata']))
if testxb:
datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'],
np.arange(4)))
datadict.update({'top' : [],
'Fly' : [],
'Ground': []})
paramsind = {'top' : [],
'Fly' : [],
'Ground': [],
'Air' : ['GC', 'Ttme', 'ConstA', 'Hinc'],
'Train' : ['GC', 'Ttme', 'ConstT'],
'Bus' : ['GC', 'Ttme', 'ConstB'],
'Car' : ['GC', 'Ttme']
}
modru = RU2NMNL(endog, datadict, tree0, paramsind)
modru.recursionparams[-1] = 2
modru.recursionparams[1] = 1
print('Example 1')
print('---------\n')
print(modru.calc_prob(modru.tree))
print('Tree')
pprint(modru.tree)
print('\nmodru.probs')
pprint(modru.probs)
############## example with many layers
tree2 = ('top',
[('B1',['a','b']),
('B2',
[('B21',['c', 'd']),
('B22',['e', 'f', 'g'])
]
),
('B3',['h'])])
#Note: dict looses ordering
paramsind2 = {
'B1': [],
'a': ['consta', 'p'],
'b': ['constb', 'p'],
'B2': ['const2', 'x2'],
'B21': [],
'c': ['constc', 'p', 'time'],
'd': ['constd', 'p', 'time'],
'B22': ['x22'],
'e': ['conste', 'p', 'hince'],
'f': ['constf', 'p', 'hincf'],
'g': [ 'p', 'hincg'],
'B3': [],
'h': ['consth', 'p', 'h'],
'top': []}
datadict2 = dict([i for i in zip('abcdefgh',lrange(8))])
datadict2.update({'top':1000, 'B1':100, 'B2':200, 'B21':21,'B22':22, 'B3':300})
'''
>>> pprint(datadict2)
{'B1': 100,
'B2': 200,
'B21': 21,
'B22': 22,
'B3': 300,
'a': 0.5,
'b': 1,
'c': 2,
'd': 3,
'e': 4,
'f': 5,
'g': 6,
'h': 7,
'top': 1000}
'''
modru2 = RU2NMNL(endog, datadict2, tree2, paramsind2)
modru2.recursionparams[-3] = 2
modru2.recursionparams[3] = 1
print('\n\nExample 2')
print('---------\n')
print(modru2.calc_prob(modru2.tree))
print('Tree')
pprint(modru2.tree)
print('\nmodru.probs')
pprint(modru2.probs)
print('sum of probs', sum(list(modru2.probs.values())))
print('branchvalues')
print(modru2.branchvalues)
print(modru.branchvalues)
print('branch probabilities')
print(modru.bprobs)
print('degenerate branches')
print(modru.branches_degenerate)
'''
>>> modru.bprobs
{'Fly': [], 'top': [0.0016714179077931082, 0.99832858209220687], 'Ground': []}
>>> modru2.bprobs
{'top': [0.25000000000000006, 0.62499999999999989, 0.12500000000000003], 'B22': [], 'B21': [], 'B1': [], 'B2': [0.40000000000000008, 0.59999999999999998], 'B3': []}
'''
params1 = np.array([ 0., 1., 0., 0., 0., 0., 1., 1., 2.])
print(modru.get_probs(params1))
params2 = np.array([ 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 1., 1., 1., 2., 1., 1.])
print(modru2.get_probs(params2)) #raises IndexError