678 lines
23 KiB
Python
678 lines
23 KiB
Python
'''estimate distribution parameters by various methods
|
|
method of moments or matching quantiles, and Maximum Likelihood estimation
|
|
based on binned data and Maximum Product-of-Spacings
|
|
|
|
Warning: I'm still finding cut-and-paste and refactoring errors, e.g.
|
|
hardcoded variables from outer scope in functions
|
|
some results do not seem to make sense for Pareto case,
|
|
looks better now after correcting some name errors
|
|
|
|
initially loosely based on a paper and blog for quantile matching
|
|
by John D. Cook
|
|
formula for gamma quantile (ppf) matching by him (from paper)
|
|
http://www.codeproject.com/KB/recipes/ParameterPercentile.aspx
|
|
http://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/
|
|
this is what I actually used (in parts):
|
|
http://www.bepress.com/mdandersonbiostat/paper55/
|
|
|
|
quantile based estimator
|
|
^^^^^^^^^^^^^^^^^^^^^^^^
|
|
only special cases for number or parameters so far
|
|
Is there a literature for GMM estimation of distribution parameters? check
|
|
found one: Wu/Perloff 2007
|
|
|
|
|
|
binned estimator
|
|
^^^^^^^^^^^^^^^^
|
|
* I added this also
|
|
* use it for chisquare tests with estimation distribution parameters
|
|
* move this to distribution_extras (next to gof tests powerdiscrepancy and
|
|
continuous) or add to distribution_patch
|
|
|
|
|
|
example: t-distribution
|
|
* works with quantiles if they contain tail quantiles
|
|
* results with momentcondquant do not look as good as mle estimate
|
|
|
|
TODOs
|
|
* rearange and make sure I do not use module globals (as I did initially) DONE
|
|
make two version exactly identified method of moments with fsolve
|
|
and GMM (?) version with fmin
|
|
and maybe the special cases of JD Cook
|
|
update: maybe exact (MM) version is not so interesting compared to GMM
|
|
* add semifrozen version of moment and quantile based estimators,
|
|
e.g. for beta (both loc and scale fixed), or gamma (loc fixed)
|
|
* add beta example to the semifrozen MLE, fitfr, code
|
|
-> added method of moment estimator to _fitstart for beta
|
|
* start a list of how well different estimators, especially current mle work
|
|
for the different distributions
|
|
* need general GMM code (with optimal weights ?), looks like a good example
|
|
for it
|
|
* get example for binned data estimation, mailing list a while ago
|
|
* any idea when these are better than mle ?
|
|
* check language: I use quantile to mean the value of the random variable, not
|
|
quantile between 0 and 1.
|
|
* for GMM: move moment conditions to separate function, so that they can be
|
|
used for further analysis, e.g. covariance matrix of parameter estimates
|
|
* question: Are GMM properties different for matching quantiles with cdf or
|
|
ppf? Estimate should be the same, but derivatives of moment conditions
|
|
differ.
|
|
* add maximum spacings estimator, Wikipedia, Per Brodtkorb -> basic version Done
|
|
* add parameter estimation based on empirical characteristic function
|
|
(Carrasco/Florens), especially for stable distribution
|
|
* provide a model class based on estimating all distributions, and collect
|
|
all distribution specific information
|
|
|
|
|
|
References
|
|
----------
|
|
|
|
Ximing Wu, Jeffrey M. Perloff, GMM estimation of a maximum entropy
|
|
distribution with interval data, Journal of Econometrics, Volume 138,
|
|
Issue 2, 'Information and Entropy Econometrics' - A Volume in Honor of
|
|
Arnold Zellner, June 2007, Pages 532-546, ISSN 0304-4076,
|
|
DOI: 10.1016/j.jeconom.2006.05.008.
|
|
http://www.sciencedirect.com/science/article/B6VC0-4K606TK-4/2/78bc07c6245546374490f777a6bdbbcc
|
|
http://escholarship.org/uc/item/7jf5w1ht (working paper)
|
|
|
|
Johnson, Kotz, Balakrishnan: Volume 2
|
|
|
|
|
|
Author : josef-pktd
|
|
License : BSD
|
|
created : 2010-04-20
|
|
|
|
changes:
|
|
added Maximum Product-of-Spacings 2010-05-12
|
|
|
|
'''
|
|
|
|
import numpy as np
|
|
from scipy import stats, optimize, special
|
|
|
|
cache = {} #module global storage for temp results, not used
|
|
|
|
|
|
# the next two use distfn from module scope - not anymore
|
|
def gammamomentcond(distfn, params, mom2, quantile=None):
|
|
'''estimate distribution parameters based method of moments (mean,
|
|
variance) for distributions with 1 shape parameter and fixed loc=0.
|
|
|
|
Returns
|
|
-------
|
|
cond : function
|
|
|
|
Notes
|
|
-----
|
|
first test version, quantile argument not used
|
|
|
|
'''
|
|
def cond(params):
|
|
alpha, scale = params
|
|
mom2s = distfn.stats(alpha, 0.,scale)
|
|
#quantil
|
|
return np.array(mom2)-mom2s
|
|
return cond
|
|
|
|
def gammamomentcond2(distfn, params, mom2, quantile=None):
|
|
'''estimate distribution parameters based method of moments (mean,
|
|
variance) for distributions with 1 shape parameter and fixed loc=0.
|
|
|
|
Returns
|
|
-------
|
|
difference : ndarray
|
|
difference between theoretical and empirical moments
|
|
|
|
Notes
|
|
-----
|
|
first test version, quantile argument not used
|
|
|
|
The only difference to previous function is return type.
|
|
|
|
'''
|
|
alpha, scale = params
|
|
mom2s = distfn.stats(alpha, 0.,scale)
|
|
return np.array(mom2)-mom2s
|
|
|
|
|
|
|
|
######### fsolve does not move in small samples, fmin not very accurate
|
|
def momentcondunbound(distfn, params, mom2, quantile=None):
|
|
'''moment conditions for estimating distribution parameters using method
|
|
of moments, uses mean, variance and one quantile for distributions
|
|
with 1 shape parameter.
|
|
|
|
Returns
|
|
-------
|
|
difference : ndarray
|
|
difference between theoretical and empirical moments and quantiles
|
|
|
|
'''
|
|
shape, loc, scale = params
|
|
mom2diff = np.array(distfn.stats(shape, loc,scale)) - mom2
|
|
if quantile is not None:
|
|
pq, xq = quantile
|
|
#ppfdiff = distfn.ppf(pq, alpha)
|
|
cdfdiff = distfn.cdf(xq, shape, loc, scale) - pq
|
|
return np.concatenate([mom2diff, cdfdiff[:1]])
|
|
return mom2diff
|
|
|
|
|
|
###### loc scale only
|
|
def momentcondunboundls(distfn, params, mom2, quantile=None, shape=None):
|
|
'''moment conditions for estimating loc and scale of a distribution
|
|
with method of moments using either 2 quantiles or 2 moments (not both).
|
|
|
|
Returns
|
|
-------
|
|
difference : ndarray
|
|
difference between theoretical and empirical moments or quantiles
|
|
|
|
'''
|
|
loc, scale = params
|
|
mom2diff = np.array(distfn.stats(shape, loc, scale)) - mom2
|
|
if quantile is not None:
|
|
pq, xq = quantile
|
|
#ppfdiff = distfn.ppf(pq, alpha)
|
|
cdfdiff = distfn.cdf(xq, shape, loc, scale) - pq
|
|
#return np.concatenate([mom2diff, cdfdiff[:1]])
|
|
return cdfdiff
|
|
return mom2diff
|
|
|
|
|
|
|
|
######### try quantile GMM with identity weight matrix
|
|
#(just a guess that's what it is
|
|
|
|
def momentcondquant(distfn, params, mom2, quantile=None, shape=None):
|
|
'''moment conditions for estimating distribution parameters by matching
|
|
quantiles, defines as many moment conditions as quantiles.
|
|
|
|
Returns
|
|
-------
|
|
difference : ndarray
|
|
difference between theoretical and empirical quantiles
|
|
|
|
Notes
|
|
-----
|
|
This can be used for method of moments or for generalized method of
|
|
moments.
|
|
|
|
'''
|
|
#this check looks redundant/unused know
|
|
if len(params) == 2:
|
|
loc, scale = params
|
|
elif len(params) == 3:
|
|
shape, loc, scale = params
|
|
else:
|
|
#raise NotImplementedError
|
|
pass #see whether this might work, seems to work for beta with 2 shape args
|
|
|
|
#mom2diff = np.array(distfn.stats(*params)) - mom2
|
|
#if not quantile is None:
|
|
pq, xq = quantile
|
|
#ppfdiff = distfn.ppf(pq, alpha)
|
|
cdfdiff = distfn.cdf(xq, *params) - pq
|
|
#return np.concatenate([mom2diff, cdfdiff[:1]])
|
|
return cdfdiff
|
|
#return mom2diff
|
|
|
|
def fitquantilesgmm(distfn, x, start=None, pquant=None, frozen=None):
|
|
if pquant is None:
|
|
pquant = np.array([0.01, 0.05,0.1,0.4,0.6,0.9,0.95,0.99])
|
|
if start is None:
|
|
if hasattr(distfn, '_fitstart'):
|
|
start = distfn._fitstart(x)
|
|
else:
|
|
start = [1]*distfn.numargs + [0.,1.]
|
|
#TODO: vectorize this:
|
|
xqs = [stats.scoreatpercentile(x, p) for p in pquant*100]
|
|
mom2s = None
|
|
parest = optimize.fmin(lambda params:np.sum(
|
|
momentcondquant(distfn, params, mom2s,(pquant,xqs), shape=None)**2), start)
|
|
return parest
|
|
|
|
|
|
|
|
def fitbinned(distfn, freq, binedges, start, fixed=None):
|
|
'''estimate parameters of distribution function for binned data using MLE
|
|
|
|
Parameters
|
|
----------
|
|
distfn : distribution instance
|
|
needs to have cdf method, as in scipy.stats
|
|
freq : ndarray, 1d
|
|
frequency count, e.g. obtained by histogram
|
|
binedges : ndarray, 1d
|
|
binedges including lower and upper bound
|
|
start : tuple or array_like ?
|
|
starting values, needs to have correct length
|
|
|
|
Returns
|
|
-------
|
|
paramest : ndarray
|
|
estimated parameters
|
|
|
|
Notes
|
|
-----
|
|
todo: add fixed parameter option
|
|
|
|
added factorial
|
|
|
|
'''
|
|
if fixed is not None:
|
|
raise NotImplementedError
|
|
nobs = np.sum(freq)
|
|
lnnobsfact = special.gammaln(nobs+1)
|
|
|
|
def nloglike(params):
|
|
'''negative loglikelihood function of binned data
|
|
|
|
corresponds to multinomial
|
|
'''
|
|
prob = np.diff(distfn.cdf(binedges, *params))
|
|
return -(lnnobsfact + np.sum(freq*np.log(prob)- special.gammaln(freq+1)))
|
|
return optimize.fmin(nloglike, start)
|
|
|
|
|
|
def fitbinnedgmm(distfn, freq, binedges, start, fixed=None, weightsoptimal=True):
|
|
'''estimate parameters of distribution function for binned data using GMM
|
|
|
|
Parameters
|
|
----------
|
|
distfn : distribution instance
|
|
needs to have cdf method, as in scipy.stats
|
|
freq : ndarray, 1d
|
|
frequency count, e.g. obtained by histogram
|
|
binedges : ndarray, 1d
|
|
binedges including lower and upper bound
|
|
start : tuple or array_like ?
|
|
starting values, needs to have correct length
|
|
fixed : None
|
|
not used yet
|
|
weightsoptimal : bool
|
|
If true, then the optimal weighting matrix for GMM is used. If false,
|
|
then the identity matrix is used
|
|
|
|
Returns
|
|
-------
|
|
paramest : ndarray
|
|
estimated parameters
|
|
|
|
Notes
|
|
-----
|
|
todo: add fixed parameter option
|
|
|
|
added factorial
|
|
|
|
'''
|
|
if fixed is not None:
|
|
raise NotImplementedError
|
|
nobs = np.sum(freq)
|
|
if weightsoptimal:
|
|
weights = freq/float(nobs)
|
|
else:
|
|
weights = np.ones(len(freq))
|
|
freqnormed = freq/float(nobs)
|
|
# skip turning weights into matrix diag(freq/float(nobs))
|
|
|
|
def gmmobjective(params):
|
|
'''negative loglikelihood function of binned data
|
|
|
|
corresponds to multinomial
|
|
'''
|
|
prob = np.diff(distfn.cdf(binedges, *params))
|
|
momcond = freqnormed - prob
|
|
return np.dot(momcond*weights, momcond)
|
|
return optimize.fmin(gmmobjective, start)
|
|
|
|
#Addition from try_maxproductspacings:
|
|
"""Estimating Parameters of Log-Normal Distribution with Maximum
|
|
Likelihood and Maximum Product-of-Spacings
|
|
|
|
MPS definiton from JKB page 233
|
|
|
|
Created on Tue May 11 13:52:50 2010
|
|
Author: josef-pktd
|
|
License: BSD
|
|
"""
|
|
|
|
def hess_ndt(fun, pars, args, options):
|
|
import numdifftools as ndt
|
|
if not ('stepMax' in options or 'stepFix' in options):
|
|
options['stepMax'] = 1e-5
|
|
f = lambda params: fun(params, *args)
|
|
h = ndt.Hessian(f, **options)
|
|
return h(pars), h
|
|
|
|
def logmps(params, xsorted, dist):
|
|
'''calculate negative log of Product-of-Spacings
|
|
|
|
Parameters
|
|
----------
|
|
params : array_like, tuple ?
|
|
parameters of the distribution funciton
|
|
xsorted : array_like
|
|
data that is already sorted
|
|
dist : instance of a distribution class
|
|
only cdf method is used
|
|
|
|
Returns
|
|
-------
|
|
mps : float
|
|
negative log of Product-of-Spacings
|
|
|
|
|
|
Notes
|
|
-----
|
|
MPS definiton from JKB page 233
|
|
'''
|
|
xcdf = np.r_[0., dist.cdf(xsorted, *params), 1.]
|
|
D = np.diff(xcdf)
|
|
return -np.log(D).mean()
|
|
|
|
def getstartparams(dist, data):
|
|
'''get starting values for estimation of distribution parameters
|
|
|
|
Parameters
|
|
----------
|
|
dist : distribution instance
|
|
the distribution instance needs to have either a method fitstart
|
|
or an attribute numargs
|
|
data : ndarray
|
|
data for which preliminary estimator or starting value for
|
|
parameter estimation is desired
|
|
|
|
Returns
|
|
-------
|
|
x0 : ndarray
|
|
preliminary estimate or starting value for the parameters of
|
|
the distribution given the data, including loc and scale
|
|
|
|
'''
|
|
if hasattr(dist, 'fitstart'):
|
|
#x0 = getattr(dist, 'fitstart')(data)
|
|
x0 = dist.fitstart(data)
|
|
else:
|
|
if np.isfinite(dist.a):
|
|
x0 = np.r_[[1.]*dist.numargs, (data.min()-1), 1.]
|
|
else:
|
|
x0 = np.r_[[1.]*dist.numargs, (data.mean()-1), 1.]
|
|
return x0
|
|
|
|
def fit_mps(dist, data, x0=None):
|
|
'''Estimate distribution parameters with Maximum Product-of-Spacings
|
|
|
|
Parameters
|
|
----------
|
|
params : array_like, tuple ?
|
|
parameters of the distribution funciton
|
|
xsorted : array_like
|
|
data that is already sorted
|
|
dist : instance of a distribution class
|
|
only cdf method is used
|
|
|
|
Returns
|
|
-------
|
|
x : ndarray
|
|
estimates for the parameters of the distribution given the data,
|
|
including loc and scale
|
|
|
|
|
|
'''
|
|
xsorted = np.sort(data)
|
|
if x0 is None:
|
|
x0 = getstartparams(dist, xsorted)
|
|
args = (xsorted, dist)
|
|
print(x0)
|
|
#print(args)
|
|
return optimize.fmin(logmps, x0, args=args)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
#Example: gamma - distribution
|
|
#-----------------------------
|
|
|
|
print('\n\nExample: gamma Distribution')
|
|
print( '---------------------------')
|
|
|
|
alpha = 2
|
|
xq = [0.5, 4]
|
|
pq = [0.1, 0.9]
|
|
print(stats.gamma.ppf(pq, alpha))
|
|
xq = stats.gamma.ppf(pq, alpha)
|
|
print(np.diff(stats.gamma.ppf(pq, np.linspace(0.01,4,10)[:,None])*xq[::-1]))
|
|
#optimize.bisect(lambda alpha: np.diff((stats.gamma.ppf(pq, alpha)*xq[::-1])))
|
|
print(optimize.fsolve(lambda alpha: np.diff(stats.gamma.ppf(pq, alpha)*xq[::-1]), 3.))
|
|
|
|
distfn = stats.gamma
|
|
mcond = gammamomentcond(distfn, [5.,10], mom2=stats.gamma.stats(alpha, 0.,1.), quantile=None)
|
|
print(optimize.fsolve(mcond, [1.,2.]))
|
|
mom2 = stats.gamma.stats(alpha, 0.,1.)
|
|
print(optimize.fsolve(lambda params:gammamomentcond2(distfn, params, mom2), [1.,2.]))
|
|
|
|
grvs = stats.gamma.rvs(alpha, 0.,2., size=1000)
|
|
mom2 = np.array([grvs.mean(), grvs.var()])
|
|
alphaestq = optimize.fsolve(lambda params:gammamomentcond2(distfn, params, mom2), [1.,3.])
|
|
print(alphaestq)
|
|
print('scale = ', xq/stats.gamma.ppf(pq, alphaestq))
|
|
|
|
|
|
#Example beta - distribution
|
|
#---------------------------
|
|
|
|
#Warning: this example had cut-and-paste errors
|
|
|
|
print('\n\nExample: beta Distribution')
|
|
print( '--------------------------')
|
|
|
|
#monkey patching :
|
|
## if hasattr(stats.beta, '_fitstart'):
|
|
## del stats.beta._fitstart #bug in _fitstart #raises AttributeError: _fitstart
|
|
#stats.distributions.beta_gen._fitstart = lambda self, data : np.array([1,1,0,1])
|
|
#_fitstart seems to require a tuple
|
|
stats.distributions.beta_gen._fitstart = lambda self, data : (5,5,0,1)
|
|
|
|
pq = np.array([0.01, 0.05,0.1,0.4,0.6,0.9,0.95,0.99])
|
|
#rvsb = stats.beta.rvs(0.5,0.15,size=200)
|
|
rvsb = stats.beta.rvs(10,15,size=2000)
|
|
print('true params', 10, 15, 0, 1)
|
|
print(stats.beta.fit(rvsb))
|
|
xqsb = [stats.scoreatpercentile(rvsb, p) for p in pq*100]
|
|
mom2s = np.array([rvsb.mean(), rvsb.var()])
|
|
betaparest_gmmquantile = optimize.fmin(lambda params:np.sum(momentcondquant(stats.beta, params, mom2s,(pq,xqsb), shape=None)**2),
|
|
[10,10, 0., 1.], maxiter=2000)
|
|
print('betaparest_gmmquantile', betaparest_gmmquantile)
|
|
#result sensitive to initial condition
|
|
|
|
|
|
#Example t - distribution
|
|
#------------------------
|
|
|
|
print('\n\nExample: t Distribution')
|
|
print( '-----------------------')
|
|
|
|
nobs = 1000
|
|
distfn = stats.t
|
|
pq = np.array([0.1,0.9])
|
|
paramsdgp = (5, 0, 1)
|
|
trvs = distfn.rvs(5, 0, 1, size=nobs)
|
|
xqs = [stats.scoreatpercentile(trvs, p) for p in pq*100]
|
|
mom2th = distfn.stats(*paramsdgp)
|
|
mom2s = np.array([trvs.mean(), trvs.var()])
|
|
tparest_gmm3quantilefsolve = optimize.fsolve(lambda params:momentcondunbound(distfn,params, mom2s,(pq,xqs)), [10,1.,2.])
|
|
print('tparest_gmm3quantilefsolve', tparest_gmm3quantilefsolve)
|
|
tparest_gmm3quantile = optimize.fmin(lambda params:np.sum(momentcondunbound(distfn,params, mom2s,(pq,xqs))**2), [10,1.,2.])
|
|
print('tparest_gmm3quantile', tparest_gmm3quantile)
|
|
print(distfn.fit(trvs))
|
|
|
|
##
|
|
|
|
##distfn = stats.t
|
|
##pq = np.array([0.1,0.9])
|
|
##paramsdgp = (5, 0, 1)
|
|
##trvs = distfn.rvs(5, 0, 1, size=nobs)
|
|
##xqs = [stats.scoreatpercentile(trvs, p) for p in pq*100]
|
|
##mom2th = distfn.stats(*paramsdgp)
|
|
##mom2s = np.array([trvs.mean(), trvs.var()])
|
|
print(optimize.fsolve(lambda params:momentcondunboundls(distfn, params, mom2s,shape=5), [1.,2.]))
|
|
print(optimize.fmin(lambda params:np.sum(momentcondunboundls(distfn, params, mom2s,shape=5)**2), [1.,2.]))
|
|
print(distfn.fit(trvs))
|
|
#loc, scale, based on quantiles
|
|
print(optimize.fsolve(lambda params:momentcondunboundls(distfn, params, mom2s,(pq,xqs),shape=5), [1.,2.]))
|
|
|
|
##
|
|
|
|
pq = np.array([0.01, 0.05,0.1,0.4,0.6,0.9,0.95,0.99])
|
|
#paramsdgp = (5, 0, 1)
|
|
xqs = [stats.scoreatpercentile(trvs, p) for p in pq*100]
|
|
tparest_gmmquantile = optimize.fmin(lambda params:np.sum(momentcondquant(distfn, params, mom2s,(pq,xqs), shape=None)**2), [10, 1.,2.])
|
|
print('tparest_gmmquantile', tparest_gmmquantile)
|
|
tparest_gmmquantile2 = fitquantilesgmm(distfn, trvs, start=[10, 1.,2.], pquant=None, frozen=None)
|
|
print('tparest_gmmquantile2', tparest_gmmquantile2)
|
|
|
|
|
|
##
|
|
|
|
|
|
#use trvs from before
|
|
bt = stats.t.ppf(np.linspace(0,1,21),5)
|
|
ft,bt = np.histogram(trvs,bins=bt)
|
|
print('fitbinned t-distribution')
|
|
tparest_mlebinew = fitbinned(stats.t, ft, bt, [10, 0, 1])
|
|
tparest_gmmbinewidentity = fitbinnedgmm(stats.t, ft, bt, [10, 0, 1])
|
|
tparest_gmmbinewoptimal = fitbinnedgmm(stats.t, ft, bt, [10, 0, 1], weightsoptimal=False)
|
|
print(paramsdgp)
|
|
|
|
#Note: this can be used for chisquare test and then has correct asymptotic
|
|
# distribution for a distribution with estimated parameters, find ref again
|
|
#TODO combine into test with binning included, check rule for number of bins
|
|
|
|
#bt2 = stats.t.ppf(np.linspace(trvs.,1,21),5)
|
|
ft2,bt2 = np.histogram(trvs,bins=50)
|
|
'fitbinned t-distribution'
|
|
tparest_mlebinel = fitbinned(stats.t, ft2, bt2, [10, 0, 1])
|
|
tparest_gmmbinelidentity = fitbinnedgmm(stats.t, ft2, bt2, [10, 0, 1])
|
|
tparest_gmmbineloptimal = fitbinnedgmm(stats.t, ft2, bt2, [10, 0, 1], weightsoptimal=False)
|
|
tparest_mle = stats.t.fit(trvs)
|
|
|
|
np.set_printoptions(precision=6)
|
|
print('sample size', nobs)
|
|
print('true (df, loc, scale) ', paramsdgp)
|
|
print('parest_mle ', tparest_mle)
|
|
print
|
|
print('tparest_mlebinel ', tparest_mlebinel)
|
|
print('tparest_gmmbinelidentity ', tparest_gmmbinelidentity)
|
|
print('tparest_gmmbineloptimal ', tparest_gmmbineloptimal)
|
|
print
|
|
print('tparest_mlebinew ', tparest_mlebinew)
|
|
print('tparest_gmmbinewidentity ', tparest_gmmbinewidentity)
|
|
print('tparest_gmmbinewoptimal ', tparest_gmmbinewoptimal)
|
|
print
|
|
print('tparest_gmmquantileidentity', tparest_gmmquantile)
|
|
print('tparest_gmm3quantilefsolve ', tparest_gmm3quantilefsolve)
|
|
print('tparest_gmm3quantile ', tparest_gmm3quantile)
|
|
|
|
''' example results:
|
|
standard error for df estimate looks large
|
|
note: iI do not impose that df is an integer, (b/c not necessary)
|
|
need Monte Carlo to check variance of estimators
|
|
|
|
|
|
sample size 1000
|
|
true (df, loc, scale) (5, 0, 1)
|
|
parest_mle [ 4.571405 -0.021493 1.028584]
|
|
|
|
tparest_mlebinel [ 4.534069 -0.022605 1.02962 ]
|
|
tparest_gmmbinelidentity [ 2.653056 0.012807 0.896958]
|
|
tparest_gmmbineloptimal [ 2.437261 -0.020491 0.923308]
|
|
|
|
tparest_mlebinew [ 2.999124 -0.0199 0.948811]
|
|
tparest_gmmbinewidentity [ 2.900939 -0.020159 0.93481 ]
|
|
tparest_gmmbinewoptimal [ 2.977764 -0.024925 0.946487]
|
|
|
|
tparest_gmmquantileidentity [ 3.940797 -0.046469 1.002001]
|
|
tparest_gmm3quantilefsolve [ 10. 1. 2.]
|
|
tparest_gmm3quantile [ 6.376101 -0.029322 1.112403]
|
|
'''
|
|
|
|
#Example with Maximum Product of Spacings Estimation
|
|
#===================================================
|
|
|
|
#Example: Lognormal Distribution
|
|
#-------------------------------
|
|
|
|
#tough problem for MLE according to JKB
|
|
#but not sure for which parameters
|
|
|
|
print('\n\nExample: Lognormal Distribution')
|
|
print( '-------------------------------')
|
|
|
|
sh = np.exp(10)
|
|
sh = 0.01
|
|
print(sh)
|
|
x = stats.lognorm.rvs(sh,loc=100, scale=10,size=200)
|
|
|
|
print(x.min())
|
|
print(stats.lognorm.fit(x, 1.,loc=x.min()-1,scale=1))
|
|
|
|
xsorted = np.sort(x)
|
|
|
|
x0 = [1., x.min()-1, 1]
|
|
args = (xsorted, stats.lognorm)
|
|
print(optimize.fmin(logmps,x0,args=args))
|
|
|
|
|
|
#Example: Lomax, Pareto, Generalized Pareto Distributions
|
|
#--------------------------------------------------------
|
|
|
|
#partially a follow-up to the discussion about numpy.random.pareto
|
|
#Reference: JKB
|
|
#example Maximum Product of Spacings Estimation
|
|
|
|
# current results:
|
|
# does not look very good yet sensitivity to starting values
|
|
# Pareto and Generalized Pareto look like a tough estimation problemprint('\n\nExample: Lognormal Distribution'
|
|
|
|
print('\n\nExample: Lomax, Pareto, Generalized Pareto Distributions')
|
|
print( '--------------------------------------------------------')
|
|
|
|
p2rvs = stats.genpareto.rvs(2, size=500)
|
|
#Note: is Lomax without +1; and classical Pareto with +1
|
|
p2rvssorted = np.sort(p2rvs)
|
|
argsp = (p2rvssorted, stats.pareto)
|
|
x0p = [1., p2rvs.min()-5, 1]
|
|
print(optimize.fmin(logmps,x0p,args=argsp))
|
|
print(stats.pareto.fit(p2rvs, 0.5, loc=-20, scale=0.5))
|
|
print('gpdparest_ mle', stats.genpareto.fit(p2rvs))
|
|
parsgpd = fit_mps(stats.genpareto, p2rvs)
|
|
print('gpdparest_ mps', parsgpd)
|
|
argsgpd = (p2rvssorted, stats.genpareto)
|
|
options = dict(stepFix=1e-7)
|
|
#hess_ndt(fun, pars, argsgdp, options)
|
|
#the results for the following look strange, maybe refactoring error
|
|
he, h = hess_ndt(logmps, parsgpd, argsgpd, options)
|
|
print(np.linalg.eigh(he)[0])
|
|
f = lambda params: logmps(params, *argsgpd)
|
|
print(f(parsgpd))
|
|
#add binned
|
|
fp2, bp2 = np.histogram(p2rvs, bins=50)
|
|
'fitbinned t-distribution'
|
|
gpdparest_mlebinel = fitbinned(stats.genpareto, fp2, bp2, x0p)
|
|
gpdparest_gmmbinelidentity = fitbinnedgmm(stats.genpareto, fp2, bp2, x0p)
|
|
print('gpdparest_mlebinel', gpdparest_mlebinel)
|
|
print('gpdparest_gmmbinelidentity', gpdparest_gmmbinelidentity)
|
|
gpdparest_gmmquantile2 = fitquantilesgmm(
|
|
stats.genpareto, p2rvs, start=x0p, pquant=None, frozen=None)
|
|
print('gpdparest_gmmquantile2', gpdparest_gmmquantile2)
|
|
|
|
print(fitquantilesgmm(stats.genpareto, p2rvs, start=x0p,
|
|
pquant=np.linspace(0.01,0.99,10), frozen=None))
|
|
fp2, bp2 = np.histogram(
|
|
p2rvs,
|
|
bins=stats.genpareto(2).ppf(np.linspace(0,0.99,10)))
|
|
print('fitbinnedgmm equal weight bins')
|
|
print(fitbinnedgmm(stats.genpareto, fp2, bp2, x0p))
|