1038 lines
35 KiB
Python
1038 lines
35 KiB
Python
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
from numpy.linalg import eigh, inv, norm, matrix_rank
|
||
|
import pandas as pd
|
||
|
from scipy.optimize import minimize
|
||
|
|
||
|
from statsmodels.tools.decorators import cache_readonly
|
||
|
from statsmodels.base.model import Model
|
||
|
from statsmodels.iolib import summary2
|
||
|
from statsmodels.graphics.utils import _import_mpl
|
||
|
|
||
|
from .factor_rotation import rotate_factors, promax
|
||
|
|
||
|
|
||
|
_opt_defaults = {'gtol': 1e-7}
|
||
|
|
||
|
|
||
|
def _check_args_1(endog, n_factor, corr, nobs):
|
||
|
|
||
|
msg = "Either endog or corr must be provided."
|
||
|
if endog is not None and corr is not None:
|
||
|
raise ValueError(msg)
|
||
|
if endog is None and corr is None:
|
||
|
warnings.warn('Both endog and corr are provided, ' +
|
||
|
'corr will be used for factor analysis.')
|
||
|
|
||
|
if n_factor <= 0:
|
||
|
raise ValueError('n_factor must be larger than 0! %d < 0' %
|
||
|
(n_factor))
|
||
|
|
||
|
if nobs is not None and endog is not None:
|
||
|
warnings.warn("nobs is ignored when endog is provided")
|
||
|
|
||
|
|
||
|
def _check_args_2(endog, n_factor, corr, nobs, k_endog):
|
||
|
|
||
|
if n_factor > k_endog:
|
||
|
raise ValueError('n_factor cannot be greater than the number'
|
||
|
' of variables! %d > %d' %
|
||
|
(n_factor, k_endog))
|
||
|
|
||
|
if np.max(np.abs(np.diag(corr) - 1)) > 1e-10:
|
||
|
raise ValueError("corr must be a correlation matrix")
|
||
|
|
||
|
if corr.shape[0] != corr.shape[1]:
|
||
|
raise ValueError('Correlation matrix corr must be a square '
|
||
|
'(rows %d != cols %d)' % corr.shape)
|
||
|
|
||
|
|
||
|
class Factor(Model):
|
||
|
"""
|
||
|
Factor analysis
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : array_like
|
||
|
Variables in columns, observations in rows. May be `None` if
|
||
|
`corr` is not `None`.
|
||
|
n_factor : int
|
||
|
The number of factors to extract
|
||
|
corr : array_like
|
||
|
Directly specify the correlation matrix instead of estimating
|
||
|
it from `endog`. If provided, `endog` is not used for the
|
||
|
factor analysis, it may be used in post-estimation.
|
||
|
method : str
|
||
|
The method to extract factors, currently must be either 'pa'
|
||
|
for principal axis factor analysis or 'ml' for maximum
|
||
|
likelihood estimation.
|
||
|
smc : True or False
|
||
|
Whether or not to apply squared multiple correlations (method='pa')
|
||
|
endog_names : str
|
||
|
Names of endogenous variables. If specified, it will be used
|
||
|
instead of the column names in endog
|
||
|
nobs : int
|
||
|
The number of observations, not used if endog is present. Needs to
|
||
|
be provided for inference if endog is None.
|
||
|
missing : 'none', 'drop', or 'raise'
|
||
|
Missing value handling for endog, default is row-wise deletion 'drop'
|
||
|
If 'none', no nan checking is done. If 'drop', any observations with
|
||
|
nans are dropped. If 'raise', an error is raised.
|
||
|
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
**Experimental**
|
||
|
|
||
|
Supported rotations: 'varimax', 'quartimax', 'biquartimax',
|
||
|
'equamax', 'oblimin', 'parsimax', 'parsimony', 'biquartimin',
|
||
|
'promax'
|
||
|
|
||
|
If method='ml', the factors are rotated to satisfy condition IC3
|
||
|
of Bai and Li (2012). This means that the scores have covariance
|
||
|
I, so the model for the covariance matrix is L * L' + diag(U),
|
||
|
where L are the loadings and U are the uniquenesses. In addition,
|
||
|
L' * diag(U)^{-1} L must be diagonal.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [*] Hofacker, C. (2004). Exploratory Factor Analysis, Mathematical
|
||
|
Marketing. http://www.openaccesstexts.org/pdf/Quant_Chapter_11_efa.pdf
|
||
|
.. [*] J Bai, K Li (2012). Statistical analysis of factor models of high
|
||
|
dimension. Annals of Statistics. https://arxiv.org/pdf/1205.6617.pdf
|
||
|
"""
|
||
|
def __init__(self, endog=None, n_factor=1, corr=None, method='pa',
|
||
|
smc=True, endog_names=None, nobs=None, missing='drop'):
|
||
|
|
||
|
_check_args_1(endog, n_factor, corr, nobs)
|
||
|
|
||
|
if endog is not None:
|
||
|
super().__init__(endog, exog=None, missing=missing)
|
||
|
endog = self.endog # after preprocessing like missing, asarray
|
||
|
k_endog = endog.shape[1]
|
||
|
nobs = endog.shape[0]
|
||
|
corr = self.corr = np.corrcoef(endog, rowvar=0)
|
||
|
elif corr is not None:
|
||
|
corr = self.corr = np.asarray(corr)
|
||
|
k_endog = self.corr.shape[0]
|
||
|
self.endog = None
|
||
|
else:
|
||
|
msg = "Either endog or corr must be provided."
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
_check_args_2(endog, n_factor, corr, nobs, k_endog)
|
||
|
|
||
|
self.n_factor = n_factor
|
||
|
self.loadings = None
|
||
|
self.communality = None
|
||
|
self.method = method
|
||
|
self.smc = smc
|
||
|
self.nobs = nobs
|
||
|
self.method = method
|
||
|
self.corr = corr
|
||
|
self.k_endog = k_endog
|
||
|
|
||
|
if endog_names is None:
|
||
|
if hasattr(corr, 'index'):
|
||
|
endog_names = corr.index
|
||
|
if hasattr(corr, 'columns'):
|
||
|
endog_names = corr.columns
|
||
|
self.endog_names = endog_names
|
||
|
|
||
|
@property
|
||
|
def endog_names(self):
|
||
|
"""Names of endogenous variables"""
|
||
|
if self._endog_names is not None:
|
||
|
return self._endog_names
|
||
|
else:
|
||
|
if self.endog is not None:
|
||
|
return self.data.ynames
|
||
|
else:
|
||
|
d = 0
|
||
|
n = self.corr.shape[0] - 1
|
||
|
while n > 0:
|
||
|
d += 1
|
||
|
n //= 10
|
||
|
return [('var%0' + str(d) + 'd') % i
|
||
|
for i in range(self.corr.shape[0])]
|
||
|
|
||
|
@endog_names.setter
|
||
|
def endog_names(self, value):
|
||
|
# Check validity of endog_names:
|
||
|
if value is not None:
|
||
|
if len(value) != self.corr.shape[0]:
|
||
|
raise ValueError('The length of `endog_names` must '
|
||
|
'equal the number of variables.')
|
||
|
self._endog_names = np.asarray(value)
|
||
|
else:
|
||
|
self._endog_names = None
|
||
|
|
||
|
def fit(self, maxiter=50, tol=1e-8, start=None, opt_method='BFGS',
|
||
|
opt=None, em_iter=3):
|
||
|
"""
|
||
|
Estimate factor model parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
maxiter : int
|
||
|
Maximum number of iterations for iterative estimation algorithms
|
||
|
tol : float
|
||
|
Stopping criteria (error tolerance) for iterative estimation
|
||
|
algorithms
|
||
|
start : array_like
|
||
|
Starting values, currently only used for ML estimation
|
||
|
opt_method : str
|
||
|
Optimization method for ML estimation
|
||
|
opt : dict-like
|
||
|
Keyword arguments passed to optimizer, only used for ML estimation
|
||
|
em_iter : int
|
||
|
The number of EM iterations before starting gradient optimization,
|
||
|
only used for ML estimation.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
FactorResults
|
||
|
Results class instance.
|
||
|
"""
|
||
|
method = self.method.lower()
|
||
|
if method == 'pa':
|
||
|
return self._fit_pa(maxiter=maxiter, tol=tol)
|
||
|
elif method == 'ml':
|
||
|
return self._fit_ml(start, em_iter, opt_method, opt)
|
||
|
else:
|
||
|
msg = "Unknown factor extraction approach '%s'" % self.method
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
def _fit_pa(self, maxiter=50, tol=1e-8):
|
||
|
"""
|
||
|
Extract factors using the iterative principal axis method
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
maxiter : int
|
||
|
Maximum number of iterations for communality estimation
|
||
|
tol : float
|
||
|
If `norm(communality - last_communality) < tolerance`,
|
||
|
estimation stops
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
results : FactorResults instance
|
||
|
"""
|
||
|
|
||
|
R = self.corr.copy() # inplace modification below
|
||
|
|
||
|
# Parameter validation
|
||
|
self.n_comp = matrix_rank(R)
|
||
|
if self.n_factor > self.n_comp:
|
||
|
raise ValueError('n_factor must be smaller or equal to the rank'
|
||
|
' of endog! %d > %d' %
|
||
|
(self.n_factor, self.n_comp))
|
||
|
if maxiter <= 0:
|
||
|
raise ValueError('n_max_iter must be larger than 0! %d < 0' %
|
||
|
(maxiter))
|
||
|
if tol <= 0 or tol > 0.01:
|
||
|
raise ValueError('tolerance must be larger than 0 and smaller than'
|
||
|
' 0.01! Got %f instead' % (tol))
|
||
|
|
||
|
# Initial communality estimation
|
||
|
if self.smc:
|
||
|
c = 1 - 1 / np.diag(inv(R))
|
||
|
else:
|
||
|
c = np.ones(len(R))
|
||
|
|
||
|
# Iterative communality estimation
|
||
|
eigenvals = None
|
||
|
for i in range(maxiter):
|
||
|
# Get eigenvalues/eigenvectors of R with diag replaced by
|
||
|
# communality
|
||
|
for j in range(len(R)):
|
||
|
R[j, j] = c[j]
|
||
|
L, V = eigh(R, UPLO='U')
|
||
|
c_last = np.array(c)
|
||
|
ind = np.argsort(L)
|
||
|
ind = ind[::-1]
|
||
|
L = L[ind]
|
||
|
n_pos = (L > 0).sum()
|
||
|
V = V[:, ind]
|
||
|
eigenvals = np.array(L)
|
||
|
|
||
|
# Select eigenvectors with positive eigenvalues
|
||
|
n = np.min([n_pos, self.n_factor])
|
||
|
sL = np.diag(np.sqrt(L[:n]))
|
||
|
V = V[:, :n]
|
||
|
|
||
|
# Calculate new loadings and communality
|
||
|
A = V.dot(sL)
|
||
|
c = np.power(A, 2).sum(axis=1)
|
||
|
if norm(c_last - c) < tol:
|
||
|
break
|
||
|
|
||
|
self.eigenvals = eigenvals
|
||
|
self.communality = c
|
||
|
self.uniqueness = 1 - c
|
||
|
self.loadings = A
|
||
|
return FactorResults(self)
|
||
|
|
||
|
# Unpacks the model parameters from a flat vector, used for ML
|
||
|
# estimation. The first k_endog elements of par are the square
|
||
|
# roots of the uniquenesses. The remaining elements are the
|
||
|
# factor loadings, packed one factor at a time.
|
||
|
def _unpack(self, par):
|
||
|
return (par[0:self.k_endog]**2,
|
||
|
np.reshape(par[self.k_endog:], (-1, self.k_endog)).T)
|
||
|
|
||
|
# Packs the model parameters into a flat parameter, used for ML
|
||
|
# estimation.
|
||
|
def _pack(self, load, uniq):
|
||
|
return np.concatenate((np.sqrt(uniq), load.T.flat))
|
||
|
|
||
|
def loglike(self, par):
|
||
|
"""
|
||
|
Evaluate the log-likelihood function.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
par : ndarray or tuple of 2 ndarray's
|
||
|
The model parameters, either a packed representation of
|
||
|
the model parameters or a 2-tuple containing a `k_endog x
|
||
|
n_factor` matrix of factor loadings and a `k_endog` vector
|
||
|
of uniquenesses.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
The value of the log-likelihood evaluated at par.
|
||
|
"""
|
||
|
|
||
|
if type(par) is np.ndarray:
|
||
|
uniq, load = self._unpack(par)
|
||
|
else:
|
||
|
load, uniq = par[0], par[1]
|
||
|
|
||
|
loadu = load / uniq[:, None]
|
||
|
lul = np.dot(load.T, loadu)
|
||
|
|
||
|
# log|GG' + S|
|
||
|
# Using matrix determinant lemma:
|
||
|
# |GG' + S| = |I + G'S^{-1}G|*|S|
|
||
|
lul.flat[::lul.shape[0]+1] += 1
|
||
|
_, ld = np.linalg.slogdet(lul)
|
||
|
v = np.sum(np.log(uniq)) + ld
|
||
|
|
||
|
# tr((GG' + S)^{-1}C)
|
||
|
# Using Sherman-Morrison-Woodbury
|
||
|
w = np.sum(1 / uniq)
|
||
|
b = np.dot(load.T, self.corr / uniq[:, None])
|
||
|
b = np.linalg.solve(lul, b)
|
||
|
b = np.dot(loadu, b)
|
||
|
w -= np.trace(b)
|
||
|
|
||
|
# Scaled log-likelihood
|
||
|
return -(v + w) / (2*self.k_endog)
|
||
|
|
||
|
def score(self, par):
|
||
|
"""
|
||
|
Evaluate the score function (first derivative of loglike).
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
par : ndarray or tuple of 2 ndarray's
|
||
|
The model parameters, either a packed representation of
|
||
|
the model parameters or a 2-tuple containing a `k_endog x
|
||
|
n_factor` matrix of factor loadings and a `k_endog` vector
|
||
|
of uniquenesses.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ndarray
|
||
|
The score function evaluated at par.
|
||
|
"""
|
||
|
|
||
|
if type(par) is np.ndarray:
|
||
|
uniq, load = self._unpack(par)
|
||
|
else:
|
||
|
load, uniq = par[0], par[1]
|
||
|
|
||
|
# Center term of SMW
|
||
|
loadu = load / uniq[:, None]
|
||
|
c = np.dot(load.T, loadu)
|
||
|
c.flat[::c.shape[0]+1] += 1
|
||
|
d = np.linalg.solve(c, load.T)
|
||
|
|
||
|
# Precompute these terms
|
||
|
lud = np.dot(loadu, d)
|
||
|
cu = (self.corr / uniq) / uniq[:, None]
|
||
|
r = np.dot(cu, load)
|
||
|
lul = np.dot(lud.T, load)
|
||
|
luz = np.dot(cu, lul)
|
||
|
|
||
|
# First term
|
||
|
du = 2*np.sqrt(uniq) * (1/uniq - (d * load.T).sum(0) / uniq**2)
|
||
|
dl = 2*(loadu - np.dot(lud, loadu))
|
||
|
|
||
|
# Second term
|
||
|
h = np.dot(lud, cu)
|
||
|
f = np.dot(h, lud.T)
|
||
|
du -= 2*np.sqrt(uniq) * (np.diag(cu) - 2*np.diag(h) + np.diag(f))
|
||
|
dl -= 2*r
|
||
|
dl += 2*np.dot(lud, r)
|
||
|
dl += 2*luz
|
||
|
dl -= 2*np.dot(lud, luz)
|
||
|
|
||
|
# Cannot use _pack because we are working with the square root
|
||
|
# uniquenesses directly.
|
||
|
return -np.concatenate((du, dl.T.flat)) / (2*self.k_endog)
|
||
|
|
||
|
# Maximum likelihood factor analysis.
|
||
|
def _fit_ml(self, start, em_iter, opt_method, opt):
|
||
|
"""estimate Factor model using Maximum Likelihood
|
||
|
"""
|
||
|
|
||
|
# Starting values
|
||
|
if start is None:
|
||
|
load, uniq = self._fit_ml_em(em_iter)
|
||
|
start = self._pack(load, uniq)
|
||
|
elif len(start) == 2:
|
||
|
if len(start[1]) != start[0].shape[0]:
|
||
|
msg = "Starting values have incompatible dimensions"
|
||
|
raise ValueError(msg)
|
||
|
start = self._pack(start[0], start[1])
|
||
|
else:
|
||
|
raise ValueError("Invalid starting values")
|
||
|
|
||
|
def nloglike(par):
|
||
|
return -self.loglike(par)
|
||
|
|
||
|
def nscore(par):
|
||
|
return -self.score(par)
|
||
|
|
||
|
# Do the optimization
|
||
|
if opt is None:
|
||
|
opt = _opt_defaults
|
||
|
r = minimize(nloglike, start, jac=nscore, method=opt_method,
|
||
|
options=opt)
|
||
|
if not r.success:
|
||
|
warnings.warn("Fitting did not converge")
|
||
|
par = r.x
|
||
|
uniq, load = self._unpack(par)
|
||
|
|
||
|
if uniq.min() < 1e-10:
|
||
|
warnings.warn("Some uniquenesses are nearly zero")
|
||
|
|
||
|
# Rotate solution to satisfy IC3 of Bai and Li
|
||
|
load = self._rotate(load, uniq)
|
||
|
|
||
|
self.uniqueness = uniq
|
||
|
self.communality = 1 - uniq
|
||
|
self.loadings = load
|
||
|
self.mle_retvals = r
|
||
|
|
||
|
return FactorResults(self)
|
||
|
|
||
|
def _fit_ml_em(self, iter, random_state=None):
|
||
|
"""estimate Factor model using EM algorithm
|
||
|
"""
|
||
|
# Starting values
|
||
|
if random_state is None:
|
||
|
random_state = np.random.RandomState(3427)
|
||
|
load = 0.1 * random_state.standard_normal(size=(self.k_endog, self.n_factor))
|
||
|
uniq = 0.5 * np.ones(self.k_endog)
|
||
|
|
||
|
for k in range(iter):
|
||
|
|
||
|
loadu = load / uniq[:, None]
|
||
|
|
||
|
f = np.dot(load.T, loadu)
|
||
|
f.flat[::f.shape[0]+1] += 1
|
||
|
|
||
|
r = np.linalg.solve(f, loadu.T)
|
||
|
q = np.dot(loadu.T, load)
|
||
|
h = np.dot(r, load)
|
||
|
|
||
|
c = load - np.dot(load, h)
|
||
|
c /= uniq[:, None]
|
||
|
|
||
|
g = np.dot(q, r)
|
||
|
e = np.dot(g, self.corr)
|
||
|
d = np.dot(loadu.T, self.corr) - e
|
||
|
|
||
|
a = np.dot(d, c)
|
||
|
a -= np.dot(load.T, c)
|
||
|
a.flat[::a.shape[0]+1] += 1
|
||
|
|
||
|
b = np.dot(self.corr, c)
|
||
|
|
||
|
load = np.linalg.solve(a, b.T).T
|
||
|
uniq = np.diag(self.corr) - (load * d.T).sum(1)
|
||
|
|
||
|
return load, uniq
|
||
|
|
||
|
def _rotate(self, load, uniq):
|
||
|
"""rotate loadings for MLE
|
||
|
"""
|
||
|
# Rotations used in ML estimation.
|
||
|
load, s, _ = np.linalg.svd(load, 0)
|
||
|
load *= s
|
||
|
|
||
|
if self.nobs is None:
|
||
|
nobs = 1
|
||
|
else:
|
||
|
nobs = self.nobs
|
||
|
|
||
|
cm = np.dot(load.T, load / uniq[:, None]) / nobs
|
||
|
_, f = np.linalg.eig(cm)
|
||
|
load = np.dot(load, f)
|
||
|
return load
|
||
|
|
||
|
|
||
|
class FactorResults:
|
||
|
"""
|
||
|
Factor results class
|
||
|
|
||
|
For result summary, scree/loading plots and factor rotations
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
factor : Factor
|
||
|
Fitted Factor class
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
uniqueness : ndarray
|
||
|
The uniqueness (variance of uncorrelated errors unique to
|
||
|
each variable)
|
||
|
communality : ndarray
|
||
|
1 - uniqueness
|
||
|
loadings : ndarray
|
||
|
Each column is the loading vector for one factor
|
||
|
loadings_no_rot : ndarray
|
||
|
Unrotated loadings, not available under maximum likelihood
|
||
|
analysis.
|
||
|
eigenvals : ndarray
|
||
|
The eigenvalues for a factor analysis obtained using
|
||
|
principal components; not available under ML estimation.
|
||
|
n_comp : int
|
||
|
Number of components (factors)
|
||
|
nbs : int
|
||
|
Number of observations
|
||
|
fa_method : str
|
||
|
The method used to obtain the decomposition, either 'pa' for
|
||
|
'principal axes' or 'ml' for maximum likelihood.
|
||
|
df : int
|
||
|
Degrees of freedom of the factor model.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Under ML estimation, the default rotation (used for `loadings`) is
|
||
|
condition IC3 of Bai and Li (2012). Under this rotation, the
|
||
|
factor scores are iid and standardized. If `G` is the canonical
|
||
|
loadings and `U` is the vector of uniquenesses, then the
|
||
|
covariance matrix implied by the factor analysis is `GG' +
|
||
|
diag(U)`.
|
||
|
|
||
|
Status: experimental, Some refactoring will be necessary when new
|
||
|
features are added.
|
||
|
"""
|
||
|
def __init__(self, factor):
|
||
|
self.model = factor
|
||
|
self.endog_names = factor.endog_names
|
||
|
self.loadings_no_rot = factor.loadings
|
||
|
if hasattr(factor, "eigenvals"):
|
||
|
self.eigenvals = factor.eigenvals
|
||
|
|
||
|
self.communality = factor.communality
|
||
|
self.uniqueness = factor.uniqueness
|
||
|
self.rotation_method = None
|
||
|
self.fa_method = factor.method
|
||
|
self.n_comp = factor.loadings.shape[1]
|
||
|
self.nobs = factor.nobs
|
||
|
self._factor = factor
|
||
|
if hasattr(factor, "mle_retvals"):
|
||
|
self.mle_retvals = factor.mle_retvals
|
||
|
|
||
|
p, k = self.loadings_no_rot.shape
|
||
|
self.df = ((p - k)**2 - (p + k)) // 2
|
||
|
|
||
|
# no rotation, overwritten in `rotate`
|
||
|
self.loadings = factor.loadings
|
||
|
self.rotation_matrix = np.eye(self.n_comp)
|
||
|
|
||
|
|
||
|
def __str__(self):
|
||
|
return self.summary().__str__()
|
||
|
|
||
|
def rotate(self, method):
|
||
|
"""
|
||
|
Apply rotation, inplace modification of this Results instance
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
method : str
|
||
|
Rotation to be applied. Allowed methods are varimax,
|
||
|
quartimax, biquartimax, equamax, oblimin, parsimax,
|
||
|
parsimony, biquartimin, promax.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
None : nothing returned, modifications are inplace
|
||
|
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Warning: 'varimax', 'quartimax' and 'oblimin' are verified against R or
|
||
|
Stata. Some rotation methods such as promax do not produce the same
|
||
|
results as the R or Stata default functions.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
factor_rotation : subpackage that implements rotation methods
|
||
|
"""
|
||
|
self.rotation_method = method
|
||
|
if method not in ['varimax', 'quartimax', 'biquartimax',
|
||
|
'equamax', 'oblimin', 'parsimax', 'parsimony',
|
||
|
'biquartimin', 'promax']:
|
||
|
raise ValueError('Unknown rotation method %s' % (method))
|
||
|
|
||
|
if method in ['varimax', 'quartimax', 'biquartimax', 'equamax',
|
||
|
'parsimax', 'parsimony', 'biquartimin']:
|
||
|
self.loadings, T = rotate_factors(self.loadings_no_rot, method)
|
||
|
elif method == 'oblimin':
|
||
|
self.loadings, T = rotate_factors(self.loadings_no_rot,
|
||
|
'quartimin')
|
||
|
elif method == 'promax':
|
||
|
self.loadings, T = promax(self.loadings_no_rot)
|
||
|
else:
|
||
|
raise ValueError('rotation method not recognized')
|
||
|
|
||
|
self.rotation_matrix = T
|
||
|
|
||
|
def _corr_factors(self):
|
||
|
"""correlation of factors implied by rotation
|
||
|
|
||
|
If the rotation is oblique, then the factors are correlated.
|
||
|
|
||
|
currently not cached
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
corr_f : ndarray
|
||
|
correlation matrix of rotated factors, assuming initial factors are
|
||
|
orthogonal
|
||
|
"""
|
||
|
T = self.rotation_matrix
|
||
|
corr_f = T.T.dot(T)
|
||
|
return corr_f
|
||
|
|
||
|
def factor_score_params(self, method='bartlett'):
|
||
|
"""
|
||
|
Compute factor scoring coefficient matrix
|
||
|
|
||
|
The coefficient matrix is not cached.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
method : 'bartlett' or 'regression'
|
||
|
Method to use for factor scoring.
|
||
|
'regression' can be abbreviated to `reg`
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
coeff_matrix : ndarray
|
||
|
matrix s to compute factors f from a standardized endog ys.
|
||
|
``f = ys dot s``
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The `regression` method follows the Stata definition.
|
||
|
Method bartlett and regression are verified against Stats.
|
||
|
Two unofficial methods, 'ols' and 'gls', produce similar factor scores
|
||
|
but are not verified.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.multivariate.factor.FactorResults.factor_scoring
|
||
|
"""
|
||
|
L = self.loadings
|
||
|
T = self.rotation_matrix.T
|
||
|
#TODO: check row versus column convention for T
|
||
|
uni = 1 - self.communality #self.uniqueness
|
||
|
|
||
|
if method == 'bartlett':
|
||
|
s_mat = np.linalg.inv(L.T.dot(L/(uni[:,None]))).dot(L.T / uni).T
|
||
|
elif method.startswith('reg'):
|
||
|
corr = self.model.corr
|
||
|
corr_f = self._corr_factors()
|
||
|
# if orthogonal then corr_f is just eye
|
||
|
s_mat = corr_f.dot(L.T.dot(np.linalg.inv(corr))).T
|
||
|
elif method == 'ols':
|
||
|
# not verified
|
||
|
corr = self.model.corr
|
||
|
corr_f = self._corr_factors()
|
||
|
s_mat = corr_f.dot(np.linalg.pinv(L)).T
|
||
|
elif method == 'gls':
|
||
|
# not verified
|
||
|
#s_mat = np.linalg.inv(1*np.eye(L.shape[1]) + L.T.dot(L/(uni[:,None])))
|
||
|
corr = self.model.corr
|
||
|
corr_f = self._corr_factors()
|
||
|
s_mat = np.linalg.inv(np.linalg.inv(corr_f) + L.T.dot(L/(uni[:,None])))
|
||
|
s_mat = s_mat.dot(L.T / uni).T
|
||
|
else:
|
||
|
raise ValueError('method not available, use "bartlett ' +
|
||
|
'or "regression"')
|
||
|
return s_mat
|
||
|
|
||
|
def factor_scoring(self, endog=None, method='bartlett', transform=True):
|
||
|
"""
|
||
|
factor scoring: compute factors for endog
|
||
|
|
||
|
If endog was not provided when creating the factor class, then
|
||
|
a standarized endog needs to be provided here.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
method : 'bartlett' or 'regression'
|
||
|
Method to use for factor scoring.
|
||
|
'regression' can be abbreviated to `reg`
|
||
|
transform : bool
|
||
|
If transform is true and endog is provided, then it will be
|
||
|
standardized using mean and scale of original data, which has to
|
||
|
be available in this case.
|
||
|
If transform is False, then a provided endog will be used unchanged.
|
||
|
The original endog in the Factor class will
|
||
|
always be standardized if endog is None, independently of `transform`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
factor_score : ndarray
|
||
|
estimated factors using scoring matrix s and standarized endog ys
|
||
|
``f = ys dot s``
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Status: transform option is experimental and might change.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.multivariate.factor.FactorResults.factor_score_params
|
||
|
"""
|
||
|
|
||
|
if transform is False and endog is not None:
|
||
|
# no transformation in this case
|
||
|
endog = np.asarray(endog)
|
||
|
else:
|
||
|
# we need to standardize with the original mean and scale
|
||
|
if self.model.endog is not None:
|
||
|
m = self.model.endog.mean(0)
|
||
|
s = self.model.endog.std(ddof=1, axis=0)
|
||
|
if endog is None:
|
||
|
endog = self.model.endog
|
||
|
else:
|
||
|
endog = np.asarray(endog)
|
||
|
else:
|
||
|
raise ValueError('If transform is True, then `endog` needs ' +
|
||
|
'to be available in the Factor instance.')
|
||
|
|
||
|
endog = (endog - m) / s
|
||
|
|
||
|
s_mat = self.factor_score_params(method=method)
|
||
|
factors = endog.dot(s_mat)
|
||
|
return factors
|
||
|
|
||
|
def summary(self):
|
||
|
"""Summary"""
|
||
|
summ = summary2.Summary()
|
||
|
summ.add_title('Factor analysis results')
|
||
|
loadings_no_rot = pd.DataFrame(
|
||
|
self.loadings_no_rot,
|
||
|
columns=["factor %d" % (i)
|
||
|
for i in range(self.loadings_no_rot.shape[1])],
|
||
|
index=self.endog_names
|
||
|
)
|
||
|
if hasattr(self, "eigenvals"):
|
||
|
# eigenvals not available for ML method
|
||
|
eigenvals = pd.DataFrame(
|
||
|
[self.eigenvals], columns=self.endog_names, index=[''])
|
||
|
summ.add_dict({'': 'Eigenvalues'})
|
||
|
summ.add_df(eigenvals)
|
||
|
communality = pd.DataFrame([self.communality],
|
||
|
columns=self.endog_names, index=[''])
|
||
|
summ.add_dict({'': ''})
|
||
|
summ.add_dict({'': 'Communality'})
|
||
|
summ.add_df(communality)
|
||
|
summ.add_dict({'': ''})
|
||
|
summ.add_dict({'': 'Pre-rotated loadings'})
|
||
|
summ.add_df(loadings_no_rot)
|
||
|
summ.add_dict({'': ''})
|
||
|
if self.rotation_method is not None:
|
||
|
loadings = pd.DataFrame(
|
||
|
self.loadings,
|
||
|
columns=["factor %d" % (i)
|
||
|
for i in range(self.loadings.shape[1])],
|
||
|
index=self.endog_names
|
||
|
)
|
||
|
summ.add_dict({'': '%s rotated loadings' % (self.rotation_method)})
|
||
|
summ.add_df(loadings)
|
||
|
return summ
|
||
|
|
||
|
def get_loadings_frame(self, style='display', sort_=True, threshold=0.3,
|
||
|
highlight_max=True, color_max='yellow',
|
||
|
decimals=None):
|
||
|
"""get loadings matrix as DataFrame or pandas Styler
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
style : 'display' (default), 'raw' or 'strings'
|
||
|
Style to use for display
|
||
|
|
||
|
* 'raw' returns just a DataFrame of the loadings matrix, no options are
|
||
|
applied
|
||
|
* 'display' add sorting and styling as defined by other keywords
|
||
|
* 'strings' returns a DataFrame with string elements with optional sorting
|
||
|
and suppressing small loading coefficients.
|
||
|
|
||
|
sort_ : bool
|
||
|
If True, then the rows of the DataFrame is sorted by contribution of each
|
||
|
factor. applies if style is either 'display' or 'strings'
|
||
|
threshold : float
|
||
|
If the threshold is larger than zero, then loading coefficients are
|
||
|
either colored white (if style is 'display') or replace by empty
|
||
|
string (if style is 'strings').
|
||
|
highlight_max : bool
|
||
|
This add a background color to the largest coefficient in each row.
|
||
|
color_max : html color
|
||
|
default is 'yellow'. color for background of row maximum
|
||
|
decimals : None or int
|
||
|
If None, then pandas default precision applies. Otherwise values are
|
||
|
rounded to the specified decimals. If style is 'display', then the
|
||
|
underlying dataframe is not changed. If style is 'strings', then
|
||
|
values are rounded before conversion to strings.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
loadings : DataFrame or pandas Styler instance
|
||
|
The return is a pandas Styler instance, if style is 'display' and
|
||
|
at least one of highlight_max, threshold or decimals is applied.
|
||
|
Otherwise, the returned loadings is a DataFrame.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> mod = Factor(df, 3, smc=True)
|
||
|
>>> res = mod.fit()
|
||
|
>>> res.get_loadings_frame(style='display', decimals=3, threshold=0.2)
|
||
|
|
||
|
To get a sorted DataFrame, all styling options need to be turned off:
|
||
|
|
||
|
>>> df_sorted = res.get_loadings_frame(style='display',
|
||
|
... highlight_max=False, decimals=None, threshold=0)
|
||
|
|
||
|
Options except for highlighting are available for plain test or Latex
|
||
|
usage:
|
||
|
|
||
|
>>> lds = res_u.get_loadings_frame(style='strings', decimals=3,
|
||
|
... threshold=0.3)
|
||
|
>>> print(lds.to_latex())
|
||
|
"""
|
||
|
|
||
|
loadings_df = pd.DataFrame(
|
||
|
self.loadings,
|
||
|
columns=["factor %d" % (i)
|
||
|
for i in range(self.loadings.shape[1])],
|
||
|
index=self.endog_names
|
||
|
)
|
||
|
|
||
|
if style not in ['raw', 'display', 'strings']:
|
||
|
msg = "style has to be one of 'raw', 'display', 'strings'"
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
if style == 'raw':
|
||
|
return loadings_df
|
||
|
|
||
|
# add sorting and some formatting
|
||
|
if sort_ is True:
|
||
|
loadings_df2 = loadings_df.copy()
|
||
|
n_f = len(loadings_df2)
|
||
|
high = np.abs(loadings_df2.values).argmax(1)
|
||
|
loadings_df2['high'] = high
|
||
|
loadings_df2['largest'] = np.abs(loadings_df.values[np.arange(n_f), high])
|
||
|
loadings_df2.sort_values(by=['high', 'largest'], ascending=[True, False], inplace=True)
|
||
|
loadings_df = loadings_df2.drop(['high', 'largest'], axis=1)
|
||
|
|
||
|
if style == 'display':
|
||
|
sty = None
|
||
|
if threshold > 0:
|
||
|
def color_white_small(val):
|
||
|
"""
|
||
|
Takes a scalar and returns a string with
|
||
|
the css property `'color: white'` for small values, black otherwise.
|
||
|
|
||
|
takes threshold from outer scope
|
||
|
"""
|
||
|
color = 'white' if np.abs(val) < threshold else 'black'
|
||
|
return 'color: %s' % color
|
||
|
try:
|
||
|
sty = loadings_df.style.map(color_white_small)
|
||
|
except AttributeError:
|
||
|
# Deprecated in pandas 2.1
|
||
|
sty = loadings_df.style.applymap(color_white_small)
|
||
|
|
||
|
if highlight_max is True:
|
||
|
def highlight_max(s):
|
||
|
'''
|
||
|
highlight the maximum in a Series yellow.
|
||
|
'''
|
||
|
s = np.abs(s)
|
||
|
is_max = s == s.max()
|
||
|
return ['background-color: '+ color_max if v else '' for v in is_max]
|
||
|
|
||
|
if sty is None:
|
||
|
sty = loadings_df.style
|
||
|
|
||
|
sty = sty.apply(highlight_max, axis=1)
|
||
|
|
||
|
if decimals is not None:
|
||
|
if sty is None:
|
||
|
sty = loadings_df.style
|
||
|
|
||
|
sty.format("{:.%sf}" % decimals)
|
||
|
|
||
|
if sty is None:
|
||
|
return loadings_df
|
||
|
else:
|
||
|
return sty
|
||
|
|
||
|
if style == 'strings':
|
||
|
ld = loadings_df
|
||
|
if decimals is not None:
|
||
|
ld = ld.round(decimals)
|
||
|
ld = ld.astype(str)
|
||
|
if threshold > 0:
|
||
|
ld[loadings_df.abs() < threshold] = ''
|
||
|
return ld
|
||
|
|
||
|
def plot_scree(self, ncomp=None):
|
||
|
"""
|
||
|
Plot of the ordered eigenvalues and variance explained for the loadings
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
ncomp : int, optional
|
||
|
Number of loadings to include in the plot. If None, will
|
||
|
included the same as the number of maximum possible loadings
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Figure
|
||
|
Handle to the figure.
|
||
|
"""
|
||
|
_import_mpl()
|
||
|
from .plots import plot_scree
|
||
|
return plot_scree(self.eigenvals, self.n_comp, ncomp)
|
||
|
|
||
|
def plot_loadings(self, loading_pairs=None, plot_prerotated=False):
|
||
|
"""
|
||
|
Plot factor loadings in 2-d plots
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
loading_pairs : None or a list of tuples
|
||
|
Specify plots. Each tuple (i, j) represent one figure, i and j is
|
||
|
the loading number for x-axis and y-axis, respectively. If `None`,
|
||
|
all combinations of the loadings will be plotted.
|
||
|
plot_prerotated : True or False
|
||
|
If True, the loadings before rotation applied will be plotted. If
|
||
|
False, rotated loadings will be plotted.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
figs : a list of figure handles
|
||
|
"""
|
||
|
_import_mpl()
|
||
|
from .plots import plot_loadings
|
||
|
|
||
|
if self.rotation_method is None:
|
||
|
plot_prerotated = True
|
||
|
loadings = self.loadings_no_rot if plot_prerotated else self.loadings
|
||
|
if plot_prerotated:
|
||
|
title = 'Prerotated Factor Pattern'
|
||
|
else:
|
||
|
title = '%s Rotated Factor Pattern' % (self.rotation_method)
|
||
|
var_explained = self.eigenvals / self.n_comp * 100
|
||
|
|
||
|
return plot_loadings(loadings, loading_pairs=loading_pairs,
|
||
|
title=title, row_names=self.endog_names,
|
||
|
percent_variance=var_explained)
|
||
|
|
||
|
@cache_readonly
|
||
|
def fitted_cov(self):
|
||
|
"""
|
||
|
Returns the fitted covariance matrix.
|
||
|
"""
|
||
|
|
||
|
c = np.dot(self.loadings, self.loadings.T)
|
||
|
c.flat[::c.shape[0]+1] += self.uniqueness
|
||
|
return c
|
||
|
|
||
|
@cache_readonly
|
||
|
def uniq_stderr(self, kurt=0):
|
||
|
"""
|
||
|
The standard errors of the uniquenesses.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
kurt : float
|
||
|
Excess kurtosis
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
If excess kurtosis is known, provide as `kurt`. Standard
|
||
|
errors are only available if the model was fit using maximum
|
||
|
likelihood. If `endog` is not provided, `nobs` must be
|
||
|
provided to obtain standard errors.
|
||
|
|
||
|
These are asymptotic standard errors. See Bai and Li (2012)
|
||
|
for conditions under which the standard errors are valid.
|
||
|
|
||
|
The standard errors are only applicable to the original,
|
||
|
unrotated maximum likelihood solution.
|
||
|
"""
|
||
|
|
||
|
if self.fa_method.lower() != "ml":
|
||
|
msg = "Standard errors only available under ML estimation"
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
if self.nobs is None:
|
||
|
msg = "nobs is required to obtain standard errors."
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
v = self.uniqueness**2 * (2 + kurt)
|
||
|
return np.sqrt(v / self.nobs)
|
||
|
|
||
|
@cache_readonly
|
||
|
def load_stderr(self):
|
||
|
"""
|
||
|
The standard errors of the loadings.
|
||
|
|
||
|
Standard errors are only available if the model was fit using
|
||
|
maximum likelihood. If `endog` is not provided, `nobs` must be
|
||
|
provided to obtain standard errors.
|
||
|
|
||
|
These are asymptotic standard errors. See Bai and Li (2012)
|
||
|
for conditions under which the standard errors are valid.
|
||
|
|
||
|
The standard errors are only applicable to the original,
|
||
|
unrotated maximum likelihood solution.
|
||
|
"""
|
||
|
|
||
|
if self.fa_method.lower() != "ml":
|
||
|
msg = "Standard errors only available under ML estimation"
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
if self.nobs is None:
|
||
|
msg = "nobs is required to obtain standard errors."
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
v = np.outer(self.uniqueness, np.ones(self.loadings.shape[1]))
|
||
|
return np.sqrt(v / self.nobs)
|