''' Working with categorical data ============================= use of dummy variables, group statistics, within and between statistics examples for efficient matrix algebra dummy versions require that the number of unique groups or categories is not too large group statistics with scipy.ndimage can handle large number of observations and groups scipy.ndimage stats is missing count new: np.bincount can also be used for calculating values per label ''' from statsmodels.compat.python import lrange import numpy as np from scipy import ndimage #problem: ndimage does not allow axis argument, # calculates mean or var corresponding to axis=None in np.mean, np.var # useless for multivariate application def labelmeanfilter(y, x): # requires integer labels # from mailing list scipy-user 2009-02-11 labelsunique = np.arange(np.max(y)+1) labelmeans = np.array(ndimage.mean(x, labels=y, index=labelsunique)) # returns label means for each original observation return labelmeans[y] #groupcount: i.e. number of observation by group/label #np.array(ndimage.histogram(yrvs[:,0],0,10,1,labels=yrvs[:,0],index=np.unique(yrvs[:,0]))) def labelmeanfilter_nd(y, x): # requires integer labels # from mailing list scipy-user 2009-02-11 # adjusted for 2d x with column variables labelsunique = np.arange(np.max(y)+1) labmeansdata = [] labmeans = [] for xx in x.T: labelmeans = np.array(ndimage.mean(xx, labels=y, index=labelsunique)) labmeansdata.append(labelmeans[y]) labmeans.append(labelmeans) # group count: labelcount = np.array(ndimage.histogram(y, labelsunique[0], labelsunique[-1]+1, 1, labels=y, index=labelsunique)) # returns array of lable/group counts and of label/group means # and label/group means for each original observation return labelcount, np.array(labmeans), np.array(labmeansdata).T def labelmeanfilter_str(ys, x): # works also for string labels in ys, but requires 1D # from mailing list scipy-user 2009-02-11 unil, unilinv = np.unique(ys, return_index=False, return_inverse=True) labelmeans = np.array(ndimage.mean(x, labels=unilinv, index=np.arange(np.max(unil)+1))) arr3 = labelmeans[unilinv] return arr3 def groupstatsbin(factors, values): '''uses np.bincount, assumes factors/labels are integers ''' n = len(factors) ix,rind = np.unique(factors, return_inverse=1) gcount = np.bincount(rind) gmean = np.bincount(rind, weights=values)/ (1.0*gcount) meanarr = gmean[rind] withinvar = np.bincount(rind, weights=(values-meanarr)**2) / (1.0*gcount) withinvararr = withinvar[rind] return gcount, gmean , meanarr, withinvar, withinvararr def convertlabels(ys, indices=None): '''convert labels based on multiple variables or string labels to unique index labels 0,1,2,...,nk-1 where nk is the number of distinct labels ''' if indices is None: ylabel = ys else: idx = np.array(indices) if idx.size > 1 and ys.ndim == 2: ylabel = np.array(['@%s@' % ii[:2].tostring() for ii in ys])[:,np.newaxis] #alternative ## if ys[:,idx].dtype.kind == 'S': ## ylabel = nd.array([' '.join(ii[:2]) for ii in ys])[:,np.newaxis] else: # there might be a problem here ylabel = ys unil, unilinv = np.unique(ylabel, return_index=False, return_inverse=True) return unilinv, np.arange(len(unil)), unil def groupsstats_1d(y, x, labelsunique): '''use ndimage to get fast mean and variance''' labelmeans = np.array(ndimage.mean(x, labels=y, index=labelsunique)) labelvars = np.array(ndimage.var(x, labels=y, index=labelsunique)) return labelmeans, labelvars def cat2dummy(y, nonseq=0): if nonseq or (y.ndim == 2 and y.shape[1] > 1): ycat, uniques, unitransl = convertlabels(y, lrange(y.shape[1])) else: ycat = y.copy() ymin = y.min() uniques = np.arange(ymin,y.max()+1) if ycat.ndim == 1: ycat = ycat[:,np.newaxis] # this builds matrix nobs*ncat dummy = (ycat == uniques).astype(int) return dummy def groupsstats_dummy(y, x, nonseq=0): if x.ndim == 1: # use groupsstats_1d x = x[:,np.newaxis] dummy = cat2dummy(y, nonseq=nonseq) countgr = dummy.sum(0, dtype=float) meangr = np.dot(x.T,dummy)/countgr meandata = np.dot(dummy,meangr.T) # category/group means as array in shape of x xdevmeangr = x - meandata # deviation from category/group mean vargr = np.dot((xdevmeangr * xdevmeangr).T, dummy) / countgr return meangr, vargr, xdevmeangr, countgr