Scripts:mkndhist.py
From UC_Chemistry
This program makes histograms in any number of variables, and outputs the results as a string of counts which must be read in the appropriate shape later. Its usage should be pretty self-explanatory. See also its cousin, Scripts:mkhist.py, which only does 1-D and doesn't output the counts.
[edit]
Source
</nowiki>
#!/usr/bin/env python
import sys
try:
from pylab import *
except RuntimeError:
pass
from numpy import *
UsageInfo = "Usage: calc_prob.py <input data file> <number of bins> <output>\n"\ "Options:\n" \
"\t-n [data columns] Columns of input file containing data points.\n"\
"\t-start [vec] Start of bins.\n"\
"\t-step [vec] Bin step sizes.\n"\
"\t-wrap d x0 x1 Wrap data in output dimension d to range [x0,x1).\n"\
"\t-w [wt col] Column containing weight for each sample.\n"\
"\t-wf [wt file] File containing weight for each sample.\n"\
"\t-scale [vec] Scale input data.\n"\
"\t-shift [vec] Shift input data.\n"\
"\t-b [time] Discard data points with column 0 below this value.\n"\
"\t-e [time] Discard data points with column 0 above this value.\n"\
"\t-g Write the output as a graph instead of a list.\n"\
"\t-norm Normalize output histogram to 1 (default is no).\n"\
"\t-h,--help Print out this useful reference.\n\n"\
"Comments:\n" \
"\tData are read from an input file in matrix format. Column 0 is assumed\n"\
" to contain the consecutive time index for -b. Input data is read from\n"\
" the file and -b is applied. Next -wrap, -scale, and -shift are applied\n"\" in that order. Note that the vectors must be contained in a single\n"\
" argument i.e. use quotes around them. If a weight col. is not given,\n"\
" any weight input file is assumed to be in binary format.\n"
def main(argv):
col = []
x0 = []
dx = []
graph = 0
wraps = []
scale = []
shift = []
start = "first"
end = "last"
weighted = 0
norm = 0
wtn = None
wt = None
while(len(argv) > 3):
if(argv[1] == "-b"):
start = float(argv[2])
argv = argv[2:]
elif(argv[1] == "-e"):
end = float(argv[2])
argv = argv[2:]
elif(argv[1] == "-start"):
x0 += [float(tok) for tok in argv[2].split()]
argv = argv[2:]
elif(argv[1] == "-step"):
dx += [float(tok) for tok in argv[2].split()]
argv = argv[2:]
elif(argv[1] == "-n"):
col += [int(tok) for tok in argv[2].split()]
argv = argv[2:]
elif(argv[1] == "-w"):
weighted = 1
wtn = int(argv[2])
argv = argv[2:]
elif(argv[1] == "-wf"):
weighted = 1
wt = argv[2]
argv = argv[2:]
elif(argv[1] == "-norm"):
norm = 1
argv = argv[1:]
elif(argv[1] == "-g"):
graph = 1
argv = argv[1:]
elif(argv[1] == "-wrap"):
wraps.append([int(argv[2]), float(argv[3]), float(argv[4])])
argv = argv[4:]
elif(argv[1] == "-scale"):
scale += [float(tok) for tok in argv[2].split()]
argv = argv[2:]
elif(argv[1] == "-shift"):
shift += [float(tok) for tok in argv[2].split()]
argv = argv[2:]
elif(argv[1] == "--"):
del argv[1]
break
elif(argv[1] in ["-h","-help","--help"]):
print UsageInfo
return 0
else:
break
if(len(argv) < 4):
print UsageInfo
return 1
data = readdata(argv[1])
#print D
bins = []
for i in argv[2:-1]:
for tok in i.split():
bins.append(int(tok))
n = len(bins) # Dimensionality
# Check input.
if(n == 0):
print "Error! Number of bins in ea. dimension must be specified."
print UsageInfo
return 1
# Assign defaults
if(col == []):
col = range(1, n+1)
if(len(col) != n):
print "Error! Number of input col.s (%d) doesn't match "\
"dimensionality (%d)"%(len(col), n)
return 1
if(scale == []):
scale = [1.0]*n
if(len(scale) != n):
print "Error! Number of input scales (%d) doesn't match "\
"dimensionality (%d)"%(len(scale), n)
return 1
if(shift == []):
shift = [0.0]*n
if(len(shift) != n):
print "Error! Number of input shifts (%d) doesn't match "\
"dimensionality (%d)"%(len(shift), n)
return 1
# Read weight file.
if weighted:
if wtn == None: # No col. given, assume file is binary.
wt = fromfile(wt, float)
elif wt == None: # No file given, take from input.
wt = data[:,wtn]
else: # Both given, read given col. of weight file.
wt = readdata(wt)[:,wtn]
if len(wt) != data.shape[0]:
print "Error! Number of input weights (%d) does not match "\ "number of input samples (%d)"%(len(wt), data.shape[0])
# Cut out important columns
D = zeros((data.shape[0], n), float)
for i,d in enumerate(col):
D[:,i] = data[:,d]
for wrap in wraps:
print "Wrapping column %d data to range [ %f, %f )."%(wrap[0],\
wrap[1], wrap[2])
L = float(wrap[2]-wrap[1])
D[:,wrap[0]] -= L*floor((D[:,wrap[0]]-wrap[1])/L)
if(start != "first" or end != "last"): # by assumption, 0 is the time
time = data[:,0]
if(start == "first"):
start = time[0]
if(end == "last"):
end = time[-1]
if end < start:
raise RuntimeError, "Error! start and end time are bogus!"
if(time[-1] < start or time[0] > end):
print "No valid time points in data, using all!"
i = 0
j = -1
else:
for i in range(len(time)): # search for start time
if(time[i] >= start):
break
for j in range(len(time)-1, i-1, -1):
if(time[j] <= end): # search backward for end time
break
j += 1
time = time[i:j]
else:
i = 0
j = -1
if(j != -1):
D = D[i:j]*scale+shift
if weighted:
wt = wt[i:j]
else:
D = D[i:]*scale+shift
if weighted:
wt = wt[i:]
del data
if(x0 == []):
for i in range(n):
x0.append(min(D[:,i]))
if(len(x0) != n):
print "Error! Number of input bin starts (%d) doesn't match "\
"dimensionality (%d)"%(len(x0), n)
return 1
if(dx == []):
for i,st,m in zip(range(n),x0,bins):
l = max(D[:,i])-st+2.0e-10
dx.append(l/m)
if(len(dx) != n):
print "Error! Number of input bin widths (%d) doesn't match "\
"dimensionality (%d)"%(len(dx), n)
return 1
bin_spec = [(start,step,b) for start,step,b in zip(x0,dx,bins)]
# The user output.
print "Bin Start Step N"
for i,s in enumerate(bin_spec):
print "%d %8.3f %8.3e %5d"%(i+1,s[0],s[1],s[2])
# The buisness.
ostr = " %3d"
if weighted:
print "Creating weighted histograms..."
ostr = " %e"
hist = counter(bin_spec, float)
for v,w in zip(D,wt):
hist.append(v,w)
hdat = hist.stats()
if norm:
hdat /= hist.N
else:
print "Creating histograms..."
hist = counter(bin_spec)
for v in D:
hist.append(v)
hdat = hist.stats()
if norm:
hdat /= float(hist.N)
ostr = " %e"
if graph and len(bin_spec) == 1:
write_graph(argv[-1], hdat, bin_spec[0], '%10f'+ostr+'\n')
else:
write_his(argv[-1], hdat, ostr)
print "%d unclassified points."%(hist.bin[-1])
if(n == 2):
x = arange(x0[0], x0[0]+dx[0]*(bins[0]+0.5), dx[0])
y = arange(x0[1], x0[1]+dx[1]*(bins[1]+0.5), dx[1])
#X,Y = meshgrid(x,y)
#pcolor(X, Y, hist.stats(), shading='flat') # slow
rc('image', cmap='jet')
im = imshow(transpose(hist.stats()), \
interpolation='nearest',origin='lowerleft', \
extent=(x[0],x[-1],y[0],y[-1]))
colorbar(im)
show()
if not weighted:
wt = ones(len(D))
m = sum(wt*D[:,0])/sum(wt), sum(wt*D[:,1])/sum(wt)
c00 = sum(wt*(D[:,0]-m[0])*(D[:,0]-m[0]))/sum(wt)
c01 = sum(wt*(D[:,0]-m[0])*(D[:,1]-m[1]))/sum(wt)
c11 = sum(wt*(D[:,1]-m[1])*(D[:,1]-m[1]))/sum(wt)
print "Mean: %e %e"%(m[0], m[1])
print "Cov: %e %e"%(c00, c01)
print " %e %e"%(c01, c11)
print "rho: %e"%(c01/sqrt(c00*c11))
# print mean matrix
line=" "
for i in range(n):
line += " %3d "%(i+1)
print line
line=" "
for i in range(n):
line += " %7e"%(mean(hist.marginaldist(i), bin_spec[i]))
print line
print
# print variance-covariance matrix
line=" "
for i in range(n):
line += " %3d "%(i+1)
print line
for i in range(n):
line="%3d"%(i+1)
for j in range(n):
if i<j:
print " "
continue
line += " %7e"%(cov(hist.marginaldist(i,j), \
bin_spec[i], bin_spec[j]))
print line
def readdata(name):
digits = "01234567890.-+"
file = open(name)
list = []
cols = -1
i = 0
for line in file.xreadlines():
j = 0
for tok in line.split():
if(tok[0] not in digits):
break
list.append(float(tok))
j += 1
if(j > 0):
if(cols == -1):
cols = j
elif(j != cols):
raise ValueError, "Error, line %d contains wrong " \
"number of columns"%(i+1)
i += 1
if(cols == -1):
raise ValueError, "No data!"
file.close()
return reshape(array(list), (i,cols))
class counter:
def __init__(self, bin_spec, bintype=int):
self.dim = len(bin_spec)
self.start = []
self.step = []
self.n = []
self.N = bintype(0)
l = 1
for i in range(self.dim):
self.start.append(bin_spec[i][0])
self.step.append(bin_spec[i][1])
self.n.append(int(bin_spec[i][2]))
#print "start = %f, step = %f, n = %d"%(self.start[i],\
# self.step[i], self.n[i])
l *= self.n[i]
print "len = %d"%(l+1)
# extra space in array to hold total hits outside of query
self.bin = zeros(l+1, bintype)
def append(self, x, w=1):
if(len(x) != self.dim):
raise RuntimeError, "Bad call to counter.append()"
try:
self.bin[self.b(x)] += w
except IndexError:
print x, self.b(x)
self.N += w
def b(self, x): # build index to bin array
i = 0 # index
N = 1 # step size
# starting at least significant side (right), add to index
for d in range(self.dim-1, -1, -1):
j = int((x[d]-self.start[d])/self.step[d])
if(x[d] < self.start[d] or j < 0 or j >= self.n[d]):
i = -1
break
else:
i += N*j
N *= self.n[d]
return i
def marginaldist(self, *dims):
doff = 0
x = reshape(self.bin[:-1], self.n)
#print dims
for i in reversed(range(self.dim)):
if i not in dims:
x = sum(x, i)
#print x.shape
if len(x.shape) == 1 and len(dims) == 2:
return diag(x)
return x
def N(self):
return self.N
def stats(self):
return reshape(self.bin[:-1], self.n)
def mean(x, spec):
y = spec[1]*array(range(spec[2]))+spec[0]
return sum(x*y)/float(sum(x))
def cov(xy,xspec,yspec):
x = sum(xy, 0)
y = sum(xy, 1)
xv = xspec[1]*array(range(xspec[2]))+xspec[0]
yv = yspec[1]*array(range(yspec[2]))+yspec[0]
xv -= sum(x*xv)/float(sum(x)) # subtract mean
yv -= sum(y*yv)/float(sum(y)) # subtract mean
return sum(xy*xv*yv[:,newaxis])/sum(xy)
def write_his(name, bin, ostr=" %3d"):
out = open(name, 'w')
sn = bin.shape
bin = reshape(bin, bin.size)
# O(n log n) method for finding number of \n-s
for i in range(int(prod(sn[0:-1]))):
line = "" # Write last dim. on one line.
for j in range(sn[-1]*i, sn[-1]*(i+1)):
line += ostr%(bin[j])
out.write(line+'\n')
del line
if(len(sn) > 1): # Figure out number of \n-s
p = 1
for n in range(len(sn)-2, -1, -1):
p *= sn[n]
if(i % p == p-1):
out.write('\n')
out.close()
def write_graph(name, list, dim=(0., 1.), fmt="%10f %e\n"):
out = open(name, 'w')
if(len(list.shape) != 1):
raise ValueError, "Error: List should be 1D!"
for i in range(len(list)):
out.write(fmt%(dim[0]+(i+0.5)*dim[1], list[i]))
out.write('\n')
out.close()
if __name__=='__main__':
main(sys.argv)
</nowiki>
