Scripts:mkndhist.py

From UC_Chemistry

Jump to: navigation, search

This program makes histograms in any number of variables, and outputs the results as a string of counts which must be read in the appropriate shape later. Its usage should be pretty self-explanatory. See also its cousin, Scripts:mkhist.py, which only does 1-D and doesn't output the counts.

Source

</nowiki>
#!/usr/bin/env python

import sys

try:
        from pylab import *
except RuntimeError:
        pass

from numpy import *

UsageInfo = "Usage: calc_prob.py <input data file> <number of bins> <output>\n"\   "Options:\n" \
   "\t-n [data columns] Columns of input file containing data points.\n"\
   "\t-start [vec]      Start of bins.\n"\
   "\t-step  [vec]      Bin step sizes.\n"\
   "\t-wrap  d x0 x1    Wrap data in output dimension d to range [x0,x1).\n"\
   "\t-w     [wt col]   Column containing weight for each sample.\n"\
   "\t-wf    [wt file]  File containing weight for each sample.\n"\
   "\t-scale [vec]      Scale input data.\n"\
   "\t-shift [vec]      Shift input data.\n"\
   "\t-b     [time]     Discard data points with column 0 below this value.\n"\
   "\t-e     [time]     Discard data points with column 0 above this value.\n"\
   "\t-g                Write the output as a graph instead of a list.\n"\
   "\t-norm             Normalize output histogram to 1 (default is no).\n"\
   "\t-h,--help         Print out this useful reference.\n\n"\
   "Comments:\n" \
"\tData are read from an input file in matrix format.  Column 0 is assumed\n"\
"    to contain the consecutive time index for -b.  Input data is read from\n"\
"    the file and -b is applied.  Next -wrap, -scale, and -shift are applied\n"\"    in that order.  Note that the vectors must be contained in a single\n"\
"    argument i.e. use quotes around them.  If a weight col. is not given,\n"\
"    any weight input file is assumed to be in binary format.\n"

def main(argv):
        col = []
        x0 = []
        dx = []
        graph = 0
        wraps = []
        scale = []
        shift = []
        start = "first"
        end = "last"
        weighted = 0
        norm = 0
        wtn = None
        wt = None
        while(len(argv) > 3):
            if(argv[1] == "-b"):
                start = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-e"):
                end = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-start"):
                x0 += [float(tok) for tok in argv[2].split()]
                argv = argv[2:]
            elif(argv[1] == "-step"):
                dx += [float(tok) for tok in argv[2].split()]
                argv = argv[2:]
            elif(argv[1] == "-n"):
                col += [int(tok) for tok in argv[2].split()]
                argv = argv[2:]
            elif(argv[1] == "-w"):
                weighted = 1
                wtn = int(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-wf"):
                weighted = 1
                wt = argv[2]
                argv = argv[2:]
            elif(argv[1] == "-norm"):
                norm = 1
                argv = argv[1:]
            elif(argv[1] == "-g"):
                graph = 1
                argv = argv[1:]
            elif(argv[1] == "-wrap"):
                wraps.append([int(argv[2]), float(argv[3]), float(argv[4])])
                argv = argv[4:]
            elif(argv[1] == "-scale"):
                scale += [float(tok) for tok in argv[2].split()]
                argv = argv[2:]
            elif(argv[1] == "-shift"):
                shift += [float(tok) for tok in argv[2].split()]
                argv = argv[2:]
            elif(argv[1] == "--"):
                del argv[1]
                break
            elif(argv[1] in ["-h","-help","--help"]):
                print UsageInfo
                return 0
            else:
                break

        if(len(argv) < 4):
                print UsageInfo
                return 1

        data = readdata(argv[1])
        #print D
        bins = []
        for i in argv[2:-1]:
            for tok in i.split():
                bins.append(int(tok))
        n = len(bins) # Dimensionality

        # Check input.
        if(n == 0):
            print "Error! Number of bins in ea. dimension must be specified."
            print UsageInfo
            return 1

        # Assign defaults
        if(col == []):
                col = range(1, n+1)
        if(len(col) != n):
                print "Error! Number of input col.s (%d) doesn't match "\
                        "dimensionality (%d)"%(len(col), n)
                return 1
        if(scale == []):
                scale = [1.0]*n
        if(len(scale) != n):
                print "Error! Number of input scales (%d) doesn't match "\
                        "dimensionality (%d)"%(len(scale), n)
                return 1
        if(shift == []):
                shift = [0.0]*n
        if(len(shift) != n):
                print "Error! Number of input shifts (%d) doesn't match "\
                        "dimensionality (%d)"%(len(shift), n)
                return 1

        # Read weight file.
        if weighted:
                if wtn == None: # No col. given, assume file is binary.
                        wt = fromfile(wt, float)
                elif wt == None: # No file given, take from input.
                        wt = data[:,wtn]
                else: # Both given, read given col. of weight file.
                        wt = readdata(wt)[:,wtn]

                if len(wt) != data.shape[0]:
                    print "Error! Number of input weights (%d) does not match "\                        "number of input samples (%d)"%(len(wt), data.shape[0])

        # Cut out important columns
        D = zeros((data.shape[0], n), float)
        for i,d in enumerate(col):
                D[:,i] = data[:,d]

        for wrap in wraps:
                print "Wrapping column %d data to range [ %f, %f )."%(wrap[0],\
                                                wrap[1], wrap[2])
                L = float(wrap[2]-wrap[1])
                D[:,wrap[0]] -= L*floor((D[:,wrap[0]]-wrap[1])/L)

        if(start != "first" or end != "last"): # by assumption, 0 is the time
            time = data[:,0]
            if(start == "first"):
                start = time[0]
            if(end == "last"):
                end = time[-1]
            if end < start:
                raise RuntimeError, "Error! start and end time are bogus!"
            if(time[-1] < start or time[0] > end):
                print "No valid time points in data, using all!"
                i = 0
                j = -1
            else:
              for i in range(len(time)):        # search for start time
                if(time[i] >= start):
                        break
              for j in range(len(time)-1, i-1, -1):
                if(time[j] <= end): # search backward for end time
                        break
              j += 1
            time = time[i:j]
        else:
            i = 0
            j = -1
        if(j != -1):
                D = D[i:j]*scale+shift
                if weighted:
                        wt = wt[i:j]
        else:
                D = D[i:]*scale+shift
                if weighted:
                        wt = wt[i:]
        del data


        if(x0 == []):
                for i in range(n):
                        x0.append(min(D[:,i]))
        if(len(x0) != n):
                print "Error! Number of input bin starts (%d) doesn't match "\
                        "dimensionality (%d)"%(len(x0), n)
                return 1
        if(dx == []):
                for i,st,m in zip(range(n),x0,bins):
                        l = max(D[:,i])-st+2.0e-10
                        dx.append(l/m)
        if(len(dx) != n):
                print "Error! Number of input bin widths (%d) doesn't match "\
                        "dimensionality (%d)"%(len(dx), n)
                return 1

        bin_spec = [(start,step,b) for start,step,b in zip(x0,dx,bins)]
        # The user output.
        print "Bin  Start   Step     N"
        for i,s in enumerate(bin_spec):
                print "%d %8.3f %8.3e %5d"%(i+1,s[0],s[1],s[2])

        # The buisness.
        ostr = " %3d"
        if weighted:
            print "Creating weighted histograms..."
            ostr = " %e"
            hist = counter(bin_spec, float)
            for v,w in zip(D,wt):
                hist.append(v,w)
            hdat = hist.stats()
            if norm:
                hdat /= hist.N
        else:
            print "Creating histograms..."
            hist = counter(bin_spec)
            for v in D:
                hist.append(v)
            hdat = hist.stats()
            if norm:
                hdat /= float(hist.N)
                ostr = " %e"
        if graph and len(bin_spec) == 1:
                write_graph(argv[-1], hdat, bin_spec[0], '%10f'+ostr+'\n')
        else:
                write_his(argv[-1], hdat, ostr)

        print "%d unclassified points."%(hist.bin[-1])

        if(n == 2):
                x = arange(x0[0], x0[0]+dx[0]*(bins[0]+0.5), dx[0])
                y = arange(x0[1], x0[1]+dx[1]*(bins[1]+0.5), dx[1])
                #X,Y = meshgrid(x,y)
                #pcolor(X, Y, hist.stats(), shading='flat') # slow
                rc('image', cmap='jet')
                im = imshow(transpose(hist.stats()), \
                        interpolation='nearest',origin='lowerleft', \
                        extent=(x[0],x[-1],y[0],y[-1]))
                colorbar(im)
                show()
                if not weighted:
                        wt = ones(len(D))
                m = sum(wt*D[:,0])/sum(wt), sum(wt*D[:,1])/sum(wt)
                c00 = sum(wt*(D[:,0]-m[0])*(D[:,0]-m[0]))/sum(wt)
                c01 = sum(wt*(D[:,0]-m[0])*(D[:,1]-m[1]))/sum(wt)
                c11 = sum(wt*(D[:,1]-m[1])*(D[:,1]-m[1]))/sum(wt)
                print "Mean: %e %e"%(m[0], m[1])
                print "Cov:  %e %e"%(c00, c01)
                print "      %e %e"%(c01, c11)
                print "rho:  %e"%(c01/sqrt(c00*c11))

        # print mean matrix
        line="   "
        for i in range(n):
                line += "  %3d   "%(i+1)
        print line
        line="   "
        for i in range(n):
                line += " %7e"%(mean(hist.marginaldist(i), bin_spec[i]))
        print line
        print

        # print variance-covariance matrix
        line="   "
        for i in range(n):
                line += "  %3d   "%(i+1)
        print line
        for i in range(n):
                line="%3d"%(i+1)
                for j in range(n):
                        if i<j:
                                print "        "
                                continue
                        line += " %7e"%(cov(hist.marginaldist(i,j), \
                                                bin_spec[i], bin_spec[j]))
                print line

def readdata(name):
        digits = "01234567890.-+"

        file = open(name)
        list = []

        cols = -1
        i = 0
        for line in file.xreadlines():
                j = 0
                for tok in line.split():
                        if(tok[0] not in digits):
                                break
                        list.append(float(tok))
                        j += 1
                if(j > 0):
                    if(cols == -1):
                        cols = j
                    elif(j != cols):
                      raise ValueError, "Error, line %d contains wrong " \
                                "number of columns"%(i+1)
                    i += 1
        if(cols == -1):
                raise ValueError, "No data!"

        file.close()
        return reshape(array(list), (i,cols))

class counter:
        def __init__(self, bin_spec, bintype=int):
                self.dim = len(bin_spec)
                self.start = []
                self.step = []
                self.n = []
                self.N = bintype(0)
                l = 1
                for i in range(self.dim):
                        self.start.append(bin_spec[i][0])
                        self.step.append(bin_spec[i][1])
                        self.n.append(int(bin_spec[i][2]))
                        #print "start = %f, step = %f, n = %d"%(self.start[i],\
                        #               self.step[i], self.n[i])
                        l *= self.n[i]
                print "len = %d"%(l+1)
                # extra space in array to hold total hits outside of query
                self.bin = zeros(l+1, bintype)

        def append(self, x, w=1):
                if(len(x) != self.dim):
                        raise RuntimeError, "Bad call to counter.append()"
                try:
                        self.bin[self.b(x)] += w
                except IndexError:
                        print x, self.b(x)
                self.N += w

        def b(self, x):         # build index to bin array
                i = 0   # index
                N = 1   # step size
                # starting at least significant side (right), add to index
                for d in range(self.dim-1, -1, -1):
                        j = int((x[d]-self.start[d])/self.step[d])
                        if(x[d] < self.start[d] or j < 0 or j >= self.n[d]):
                                i = -1
                                break
                        else:
                                i += N*j
                        N *= self.n[d]
                return i

        def marginaldist(self, *dims):
                doff = 0
                x = reshape(self.bin[:-1], self.n)
                #print dims
                for i in reversed(range(self.dim)):
                        if i not in dims:
                                x = sum(x, i)
                #print x.shape
                if len(x.shape) == 1 and len(dims) == 2:
                        return diag(x)
                return x

        def N(self):
                return self.N

        def stats(self):
                return reshape(self.bin[:-1], self.n)

def mean(x, spec):
        y = spec[1]*array(range(spec[2]))+spec[0]
        return sum(x*y)/float(sum(x))

def cov(xy,xspec,yspec):
        x = sum(xy, 0)
        y = sum(xy, 1)
        xv = xspec[1]*array(range(xspec[2]))+xspec[0]
        yv = yspec[1]*array(range(yspec[2]))+yspec[0]
        xv -= sum(x*xv)/float(sum(x)) # subtract mean
        yv -= sum(y*yv)/float(sum(y)) # subtract mean

        return sum(xy*xv*yv[:,newaxis])/sum(xy)

def write_his(name, bin, ostr=" %3d"):
        out = open(name, 'w')

        sn = bin.shape
        bin = reshape(bin, bin.size)
        # O(n log n) method for finding number of \n-s
        for i in range(int(prod(sn[0:-1]))):
                line = "" # Write last dim. on one line.
                for j in range(sn[-1]*i, sn[-1]*(i+1)):
                    line += ostr%(bin[j])
                out.write(line+'\n')
                del line

                if(len(sn) > 1): # Figure out number of \n-s
                    p = 1
                    for n in range(len(sn)-2, -1, -1):
                        p *= sn[n]
                        if(i % p == p-1):
                            out.write('\n')

        out.close()

def write_graph(name, list, dim=(0., 1.), fmt="%10f %e\n"):
        out = open(name, 'w')

        if(len(list.shape) != 1):
                raise ValueError, "Error: List should be 1D!"

        for i in range(len(list)):
                out.write(fmt%(dim[0]+(i+0.5)*dim[1], list[i]))
        out.write('\n')

        out.close()

if __name__=='__main__':
        main(sys.argv)
</nowiki>
Personal tools