Scripts:mkhist.py

From UC_Chemistry

Jump to: navigation, search

This program is self-documenting, and basically a wrapper for pylab's hist() function. It pops-up a plot of the histogram which can then be saved to various image file formats.

Source

<nowiki>
#!/usr/bin/python

import sys
from matplotlib.pylab import *
from math import sqrt, exp, pi

UsageInfo =  "Usage: mkhist.py [options] <input data file> <number of bins>\n" \   "Options:\n" \
   "\t-n  [data column] Column of input file containing data points.\n"\
   "\t-t     [title]    Title your graph.\n"\
   "\t-x     [xlabel]   X-Axis label.\n"\
   "\t-y     [ylabel]   Y-Axis label.\n"\
   "\t-b     [time]     Discard data points with column 0 below this value.\n"\
   "\t-e     [time]     Discard data points with column 0 above this value.\n"\
   "\t-L     [-inf inf) Wrap input data to specified interval.\n"\
   "\t-scale [1.0]      Scale input data.\n"\
   "\t-shift [0.0]      Shift input data.\n"\
   "\t-osd   [dev]      Discard points > \"dev\" std. dev.s from mean.\n"\
   "\t-omax  [max]      Discard points > \"max\".\n"\
   "\t-omin  [max]      Discard points < \"min\".\n"\
   "\t-orec             Recalculate avg. and dev. after above discards.\n"\
   "\t-out <filename>   Output data after transformations.\n"\
   "\t-norm [spec/off]  Display normal distribution corresponding to data\n"\
   "\t                     with graph specification \"spec\" -- default g--.\n"\   "\t-h,--help         Print out this useful reference.\n\n"\
   "Comments:\n" \
"\tData are read from an input file in matrix format.  Column 0 is assumed\n"\
"    to contain the consecutive time index for -b.  Input data is read from\n"\
"    the file and -b is applied.  Next -scale and -shift are applied in that\n"\"    order.  Finally the average and deviation are calculated and outliers\n"\
"    are discarded (if -osd, -omax, or -omin are selected).\n"

def main(argv):
        xatitle = "X"
        yatitle = "N"
        col = 1
        start = "first"
        end = "last"
        ptitle = "Histogram Plot"
        outrecalc = 0
        norm = "g--"
        scale = 1.0
        shift = 0.0
        osd = "no"
        omax = "no"
        omin = "no"
        wrap = "no"
        outdat = -1
        while(len(argv) > 3):
            if(argv[1] == "-x"):
                xatitle = argv[2]
                argv = argv[2:]
            elif(argv[1] == "-y"):
                yatitle = argv[2]
                argv = argv[2:]
            elif(argv[1] == "-t"):
                ptitle = argv[2]
                argv = argv[2:]
            elif(argv[1] == "-b"):
                start = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-e"):
                end = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-n"):
                col = int(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-out"):
                if(outdat != -1):
                        print "Warning! out defined twice -- using last def."
                outdat = argv[2]
                argv = argv[2:]
            elif(argv[1] == "-L"):
                wrap = (float(argv[2]), float(argv[3]))
                argv = argv[3:]
            elif(argv[1] == "-scale"):
                scale = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-shift"):
                shift = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-osd"): # outlier definition by x devs
                if(osd != "no"):
                        print "Warning! osd defined twice -- using last def."
                osd = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-omax"): # outlier definition by max
                if(omax != "no"):
                        print "Warning! omax defined twice -- using last def."
                omax = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-omin"): # outlier definition by min
                if(omin != "no"):
                        print "Warning! omin defined twice -- using last def."
                omin = float(argv[2])
                argv = argv[2:]
            elif(argv[1] == "-norm"):
                if(argv[2].lower() in ["off", "0", "no"]):
                        norm = "no"
                else:
                        norm = argv[2]
                argv = argv[2:]
            # 0-parameter options (flags)
            elif(argv[1] == "-orec"): # re-calculate avg and dev.
                outrecalc = 1
                argv = argv[1:]
            elif(argv[1] in ["-h","-help","--help"]):
                print UsageInfo
                return 0
                argv = argv[1:]
            else:
                break

        if(len(argv) < 3):
                print UsageInfo
                return 1


        #list = readdata(argv[1])[:,col]
        D = readdata(argv[1])
        #print D
        if(start != "first" or end != "last"): # by assumption, 0 is the time
            time = D[:,0]
            if(start == "first"):
                start = time[0]
            if(end == "last"):
                end = time[-1]
            if end < start:
                raise RuntimeError, "Error! start and end time are bogus!"
            if(time[-1] < start or time[0] > end):
                print "No valid time points in data, using all!"
                i = 0
                j = -1
            else:
              for i in range(len(time)):        # search for start time
                if(time[i] >= start):
                        break
              for j in range(len(time)-1, i-1, -1):
                if(time[j] <= end): # search backward for end time
                        break
              j += 1
            time = time[i:j]
        else:
            i = 0
            j = -1
        if(j != -1):
                list = D[i:j,col]*scale+shift
        else:
                list = D[i:,col]*scale+shift
        del D
        if wrap != "no":
                print "Wrapping data to range [ %f, %f )."%(wrap[0], wrap[1])
                L = float(wrap[1]-wrap[0])
                list -= L*floor((list-wrap[0])/L)
        #print time

        print "Using %d samples"%(len(list))
        mu = avg(list)
        sigma = dev(list, mu)
        #list = mu + sigma*randn(10000) # ideal random normal data
        if(osd != "no"):
                list = strip_outliers(list, mu-osd*sigma, mu+osd*sigma)
        elif(omin != "no"):
            if(omax != "no"):
                list = strip_outliers(list, omin, omax)
            else:
                omax = max(list)*1.000001
                list = strip_outliers(list, omin, omax)
        elif(omax != "no"):
                omin = min(list)*0.99999
                list = strip_outliers(list, omin, omax)
        list = array(list)

        if outrecalc:
                print "Using %d samples for re-calculation."%(len(list))
                mu = avg(list)
                sigma = dev(list, mu)

        n, bins, patches = hist(list, int(argv[2]), normed=0)
        width = bins[1] - bins[0]

        print "n = " + str(n)
        print "start\t width"
        print "%8f %8f"%(bins[0], width)
        print "avg.\t var.\t sigma."
        print "%8f %8f %8f"%(mu, sigma**2, sigma)
        if(outdat != -1):
                write_list(outdat, list)

        if norm != "no": # add a 'best fit' line
                x = arange(bins[0], bins[-1]+width, 1.0e-3*(bins[-1]-bins[0]))
                y = normal(x, mu, sigma)*len(list)*width
                l = plot(x, y, norm, linewidth=2)

        xlabel(xatitle)
        ylabel(yatitle)
        title(ptitle)
        #axis([-20.,-5.,0,350])

        show()


def strip_outliers(list, minimum, maximum):
        out = []
        canh = 0
        canl = 0
        for x in list:
                if(x <= maximum):
                        if(x >= minimum):
                                out.append(x)
                        else:
                                canl += 1
                else:
                        canh += 1
        print "Removed %d outliers >%e and %d <%e from list.\n"%(\
                                canh, maximum, canl, minimum)
        return out

def readdata(name):
        digits = "01234567890.-+"

        file = open(name)
        list = []

        cols = -1
        i = 0
        for line in file.xreadlines():
                j = 0
                for tok in line.split():
                        if(tok[0] not in digits):
                                break
                        list.append(float(tok))
                        j += 1
                if(j > 0):
                    if(cols == -1):
                        cols = j
                    elif(j != cols):
                      raise ValueError, "Error, line %d contains wrong " \
                                "number of columns"%(i+1)
                    i += 1
        if(cols == -1):
                raise ValueError, "No data!"

        file.close()
        return reshape(array(list), (i,cols))

def avg(list):
        return sum(list)/len(list)

def dev(list, avg):
        return sqrt( sum( (list-avg)**2 )/len(list) )

def normal(x, mu, sigma):
        s = 2.0*(sigma**2)

        norms = x.copy()
        for i in range(len(norms)):
                norms[i] = exp(-(norms[i]-mu)**2 / s)

        return norms/(sigma*sqrt(2.0*pi))

def write_list(name, list):
        out = open(name, 'w')
        for x in list:
                out.write("%12e\n"%(x))
        out.close()

if __name__ == "__main__":
        main(sys.argv)
</nowiki>
Personal tools