Scripts:calc du.py

From UC_Chemistry

Jump to: navigation, search

Notes:

  • Make sure kT is correct for your system.
  • Reading the rmin data multiplies by 10 to convert Gromacs nm to Angstrom units.
  • h is in Angstroms and is arbitrary, but should be consistent with other plots.
  • gnuplot line for output data:
u 1:3:($4/sqrt($2-1))

Output Datafile Columns:

  1. lambda (Angstroms)
  2. Effective number of samples (N)
  3. <DU|lambda> (input energy units -- CHECK WITH kT!)
  4. <variance DU | lambda> (input energy units ^2)
  5. simulation number (starting at zero) from which best bound was taken
  6. best upper / lower bound, depending on direction (input energy units)
  7. Standard deviation (estimation error) of best bound.

Source

#!/usr/bin/python

import sys
from ucgrad import *

kT = Boltzmann*298.15/4.184
step = 0.01

def main(argv):
        if len(argv) < 4:
                print "Usage: %s <+/-> <M en> <rmin> <enlist> [-d diff. file (subtracted from last enlist)]"%(argv[0])
                return 1

        if argv[1] == '-': # upper bound
                a = 1.0
                arg = argmin
        elif argv[1] == '+': # lower bound
                a = -1.0
                arg = argmax
        else:
                raise runtimeError, "No sign selected."
        del argv[1]

        NS = 0
        m = []
        rmin = []
        du = []
        mcol = -1
        rcol = -1
        ucol = -1
        while len(argv) > 1:
                if argv[1] == '-d':
                        if len(du) < 1:
                          raise runtimeError, "No complete input sets found!"
                        du[-1] -= read_matrix(argv[2])[:,ucol]
                        del argv[1:3]
                        continue
                elif argv[1] == '-m':
                        mcol = int(argv[2])
                        del argv[1:3]
                        continue
                elif argv[1] == '-r':
                        rcol = int(argv[2])
                        del argv[1:3]
                        continue
                elif argv[1] == '-u':
                        ucol = int(argv[2])
                        del argv[1:3]
                        continue
                # Append m, rmin (converted to Ang.) , u
                m.append(read_matrix(argv[1])[:,mcol])
                rmin.append(read_matrix(argv[2])[:,rcol]*10.0)
                #rmin.append(read_matrix(argv[2])[:,rcol])
                du.append(read_matrix(argv[3])[:,ucol])
                NS += 1
                del argv[1:4]
        print >>sys.stderr, "%d input sets read."%NS

        # Sort samples according to rmin.
        wt = []
        for i in range(NS):
            lt = zip(rmin[i].copy(),m[i].copy(),du[i].copy()) # list of tuples
            lt.sort()
            for j,l in enumerate(lt): # re-order original data.
                rmin[i][j] = l[0]
                m[i][j] = l[1]
                du[i][j] = l[2]
            wt.append(exp(m[i]/kT))
            #M = zeros((len(wt[-1]), 4))
            #M[:,0] = rmin[i]
            #M[:,1] = m[i]
            #M[:,2] = du[i]
            #M[:,3] = wt[i]
            #write_matrix("dat.%d"%i, M)

        # Create list of minimum observed rmin-s to act as an "on"
        # switch and avoid bias during averaging.
        lon = array([rmin[i][0] for i in range(NS)])

        # Loop through lambda values and calculate stuff.
        st = zeros(NS, int)
        i = 0
        start = floor(min(lon)/step)*step # Find a round starting point.
        while 1:
                i += 1
                lm = start+i*step
                on = lon <= lm
                # Update start indices.
                for j in range(NS):
                        while st[j] < len(rmin[j]) and rmin[j][st[j]] <= lm:
                                st[j] += 1
                        # Turn off those without further data.
                        if st[j] == len(rmin[j]):
                                on[j] = False

                if not any(on): # Do any simulations merit inclusion?
                        if any(lon > lm):
                                print "Unable to use highlighted datasets: " \
                                        + str(lon > lm)
                        break

                #print >>sys.stderr, "%4.2f"%lm
                S,mu,v = calc_mv([du[j] for j in range(NS) if on[j]],\
                                [st[j] for j in range(NS) if on[j]],\
                                [wt[j] for j in range(NS) if on[j]])
                if S <= 1.0: # Break in the data or none at all, either way
                        if any(lon > lm):
                                print "Unable to use highlighted datasets: " \
                                        + str(lon > lm)
                        break # we can't continue the calc.

                line = "%4.2f %.19f %.19f %.19f "%(lm, S, mu, v)
                nb, b, be = calc_best_bound(m,du,on,st,a,arg)
                if nb < 0: # None of the sim.s has more than one sample.
                        continue
                line += "%.19f %.19f %5d"%(b, be, nb)
                print line

        return 0

# The bound is of the form <U>+a*<VM>/kT for each sim.
# and we want the max/min of that from any sim.
def calc_best_bound(m,du,on,st,a,arg):
        oni = on.copy()
        b = zeros(len(m))
        be = zeros(len(m))
        nb = -1
        for i in range(len(m)):
            if oni[i]:
                N,mu,v = calc_mv([du[i]],[st[i]])
                if N <= 1.0:
                        oni[i] = False
                        continue
                b[i] = mu
                be[i] = v/(N-1.0)
                N,mu,v = calc_mv([m[i]],[st[i]])
                b[i] += a*v/kT
                be[i] += (2.0*a*v/kT)**2/(N-1.0)
                nb = i
        be = sqrt(be) # change var. to stdev

        # Find best in "on" list.
        best = b[nb]+a*be[nb]
        for i in range(len(m)):
            if oni[i] and arg([b[i]+a*be[i], best]) == 0:
                nb = i
                best = b[i]+a*be[i]
        return nb, b[nb], be[nb]


# Effective number of samples is the tricky one.
# We set the wt.s such that the max from each sim. is 1
# then sum all the wt.s to get an effective N.
def calc_mv(x, st, wt=None):
        if wt == None: # Settle the issue of weights.
                wt = [ones(len(xi)) for xi in x]
        # Splice all data into a super-data element.
        y = []
        w = []
        #i = 0
        for xi, si, wi in zip(x,st,wt):
        #    i += 1
            if si < len(xi):
        #       print "Dataset %d, %d samples, maxwt = %f, Neff = %f"%(i,len(wi[si:]), max(wi[si:]), sum(wi[si:])/max(wi[si:]))
                y += xi[si:].tolist()
                w += (wi[si:]/max(wi[si:])).tolist()
        y = array(y)
        w = array(w)
        N = sum(w)
        m = sum(y*w)/N
        v = sum((y-m)*(y-m)*w)/N
        return N, m, v

def cov(x,y):
        return (sum(x*y) - sum(x)*sum(y)/len(x)) / len(x)

def pfee(x):
        m, v, me, ve = stats(exp(x/kT))
        return kT*log(m), kT*me/m

def nfee(x):
        m, v, me, ve = stats(exp(-x/kT))
        return -kT*log(m), kT*me/m

if __name__=="__main__":
        main(sys.argv)
Personal tools