Scripts:fasta reader.py

From UC_Chemistry

Jump to: navigation, search

Many of functions here are not actually used

import sys,os
def SaveLines(file, lines, option = "w"):
       f = open(file, option)
       for l in lines:
               if l[-1] <> '\n':
                       l = l + '\n'
               f.write(l)
       f.close()
def FastaNames(file):
       names = []
       f = open(file)
       for i in f.readlines():
               if i[0] == '>':
                       s = i[1:-1]
                       s = s.split()
                       names.append(s[0].lower())
       f.close()
       #failsave if file with names doesn't have '>' as the first character
       if len(names) ==0:
               f=open(file)
               for i in f.readlines():
                       names.append(i[:-1].lower())
               f.close()
       return names
def CheckStartEnd(start, end):
       if len(start) <> len(end):
               print 'Mismatch start and end section'
               print start
               print end
               sys.exit()
       if len(start) == 0:
               print 'could not create start indeces'
               print start
               print end
               sys.exit()
def FastaReader(filename, names, seqs):
       """
       input: aln-fasta filename
       output: list of names, and list of seqs
       but better use FastaReaderSeqs
       example:
       fastafile = 'proteins.fasta'
       names = []
       seqs = []
       FastaReader(fastafile, names, seqs)
       for (name,seq) in zip(names,seqs):
               print name
               print seq
       """
       f = open(filename)
       #print 'reading file',filename
       lines = f.readlines()
       f.close()
       content = 
       start = []
       end = []
       i = 0
       for l in lines:
               if l[0] == '>':
                       names.append(l[:-1])
                       start.append(i+1)
                       if len(names) > 1: end.append(i)
               i += 1 #line counter
       end.append(i) #the very last block didn't have the end
       CheckStartEnd(start, end)
       i = 0
       for name in names:
               protname = name[1:-1]
               seq = lines[start[i]:end[i]]
               seqline = 
               for iseq in seq:
                       iseq = iseq.replace('\n', )
                       seqline += iseq   #get rid of the '\n' character
               seqs.append(seqline)
               i += 1  #name counter
def FastaReaderSeqs(filename, names):
       seqs = []
       f = open(filename)
       #print 'reading file',filename
       lines = f.readlines()
       f.close()
       content = 
       start = []
       end = []
       i = 0
       for l in lines:
               if l[0] == '>':
                       start.append(i+1)
                       if len(start) > 1: end.append(i)
               i += 1 # line counter
       end.append(i) #last block end
       CheckStartEnd(start, end)
       #print start,end
       print names
       i = 0
       for name in names:
               protname = name[1:-1]
               seq = lines[start[i]:end[i]]
               seqline = 
               for iseq in seq:
                       #get rid of the '\n' character
                       iseq = iseq.replace('\n',)
                       seqline += iseq
               seqs.append(seqline)
               i += 1 #name counter
       if len(seqs) <> len(names):
               print 'While reading file', filename
               raise NameError, 'Cannot read all sequence for given names'
       #print seqs
       return seqs
def FastaReaderSeqsSymbol(file, names):
       result = FastaReaderSeqs(file, names)
       #get rid of EOL symbol
       for i in range(len(result)):
               result[i] = result[i].replace('\n',)
       return result
def FastaReaderSeqs100(file, names):
       result = []
       temp = FastaReaderSeqs(file, names)
       for p in temp:
               num = []
               seq = p.replace('\n', ' ')
               for number in seq.split():      num.append(int(number))
               result.append(num)
       return result
def PDBReader(file, frame = 1):
       f = open(file)
       lines = f.readlines()
       f.close()
       result = []
       for l in lines:
               if (frame == 1) and (l[:6] == 'ENDMDL'):
                       return result
               if l[:4] == 'ATOM':
                       result.append(l)
       if len(result) == 0:
               raise NameError, 'PDB file doesn\'t have ATOM records'
       return result

def PDBLine(line, extract):
       message = 
       if ('res' in extract):
               if ('num' in extract) and ('string' in extract):
                       return line[23:27]
               if ('num' in extract):
                       return int(line[23:27])
               if ('name' in extract):
                       return line[17:20]
       if ('at' in extract):
               if ('num' in extract):
                       return int(line[4:11])
               if ('name' in extract):
                       atname = line[12:16]
                       if ('no spac' in extract):
                               atname = atname.replace(' ',)
                       return atname
       if ('xyz' in extract) or ('coor' in extract) or ('pos' in extract):
               if '*' in line[30:54]:
                       message = ' Bad coordinates were set to zero'
                       print message
                       return [0.0,0.0,0.0]
               xyz = []
               xyz.append(float(line[30:38]))
               xyz.append(float(line[38:46]))
               xyz.append(float(line[46:54]))
               return xyz
       print 'ERROR: when trying to extract '
       print '<'+extract+'>'
       print 'From line '
       print '<'+line[:-1]+'>'
       raise NameError, 'Can not parse PDB line'
def PDBGetResnum(pdbline):
       #insertion code is converted in float
       #no code converts to 0, Z converts to 0.96...
       alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
       sres = PDBLine(pdbline,"res num string")
       #separate residue number from insertion code
       sresnum = sres[0:-1]
       sreslet = sres[-1]
       return float(sresnum) + alphabet.find(sreslet)/float(len(alphabet))
def PDBatomXYZ(reslines,atname):
       for line in reslines:
               if PDBLine(line, "at name") == atname:
                       return PDBLine(line, "xyz")
       print 'ERROR: PDBatomXYZ -> atom <%s> not found in'%(atname)
       print reslines
       raise NameError, 'Can not find atom in residue'
def InsertResnumInPDBLine(line, resnum):
       if resnum == int(line[23:27]):
               return line
       sres = '%4d' % resnum
       res = line[:22]+sres+line[26:]
       return res
def InsertXYZInPDBLine(line , x, y, z):
       res = line[:30]
       res += '%8.3f%8.3f%8.3f' % (x, y, z)
       res += line[54:]
       return res
def XMLReader(lines, xmlname):
       """
       reading text in xmlbrackets from file
       """
       result = []
       begin = '<'+xmlname+'>'
       end = '</' + xmlname+'>'
       Start = False
       for l in lines:
               if len(l)==0 or l[0]=='#' or l=="\n": continue
               if l[-1]=="\n": l=l[:-1]
               if begin == l[0:len(begin)]: Start=True
               elif Start:
                       if end==l[0:len(end)]:
                               #print end,l[0:len(end)]
                               return result
                       result.append(l+"\n")
       if len(result) == 0:
               print 'XML bracket name and file', xmlname, file
               #raise NameError, 'File doesn\'t have xmlname'
       return result

def XMLWriter(file, xmlname, lines, option = "w"):
       f = open(file, option)
       f.write(xmlname + '\n')
       for line in lines:
               f.write(line + '\n')
       f.write(xmlname[0] + '/' + xmlname[1:] + '\n')
       f.close()
def ExtractXML(lines, START, END):
       """
       use XMLReader instead of this!
       given lines of text, return only lines inside xml brackets
       """
       result = []
       inside = False
       for line in lines:
               if END in line:
                       return result
               if START in line:
                       inside = True
               elif inside:
                       result.append(line)
       raise NameError, 'ExtractXML'
def ResReader(file):
       f = open(file)
       lines = f.readlines()
       f.close()
       result = []
       for line in lines:
                res = []
                temp = line.split()
                if line[0] == '#': continue
                if len(temp) <> 8:
                       print 'ERROR: format of res file was changed or res file is corrupted'
                       print 'Update fasta_reader.py or fix res-file'
                       raise NameError, 'ResReader'
                res.append(int(temp[0]))
                res.append(temp[1])
                res.append(int(temp[2]))
                res.append(int(temp[3]))
                res.append(int(temp[4]))
                res.append(int(temp[5]))
                res.append(temp[6])
                res.append(int(temp[7]))
                result.append(res)
       return result
def ResReaderDict(file):
       """
       create a list of dictionary for each residue
       """
       temp = ResReader(file)
       result = []
       for res in temp:
               dict = {}
               dict['resnum'] = res[0]
               #add or remove here additional energy dictionary terms
               dict['reslett'] = res[1]
               dict['ss_h'] = res[2]
               dict['ss_c'] = res[3]
               dict['ss_e'] = res[4]
               dict['sa'] = res[5]
               dict['rla'] = res[6]
               dict['tm'] = res[7]
               result.append(dict)
       return result
def GetFileNamesFromList(dir, extension, names):
       files = os.listdir(dir)
       temp = []
       for file in files:
               fullname = dir + '/' + file
               lenext = len(extension)
               ext = file[-lenext:]
               if extension == ext:    temp.append(fullname)
       res = []
       for name in names:
               for file in temp:
                       if name in file:
                               res.append(file)
       if len(names) <> len(res):
               print 'Not all files found'
               print 'Names',names
               print 'Files',files
               raise NameError, 'GetFileNamesFromList'
       return res
def Lines2Lists(lines):
       """
       convert ['ab','cde'] into [['a','b'],['c','d','e']]
       """
       lists = []
       for line in lines:
               list = []
               for s in line:
                       list.append(s)
               lists.append(list)
       return lists
def FindContinuous(list, minlength):
       """
       returns a list of two indeces with continuous number with min list length
       specified
       """
       result = []
       temp = []
       for i in range(len(list) - 1):
               #print list[i],
               temp.append(list[i])
               if list[i+1] - list[i] > 1:
                       if len(temp) >= minlength:
                               result.append(temp)
                       temp = []
       return result
def FindAlnCore(Seqs, mincorelength = 9):
       """
       input:
               Seqs an output from Lines2Lists
               mincorelength = 9
       output:
               list of no-gap column positions in alnfasta file
       """
       #result = []
       Aln = [] # zero based position in aln-fasta
       for i in range(len(Seqs[0])):
               aln = 1
               for iseq in range(len(Seqs)):
                       if Seqs[iseq][i] == '-':
                               aln = 0
                               break
               if aln == 1:
                       Aln.append(i)
       #print Aln
       #mincorelength = 9
       Core = FindContinuous(Aln, mincorelength)
       print ' Found ', len(Core), 'cores with lengths:'
       corenum = 1
       for core in Core:
               print '  core_'+str(corenum),len(core)
               corenum += 1
       return Core
def Alnfasta2Fasta(Seqs):
       """
       input: list of sequencies with gaps
       output: list of the same shape only with numbers of residues in the
                  places where there is no gap
                 aa numbering is 1-based
       """
       result = []
       for prot in Seqs:
               temp = []
               i = 0
               for aa in prot:
                       if aa <> '-':
                               i += 1
                       temp.append(i)
               result.append(temp)
       return result
 
def GetFastaCore(names, alnfastafile, mincorelength = 9):
       """
       the format of output is:
       for current_core_in_all_prots in Core:
               for current_core in current_core_in_all_prots:
                       first_res = current_core[0]
                       last_res = current_core[-1]
                       sequence = current_core[1]
       """
       print ' Reading alignment file in fasta format',alnfastafile
       seqs = FastaReaderSeqs(alnfastafile, names)
       print ' All lengths must be the same', map(len, seqs)
       Seqs = Lines2Lists(seqs)
       Alncore = FindAlnCore(Seqs, mincorelength)
       #convert aln positions into the real residue numbering
       SeqsNumbered = Alnfasta2Fasta(Seqs)
       print 'len(SeqsNumbered[0])=', len(SeqsNumbered[0])
       print 'len(Seqs[0])=', len(Seqs[0])
       #find list of aa in core
       i = 0
       result = []
       for alncore in Alncore:
               i += 1
               core = []
               iprot = 0
               for iprot in range(0, len(names)):
                       name = names[iprot]
                       protaanumber = SeqsNumbered[iprot]
                       sequence = Seqs[iprot]
                       #print '>' + name + ' core ' + str(i)
                       seq = []
                       #format: first_aa_number, all_sequence, last_aa_number
                       seq.append(protaanumber[alncore[0]])
                       line = ""
                       for alnpos in alncore:
                               aa = sequence[alnpos]
                               line += aa
                               #print protaanumber[aa],
                       #print 
                       seq.append(line)
                       seq.append(protaanumber[alncore[-1]])
                       #print seq
                       #sys.exit(0)
                       core.append(seq)
               result.append(core)
       return result
def Atm2PDB(atmfile, pdbfile):
       atmlines = XMLReader(atmfile, '<ATOM>')
       pdblines = PDBReader(pdbfile)
       if len(atmlines) <> len(pdblines):
               print '.atm and .pdb files mismatch', len(atmlines), len(pdblines)
               raise NameError, 'ConvertAtm2PDB files mismatch'
       result = []
       for (atmline, pdbline) in zip(atmlines, pdblines):
               x = float(atmline.split()[0])
               y = float(atmline.split()[1])
               z = float(atmline.split()[2])
               line = InsertXYZInPDBLine(pdbline , x, y, z)
               result.append(line)
       return result
def PDBContent2Atm(pdbcontent):
       result = []
       for line in pdbcontent:
               coor = PDBLine(line, 'coordinates')
               resnum = PDBLine(line, 'residue number')
               line = "%12.3f %12.3f %12.3f %d" % (coor[0], coor[1], coor[2], resnum)
               result.append(line)
       return result
def PDB2Atm(pdbfile):
       content = PDBReader(pdbfile)
       result = PDBContent2Atm(content)
       return result
def VardefReader(fname, varname):
       lines = XMLReader(fname, varname)
       result = []
       for i in range(len(lines)):
               temp = []
               res = []
               if i % 5 == 0:
                       temp = lines[i].split()[:4]
                       for j in temp: res.append(int(j))
                       result.append(res)
       return result

def CreateSlidingWindowPairs(Elements):
       """
       i think i already have similar function somewhere
       create pairs [i, i+1], [i+2, i+3], ..., [i, i+k], [i+1, i+k+1],...
       """
       result = []
       for k in range(1, len(Elements)):
               for i in range(0, len(Elements) - k):
                       pair = [int(Elements[i]), int(Elements[i + k])]
                       result.append(pair)
       return result
def CreatePairs(Elements):
       """
       create pairs [i, j], like in upper trigonal matrix
       """
       result = []
       for i in range(0, len(Elements)):
               for j in range(i+1, len(Elements)):
                       pair = [int(Elements[i]), int(Elements[j])]
                       result.append(pair)
       return result
def CreatePairsInWindow(Elements, min, max):
       """
       create pairs [i, j], such as min <= abs(i-j) <= max
       """
       result = []
       for i in range(0, len(Elements)):
               for j in range(i+1, len(Elements)):
                       pair = [int(Elements[i]), int(Elements[j])]
                       diff = abs(pair[0] - pair[1])
                       if diff < min: continue
                       if (diff > max) and (max <> -1): continue
                       result.append(pair)
       return result
Personal tools