Scripts:fasta reader.py
From UC_Chemistry
Revision as of 22:18, 14 December 2008; view current revision
←Older revision | Newer revision→
←Older revision | Newer revision→
Many of functions here are not actually used
import sys,os
def SaveLines(file, lines, option = "w"):
f = open(file, option)
for l in lines:
if l[-1] <> '\n':
l = l + '\n'
f.write(l)
f.close()
def FastaNames(file):
names = []
f = open(file)
for i in f.readlines():
if i[0] == '>':
s = i[1:-1]
s = s.split()
names.append(s[0].lower())
f.close()
#failsave if file with names doesn't have '>' as the first character
if len(names) ==0:
f=open(file)
for i in f.readlines():
names.append(i[:-1].lower())
f.close()
return names
def CheckStartEnd(start, end):
if len(start) <> len(end):
print 'Mismatch start and end section'
print start
print end
sys.exit()
if len(start) == 0:
print 'could not create start indeces'
print start
print end
sys.exit()
def FastaReader(filename, names, seqs):
"""
input: aln-fasta filename
output: list of names, and list of seqs
but better use FastaReaderSeqs
example:
fastafile = 'proteins.fasta'
names = []
seqs = []
FastaReader(fastafile, names, seqs)
for (name,seq) in zip(names,seqs):
print name
print seq
"""
f = open(filename)
#print 'reading file',filename
lines = f.readlines()
f.close()
content =
start = []
end = []
i = 0
for l in lines:
if l[0] == '>':
names.append(l[:-1])
start.append(i+1)
if len(names) > 1: end.append(i)
i += 1 #line counter
end.append(i) #the very last block didn't have the end
CheckStartEnd(start, end)
i = 0
for name in names:
protname = name[1:-1]
seq = lines[start[i]:end[i]]
seqline =
for iseq in seq:
iseq = iseq.replace('\n', )
seqline += iseq #get rid of the '\n' character
seqs.append(seqline)
i += 1 #name counter
def FastaReaderSeqs(filename, names):
seqs = []
f = open(filename)
#print 'reading file',filename
lines = f.readlines()
f.close()
content =
start = []
end = []
i = 0
for l in lines:
if l[0] == '>':
start.append(i+1)
if len(start) > 1: end.append(i)
i += 1 # line counter
end.append(i) #last block end
CheckStartEnd(start, end)
#print start,end
print names
i = 0
for name in names:
protname = name[1:-1]
seq = lines[start[i]:end[i]]
seqline =
for iseq in seq:
#get rid of the '\n' character
iseq = iseq.replace('\n',)
seqline += iseq
seqs.append(seqline)
i += 1 #name counter
if len(seqs) <> len(names):
print 'While reading file', filename
raise NameError, 'Cannot read all sequence for given names'
#print seqs
return seqs
def FastaReaderSeqsSymbol(file, names):
result = FastaReaderSeqs(file, names)
#get rid of EOL symbol
for i in range(len(result)):
result[i] = result[i].replace('\n',)
return result
def FastaReaderSeqs100(file, names):
result = []
temp = FastaReaderSeqs(file, names)
for p in temp:
num = []
seq = p.replace('\n', ' ')
for number in seq.split(): num.append(int(number))
result.append(num)
return result
def PDBReader(file, frame = 1):
f = open(file)
lines = f.readlines()
f.close()
result = []
for l in lines:
if (frame == 1) and (l[:6] == 'ENDMDL'):
return result
if l[:4] == 'ATOM':
result.append(l)
if len(result) == 0:
raise NameError, 'PDB file doesn\'t have ATOM records'
return result
def PDBLine(line, extract):
message =
if ('res' in extract):
if ('num' in extract) and ('string' in extract):
return line[23:27]
if ('num' in extract):
return int(line[23:27])
if ('name' in extract):
return line[17:20]
if ('at' in extract):
if ('num' in extract):
return int(line[4:11])
if ('name' in extract):
atname = line[12:16]
if ('no spac' in extract):
atname = atname.replace(' ',)
return atname
if ('xyz' in extract) or ('coor' in extract) or ('pos' in extract):
if '*' in line[30:54]:
message = ' Bad coordinates were set to zero'
print message
return [0.0,0.0,0.0]
xyz = []
xyz.append(float(line[30:38]))
xyz.append(float(line[38:46]))
xyz.append(float(line[46:54]))
return xyz
print 'ERROR: when trying to extract '
print '<'+extract+'>'
print 'From line '
print '<'+line[:-1]+'>'
raise NameError, 'Can not parse PDB line'
def PDBGetResnum(pdbline):
#insertion code is converted in float
#no code converts to 0, Z converts to 0.96...
alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
sres = PDBLine(pdbline,"res num string")
#separate residue number from insertion code
sresnum = sres[0:-1]
sreslet = sres[-1]
return float(sresnum) + alphabet.find(sreslet)/float(len(alphabet))
def PDBatomXYZ(reslines,atname):
for line in reslines:
if PDBLine(line, "at name") == atname:
return PDBLine(line, "xyz")
print 'ERROR: PDBatomXYZ -> atom <%s> not found in'%(atname)
print reslines
raise NameError, 'Can not find atom in residue'
def InsertResnumInPDBLine(line, resnum):
if resnum == int(line[23:27]):
return line
sres = '%4d' % resnum
res = line[:22]+sres+line[26:]
return res
def InsertXYZInPDBLine(line , x, y, z):
res = line[:30]
res += '%8.3f%8.3f%8.3f' % (x, y, z)
res += line[54:]
return res
def XMLReader(lines, xmlname):
"""
reading text in xmlbrackets from file
"""
result = []
begin = '<'+xmlname+'>'
end = '</' + xmlname+'>'
Start = False
for l in lines:
if len(l)==0 or l[0]=='#' or l=="\n": continue
if l[-1]=="\n": l=l[:-1]
if begin == l[0:len(begin)]: Start=True
elif Start:
if end==l[0:len(end)]:
#print end,l[0:len(end)]
return result
result.append(l+"\n")
if len(result) == 0:
print 'XML bracket name and file', xmlname, file
#raise NameError, 'File doesn\'t have xmlname'
return result
def XMLWriter(file, xmlname, lines, option = "w"):
f = open(file, option)
f.write(xmlname + '\n')
for line in lines:
f.write(line + '\n')
f.write(xmlname[0] + '/' + xmlname[1:] + '\n')
f.close()
def ExtractXML(lines, START, END):
"""
use XMLReader instead of this!
given lines of text, return only lines inside xml brackets
"""
result = []
inside = False
for line in lines:
if END in line:
return result
if START in line:
inside = True
elif inside:
result.append(line)
raise NameError, 'ExtractXML'
def ResReader(file):
f = open(file)
lines = f.readlines()
f.close()
result = []
for line in lines:
res = []
temp = line.split()
if line[0] == '#': continue
if len(temp) <> 8:
print 'ERROR: format of res file was changed or res file is corrupted'
print 'Update fasta_reader.py or fix res-file'
raise NameError, 'ResReader'
res.append(int(temp[0]))
res.append(temp[1])
res.append(int(temp[2]))
res.append(int(temp[3]))
res.append(int(temp[4]))
res.append(int(temp[5]))
res.append(temp[6])
res.append(int(temp[7]))
result.append(res)
return result
def ResReaderDict(file):
"""
create a list of dictionary for each residue
"""
temp = ResReader(file)
result = []
for res in temp:
dict = {}
dict['resnum'] = res[0]
#add or remove here additional energy dictionary terms
dict['reslett'] = res[1]
dict['ss_h'] = res[2]
dict['ss_c'] = res[3]
dict['ss_e'] = res[4]
dict['sa'] = res[5]
dict['rla'] = res[6]
dict['tm'] = res[7]
result.append(dict)
return result
def GetFileNamesFromList(dir, extension, names):
files = os.listdir(dir)
temp = []
for file in files:
fullname = dir + '/' + file
lenext = len(extension)
ext = file[-lenext:]
if extension == ext: temp.append(fullname)
res = []
for name in names:
for file in temp:
if name in file:
res.append(file)
if len(names) <> len(res):
print 'Not all files found'
print 'Names',names
print 'Files',files
raise NameError, 'GetFileNamesFromList'
return res
def Lines2Lists(lines):
"""
convert ['ab','cde'] into [['a','b'],['c','d','e']]
"""
lists = []
for line in lines:
list = []
for s in line:
list.append(s)
lists.append(list)
return lists
def FindContinuous(list, minlength):
"""
returns a list of two indeces with continuous number with min list length
specified
"""
result = []
temp = []
for i in range(len(list) - 1):
#print list[i],
temp.append(list[i])
if list[i+1] - list[i] > 1:
if len(temp) >= minlength:
result.append(temp)
temp = []
return result
def FindAlnCore(Seqs, mincorelength = 9):
"""
input:
Seqs an output from Lines2Lists
mincorelength = 9
output:
list of no-gap column positions in alnfasta file
"""
#result = []
Aln = [] # zero based position in aln-fasta
for i in range(len(Seqs[0])):
aln = 1
for iseq in range(len(Seqs)):
if Seqs[iseq][i] == '-':
aln = 0
break
if aln == 1:
Aln.append(i)
#print Aln
#mincorelength = 9
Core = FindContinuous(Aln, mincorelength)
print ' Found ', len(Core), 'cores with lengths:'
corenum = 1
for core in Core:
print ' core_'+str(corenum),len(core)
corenum += 1
return Core
def Alnfasta2Fasta(Seqs):
"""
input: list of sequencies with gaps
output: list of the same shape only with numbers of residues in the
places where there is no gap
aa numbering is 1-based
"""
result = []
for prot in Seqs:
temp = []
i = 0
for aa in prot:
if aa <> '-':
i += 1
temp.append(i)
result.append(temp)
return result
def GetFastaCore(names, alnfastafile, mincorelength = 9):
"""
the format of output is:
for current_core_in_all_prots in Core:
for current_core in current_core_in_all_prots:
first_res = current_core[0]
last_res = current_core[-1]
sequence = current_core[1]
"""
print ' Reading alignment file in fasta format',alnfastafile
seqs = FastaReaderSeqs(alnfastafile, names)
print ' All lengths must be the same', map(len, seqs)
Seqs = Lines2Lists(seqs)
Alncore = FindAlnCore(Seqs, mincorelength)
#convert aln positions into the real residue numbering
SeqsNumbered = Alnfasta2Fasta(Seqs)
print 'len(SeqsNumbered[0])=', len(SeqsNumbered[0])
print 'len(Seqs[0])=', len(Seqs[0])
#find list of aa in core
i = 0
result = []
for alncore in Alncore:
i += 1
core = []
iprot = 0
for iprot in range(0, len(names)):
name = names[iprot]
protaanumber = SeqsNumbered[iprot]
sequence = Seqs[iprot]
#print '>' + name + ' core ' + str(i)
seq = []
#format: first_aa_number, all_sequence, last_aa_number
seq.append(protaanumber[alncore[0]])
line = ""
for alnpos in alncore:
aa = sequence[alnpos]
line += aa
#print protaanumber[aa],
#print
seq.append(line)
seq.append(protaanumber[alncore[-1]])
#print seq
#sys.exit(0)
core.append(seq)
result.append(core)
return result
def Atm2PDB(atmfile, pdbfile):
atmlines = XMLReader(atmfile, '<ATOM>')
pdblines = PDBReader(pdbfile)
if len(atmlines) <> len(pdblines):
print '.atm and .pdb files mismatch', len(atmlines), len(pdblines)
raise NameError, 'ConvertAtm2PDB files mismatch'
result = []
for (atmline, pdbline) in zip(atmlines, pdblines):
x = float(atmline.split()[0])
y = float(atmline.split()[1])
z = float(atmline.split()[2])
line = InsertXYZInPDBLine(pdbline , x, y, z)
result.append(line)
return result
def PDBContent2Atm(pdbcontent):
result = []
for line in pdbcontent:
coor = PDBLine(line, 'coordinates')
resnum = PDBLine(line, 'residue number')
line = "%12.3f %12.3f %12.3f %d" % (coor[0], coor[1], coor[2], resnum)
result.append(line)
return result
def PDB2Atm(pdbfile):
content = PDBReader(pdbfile)
result = PDBContent2Atm(content)
return result
def VardefReader(fname, varname):
lines = XMLReader(fname, varname)
result = []
for i in range(len(lines)):
temp = []
res = []
if i % 5 == 0:
temp = lines[i].split()[:4]
for j in temp: res.append(int(j))
result.append(res)
return result
def CreateSlidingWindowPairs(Elements):
"""
i think i already have similar function somewhere
create pairs [i, i+1], [i+2, i+3], ..., [i, i+k], [i+1, i+k+1],...
"""
result = []
for k in range(1, len(Elements)):
for i in range(0, len(Elements) - k):
pair = [int(Elements[i]), int(Elements[i + k])]
result.append(pair)
return result
def CreatePairs(Elements):
"""
create pairs [i, j], like in upper trigonal matrix
"""
result = []
for i in range(0, len(Elements)):
for j in range(i+1, len(Elements)):
pair = [int(Elements[i]), int(Elements[j])]
result.append(pair)
return result
def CreatePairsInWindow(Elements, min, max):
"""
create pairs [i, j], such as min <= abs(i-j) <= max
"""
result = []
for i in range(0, len(Elements)):
for j in range(i+1, len(Elements)):
pair = [int(Elements[i]), int(Elements[j])]
diff = abs(pair[0] - pair[1])
if diff < min: continue
if (diff > max) and (max <> -1): continue
result.append(pair)
return result
