my file is of the form
01 "\t" 10.19 "\t" 0.00 "\t" 10.65
02 "\t" 11.19 "\t" 10.12 "\t" 99.99
and i need to access the individual floating point numbers from it!
say for ex. the first no is 10.19.. i want to access this and add one to it. -
filename=open("half.transfac","r")
-
file_content=filename.readlines()
-
sam=""
-
for line in file_content:
-
for char in line:
-
if char=="\tchar\t\n":
-
sam+=char
-
print sam
-
for char accesss every digit and not the numbers{"10.19","0.00")etc.. how do i do this..help
Jul 5 '07
60 4319
Thanks for that.Now what I am trying to do is that instead of the sequence which i had.. i am generting a random sequence and calculating the score for that.
what exactly should happen is that
supposing my sequence contains is 50 alphabets..for each iteration it should consider 16 alphabets..so the for the first iteration it should be for first 16 alphabets,then it should be(leaving the first) the next sixteen..and so on..until it the sequence remains of the length sixteen(length less than sixteen is to be omitted)
so i have the program which is goin to calculate the score(the same thing i kept calculating using my input file).this score is called "res" in my code.
and for the same sequence I am calculating another "score" by giving specific values for each alphabet,then i am calculating the log(res/score)..I get an error which doesnt make any sense to me!please tell me what change i should do
here is my code: -
from math import *
-
import random
-
f=open("deeps1.txt","r")
-
line=f.next()
-
while not line.startswith('PO'):
-
line=f.next()
-
-
headerlist=line.strip().split()[1:]
-
linelist=[]
-
-
-
line=f.next().strip()
-
while not line.startswith('/'):
-
if line != '':
-
linelist.append(line.strip().split())
-
line=f.next().strip()
-
-
keys=[i[0] for i in linelist]
-
values=[[float(s) for s in item] for item in [j[1:] for j in linelist]]
-
array={}
-
linedict=dict(zip(keys,values))
-
keys = linedict.keys()
-
keys.sort()
-
for key in keys:
-
array=[key,linedict[key]]
-
-
datadict={}
-
datadict1={}
-
for i,item in enumerate(headerlist):
-
datadict[item]={}
-
for key_ in linedict:
-
datadict[item][key_]=linedict[key_][i]
-
-
for keymain in datadict:
-
for keysub in datadict[keymain]:
-
datadict[keymain][keysub]+=1.0
-
-
def random_seq():
-
seq=""
-
ch=""
-
for i in range(0,1000):
-
ch=random.choice(("ATGC"))
-
seq=seq+ch
-
return seq
-
-
-
p=random_seq()
-
-
#def my_rand():
-
#
-
#print p
-
# part=""
-
# q=len(p)
-
# seqq=""
-
-
# for i in range(0,q):
-
# part= p[i:i+16]
-
# if len(part)==16:
-
# seqq=part
-
# return seqq
-
-
-
-
#my_seq=my_rand()
-
#print len(my_seq)
-
-
-
-
-
res=1
-
part=""
-
q=len(p)
-
seqq=""
-
for i in range(0,q):
-
part=p[i:i+16]
-
if len(part)==16:
-
seqq=part
-
for i in range(0,16):
-
key=p[i]
-
print p[i]
-
res*=datadict[key]["%02d"%(i+1)]
-
print res,"&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
-
#score=1
-
#value={"A":"0.3","T":"0.3","C":"0.2","G":"0.2"}
-
# for it in value:
-
# for key in p:
-
# if it==key:
-
# score=score*float(value[it])
-
#log_ratio=(res/score)
-
#print log(log_ratio)
-
my error says instea of printing some value of res,prints something like
inf &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
i think there is an error in this line
res*=datadict[key]["%02d"%(i+1)]
please help
waiting for ur reply,
cheers!
hey,
sorry,i found the mistake and got the output!:)
cheers!!
There is one thing i have got to do to my code and i donno how to do that.
After adding one to every element in the input file(until this i have already done).i have to normalize the rows as in every element should be divided by sum of the elements of that row
my input file is :
NA bap
PO A C G T
01 0.00 3.67 0.00 0.00
02 0.00 0.00 3.67 0.00
03 0.00 0.00 0.00 3.67
04 0.00 3.67 0.00 0.00
05 3.67 0.00 0.00 0.00
06 3.46 0.00 0.22 0.00
07 0.00 0.00 3.67 0.00
08 0.00 0.00 0.00 3.67
09 0.00 0.00 0.00 3.67
10 0.00 3.67 0.00 0.00
11 3.67 0.00 0.00 0.00
12 3.67 0.00 0.00 0.00
13 0.00 0.00 3.67 0.00
14 0.00 0.00 0.00 3.67
15 0.00 0.00 3.67 0.00
16 0.00 3.67 0.00 0.00
//
//
A[01]=1.0(this is because a have already added one to the element)/[1.0+4.67+1.0+1.0]
similarly it has to be done for every element in every row.
the basic formula
formula=element/(sum of the elements of that row)
My code with one already added is -
from math import *
-
import random
-
f=open("deeps1.txt","r")
-
line=f.next()
-
while not line.startswith('PO'):
-
line=f.next()
-
-
headerlist=line.strip().split()[1:]
-
linelist=[]
-
-
-
line=f.next().strip()
-
while not line.startswith('/'):
-
if line != '':
-
linelist.append(line.strip().split())
-
line=f.next().strip()
-
-
keys=[i[0] for i in linelist]
-
values=[[float(s) for s in item] for item in [j[1:] for j in linelist]]
-
array={}
-
linedict=dict(zip(keys,values))
-
keys = linedict.keys()
-
keys.sort()
-
for key in keys:
-
array=[key,linedict[key]]
-
-
datadict={}
-
datadict1={}
-
for i,item in enumerate(headerlist):
-
datadict[item]={}
-
for key_ in linedict:
-
datadict[item][key_]=linedict[key_][i]
-
-
for keymain in datadict:
-
for keysub in datadict[keymain]:
-
datadict[keymain][keysub]+=1.0
-
# here one has been added to all elements now how do i normalize it?
-
waiting for ur reply,
cheers!
bvdet 2,851
Expert Mod 2GB
There is one thing i have got to do to my code and i donno how to do that.
After adding one to every element in the input file(until this i have already done).i have to normalize the rows as in every element should be divided by sum of the elements of that row
my input file is :
NA bap
PO A C G T
01 0.00 3.67 0.00 0.00
02 0.00 0.00 3.67 0.00
03 0.00 0.00 0.00 3.67
04 0.00 3.67 0.00 0.00
05 3.67 0.00 0.00 0.00
06 3.46 0.00 0.22 0.00
07 0.00 0.00 3.67 0.00
08 0.00 0.00 0.00 3.67
09 0.00 0.00 0.00 3.67
10 0.00 3.67 0.00 0.00
11 3.67 0.00 0.00 0.00
12 3.67 0.00 0.00 0.00
13 0.00 0.00 3.67 0.00
14 0.00 0.00 0.00 3.67
15 0.00 0.00 3.67 0.00
16 0.00 3.67 0.00 0.00
//
//
A[01]=1.0(this is because a have already added one to the element)/[1.0+4.67+1.0+1.0]
similarly it has to be done for every element in every row.
the basic formula
formula=element/(sum of the elements of that row)
My code with one already added is -
from math import *
-
import random
-
f=open("deeps1.txt","r")
-
line=f.next()
-
while not line.startswith('PO'):
-
line=f.next()
-
-
headerlist=line.strip().split()[1:]
-
linelist=[]
-
-
-
line=f.next().strip()
-
while not line.startswith('/'):
-
if line != '':
-
linelist.append(line.strip().split())
-
line=f.next().strip()
-
-
keys=[i[0] for i in linelist]
-
values=[[float(s) for s in item] for item in [j[1:] for j in linelist]]
-
array={}
-
linedict=dict(zip(keys,values))
-
keys = linedict.keys()
-
keys.sort()
-
for key in keys:
-
array=[key,linedict[key]]
-
-
datadict={}
-
datadict1={}
-
for i,item in enumerate(headerlist):
-
datadict[item]={}
-
for key_ in linedict:
-
datadict[item][key_]=linedict[key_][i]
-
-
for keymain in datadict:
-
for keysub in datadict[keymain]:
-
datadict[keymain][keysub]+=1.0
-
# here one has been added to all elements now how do i normalize it?
-
waiting for ur reply,
cheers!
Create a new list of the sums of the items on each row of the original data: - valueSums = [sum(item)+4 for item in values]
Since there are 16 lines in the first data set, there should be 16 elements. Keep in mind lists are ordered and dictionaries are not. Iterate on each subdictionary of dataDict, create a sorted list of subdictionary keys, iterate (use enumerate) on the sorted list of keys, and update each element using the indexing operator.
Create a new list of the sums of the items on each row of the original data: - valueSums = [sum(item)+4 for item in values]
Since there are 16 lines in the first data set, there should be 16 elements. Keep in mind lists are ordered and dictionaries are not. Iterate on each subdictionary of dataDict, create a sorted list of subdictionary keys, iterate (use enumerate) on the sorted list of keys, and update each element using the indexing operator.
Or much easier: -
datadict1 = datadict.copy()
-
for keymain in datadict:
-
for keysub in datadict[keymain]:
-
datadict1[keymain][keysub] = datadict[keymain][keysub] / (sum(values[int(keysub) - 1]) + 4)
I take it the second dictionary is for the normalized values, but this does not change anything. You use the old keys and subkeys from dictdata directly and fill out the copy. If you fill it in the same dictionary you don't need the copy.
-
from math import *
-
import random
-
f=open("deeps1.txt","r")
-
line=f.next()
-
while not line.startswith('PO'):
-
line=f.next()
-
-
headerlist=line.strip().split()[1:]
-
linelist=[]
-
-
-
line=f.next().strip()
-
while not line.startswith('/'):
-
if line != '':
-
linelist.append(line.strip().split())
-
line=f.next().strip()
-
-
keys=[i[0] for i in linelist]
-
values=[[float(s) for s in item] for item in [j[1:] for j in linelist]]
-
-
array={}
-
linedict=dict(zip(keys,values))
-
keys = linedict.keys()
-
keys.sort()
-
for key in keys:
-
array=[key,linedict[key]]
-
-
datadict={}
-
datadict1={}
-
for i,item in enumerate(headerlist):
-
datadict[item]={}
-
for key_ in linedict:
-
datadict[item][key_]=linedict[key_][i]
-
-
-
for keymain in datadict:
-
for keysub in datadict[keymain]:
-
datadict[keymain][keysub]+=1.0
-
datadict1=datadict.copy()
-
for keysub in datadict:
-
for keysub in datadict[keymain]:
-
datadict1[keymain][keysub]=datadict[keymain][keysub]/(sum(values[int(keysub)-1])+4)
-
-
def random_seq(nchars,insertat,astring):
-
seq=""
-
-
for i in range(nchars):
-
if i== insertat:
-
seq+=astring
-
ch=random.choice(("ATGC"))
-
seq+=ch
-
print seq
-
return seq
-
thestring="CGTCAAGTTCAAGTGCAAAA"
-
count=50-len(thestring)
-
p=random_seq(count,15,thestring)
-
file=open("temp.txt",'w')
-
#consensus="CGTCAAGTTCAAGTGCAAAA"
-
#file.write(consensus)
-
file.write(str(p))
-
file.close()
-
-
def file_chk():
-
file=open("temp.txt","r")
-
file_content=file.read()
-
return file_content
-
-
-
-
-
-
#p=file_chk()
-
-
-
#def my_rand():
-
#
-
#print p
-
# part=""
-
# q=len(p)
-
# seqq=""
-
-
# for i in range(0,q):
-
# part= p[i:i+16]
-
# if len(part)==16:
-
# seqq=part
-
# return seqq
-
-
-
-
#my_seq=my_rand()
-
#print len(my_seq)
-
-
-
-
-
res=1
-
part=""
-
q=len(p)
-
seqq=""
-
for i in range(0,q):
-
part=p[i:i+16]
-
if len(part)==16:
-
seqq=part
-
res=1
-
for j in range(0,16):
-
key=seqq[j]
-
res=res*datadict[key]["%02d"%(j+1)]
-
print res
-
score=1
-
value={"A":"0.3","T":"0.3","C":"0.2","G":"0.2"}
-
for it in value:
-
for key in seqq:
-
if it==key:
-
score=score*float(value[it])
-
#print score,"*******************",res
-
log_ratio=log10(res/score)
-
#print i,log_ratio
-
this is my full code where i am calculating the scores dividing by another background value and ultimately taking a log. because of this normalisation some values are becomin zero.
like when i print the normalised values some values are becoming zero.
sorry but am not able to paste my o/p file..
but donno why this is happening
bvdet 2,851
Expert Mod 2GB
Or much easier: -
datadict1 = datadict.copy()
-
for keymain in datadict:
-
for keysub in datadict[keymain]:
-
datadict1[keymain][keysub] = datadict[keymain][keysub] / (sum(values[int(keysub) - 1]) + 4)
I take it the second dictionary is for the normalized values, but this does not change anything. You use the old keys and subkeys from dictdata directly and fill out the copy. If you fill it in the same dictionary you don't need the copy.
Yep, you can index on int(keySub)-1: - valueSums = [sum(item)+4 for item in values]
-
-
for keyMain in dataDict:
-
for keySub in dataDict[keyMain]:
-
dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
-
from math import *
-
import random
-
f=open("deeps1.txt","r")
-
line=f.next()
-
while not line.startswith('PO'):
-
line=f.next()
-
-
headerlist=line.strip().split()[1:]
-
linelist=[]
-
-
-
line=f.next().strip()
-
while not line.startswith('/'):
-
if line != '':
-
linelist.append(line.strip().split())
-
line=f.next().strip()
-
-
keys=[i[0] for i in linelist]
-
values=[[float(s) for s in item] for item in [j[1:] for j in linelist]]
-
valueSums = [sum(item)+4 for item in values]
-
-
array={}
-
linedict=dict(zip(keys,values))
-
keys = linedict.keys()
-
keys.sort()
-
for key in keys:
-
array=[key,linedict[key]]
-
-
datadict={}
-
datadict1={}
-
for i,item in enumerate(headerlist):
-
datadict[item]={}
-
for key_ in linedict:
-
datadict[item][key_]=linedict[key_][i]
-
-
-
for keymain in datadict:
-
for keysub in datadict[keymain]:
-
datadict[keymain][keysub]+=1.0
-
for keyMain in datadict:
-
for keySub in datadict[keyMain]:
-
datadict[keyMain][keySub] /= valueSums[int(keySub)-1]
-
-
-
-
def random_seq(nchars,insertat,astring):
-
seq=""
-
-
for i in range(nchars):
-
if i== insertat:
-
seq+=astring
-
ch=random.choice(("ATGC"))
-
seq+=ch
-
print seq
-
return seq
-
thestring="CGTCAAGTTCAAGTGCAAAA"
-
count=50-len(thestring)
-
p=random_seq(count,15,thestring)
-
file=open("temp.txt",'w')
-
#consensus="CGTCAAGTTCAAGTGCAAAA"
-
#file.write(consensus)
-
file.write(str(p))
-
file.close()
-
-
def file_chk():
-
file=open("temp.txt","r")
-
file_content=file.read()
-
return file_content
-
-
-
-
-
-
#p=file_chk()
-
-
-
#def my_rand():
-
#
-
#print p
-
# part=""
-
# q=len(p)
-
# seqq=""
-
-
# for i in range(0,q):
-
# part= p[i:i+16]
-
# if len(part)==16:
-
# seqq=part
-
# return seqq
-
-
-
-
#my_seq=my_rand()
-
#print len(my_seq)
-
-
-
-
-
res=1
-
part=""
-
q=len(p)
-
seqq=""
-
for i in range(0,q):
-
part=p[i:i+16]
-
if len(part)==16:
-
seqq=part
-
res=1
-
for j in range(0,16):
-
key=seqq[j]
-
res=res*datadict[key]["%02d"%(j+1)]
-
print res
-
score=1
-
value={"A":"0.3","T":"0.3","C":"0.2","G":"0.2"}
-
for it in value:
-
for key in seqq:
-
if it==key:
-
score=score*float(value[it])
-
#print score,"*******************",res
-
log_ratio=log(res/score)
-
#print i,log_ratio
-
-
since we are adding one to each element.. i don think my res value could be zero.do u see any mistake.or is it because it is going negative?? please help
waiting for ur reply,
cheers!
when i say print valuesums.
many of those values are zero.but this is not possible right??
this is my full code where i am calculating the scores dividing by another background value and ultimately taking a log. because of this normalisation some values are becomin zero.
like when i print the normalised values some values are becoming zero.
sorry but am not able to paste my o/p file..
but donno why this is happening
-
from math import *
-
import random
-
f=open("deeps1.txt","r")
-
line=f.next()
-
while not line.startswith('PO'):
-
line=f.next()
-
-
headerlist=line.strip().split()[1:]
-
linelist=[]
-
-
-
line=f.next().strip()
-
while not line.startswith('/'):
-
if line != '':
-
linelist.append(line.strip().split())
-
line=f.next().strip()
-
-
keys=[i[0] for i in linelist]
-
values=[[float(s) for s in item] for item in [j[1:] for j in linelist]]
-
-
array={}
-
linedict=dict(zip(keys,values))
-
keys = linedict.keys()
-
keys.sort()
-
for key in keys:
-
array=[key,linedict[key]]
-
-
datadict={}
-
datadict1={}
-
for i,item in enumerate(headerlist):
-
datadict[item]={}
-
for key_ in linedict:
-
datadict[item][key_]=linedict[key_][i]
-
-
-
for keymain in datadict:
-
for keysub in datadict[keymain]:
-
datadict[keymain][keysub]+=1.0
-
-
datadict1=datadict.copy()
-
for keysub in datadict:
-
for keysub in datadict[keymain]:
-
datadict1[keymain][keysub]=datadict[keymain][keysub]/(sum(values[int(keysub)-1])+4)
-
-
-
def random_seq(nchars,insertat,astring):
-
seq=""
-
for i in range(nchars):
-
if i== insertat:
-
seq+=astring
-
ch=random.choice(("ATGC"))
-
seq+=ch
-
print seq
-
return seq
-
-
thestring="CGTCAAGTTCAAGTGCAAAA"
-
count=50-len(thestring)
-
p=random_seq(count,15,thestring)
-
file=open("temp.txt",'w')
-
##consensus="CGTCAAGTTCAAGTGCAAAA"
-
##file.write(consensus)
-
file.write(str(p))
-
file.close()
-
-
def file_chk():
-
f=open("temp.txt","r")
-
file_content=f.read()
-
return file_content
-
-
#p=file_chk()
-
-
-
#def my_rand():
-
#
-
#print p
-
# part=""
-
# q=len(p)
-
# seqq=""
-
-
# for i in range(0,q):
-
# part= p[i:i+16]
-
# if len(part)==16:
-
# seqq=part
-
# return seqq
-
-
-
-
#my_seq=my_rand()
-
#print len(my_seq)
-
-
res=1
-
part=""
-
q=len(p)
-
seqq=""
-
-
value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
-
for i in range(q-16):
-
part=p[i:i+16]
-
seqq=part
-
res=1
-
score=1
-
for j in range(16):
-
key=seqq[j]
-
res=res*datadict1[key]["%02d"%(j+1)]
-
#print res
-
for key in seqq:
-
score=score * value[key]
-
#print score,"*******************",res
-
log_ratio=log10(res/score)
-
print i,log_ratio
-
I think you had some problems with indentation, and I simplified a lot of the last part with the score and log. I think it is ok now. I don't know why you got 0's, but I think I know what you want to do, so it looks good now.
thanks a lot buddy!!:)
its working!!
:)
cheers!!
Post your reply Sign in to post your reply or Sign up for a free account.
Similar topics
1 post
views
Thread by SUPER_SOCKO |
last post: by
|
4 posts
views
Thread by Rick Brown |
last post: by
|
4 posts
views
Thread by sitemap |
last post: by
|
9 posts
views
Thread by sean.scanlon |
last post: by
|
4 posts
views
Thread by deLenn |
last post: by
| | | |
12 posts
views
Thread by Nezhate |
last post: by
| | | | | | | | | | |