now what i should be doing is that my i/p sequence is a folder containing many files.
each file will have two sets of sequences. in all the files i should calculate the log values of the first file seperately and second set of sequences seperately. then i should take an average of the values of the first sequence and avg of the values of the second sequence. for simplicity i will take only one weight matrix as the first step.
my code that computes the log values is here
Expand|Select|Wrap|Line Numbers
- from math import *
- def parseArray(fn, dataset=1, key='PO', term='/'):
- '''
- Read a formatted data file in matrix format and
- compile data into a dictionary
- '''
- f = open(fn)
- # skip to required data set
- for _ in range(dataset):
- try:
- line = f.next()
- while not line.startswith(key):
- line = f.next()
- except StopIteration, e:
- print 'We have reached the end of the file!'
- f.close()
- return False
- headerList = line.strip().split()[1:]
- lineList = []
- line = f.next().strip()
- while not line.startswith(term):
- if line != '':
- lineList.append(line.strip().split())
- line = f.next().strip()
- f.close()
- # Key list
- keys = [i[0] for i in lineList]
- # Values list
- values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
- # Create a dictionary from keys and values
- lineDict = dict(zip(keys, values))
- dataDict = {}
- for i, item in enumerate(headerList):
- dataDict[item] = {}
- for key in lineDict:
- dataDict[item][key] = lineDict[key][i]
- # Add 1.0 to every element in dataDict subdictionaries
- for keyMain in dataDict:
- for keySub in dataDict[keyMain]:
- dataDict[keyMain][keySub] += 1.0
- # Normalize original data (with 1 added) and update data
- valueSums = [sum(item)+4 for item in values]
- # print valueSums
- for keyMain in dataDict:
- for keySub in dataDict[keyMain]:
- dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
- return dataDict
- def parseData(fn, dataset=1, key='>'):
- '''
- Read a formatted data file of sequences
- Return a list of sequences
- The first element in the list is the header
- '''
- # initialize output list
- dataList = []
- # open file for reading
- f = open(fn)
- # skip to required data set
- for _ in range(dataset):
- try:
- s = f.next()
- while not s.startswith(key):
- s = f.next()
- except StopIteration, e:
- print 'We have reached the end of the file!'
- f.close()
- return False
- # initialize output list
- dataList = [s,]
- for line in f:
- if not line.startswith(key):
- dataList.append(line.strip())
- else:
- break
- f.close()
- return dataList
- def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
- # sequence factor dictionary
- value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
- dataArray = parseArray(fnArray, arraySet)
- if dataArray:
- dataSeq = parseData(fnSeq, seqSet)
- if not dataSeq:
- return False
- else:
- return None
- # This is the complete sequence
- seq = ''.join(dataSeq[1:])
- # These are the subkeys of dataArray - '01', '02', '03',.............
- subKeys = dataArray['A'].keys()
- subKeys.sort()
- # Calculate num/den for each slice of sequence
- # Each sequence slice length = length of subKeys
- # Example:
- # seq = 'ATCGATA'
- # subKeys length = 3
- # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
- numList = []
- denList = []
- seqList = []
- for i in xrange(len(seq) - len(subKeys)):
- subseq = seq[0:len(subKeys)]
- seqList.append(subseq)
- num, den = 1, 1
- for j, s in enumerate(subseq):
- num *= dataArray[s][subKeys[j]]
- den *= value[s]
- numList.append(num)
- denList.append(den)
- seq = seq[1:]
- resultList = []
- for i, num in enumerate(numList):
- #p=log10(num/denList[i])
- #if (p) >=2:
- #print "#########",abs(int(p))
- if (log10(num/denList[i]))>=2:
- #print "i am here"
- resultList.append(int(abs(1)))
- #print resultList
- #for i in resultList:
- #mean=sum(resultList)/len(resultList)
- #sub=mean-i
- #queue = []
- #queue = (sub)**2
- #print sqrt(queue/len(resultList))
- #print mean,"@@@@@@@@@@"
- outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
- #print "this is line 294"
- return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
- if __name__ == '__main__':
- fnArray ='one_redfly.transfac'
- fnSeq = 'upstream_regions'
- outputfile = "sequence_calc_data.txt"
- arraySet = 1
- outList = []
- calcdata = 1
- while not calcdata is None:
- seqSet = 1
- while True:
- calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
- print calcdata
- if calcdata:
- outList.append(calcdata)
- seqSet += 1
- else:
- break
- arraySet += 1
- f = open(outputfile, 'w')
- f.write('\n'.join(outList))
- f.close()
- #f=open(outputfile,"r")
- #file_con=f.readlines()
- #for line in file_con:
- # print line