def X_i_predictions(binned_data, energy_matrix):
czx = zeros((2,32))
for k in range(shape(binned_data)[0]):
energy = 0
for i in range(8):
if binned_data[k,0][i] == 'A':
energy += energy_matrix[0][0,i]
elif binned_data[k,0][i] == 'T':
energy += energy_matrix[0][1,i]
elif binned_data[k,0][i] == 'G':
energy += energy_matrix[0][2,i]
elif binned_data[k,0][i] == 'C':
energy += energy_matrix[0][3,i]
if energy < energy_matrix[1] :
czx[0,binned_data[k,2]] += 1
else:
czx[1,binned_data[k,2]] += 1
return czx
def likelihood(czx):
likelihood_numerator = 0
for j in range(shape(czx)[0]):
for k in range(shape(czx)[1]):
likelihood_numerator += math.lgamma(czx[j,k]+1)
likelihood_denominator = 0
for j in range(shape(czx)[0]):
likelihood_denominator += math.lgamma(31+sum(czx[j,:])+1)
return likelihood_numerator - likelihood_denominator
def matrix_perturbation(matrix):
choose = numpy.random.choice(3)
perturbed = copy(matrix[0])
if choose == 0:
x = numpy.random.choice(4)
y = numpy.random.choice(8)
perturbed[x,y] = perturbed[x,y]+numpy.random.normal(0.0,.1)
return perturbed, matrix[1]
elif choose == 1:
mu2=matrix[1]+numpy.random.normal(0.0,.1)
return perturbed, mu2
elif choose == 2:
temp = numpy.random.normal(0.0,.1)
mu2=matrix[1]+temp
temp2 = numpy.random.choice(8)
for k in range(4):
perturbed[k,temp2] = perturbed[k,temp2] + temp
return perturbed, mu2
def matrix_normalisation(energy_matrix_preturbed):
temp = copy(energy_matrix_preturbed[0])
x= 0
for i in range(8):
x+=min(temp[:,i])
for i in range(8):
for j in range(4):
energy_matrix_preturbed[0][j,i] = (energy_matrix_preturbed[0][j,i] - min(temp[:,i]))/(energy_matrix_preturbed[1]-x)
energy_matrix_preturbed[1] = 1.0
return copy(energy_matrix_preturbed)
def seed_matrix(initial_sequences):
""" Takes a set of squences and returns a matrix with starting energies for matrix inference using
kinney et al method
"""
matrix = zeros((4,8))
for eightmere in initial_sequences:
for base in range(len(eightmere[0])):
if 'A' in eightmere[0][base]:
matrix[0,base]=1+matrix[0,base]
elif 'T' in eightmere[0][base]:
matrix[1,base]=1+matrix[1,base]
elif 'G' in eightmere[0][base]:
matrix[2,base]=1+matrix[2,base]
elif 'C' in eightmere[0][base]:
matrix[3,base]=1+matrix[3,base]
for i in range(matrix.shape[1]):
matrix[:,i]= matrix[:,i]/sum(matrix[:,i])
matrix[:,i]= matrix[:,i]/max(matrix[:,i])
matrix[:,i] = abs(matrix[:,i]-max(matrix[:,i]))
return matrix
import pandas
import pandas
import numpy
cca1_pbm = pandas.read_excel('LUX_BINDING_PBM.xls',sheetname=1)
cca1_pbm
8-mer | Reverse Complement | Design_1_Chamber_1 | Design_1_Chamber_2 | Design_1_Chamber_3 | Design_1_average | Design_2_Chamber_1 | Design_2_Chamber_2 | Design_2_Chamber_3 | Design_2_average | |
---|---|---|---|---|---|---|---|---|---|---|
0 | AAAAAAAA | TTTTTTTT | 0.422272 | 0.428349 | 0.430615 | 0.427079 | 0.306605 | 0.320749 | 0.322564 | 0.316639 |
1 | AAAAAAAC | GTTTTTTT | 0.330570 | 0.325226 | 0.336900 | 0.330898 | 0.254036 | 0.209630 | 0.166244 | 0.209970 |
2 | AAAAAAAG | CTTTTTTT | 0.288988 | 0.272463 | 0.307813 | 0.289755 | 0.226867 | 0.229429 | 0.276126 | 0.244141 |
3 | AAAAAAAT | ATTTTTTT | 0.330156 | 0.350296 | 0.313538 | 0.331330 | 0.346196 | 0.345928 | 0.349245 | 0.347123 |
4 | AAAAAACA | TGTTTTTT | 0.310488 | 0.293285 | 0.259394 | 0.287722 | 0.262225 | 0.199524 | 0.271337 | 0.244362 |
5 | AAAAAACC | GGTTTTTT | 0.219286 | 0.265552 | 0.223351 | 0.236063 | 0.166632 | 0.072022 | 0.021618 | 0.086757 |
6 | AAAAAACG | CGTTTTTT | 0.276303 | 0.227090 | 0.241953 | 0.248449 | 0.232417 | 0.190323 | 0.219157 | 0.213966 |
7 | AAAAAACT | AGTTTTTT | 0.264810 | 0.268607 | 0.250700 | 0.261372 | 0.116893 | 0.115539 | 0.116169 | 0.116201 |
8 | AAAAAAGA | TCTTTTTT | 0.392561 | 0.400577 | 0.393285 | 0.395475 | 0.292466 | 0.294417 | 0.325462 | 0.304115 |
9 | AAAAAAGC | GCTTTTTT | 0.166378 | 0.181442 | 0.228957 | 0.192259 | 0.200812 | 0.196035 | 0.218971 | 0.205273 |
10 | AAAAAAGG | CCTTTTTT | 0.249919 | 0.252611 | 0.281559 | 0.261363 | 0.205091 | 0.075833 | 0.193720 | 0.158214 |
11 | AAAAAAGT | ACTTTTTT | 0.251634 | 0.203100 | 0.279082 | 0.244605 | 0.232201 | 0.150461 | 0.167837 | 0.183499 |
12 | AAAAAATA | TATTTTTT | 0.348749 | 0.347228 | 0.365290 | 0.353755 | 0.324335 | 0.327762 | 0.329378 | 0.327158 |
13 | AAAAAATC | GATTTTTT | 0.357014 | 0.336774 | 0.343128 | 0.345638 | 0.390423 | 0.416201 | 0.407306 | 0.404644 |
14 | AAAAAATG | CATTTTTT | 0.256453 | 0.309220 | 0.200107 | 0.255260 | 0.255403 | 0.264428 | 0.285857 | 0.268563 |
15 | AAAAAATT | AATTTTTT | 0.328864 | 0.318362 | 0.272467 | 0.306564 | 0.220015 | 0.169384 | 0.216349 | 0.201916 |
16 | AAAAACAA | TTGTTTTT | 0.230935 | 0.215060 | 0.265333 | 0.237109 | 0.265211 | 0.239782 | 0.295746 | 0.266913 |
17 | AAAAACAC | GTGTTTTT | 0.114623 | 0.105541 | 0.030223 | 0.083462 | 0.137719 | 0.124203 | 0.143631 | 0.135185 |
18 | AAAAACAG | CTGTTTTT | 0.190192 | 0.294407 | 0.178118 | 0.220906 | 0.090782 | 0.045491 | 0.115808 | 0.084027 |
19 | AAAAACAT | ATGTTTTT | 0.174731 | 0.230441 | 0.226302 | 0.210491 | 0.099572 | 0.073962 | 0.130894 | 0.101476 |
20 | AAAAACCA | TGGTTTTT | 0.110617 | 0.145486 | 0.135722 | 0.130608 | 0.163244 | 0.083935 | 0.225819 | 0.157666 |
21 | AAAAACCC | GGGTTTTT | 0.217686 | 0.183476 | 0.152233 | 0.184465 | 0.068188 | -0.028239 | 0.053806 | 0.031252 |
22 | AAAAACCG | CGGTTTTT | 0.123432 | 0.206902 | 0.145712 | 0.158682 | 0.066351 | 0.070066 | -0.055011 | 0.027135 |
23 | AAAAACCT | AGGTTTTT | 0.196419 | 0.215522 | 0.212407 | 0.208116 | 0.177161 | 0.105783 | 0.171029 | 0.151324 |
24 | AAAAACGA | TCGTTTTT | 0.167552 | 0.163307 | 0.178810 | 0.169890 | 0.204572 | 0.232756 | 0.212442 | 0.216590 |
25 | AAAAACGC | GCGTTTTT | 0.028226 | 0.076835 | 0.077888 | 0.060983 | 0.143877 | 0.111680 | 0.112706 | 0.122755 |
26 | AAAAACGG | CCGTTTTT | 0.142215 | 0.197073 | 0.223731 | 0.187673 | 0.007106 | -0.079945 | -0.070241 | -0.047693 |
27 | AAAAACGT | ACGTTTTT | 0.163911 | 0.036526 | 0.086135 | 0.095524 | -0.008190 | 0.012131 | 0.042511 | 0.015484 |
28 | AAAAACTA | TAGTTTTT | 0.090453 | 0.037614 | 0.139971 | 0.089346 | 0.142185 | 0.184006 | 0.180998 | 0.169063 |
29 | AAAAACTC | GAGTTTTT | 0.214239 | 0.187912 | 0.173877 | 0.192009 | 0.210343 | 0.221153 | 0.249698 | 0.227065 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32866 | TTGATAAA | TTTATCAA | 0.109797 | 0.111236 | 0.161831 | 0.127621 | 0.311204 | 0.274810 | 0.333401 | 0.306472 |
32867 | TTGATCAA | TTGATCAA | 0.056410 | 0.157985 | -0.041491 | 0.057635 | 0.272114 | 0.227745 | 0.247943 | 0.249267 |
32868 | TTGCAAAA | TTTTGCAA | 0.165602 | 0.171290 | 0.078036 | 0.138310 | 0.130109 | 0.161902 | 0.176161 | 0.156057 |
32869 | TTGCACAA | TTGTGCAA | -0.014877 | 0.096333 | -0.028160 | 0.017765 | -0.041392 | 0.069791 | -0.035939 | -0.002513 |
32870 | TTGCCAAA | TTTGGCAA | 0.027593 | 0.034636 | -0.018277 | 0.014651 | -0.051772 | -0.048367 | -0.076957 | -0.059032 |
32871 | TTGCCCAA | TTGGGCAA | -0.103538 | 0.048463 | -0.050475 | -0.035183 | -0.020473 | -0.054248 | -0.017967 | -0.030896 |
32872 | TTGCGAAA | TTTCGCAA | -0.072650 | -0.062646 | -0.126367 | -0.087221 | 0.102273 | 0.168009 | 0.131985 | 0.134089 |
32873 | TTGCGCAA | TTGCGCAA | -0.011355 | -0.097022 | -0.040345 | -0.049574 | -0.012870 | 0.098997 | -0.022846 | 0.021093 |
32874 | TTGCTAAA | TTTAGCAA | -0.029122 | 0.104486 | 0.028793 | 0.034719 | 0.134686 | 0.071393 | 0.041492 | 0.082524 |
32875 | TTGGAAAA | TTTTCCAA | 0.073490 | 0.155892 | 0.218411 | 0.149264 | 0.118250 | 0.154993 | 0.133715 | 0.135652 |
32876 | TTGGACAA | TTGTCCAA | 0.066805 | -0.032272 | 0.026786 | 0.020440 | 0.032265 | 0.084180 | 0.000130 | 0.038858 |
32877 | TTGGCAAA | TTTGCCAA | 0.060743 | -0.012010 | -0.068178 | -0.006482 | 0.083153 | 0.129011 | 0.163485 | 0.125216 |
32878 | TTGGCCAA | TTGGCCAA | 0.039338 | 0.170602 | -0.044705 | 0.055079 | -0.054119 | 0.049829 | -0.099437 | -0.034576 |
32879 | TTGGGAAA | TTTCCCAA | 0.140833 | 0.180685 | 0.195757 | 0.172425 | 0.239048 | 0.112489 | 0.208793 | 0.186777 |
32880 | TTGGTAAA | TTTACCAA | 0.086611 | 0.179885 | 0.060742 | 0.109079 | 0.173517 | 0.177609 | 0.235698 | 0.195608 |
32881 | TTGTAAAA | TTTTACAA | 0.072319 | 0.110950 | 0.095006 | 0.092758 | 0.066172 | 0.179113 | 0.202572 | 0.149286 |
32882 | TTGTACAA | TTGTACAA | 0.034769 | 0.169711 | 0.240772 | 0.148418 | 0.226017 | 0.136729 | 0.243268 | 0.202004 |
32883 | TTGTCAAA | TTTGACAA | 0.185108 | 0.187400 | 0.155532 | 0.176013 | 0.225447 | 0.233697 | 0.149250 | 0.202798 |
32884 | TTGTGAAA | TTTCACAA | 0.275997 | 0.199434 | 0.211053 | 0.228828 | 0.005610 | 0.072999 | 0.033766 | 0.037458 |
32885 | TTGTTAAA | TTTAACAA | 0.146552 | 0.147621 | 0.192682 | 0.162285 | 0.156734 | 0.145416 | 0.217605 | 0.173252 |
32886 | TTTAAAAA | TTTTTAAA | 0.301812 | 0.305777 | 0.293187 | 0.300259 | 0.317709 | 0.286817 | 0.302072 | 0.302199 |
32887 | TTTACAAA | TTTGTAAA | 0.042153 | 0.057068 | 0.088294 | 0.062505 | 0.116354 | 0.179100 | 0.190024 | 0.161826 |
32888 | TTTAGAAA | TTTCTAAA | 0.201500 | 0.237387 | 0.238218 | 0.225701 | 0.236961 | 0.216961 | 0.222419 | 0.225447 |
32889 | TTTATAAA | TTTATAAA | -0.000997 | 0.136347 | 0.097556 | 0.077635 | 0.215377 | 0.209747 | 0.246690 | 0.223938 |
32890 | TTTCAAAA | TTTTGAAA | 0.194390 | 0.076636 | 0.125115 | 0.132047 | 0.299054 | 0.346945 | 0.317149 | 0.321049 |
32891 | TTTCCAAA | TTTGGAAA | 0.039187 | 0.119860 | 0.171609 | 0.110219 | 0.194773 | 0.172378 | 0.072369 | 0.146506 |
32892 | TTTCGAAA | TTTCGAAA | -0.005214 | 0.104952 | -0.029424 | 0.023438 | 0.175029 | 0.218847 | 0.223439 | 0.205772 |
32893 | TTTGAAAA | TTTTCAAA | 0.167543 | 0.221034 | 0.152008 | 0.180195 | 0.247508 | 0.171789 | 0.258319 | 0.225872 |
32894 | TTTGCAAA | TTTGCAAA | 0.092377 | 0.120757 | 0.051627 | 0.088254 | -0.010703 | 0.045173 | 0.089589 | 0.041353 |
32895 | TTTTAAAA | TTTTAAAA | 0.192480 | 0.156368 | 0.109922 | 0.152923 | 0.351096 | 0.354591 | 0.277783 | 0.327823 |
32896 rows × 10 columns
forward = dict([])
reverse = dict([])
forward[cca1_pbm['8-mer'][0]] = 0
for i in range(len(cca1_pbm['8-mer'])-1):
if cca1_pbm['8-mer'][i+1][::-1] in forward:
reverse[cca1_pbm['8-mer'][i+1]] = i+1
else:
forward[cca1_pbm['8-mer'][i+1]] = i+1
cca1_pbm=cca1_pbm.loc[forward.values()]
cca1_pbm_sorted= cca1_pbm[['8-mer','Design_1_Chamber_1']].sort('Design_1_Chamber_1')
cca1_pbm_sorted['bin'] = np.zeros(len(cca1_pbm_sorted.index))
cca1_pbm_sorted.head()
cca1_pbm_sorted_array = asarray(cca1_pbm_sorted)
cca1_pbm_sorted_array
len(cca1_pbm_sorted_array)
16512
bin_counter = 0
counter = 0
for i in range(shape(cca1_pbm_sorted_array)[0]):
cca1_pbm_sorted_array[i,2] = bin_counter
if counter == 516:
bin_counter = bin_counter + 1
counter = 0
counter = counter + 1
figure(figsize=(10,6))
hist(cca1_pbm_sorted_array[:,1], bins=100)
title("E-score", fontsize=20)
xticks(fontsize=20)
yticks(fontsize=20)
(array([ 0., 100., 200., 300., 400., 500.]), <a list of 6 Text yticklabel objects>)
initial_matrix= cca1_pbm_sorted_array[-10:]
ensemble = []
print seed_matrix(initial_matrix)
energy_matrix_preturbed = (seed_matrix(initial_matrix),3.)
steps = 10000000
chain = []
acceptance = 0
rejection = 0
M1_predictions = X_i_predictions(cca1_pbm_sorted_array,matrix_perturbation(energy_matrix_preturbed))
L1 = likelihood(M1_predictions)
#print energy_matrix_preturbed
[[ 0. 0.57142857 0. 0.57142857 0.71428571 0.83333333 1. 1. ] [ 1. 1. 1. 0. 0. 0.5 0.4 0.5 ] [ 1. 0. 0.57142857 1. 1. 1. 0. 0. ] [ 1. 1. 1. 1. 0.85714286 0. 0.6 0. ]]
for i in range(steps):
energy_matrix_preturbed2 = matrix_perturbation(energy_matrix_preturbed)
if energy_matrix_preturbed2[1] >= 0.0 and energy_matrix_preturbed2[1] <= 8. and energy_matrix_preturbed2[0].min() >= 0. and energy_matrix_preturbed2[0].max() <=1. :
M2_predictions = X_i_predictions(cca1_pbm_sorted_array,energy_matrix_preturbed2)
L2 = likelihood(M2_predictions)
#print L2-L1
if numpy.random.binomial(1,min(1.,exp(L2-L1))):
#print energy_matrix_preturbed2
if i % 5 == 0:
print 'Accepted '+str(len(ensemble))
print 'Ratio '+ str(acceptance/(rejection+acceptance))
L1 = L2
energy_matrix_preturbed = energy_matrix_preturbed2
chain.append(L2)
acceptance +=1.
ensemble.append(copy(energy_matrix_preturbed2))
#print "MCMC steps " + str(len(chain))
else:
#print 'Rejected '+str(i)
rejection +=1.
plot(chain[:])
[<matplotlib.lines.Line2D at 0x119a79ed0>]
parameter = []
for i in range(len(ensemble)):
parameter.append(ensemble[i][0][1,4])
plot(parameter,'o-')
[<matplotlib.lines.Line2D at 0x119de8350>]
figure(figsize=(10,5))
imshow(ensemble[-1][0], interpolation='none')
yticks([])
xticks(fontsize=22)
#savefig('matrix.png', dpi=600, format='png')
(array([-1., 0., 1., 2., 3., 4., 5., 6., 7., 8.]), <a list of 10 Text xticklabel objects>)
figure(figsize=(15,10))
M1_predictions = X_i_predictions(cca1_pbm_sorted_array,ensemble[-1])
imshow(M1_predictions, interpolation='none')
yticks([])
xticks(fontsize=22)
(array([ -5., 0., 5., 10., 15., 20., 25., 30., 35.]), <a list of 9 Text xticklabel objects>)
save('lux_ensemble.npy',ensemble)