Inference of LUX energy matrix using EMA¶

In [26]:
def X_i_predictions(binned_data, energy_matrix):
    czx = zeros((2,32))
    for k in range(shape(binned_data)[0]):
        energy = 0
        for i in range(8):
            if binned_data[k,0][i] == 'A':
                energy += energy_matrix[0][0,i]
            elif binned_data[k,0][i] == 'T':
                energy += energy_matrix[0][1,i]
            elif binned_data[k,0][i] == 'G':
                energy += energy_matrix[0][2,i]
            elif binned_data[k,0][i] == 'C':
                energy += energy_matrix[0][3,i]
        if energy < energy_matrix[1] :
            czx[0,binned_data[k,2]] += 1
        else:
            czx[1,binned_data[k,2]] += 1
    return czx


def likelihood(czx):
    likelihood_numerator = 0
    for j in range(shape(czx)[0]):
        for k in  range(shape(czx)[1]):
            likelihood_numerator += math.lgamma(czx[j,k]+1)
            
    likelihood_denominator = 0
    for j in range(shape(czx)[0]):
        likelihood_denominator += math.lgamma(31+sum(czx[j,:])+1)
    
    return likelihood_numerator - likelihood_denominator


def matrix_perturbation(matrix):
    choose = numpy.random.choice(3)
    perturbed = copy(matrix[0])
    
    if choose == 0:
        x = numpy.random.choice(4)
        y = numpy.random.choice(8)
        perturbed[x,y] = perturbed[x,y]+numpy.random.normal(0.0,.1)
        return perturbed, matrix[1]
        
    elif choose == 1:
        mu2=matrix[1]+numpy.random.normal(0.0,.1)
        
        return perturbed, mu2
    
    elif choose == 2:
        temp = numpy.random.normal(0.0,.1)
        mu2=matrix[1]+temp
        temp2 = numpy.random.choice(8)
        for k in range(4):
            perturbed[k,temp2] = perturbed[k,temp2] + temp        
        
        return perturbed, mu2

def matrix_normalisation(energy_matrix_preturbed):
    temp = copy(energy_matrix_preturbed[0])
    x= 0
    for i in range(8):
        x+=min(temp[:,i]) 
    
    for i in range(8):
        for j in range(4):
            energy_matrix_preturbed[0][j,i] = (energy_matrix_preturbed[0][j,i] - min(temp[:,i]))/(energy_matrix_preturbed[1]-x)
    
    energy_matrix_preturbed[1] = 1.0
    return copy(energy_matrix_preturbed)


def seed_matrix(initial_sequences):
    """ Takes a set of squences and returns a matrix with starting energies for matrix inference using
    kinney et al method
    """
    matrix = zeros((4,8))
    for eightmere in initial_sequences:
        for base in range(len(eightmere[0])):
            if 'A' in eightmere[0][base]:
                matrix[0,base]=1+matrix[0,base]
            elif 'T' in eightmere[0][base]:
                matrix[1,base]=1+matrix[1,base]
            elif 'G' in eightmere[0][base]:
                matrix[2,base]=1+matrix[2,base]
            elif 'C' in eightmere[0][base]:
                matrix[3,base]=1+matrix[3,base]

    for i in range(matrix.shape[1]):
        matrix[:,i]= matrix[:,i]/sum(matrix[:,i])
        matrix[:,i]= matrix[:,i]/max(matrix[:,i])
        matrix[:,i] = abs(matrix[:,i]-max(matrix[:,i]))
    
    return matrix
In [2]:
import pandas
In [16]:
import pandas 
import numpy
cca1_pbm = pandas.read_excel('LUX_BINDING_PBM.xls',sheetname=1)
cca1_pbm
Out[16]:
8-mer Reverse Complement Design_1_Chamber_1 Design_1_Chamber_2 Design_1_Chamber_3 Design_1_average Design_2_Chamber_1 Design_2_Chamber_2 Design_2_Chamber_3 Design_2_average
0 AAAAAAAA TTTTTTTT 0.422272 0.428349 0.430615 0.427079 0.306605 0.320749 0.322564 0.316639
1 AAAAAAAC GTTTTTTT 0.330570 0.325226 0.336900 0.330898 0.254036 0.209630 0.166244 0.209970
2 AAAAAAAG CTTTTTTT 0.288988 0.272463 0.307813 0.289755 0.226867 0.229429 0.276126 0.244141
3 AAAAAAAT ATTTTTTT 0.330156 0.350296 0.313538 0.331330 0.346196 0.345928 0.349245 0.347123
4 AAAAAACA TGTTTTTT 0.310488 0.293285 0.259394 0.287722 0.262225 0.199524 0.271337 0.244362
5 AAAAAACC GGTTTTTT 0.219286 0.265552 0.223351 0.236063 0.166632 0.072022 0.021618 0.086757
6 AAAAAACG CGTTTTTT 0.276303 0.227090 0.241953 0.248449 0.232417 0.190323 0.219157 0.213966
7 AAAAAACT AGTTTTTT 0.264810 0.268607 0.250700 0.261372 0.116893 0.115539 0.116169 0.116201
8 AAAAAAGA TCTTTTTT 0.392561 0.400577 0.393285 0.395475 0.292466 0.294417 0.325462 0.304115
9 AAAAAAGC GCTTTTTT 0.166378 0.181442 0.228957 0.192259 0.200812 0.196035 0.218971 0.205273
10 AAAAAAGG CCTTTTTT 0.249919 0.252611 0.281559 0.261363 0.205091 0.075833 0.193720 0.158214
11 AAAAAAGT ACTTTTTT 0.251634 0.203100 0.279082 0.244605 0.232201 0.150461 0.167837 0.183499
12 AAAAAATA TATTTTTT 0.348749 0.347228 0.365290 0.353755 0.324335 0.327762 0.329378 0.327158
13 AAAAAATC GATTTTTT 0.357014 0.336774 0.343128 0.345638 0.390423 0.416201 0.407306 0.404644
14 AAAAAATG CATTTTTT 0.256453 0.309220 0.200107 0.255260 0.255403 0.264428 0.285857 0.268563
15 AAAAAATT AATTTTTT 0.328864 0.318362 0.272467 0.306564 0.220015 0.169384 0.216349 0.201916
16 AAAAACAA TTGTTTTT 0.230935 0.215060 0.265333 0.237109 0.265211 0.239782 0.295746 0.266913
17 AAAAACAC GTGTTTTT 0.114623 0.105541 0.030223 0.083462 0.137719 0.124203 0.143631 0.135185
18 AAAAACAG CTGTTTTT 0.190192 0.294407 0.178118 0.220906 0.090782 0.045491 0.115808 0.084027
19 AAAAACAT ATGTTTTT 0.174731 0.230441 0.226302 0.210491 0.099572 0.073962 0.130894 0.101476
20 AAAAACCA TGGTTTTT 0.110617 0.145486 0.135722 0.130608 0.163244 0.083935 0.225819 0.157666
21 AAAAACCC GGGTTTTT 0.217686 0.183476 0.152233 0.184465 0.068188 -0.028239 0.053806 0.031252
22 AAAAACCG CGGTTTTT 0.123432 0.206902 0.145712 0.158682 0.066351 0.070066 -0.055011 0.027135
23 AAAAACCT AGGTTTTT 0.196419 0.215522 0.212407 0.208116 0.177161 0.105783 0.171029 0.151324
24 AAAAACGA TCGTTTTT 0.167552 0.163307 0.178810 0.169890 0.204572 0.232756 0.212442 0.216590
25 AAAAACGC GCGTTTTT 0.028226 0.076835 0.077888 0.060983 0.143877 0.111680 0.112706 0.122755
26 AAAAACGG CCGTTTTT 0.142215 0.197073 0.223731 0.187673 0.007106 -0.079945 -0.070241 -0.047693
27 AAAAACGT ACGTTTTT 0.163911 0.036526 0.086135 0.095524 -0.008190 0.012131 0.042511 0.015484
28 AAAAACTA TAGTTTTT 0.090453 0.037614 0.139971 0.089346 0.142185 0.184006 0.180998 0.169063
29 AAAAACTC GAGTTTTT 0.214239 0.187912 0.173877 0.192009 0.210343 0.221153 0.249698 0.227065
... ... ... ... ... ... ... ... ... ... ...
32866 TTGATAAA TTTATCAA 0.109797 0.111236 0.161831 0.127621 0.311204 0.274810 0.333401 0.306472
32867 TTGATCAA TTGATCAA 0.056410 0.157985 -0.041491 0.057635 0.272114 0.227745 0.247943 0.249267
32868 TTGCAAAA TTTTGCAA 0.165602 0.171290 0.078036 0.138310 0.130109 0.161902 0.176161 0.156057
32869 TTGCACAA TTGTGCAA -0.014877 0.096333 -0.028160 0.017765 -0.041392 0.069791 -0.035939 -0.002513
32870 TTGCCAAA TTTGGCAA 0.027593 0.034636 -0.018277 0.014651 -0.051772 -0.048367 -0.076957 -0.059032
32871 TTGCCCAA TTGGGCAA -0.103538 0.048463 -0.050475 -0.035183 -0.020473 -0.054248 -0.017967 -0.030896
32872 TTGCGAAA TTTCGCAA -0.072650 -0.062646 -0.126367 -0.087221 0.102273 0.168009 0.131985 0.134089
32873 TTGCGCAA TTGCGCAA -0.011355 -0.097022 -0.040345 -0.049574 -0.012870 0.098997 -0.022846 0.021093
32874 TTGCTAAA TTTAGCAA -0.029122 0.104486 0.028793 0.034719 0.134686 0.071393 0.041492 0.082524
32875 TTGGAAAA TTTTCCAA 0.073490 0.155892 0.218411 0.149264 0.118250 0.154993 0.133715 0.135652
32876 TTGGACAA TTGTCCAA 0.066805 -0.032272 0.026786 0.020440 0.032265 0.084180 0.000130 0.038858
32877 TTGGCAAA TTTGCCAA 0.060743 -0.012010 -0.068178 -0.006482 0.083153 0.129011 0.163485 0.125216
32878 TTGGCCAA TTGGCCAA 0.039338 0.170602 -0.044705 0.055079 -0.054119 0.049829 -0.099437 -0.034576
32879 TTGGGAAA TTTCCCAA 0.140833 0.180685 0.195757 0.172425 0.239048 0.112489 0.208793 0.186777
32880 TTGGTAAA TTTACCAA 0.086611 0.179885 0.060742 0.109079 0.173517 0.177609 0.235698 0.195608
32881 TTGTAAAA TTTTACAA 0.072319 0.110950 0.095006 0.092758 0.066172 0.179113 0.202572 0.149286
32882 TTGTACAA TTGTACAA 0.034769 0.169711 0.240772 0.148418 0.226017 0.136729 0.243268 0.202004
32883 TTGTCAAA TTTGACAA 0.185108 0.187400 0.155532 0.176013 0.225447 0.233697 0.149250 0.202798
32884 TTGTGAAA TTTCACAA 0.275997 0.199434 0.211053 0.228828 0.005610 0.072999 0.033766 0.037458
32885 TTGTTAAA TTTAACAA 0.146552 0.147621 0.192682 0.162285 0.156734 0.145416 0.217605 0.173252
32886 TTTAAAAA TTTTTAAA 0.301812 0.305777 0.293187 0.300259 0.317709 0.286817 0.302072 0.302199
32887 TTTACAAA TTTGTAAA 0.042153 0.057068 0.088294 0.062505 0.116354 0.179100 0.190024 0.161826
32888 TTTAGAAA TTTCTAAA 0.201500 0.237387 0.238218 0.225701 0.236961 0.216961 0.222419 0.225447
32889 TTTATAAA TTTATAAA -0.000997 0.136347 0.097556 0.077635 0.215377 0.209747 0.246690 0.223938
32890 TTTCAAAA TTTTGAAA 0.194390 0.076636 0.125115 0.132047 0.299054 0.346945 0.317149 0.321049
32891 TTTCCAAA TTTGGAAA 0.039187 0.119860 0.171609 0.110219 0.194773 0.172378 0.072369 0.146506
32892 TTTCGAAA TTTCGAAA -0.005214 0.104952 -0.029424 0.023438 0.175029 0.218847 0.223439 0.205772
32893 TTTGAAAA TTTTCAAA 0.167543 0.221034 0.152008 0.180195 0.247508 0.171789 0.258319 0.225872
32894 TTTGCAAA TTTGCAAA 0.092377 0.120757 0.051627 0.088254 -0.010703 0.045173 0.089589 0.041353
32895 TTTTAAAA TTTTAAAA 0.192480 0.156368 0.109922 0.152923 0.351096 0.354591 0.277783 0.327823

32896 rows × 10 columns

In [17]:
forward = dict([])
reverse = dict([])
forward[cca1_pbm['8-mer'][0]] = 0
for i in range(len(cca1_pbm['8-mer'])-1):
    if cca1_pbm['8-mer'][i+1][::-1] in forward:
        reverse[cca1_pbm['8-mer'][i+1]] = i+1
    else:
        forward[cca1_pbm['8-mer'][i+1]] = i+1
    
cca1_pbm=cca1_pbm.loc[forward.values()]
In [21]:
cca1_pbm_sorted= cca1_pbm[['8-mer','Design_1_Chamber_1']].sort('Design_1_Chamber_1')
cca1_pbm_sorted['bin'] = np.zeros(len(cca1_pbm_sorted.index))
cca1_pbm_sorted.head()
cca1_pbm_sorted_array = asarray(cca1_pbm_sorted)
cca1_pbm_sorted_array
len(cca1_pbm_sorted_array)
Out[21]:
16512
In [23]:
bin_counter = 0
counter = 0
for i in range(shape(cca1_pbm_sorted_array)[0]):
    cca1_pbm_sorted_array[i,2] = bin_counter
    if counter == 516:
        bin_counter = bin_counter + 1
        counter = 0
    counter = counter + 1
In [25]:
figure(figsize=(10,6))
hist(cca1_pbm_sorted_array[:,1], bins=100)
title("E-score", fontsize=20)
xticks(fontsize=20)
yticks(fontsize=20)
Out[25]:
(array([   0.,  100.,  200.,  300.,  400.,  500.]),
 <a list of 6 Text yticklabel objects>)
In [27]:
initial_matrix= cca1_pbm_sorted_array[-10:]
In [28]:
ensemble = []
print seed_matrix(initial_matrix)
energy_matrix_preturbed = (seed_matrix(initial_matrix),3.)
steps = 10000000
chain = []
acceptance = 0
rejection = 0

M1_predictions = X_i_predictions(cca1_pbm_sorted_array,matrix_perturbation(energy_matrix_preturbed))
L1 = likelihood(M1_predictions)
#print energy_matrix_preturbed
[[ 0.          0.57142857  0.          0.57142857  0.71428571  0.83333333
   1.          1.        ]
 [ 1.          1.          1.          0.          0.          0.5         0.4
   0.5       ]
 [ 1.          0.          0.57142857  1.          1.          1.          0.
   0.        ]
 [ 1.          1.          1.          1.          0.85714286  0.          0.6
   0.        ]]
In [ ]:
for i in range(steps):    
    
    energy_matrix_preturbed2 = matrix_perturbation(energy_matrix_preturbed)
    

    if  energy_matrix_preturbed2[1] >= 0.0  and energy_matrix_preturbed2[1] <=  8. and energy_matrix_preturbed2[0].min() >= 0. and energy_matrix_preturbed2[0].max() <=1. :  
        M2_predictions = X_i_predictions(cca1_pbm_sorted_array,energy_matrix_preturbed2)
        L2 = likelihood(M2_predictions)
        #print L2-L1
        
        if numpy.random.binomial(1,min(1.,exp(L2-L1))):
            #print energy_matrix_preturbed2
            if i % 5 == 0:
                print 'Accepted '+str(len(ensemble))
                print 'Ratio '+ str(acceptance/(rejection+acceptance))
            L1 = L2
            
            energy_matrix_preturbed = energy_matrix_preturbed2
            
            chain.append(L2)
            acceptance +=1.
            ensemble.append(copy(energy_matrix_preturbed2))
            #print "MCMC steps " + str(len(chain))

    else:
        #print 'Rejected '+str(i)
        rejection +=1.
    
In [51]:
plot(chain[:])
Out[51]:
[<matplotlib.lines.Line2D at 0x119a79ed0>]
In [52]:
parameter = []

for i in range(len(ensemble)):
    parameter.append(ensemble[i][0][1,4])

plot(parameter,'o-')
Out[52]:
[<matplotlib.lines.Line2D at 0x119de8350>]
In [53]:
figure(figsize=(10,5))
imshow(ensemble[-1][0], interpolation='none')
yticks([])
xticks(fontsize=22)
#savefig('matrix.png', dpi=600, format='png')
Out[53]:
(array([-1.,  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.]),
 <a list of 10 Text xticklabel objects>)
In [54]:
figure(figsize=(15,10))
M1_predictions = X_i_predictions(cca1_pbm_sorted_array,ensemble[-1])
imshow(M1_predictions, interpolation='none')
yticks([])
xticks(fontsize=22)
Out[54]:
(array([ -5.,   0.,   5.,  10.,  15.,  20.,  25.,  30.,  35.]),
 <a list of 9 Text xticklabel objects>)
In [55]:
save('lux_ensemble.npy',ensemble)
In [ ]: