%pylab inline
Populating the interactive namespace from numpy and matplotlib
import pandas
import numpy as np
cca1_pbm = pandas.read_csv('CCA1_RVE1_8mers/CCA1_8mers.txt', sep='\t')
forward = dict([])
reverse = dict([])
forward[cca1_pbm['8-mer'][0]] = 0
for i in range(len(cca1_pbm['8-mer'])-1):
if cca1_pbm['8-mer'][i+1][::-1] in forward:
reverse[cca1_pbm['8-mer'][i+1]] = i+1
else:
forward[cca1_pbm['8-mer'][i+1]] = i+1
cca1_pbm=cca1_pbm.loc[forward.values()]
def X_i_predictions(binned_data, energy_matrix):
czx = zeros((2,384))
for k in range(shape(binned_data)[0]):
energy = 0
for i in range(8):
if binned_data[k,0][i] == 'A':
energy += energy_matrix[0][0,i]
elif binned_data[k,0][i] == 'T':
energy += energy_matrix[0][1,i]
elif binned_data[k,0][i] == 'G':
energy += energy_matrix[0][2,i]
elif binned_data[k,0][i] == 'C':
energy += energy_matrix[0][3,i]
if energy < energy_matrix[1] :
czx[0,binned_data[k,2]] += 1
else:
czx[1,binned_data[k,2]] += 1
return czx
def likelihood(czx):
likelihood_numerator = 0
for j in range(shape(czx)[0]):
for k in range(shape(czx)[1]):
likelihood_numerator += math.lgamma(czx[j,k]+1)
likelihood_denominator = 0
for j in range(shape(czx)[0]):
likelihood_denominator += math.lgamma(31+sum(czx[j,:]))
print likelihood_numerator
print likelihood_numerator
return likelihood_numerator - likelihood_denominator
def matrix_perturbation(matrix):
choose = numpy.random.choice(3)
perturbed = copy(matrix[0])
if choose == 0:
x = numpy.random.choice(4)
y = numpy.random.choice(8)
perturbed[x,y] = perturbed[x,y]+numpy.random.normal(0.0,.1)
return perturbed, matrix[1]
elif choose == 1:
mu2=matrix[1]+numpy.random.normal(0.0,.1)
return perturbed, mu2
elif choose == 2:
temp = numpy.random.normal(0.0,.1)
mu2=matrix[1]+temp
temp2 = numpy.random.choice(8)
for k in range(4):
perturbed[k,temp2] = perturbed[k,temp2] + temp
return perturbed, mu2
def matrix_normalisation(energy_matrix_preturbed):
temp = copy(energy_matrix_preturbed[0])
x= 0
for i in range(8):
x+=min(temp[:,i])
for i in range(8):
for j in range(4):
energy_matrix_preturbed[0][j,i] = (energy_matrix_preturbed[0][j,i] - min(temp[:,i]))/(energy_matrix_preturbed[1]-x)
energy_matrix_preturbed[1] = 1.0
return copy(energy_matrix_preturbed)
def seed_matrix(initial_sequences):
""" Takes a set of squences and returns a matrix with starting energies for matrix inference using
kinney et al method
"""
matrix = zeros((4,8))
for eightmere in initial_sequences:
for base in range(len(eightmere[0])):
if 'A' in eightmere[0][base]:
matrix[0,base]=1+matrix[0,base]
elif 'T' in eightmere[0][base]:
matrix[1,base]=1+matrix[1,base]
elif 'G' in eightmere[0][base]:
matrix[2,base]=1+matrix[2,base]
elif 'C' in eightmere[0][base]:
matrix[3,base]=1+matrix[3,base]
for i in range(matrix.shape[1]):
matrix[:,i]= matrix[:,i]/sum(matrix[:,i])
matrix[:,i]= matrix[:,i]/max(matrix[:,i])
matrix[:,i] = abs(matrix[:,i]-max(matrix[:,i]))
return matrix
cca1_pbm_sorted=cca1_pbm.sort_values('E-score')
cca1_pbm_sorted['bin'] = np.zeros(len(cca1_pbm_sorted.index))
cca1_pbm_sorted.head()
cca1_pbm_sorted_array = asarray(cca1_pbm_sorted)
cca1_pbm_sorted_array = cca1_pbm_sorted_array[:,[0,2,5]]
cca1_pbm_sorted_array[10:100]
len(cca1_pbm_sorted_array)
16512
bin_counter = 0
counter = 0
for i in range(shape(cca1_pbm_sorted_array)[0]):
cca1_pbm_sorted_array[i,2] = bin_counter
if counter == 43:
bin_counter = bin_counter + 1
counter = 0
counter = counter + 1
#figure(figsize=(10,6))
hist(cca1_pbm_sorted_array[:,1], bins=10)
title("E-score", fontsize=20)
xticks(fontsize=20)
yticks(fontsize=20)
#savefig("CCA1-escore.pdf", format='pdf', dpi=600)
initial_matrix= cca1_pbm_sorted_array[-10:]
s = seed_matrix(initial_matrix)
for i in range(shape(s)[0]):
for j in range(shape(s)[1]):
print s[i][j]
print "\n"
0.0 0.0 0.0 0.0 0.0 0.4 1.0 0.0 1.0 1.0 0.7142857142857142 0.0 0.0 0.0 0.0 0.0 1.0 0.75 0.8571428571428571 1.0 1.0 1.0 1.0 0.6666666666666666 0.8888888888888888 1.0 1.0 1.0 1.0 0.6 0.33333333333333326 0.0
ensemble = []
print seed_matrix(initial_matrix)
energy_matrix_preturbed = (seed_matrix(initial_matrix),3.)
steps = 10000000
chain = []
acceptance = 0
rejection = 0
M1_predictions = X_i_predictions(cca1_pbm_sorted_array,matrix_perturbation(energy_matrix_preturbed))
L1 = likelihood(M1_predictions)
#print energy_matrix_preturbed
[[0. 0. 0. 0. 0. 0.4 1. 0. ] [1. 1. 0.71428571 0. 0. 0. 0. 0. ] [1. 0.75 0.85714286 1. 1. 1. 1. 0.66666667] [0.88888889 1. 1. 1. 1. 0.6 0.33333333 0. ]] 39174.1084414 39174.1084414
for i in range(steps):
energy_matrix_preturbed2 = matrix_perturbation(energy_matrix_preturbed)
if energy_matrix_preturbed2[1] >= 0.0 and energy_matrix_preturbed2[1] <= 8. and energy_matrix_preturbed2[0].min() >= 0. and energy_matrix_preturbed2[0].max() <=1. :
M2_predictions = X_i_predictions(cca1_pbm_sorted_array,energy_matrix_preturbed2)
L2 = likelihood(M2_predictions)
#print L2-L1
if numpy.random.binomial(1,min(1.,exp(L2-L1))):
#print energy_matrix_preturbed2
if i % 5 == 0:
print 'Accepted '+str(len(ensemble))
print 'Ratio '+ str(acceptance/(rejection+acceptance))
L1 = L2
energy_matrix_preturbed = energy_matrix_preturbed2
chain.append(L2)
acceptance +=1.
ensemble.append(copy(energy_matrix_preturbed2))
#print "MCMC steps " + str(len(chain))
else:
#print 'Rejected '+str(i)
rejection +=1.
plot(chain[:])
[<matplotlib.lines.Line2D at 0x12c36f410>]
len(ensemble)
8
parameter = []
for i in range(len(ensemble)):
parameter.append(ensemble[i][0][1,4])
plot(parameter,'o-')
[<matplotlib.lines.Line2D at 0x11dbc8d10>]
figure(figsize=(10,5))
imshow(ensemble[-1][0], interpolation='none')
yticks([])
xticks(fontsize=22)
#savefig('matrix.png', dpi=600, format='png')
(array([-1., 0., 1., 2., 3., 4., 5., 6., 7., 8.]), <a list of 10 Text xticklabel objects>)
figure(figsize=(20,20))
M1_predictions = X_i_predictions(cca1_pbm_sorted_array,ensemble[-1])
imshow(M1_predictions, interpolation='none')
yticks([])
xticks(fontsize=22)
(array([-50., 0., 50., 100., 150., 200., 250., 300., 350., 400.]), <a list of 10 Text xticklabel objects>)
#save('cca1_ensemble.npy',ensemble)
ensemble = np.load('ensemble.bp.npy')
ensemble[-1]
array([array([[0.04215856, 0.22217542, 0.03524763, 0.08532475, 0.22279823, 0.04915916, 0.20279689, 0.14647833], [0.04046658, 0.02910011, 0.14568986, 0.03956605, 0.24706214, 0.07064368, 0.19785632, 0.15683305], [0.90240634, 0.88073234, 0.95954551, 0.97240876, 0.98444506, 0.73813593, 0.86855906, 0.83129392], [0.89654698, 0.96924305, 0.77918178, 0.77211724, 0.90425981, 0.74640472, 0.88216585, 0.83982523]]), 3.7147668695068337], dtype=object)
ensemble[-1]
array([ array([[ 0.04215856, 0.22217542, 0.03524763, 0.08532475, 0.22279823, 0.04915916, 0.20279689, 0.14647833], [ 0.04046658, 0.02910011, 0.14568986, 0.03956605, 0.24706214, 0.07064368, 0.19785632, 0.15683305], [ 0.90240634, 0.88073234, 0.95954551, 0.97240876, 0.98444506, 0.73813593, 0.86855906, 0.83129392], [ 0.89654698, 0.96924305, 0.77918178, 0.77211724, 0.90425981, 0.74640472, 0.88216585, 0.83982523]]), 3.7147668695068337], dtype=object)
normalised_ensemble = []
for i in ensemble:
normalised_ensemble.append((matrix_normalisation(i)[0],matrix_normalisation(i)[1]))
parameter = []
parameter2 = []
for i in ensemble[90000:len(ensemble)]:
parameter.append(i[0][0,7])
parameter2.append(i[0][1,7])
plot(parameter,parameter2,'o-')
[<matplotlib.lines.Line2D at 0x1097cfa50>]
with open ("TOC1_upstream.txt", "r") as myfile:
data=myfile.read().replace('\n', '')
with open ("NC_003076.8[25252009..25252439].fa", "r") as myfile:
data2=myfile.read().replace('\n', '')
!pip install biopython==1.68
Collecting Biopython==1.68 Downloading https://files.pythonhosted.org/packages/72/6c/e1e13b9df73f9c2539b67d12bc22be6b19779230cadbed04c24f3f3e5ef4/biopython-1.68.tar.gz (14.4MB) 100% |################################| 14.4MB 23kB/s Building wheels for collected packages: Biopython Running setup.py bdist_wheel for Biopython ... done Stored in directory: /root/.cache/pip/wheels/7a/c8/ed/fc0668ec5b5037afd61e05f82a3866376365c78ebf55aed195 Successfully built Biopython Installing collected packages: Biopython Successfully installed Biopython-1.68
from Bio.Seq import *
TOC1_uppstream = Seq(data, alphabet=IUPAC.IUPACUnambiguousDNA())
TOC1_uppstream
Seq('CATATTTTTCGTAATAAGAATCAATGGGACAAAGAGAACAACAAAAGAATCTCA...ATC', IUPACUnambiguousDNA())
TOC1_uppstream_RC = TOC1_uppstream.complement()
TOC1_uppstream_RC
Seq('GTATAAAAAGCATTATTCTTAGTTACCCTGTTTCTCTTGTTGTTTTCTTAGAGT...TAG', IUPACUnambiguousDNA())
TOC1 = str(TOC1_uppstream_RC)
def free_energy(seq,energy_matrix):
"""The function gives choose is the sequence is bound by the TF given a energy matrix model.
"""
energy = 0
for i in range(8):
if seq[i] == 'A':
energy += energy_matrix[0][0,i]
elif seq[i] == 'T':
energy += energy_matrix[0][1,i]
elif seq[i] == 'G':
energy += energy_matrix[0][2,i]
elif seq[i] == 'C':
energy += energy_matrix[0][3,i]
if energy < energy_matrix[1] :
return energy
else:
return energy
seq = 'AAAAATCT'
print 'CBS '+str(free_energy(seq,normalised_ensemble[-1]))
seq = 'AAATATCT'
print 'EE '+str(free_energy(seq,normalised_ensemble[-1]))
seq = 'ATATAGCG'
print 'EE_mutant '+str(free_energy(seq,normalised_ensemble[-1]))
seq = 'AAATCGAG'
print 'CCR2_mutant '+str(free_energy(seq,normalised_ensemble[-1]))
CBS 0.3338934682820296 EE 0.31271296138796245 EE_mutant 0.7083813720428789 CCR2_mutant 0.7657975659409175
wtCCR2_for = 'GAGGTCAAACCTAGAAAATATCTAAACCTTGAAACCTAG'
wtCCR2_rev = 'CTAGGTTTCAAGGTTTAGATATTTTCTAGGTTTGACCTC'
wtPRR9_for= 'CGATCACAACCACGAAAATATCTTCTCAGAGAAAGAAGA'
wtPRR9_rev= 'TCTTCTTTCTCTGAGAAGATATTTTCGTGGTTGTGATCG'
CBS_PRR9_for='CGATCACAACCACGAAAAAATCTTCTCAGAGAAAGAAGA'
CBS_PRR9_rev='TCTTCTTTCTCTGAGAAGATTTTTTCGTGGTTGTGATCG'
mutCCR2_for = 'GAGGTCAAACCTAGAAAATCGAGAAACCTTGAAACCTAG'
mutCCR2_rev = 'CTAGGTTTCAAGGTTTCTCGATTTTCTAGGTTTGACCTC'
mutPRR9_for = 'CGATCACAACCACGAAAATCGAGTCTCAGAGAAAGAAGA'
mutPRR9_rev = 'TCTTCTTTCTCTGAGACTCGATTTTCGTGGTTGTGATCG'
figure(dpi=600)
prediction = []
for i in range(len(wtCCR2_for)-8):
prediction.append(free_energy(wtCCR2_for[0+i:8+i],normalised_ensemble[-1]))
ylabel("Normalised Engery", fontsize=22)
title("CCR2 WT", fontsize=22)
xlabel("bp position, window=8 bp", fontsize=22)
ylim(0,1.5)
plot(prediction, '-o',lw=2)
savefig("CCA1_EMA_binding_prediction_CCR2.pdf", format="pdf", dpi=600)
savefig("CCA1_EMA_binding_prediction_CCR2.svg", format="svg", dpi=600)
savefig("CCA1_EMA_binding_prediction_CCR2.png", format="png", dpi=600)
figure(dpi=600)
prediction = []
for i in range(len(mutCCR2_for)-8):
prediction.append(free_energy(mutCCR2_for[0+i:8+i],normalised_ensemble[-1]))
ylabel("Normalised Engery", fontsize=22)
title("CCR2 mut", fontsize=22)
xlabel("bp position, window=8 bp", fontsize=22)
ylim(0,1.5)
plot(prediction, '-o',lw=2)
savefig("CCA1_EMA_binding_prediction_CCR2_mut.pdf", format="pdf", dpi=600)
savefig("CCA1_EMA_binding_prediction_CCR2_mut.svg", format="svg", dpi=600)
savefig("CCA1_EMA_binding_prediction_CCR2_mut.png", format="png", dpi=600)
figure(dpi=600)
prediction1 = []
for i in range(len(wtCCR2_for)-8):
prediction1.append(free_energy(wtCCR2_for[0+i:8+i],normalised_ensemble[-1]))
print("CCR2wt=",sum(prediction1))
plot(prediction1, '-o',lw=2, label="WT")
prediction2 = []
for i in range(len(mutCCR2_for)-8):
prediction2.append(free_energy(mutCCR2_for[0+i:8+i],normalised_ensemble[-1]))
print("CCR2mut=",sum(prediction2))
plot(prediction2, '-o',lw=2, color="orange", label="mut")
ylabel("Normalised Engery", fontsize=22)
title("CCR2", fontsize=22)
xlabel("bp position, window=8 bp", fontsize=22)
ylim(0,1.5)
yticks(fontsize=18)
xticks(fontsize=18)
legend(loc="lower left", fontsize=17)
tight_layout()
savefig("CCA1_EMA_binding_prediction_CCR2wtvsmut.pdf", format="pdf", dpi=600, transparent=True)
savefig("CCA1_EMA_binding_prediction_CCR2wtvsmut.svg", format="svg", dpi=600, transparent=True)
savefig("CCA1_EMA_binding_prediction_CCR2wtvsmut.png", format="png", dpi=600, transparent=True)
('CCR2wt=', 21.692248736875847) ('CCR2mut=', 25.909194150930727)
log(2.5)
log(10)
2.302585092994046
figure(dpi=600)
prediction = []
for i in range(len(wtCCR2_rev)-8):
prediction.append(free_energy(wtCCR2_rev[0+i:8+i],normalised_ensemble[-1]))
ylabel("Normalised Engery", fontsize=22)
title("CCR2 rev WT", fontsize=22)
xlabel("bp position, window=8 bp", fontsize=22)
ylim(0,1.5)
plot(prediction, '-o',lw=2)
savefig("CCA1_EMA_binding_prediction_CCR2_rev.pdf", format="pdf", dpi=600)
savefig("CCA1_EMA_binding_prediction_CCR2_rev.svg", format="svg", dpi=600)
savefig("CCA1_EMA_binding_prediction_CCR2_rev.png", format="png", dpi=600)
figure(dpi=600)
prediction = []
for i in range(len(wtPRR9_for)-8):
prediction.append(free_energy(wtPRR9_for[0+i:8+i],normalised_ensemble[-1]))
ylabel("Normalised Engery", fontsize=22)
title("PRR9 WT", fontsize=22)
xlabel("bp position, window=8 bp", fontsize=22)
ylim(0,1.5)
plot(prediction, '-o',lw=2)
savefig("CCA1_EMA_binding_prediction_PRR9.pdf", format="pdf", dpi=600)
savefig("CCA1_EMA_binding_prediction_PRR9.svg", format="svg", dpi=600)
savefig("CCA1_EMA_binding_prediction_PRR9.png", format="png", dpi=600)
figure(dpi=600)
prediction1 = []
for i in range(len(wtPRR9_for)-8):
prediction1.append(free_energy(wtPRR9_for[0+i:8+i],normalised_ensemble[-1]))
print("PRR9wt=",sum(prediction1))
plot(prediction1, '-o',lw=2, label="WT")
prediction2 = []
for i in range(len(mutPRR9_for)-8):
prediction2.append(free_energy(mutPRR9_for[0+i:8+i],normalised_ensemble[-1]))
plot(prediction2, '-o',lw=2, color="orange", label="mut")
print("PRR9mut=",sum(prediction2))
ylabel("Normalised Engery", fontsize=22)
title("PRR9", fontsize=22)
xlabel("bp position, window=8 bp", fontsize=22)
ylim(0,1.5)
legend(loc="lower left", fontsize=17)
yticks(fontsize=18)
xticks(fontsize=18)
tight_layout()
savefig("CCA1_EMA_binding_prediction_PRR9wtvsmut.pdf", format="pdf", dpi=600, transparent=True)
savefig("CCA1_EMA_binding_prediction_PRR9wtvsmut.svg", format="svg", dpi=600, transparent=True)
savefig("CCA1_EMA_binding_prediction_PRR9wtvsmut.png", format="png", dpi=600, transparent=True)
('PRR9wt=', 25.30645981652274) ('PRR9mut=', 29.52340523057762)
figure(dpi=600)
prediction = []
for i in range(len(CBS_PRR9_for)-8):
prediction.append(free_energy(CBS_PRR9_for[0+i:8+i],normalised_ensemble[-1]))
print("PRR9CBS=",sum(prediction))
ylabel("Normalised Engery", fontsize=22)
title("CBS PRR9 WT", fontsize=22)
xlabel("bp position, window=8 bp", fontsize=22)
ylim(0,1.5)
plot(prediction, '-o',lw=2)
savefig("CCA1_EMA_binding_prediction_CBS.pdf", format="pdf", dpi=600)
savefig("CCA1_EMA_binding_prediction_CBS.svg", format="svg", dpi=600)
savefig("CCA1_EMA_binding_prediction_CBS.png", format="png", dpi=600)
('PRR9CBS=', 25.34766627346645)