!pip install seaborn
!pip install biopython==1.76

Collecting seaborn
  Downloading https://files.pythonhosted.org/packages/b2/86/43b8c9138ef4c2a1c492fee92792c83c13799d0e2061ff810d3826d06cd1/seaborn-0.9.1-py2.py3-none-any.whl (216kB)
    100% |################################| 225kB 1.4MB/s 
Requirement already satisfied: scipy>=0.17.1 in /usr/local/lib/python2.7/dist-packages (from seaborn)
Requirement already satisfied: pandas>=0.17.1 in /usr/local/lib/python2.7/dist-packages (from seaborn)
Requirement already satisfied: matplotlib>=1.5.3 in /usr/local/lib/python2.7/dist-packages (from seaborn)
Requirement already satisfied: numpy>=1.10.4 in /usr/local/lib/python2.7/dist-packages (from seaborn)
Requirement already satisfied: pytz>=2011k in /usr/local/lib/python2.7/dist-packages (from pandas>=0.17.1->seaborn)
Requirement already satisfied: python-dateutil>=2.5.0 in /usr/local/lib/python2.7/dist-packages (from pandas>=0.17.1->seaborn)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.5.3->seaborn)
Requirement already satisfied: backports.functools-lru-cache in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.5.3->seaborn)
Requirement already satisfied: subprocess32 in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.5.3->seaborn)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.5.3->seaborn)
Requirement already satisfied: six>=1.10 in /usr/lib/python2.7/dist-packages (from matplotlib>=1.5.3->seaborn)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.5.3->seaborn)
Requirement already satisfied: setuptools in /usr/lib/python2.7/dist-packages (from kiwisolver>=1.0.1->matplotlib>=1.5.3->seaborn)
Installing collected packages: seaborn
Successfully installed seaborn-0.9.1
Collecting biopython==1.76
  Downloading https://files.pythonhosted.org/packages/ff/f4/0ce39bebcbb0ff619426f2bbe86e60bc549ace318c5a9113ae480ab2adc7/biopython-1.76.tar.gz (16.3MB)
    100% |################################| 16.3MB 15kB/s 
Requirement already satisfied: numpy in /usr/local/lib/python2.7/dist-packages (from biopython==1.76)
Building wheels for collected packages: biopython
  Running setup.py bdist_wheel for biopython ... done
  Stored in directory: /root/.cache/pip/wheels/70/34/e9/b3eebf55f8e404acb8717a80830011bc3e88e881b895e1ec1c
Successfully built biopython
Installing collected packages: biopython
Successfully installed biopython-1.76


%pylab inline
import seaborn as sns
import os
import pickle
import pandas
from scipy.stats import linregress
from Bio import SeqIO, Seq
from scipy.stats import norm

Populating the interactive namespace from numpy and matplotlib


LUX_pandas = array(pandas.read_excel('PBMS/LUX_BINDING_PBM.xls'))


#figure(figsize=(15,7))
subplot(1,2,1)
hist(LUX_pandas[:,5], bins=50)
subplot(1,2,2)
hist(LUX_pandas[:,9], bins=50)


LUX_pandas = pandas.read_excel('PBMS/LUX_BINDING_PBM.xls')
LUX_pandas=LUX_pandas.sort_values(['Design_1_average'], ascending=[0])
LUX_pandas.head()


Seq.Seq('ATGATGTCTTCTCAAGATTCGATAAAAATGGTGTTG')


seq = Seq.Seq('ATGATGTCTTCTCAAGATTCGATAAAAATGGTGTTG')


seq = Seq.Seq('AGATACGC')


hits = {}
for idx, i in enumerate(LUX_pandas['8-mer']):
    for l in range(0,len(seq)-8):
        if i in seq[0+l:8+l]:
            hits[l] = LUX_pandas.iloc[idx,5]
            print l, i, LUX_pandas.iloc[idx,5]


a = array([36.7,6.5,14,17,43])
b = array([0.456846659,0.484437756,0.470610759,0.293687946,0.138534885])
linear_regession = linregress(log(a),b)
linear_regession

LinregressResult(slope=-0.11748937527465714, intercept=0.7144315156534644, rvalue=-0.6006953984442779, pvalue=0.2840488838578199, stderr=0.09027971259048806)


linear_regession.rvalue**2

0.36083496171212975


#figure(figsize=(7*2,5*2))
#The slope come from infering it in CCA1
slope = -0.05757766811159118
escore = 0.456846659
escore2 = (0.484437756,6.5)
escore3=(0.470610759,14)
escore4=(0.293687946,17)
escore5=(0.138534885,43)

kd= 36.7
intercept = escore-slope*log(kd)
xticks(fontsize=25)
yticks(fontsize=25)
plot(linspace(0,8), slope*linspace(0,8)+intercept, 'green', lw=5,label="with CCA1 slope")
plot(linspace(0,8), linear_regession.slope*linspace(0,8)+linear_regession.intercept,"black", lw=5,label="inferred with new data")
plot(log(36.7),escore, 'bo',markersize=10)
plot(log(escore2[1]),escore2[0],'ro',markersize=10)
plot(log(escore3[1]),escore3[0],'ro',markersize=10)
plot(log(escore4[1]),escore4[0],'ro',markersize=10)
plot(log(escore5[1]),escore5[0],'ro',markersize=10)
xlabel('log(Kd)', fontsize=30)
ylabel('E-score', fontsize=30)
ylim(-0.5,1.5)
tight_layout()

legend(loc="upper right", fontsize=18)
savefig('LUX_regresion_new_data_myb.png',format='png',dpi=600)
savefig('LUX_regresion_new_data_myb.pdf',format='pdf',dpi=600)
savefig('LUX_regresion_new_data_myb.svg',format='svg',dpi=600)
#savefig('LUX_regresion.png',format='png',dpi=300)


a = array([93,98,93,118,117])
b = array([0.456846659,0.484437756,0.470610759,0.293687946,0.138534885])


linear_regession = linregress(log(a),b)


figure(figsize=(8,6))
#The slope come from infering it in CCA1
slope = -0.074770270470347427
escore = 0.456846659
escore2 = (0.484437756,98)
escore3=(0.470610759,93)
escore4=(0.293687946,118)
escore5=(0.138534885,178)

kd= 93
intercept = escore-slope*log(kd)
xticks(fontsize=16)
yticks(fontsize=16)
#plot(linspace(0,10), slope*linspace(0,8)+intercept, 'black')
plot(linspace(0,8), linear_regession.slope*linspace(0,8)+linear_regession.intercept,"black")
plot(log(36.7),escore, 'ro')
plot(log(escore2[1]),escore2[0],'ro')
plot(log(escore3[1]),escore3[0],'ro')
plot(log(escore4[1]),escore4[0],'ro')
plot(log(escore5[1]),escore5[0],'ro')
xlabel('log(Kd)', fontsize=22)
ylabel('E-score', fontsize=22)
ylim(-0.5,1.5)
tight_layout()
#savefig('LUX_regresion_new_data.png',format='png',dpi=600)
#savefig('LUX_regresion_new_data.pdf',format='pdf',dpi=600)
#savefig('LUX_regresion_new_data.svg',format='svg',dpi=600)


genome_kds_for = {}
genome_kds_rev = {}
for i in asarray(LUX_pandas):
    genome_kds_for[i[0]]=((i[5]-intercept)/slope)
    genome_kds_rev[i[1]]=((i[5]-intercept)/slope)


clockgenes = {'CCA1' : 'AT2G46830', 'LHY': 'AT1G01060', 
              'TOC1' : 'AT5G61380', 'GI':'AT1G22770',
              'PRR7':'AT5G02810', 'PRR9':'AT2G46790',
              'PRR5':'AT5G24470', 'LUX':'AT3G46640',
              'ELF3':'AT2G25930', 'ELF4':'AT2G40080',
              'ZTL':'AT5G57360'}


chromosome_dict = {'AT1':'NC_003070.gbk','AT2':'NC_003071.gbk','AT3':'NC_003074.gbk','AT4':'NC_003075.gbk','AT5':'NC_003076.gbk'}
chromosomes_seq={}
chromosomes_genes={}
chromosome_concatenated = 'A'
for chromosome in os.listdir('ATGenome/'):
    print 'Processing chromosome ', chromosome
    genes={}
    chromosomes_seq[chromosome] = SeqIO.read('ATGenome/'+chromosome, 'genbank')
    chromosome_concatenated = chromosome_concatenated+chromosomes_seq[chromosome].seq
    
    for f in chromosomes_seq[chromosome].features:
        if f.type == 'gene':
            genes[f.qualifiers['locus_tag'][0]] = f
    chromosomes_genes[chromosome] = genes

Processing chromosome  NC_003070.gbk
Processing chromosome  NC_003071.gbk
Processing chromosome  NC_003074.gbk
Processing chromosome  NC_003075.gbk
Processing chromosome  NC_003076.gbk


##Create a data frame for the genes which can then be sorted by chromosome position 
gene_id_data_frame=[]
gene_location_start_data_frame=[]
gene_location_end_data_frame=[]
gene_strand = []
for gene in chromosomes_genes[chromosome_dict["AT1"]].keys():
    gene_id_data_frame.append(gene)
    gene_location_start_data_frame.append(chromosomes_genes[chromosome_dict["AT1"]][gene].location.start.position)
    gene_location_end_data_frame.append(chromosomes_genes[chromosome_dict["AT1"]][gene].location.end.position)
    gene_strand.append(chromosomes_genes[chromosome_dict["AT1"]][gene].location.strand)
gene_locations = pandas.DataFrame({"1id":gene_id_data_frame,"1start":gene_location_start_data_frame,
                                 "2end":gene_location_end_data_frame,"3strand":gene_strand})
gene_locations = gene_locations.sort_values("1start")


promoters = {}
for idx, strand in enumerate(gene_locations.iloc[:,3]):
    if idx==0:
        if strand == 1:
            promoters[gene_locations.iloc[idx,0]]=([0,gene_locations.iloc[idx,1]])
    else:
        if strand == 1:
            promoters[gene_locations.iloc[idx,0]]=([gene_locations.iloc[idx-1,2],gene_locations.iloc[idx,1]])
        elif strand == -1:
            promoters[gene_locations.iloc[idx,0]]=([gene_locations.iloc[idx,2],gene_locations.iloc[idx+1,1]])


LUX_peaks = pandas.read_excel("41477_2017_BFnplants201787_MOESM3_ESM.xlsx", sheet_name="LUX_17")


Here we find the promoter that contains peaks of LUX at 17ºC


gene_and_its_peaks = {}
for idx, chro in enumerate(LUX_peaks.iloc[:,0]):
    if chro==1:
        peak = int((LUX_peaks.iloc[idx,1]+LUX_peaks.iloc[idx,2])/2)
        for promoter in promoters.keys():
            if peak >= promoters[promoter][0] and peak <= promoters[promoter][1]:
                gene_and_its_peaks[promoter] = []


for idx, chro in enumerate(LUX_peaks.iloc[:,0]):
    if chro==1:
        peak = int((LUX_peaks.iloc[idx,1]+LUX_peaks.iloc[idx,2])/2)
        for promoter in promoters.keys():
            if peak >= promoters[promoter][0] and peak <= promoters[promoter][1]:
                gene_and_its_peaks[promoter] = gene_and_its_peaks[promoter].append((LUX_peaks.iloc[idx,1],LUX_peaks.iloc[idx,2]))


f = chromosomes_genes[chromosome_dict["AT1"]]["AT1G06190"]

print(f.location.start)
print(f.location.end)

1892272
1894148
-1


promoter_length = 400
pos = range(0,len(chromosome_concatenated)-promoter_length)
random_regions = random.choice(pos,1000)


LUX_peaks = pandas.read_excel("41477_2017_BFnplants201787_MOESM3_ESM.xlsx", sheet_name="LUX_17")


LUX_peaks.head()


ensemble = load('PBM_matrix_inference/LUX_ensemble.npy')


def seq_energy(seq, energy_matrix):
    bound = {}
    for j in range(0,len(seq)-7):
            energy = 0
            for i in range(8):
                if seq[j+i] == 'A':
                    energy += energy_matrix[0][0,i]
                elif seq[j+i] == 'T':
                    energy += energy_matrix[0][1,i]
                elif seq[j+i] == 'G':
                    energy += energy_matrix[0][2,i]
                elif seq[j+i] == 'C':
                    energy += energy_matrix[0][3,i]
            if energy < energy_matrix[1] :
                bound[j] = 1
    return bound

def matrix_normalisation(energy_matrix_preturbed):
    temp = copy(energy_matrix_preturbed[0])
    x= 0
    for i in range(8):
        x+=min(temp[:,i]) 
    
    for i in range(8):
        for j in range(4):
            energy_matrix_preturbed[0][j,i] = (energy_matrix_preturbed[0][j,i] - min(temp[:,i]))/(energy_matrix_preturbed[1]-x)
    
    energy_matrix_preturbed[1] = 1.0
    return copy(energy_matrix_preturbed)


norm_mat = matrix_normalisation(ensemble[-20000])
boundf =  seq_energy(seq,norm_mat)
boundr =  seq_energy(seq.reverse_complement(),norm_mat)
print boundf
print boundr

{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1}
{1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1}


def seq_energy_eightmere(seq, energy_matrix):
        energy = 0
        for i in range(8):
            if seq[i] == 'A':
                energy += energy_matrix[0][0,i]
            elif seq[i] == 'T':
                energy += energy_matrix[0][1,i]
            elif seq[i] == 'G':
                energy += energy_matrix[0][2,i]
            elif seq[i] == 'C':
                energy += energy_matrix[0][3,i]
        if energy < energy_matrix[1] :
            return 1
        else:
            return 0


CCA1_peaks = asanyarray(pandas.read_csv('CCA1peaks_Kamioka2016', delimiter='\t', header=None))


LUX_pandas = pandas.read_excel('PBMS/LUX_BINDING_PBM.xls')
LUX_pandas.head()


LUX_pandas = pandas.read_excel('PBMS/LUX_BINDING_PBM.xls')

bound_bin = []
unbound_bin =[]
for idx, s in enumerate(LUX_pandas['8-mer']):
    if seq_energy_eightmere(s,norm_mat):
        bound_bin.append(LUX_pandas.iloc[idx]['Design_1_average'])
    else:
        unbound_bin.append(LUX_pandas.iloc[idx]['Design_1_average'])


figure(figsize=(7*2,5*2))
hist(unbound_bin, alpha=0.5, label='Unbound', normed=False, bins=30)
hist(bound_bin, alpha=0.5, color='orange', label='Bound', normed=False, bins=30)
legend(loc='upper right', fontsize=22)
xticks(fontsize=22)
yticks(fontsize=22)
ylabel('Ocurrence', fontsize=22)
xlabel('E-score', fontsize=22)
title('LUX PBM', fontsize=30)
savefig('images/EMA_LUX_separation.pdf', format='pdf', dpi=300)
savefig('images/EMA_LUX_separation.png', format='png', dpi=600)
savefig('images/EMA_LUX_separation.svg', format='svg', dpi=600)


def affinity_calc_energy_mat(seq, genome_kds_for, genome_kds_rev, boundf,boundr):
    affinities = 0
    k = boundf.keys()
    for pos in k:
        try:
            affinities+=(1/exp(genome_kds_for[seq[pos:pos+8]]))
        except:
            pass
    k = boundr.keys()
    len_seq = len(seq)
    for pos in k:
        try:
            affinities+=(1/exp(genome_kds_rev[seq[abs(pos-len_seq):abs(pos-len_seq)+8]]))
        except:
            pass
    return affinities


def affinity_calc(seq, genome_kds_for, genome_kds_rev):
    affinities = 0
    for pos in range(len(seq)-8):
        try:
            affinities+=(1/exp(genome_kds_for[seq[pos:pos+8]]))
        except:
            pass
        try:
            affinities+=(1/exp(genome_kds_rev[seq[pos:pos+8]]))
        except:
            pass
    return affinities


affinities = {}
background = {}
gene_num = 863
for idx, gene in enumerate(CCA1_peaks[:,4]):
    seq = chromosomes_seq[chromosome_dict[gene[0:3]]].seq[CCA1_peaks[idx,1]:CCA1_peaks[idx,2]]
    norm_mat = matrix_normalisation(ensemble[-20000])
    boundf =  seq_energy(seq,norm_mat)
    boundr =  seq_energy(seq.reverse_complement(),norm_mat)
    affinities[gene] = affinity_calc_energy_mat(seq,genome_kds_for,genome_kds_rev,boundf,boundr)
    print gene, float(idx)/gene_num 
    promoter_length = CCA1_peaks[idx,2]-CCA1_peaks[idx,1]
    pos = range(0,len(chromosome_concatenated)-promoter_length)
    random_regions = random.choice(pos,1000)
    affinities_background = []
    for randome_region in random_regions:
        affinities_background.append(affinity_calc_energy_mat(chromosome_concatenated[randome_region:(randome_region+promoter_length)], 
                      genome_kds_for, genome_kds_rev, boundf,boundr))
    background[gene] = affinities_background


#import pickle
#with open('lux_background_with_matrix.pickle', 'wb') as handle:
#    pickle.dump(background, handle)
#with open('lux_affinities_with_matrix.pickle', 'wb') as handle:
#    pickle.dump(affinities, handle)


with open('lux_background_with_matrix_2.pickle', 'rb') as handle:
    background_matrix = pickle.load(handle)
with open('lux_affinities_with_matrix_2.pickle', 'rb') as handle:
    affinities_matrix = pickle.load(handle)


with open('lux_background_with_matrix.pickle', 'rb') as handle:
    background = pickle.load(handle)
with open('lux_affinities_with_matrix.pickle', 'rb') as handle:
    affinities = pickle.load(handle)


b_dist = array(background[clockgenes['TOC1']])[array(background[clockgenes['TOC1']]) > 0]
figure(figsize=(10,7))
hist(b_dist, bins=100, normed=True, label='Full E-scores')
#savefig('images/toc1_background_raw_matrix.pdf', format='pdf', dpi=300)
b_dist = array(background_matrix[clockgenes['TOC1']])[array(background_matrix[clockgenes['TOC1']]) > 0]
hist(b_dist, bins=100, normed=True, alpha=0.6, color='orange', label='EMA selected')
xlabel('Affinity ($1/K_d$)', fontsize=22)
xticks(fontsize=16)
yticks(fontsize=16)
ylabel('Frequency', fontsize=22)
legend(loc='upper right', fontsize=22)
#savefig('images/toc1_background_raw.pdf', format='pdf', dpi=300)

<matplotlib.legend.Legend at 0x25e0d7510>


figure(figsize=(10,7))
b_dist = array(background[clockgenes['TOC1']])[array(background[clockgenes['TOC1']]) > 0]
hist((log(b_dist)-mean(log(b_dist)))/std(log(b_dist)), bins=100, normed=True, label='Normed Ln(affinites)')
plot(linspace(-5,5,100),norm.pdf(linspace(-5,5,100),loc=0), lw=3, color='r', label='$N(0,1)$')
axvline((log(affinities[clockgenes['TOC1']])-mean(log(b_dist)))/std(log(b_dist)), color='g', lw=3, label='TOC1 z-score')
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)
legend(loc='upper left', fontsize=16)
#savefig('images/toc1_background_matrix.pdf',format='pdf',dpi=300)

<matplotlib.legend.Legend at 0x2600f6bd0>


figure(figsize=(10,7))
b_dist = array(background[clockgenes['PRR9']])[array(background[clockgenes['PRR9']]) > 0]
hist((log(b_dist)-mean(log(b_dist)))/std(log(b_dist)), bins=100, normed=True, label='Normed Ln(affinites)')
plot(linspace(-5,5,100),norm.pdf(linspace(-5,5,100),loc=0), lw=3, color='r', label='$N(0,1)$')
axvline((log(affinities[clockgenes['PRR9']])-mean(log(b_dist)))/std(log(b_dist)), color='g', lw=3, label='PRR9 z-score')
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)
legend(loc='upper left', fontsize=16)
#savefig('images/toc1_background_matrix.pdf',format='pdf',dpi=300)

<matplotlib.legend.Legend at 0x26d1b88d0>


figure(figsize=(10,7))
b_dist = array(background[clockgenes['PRR7']])[array(background[clockgenes['PRR7']]) > 0]
hist((log(b_dist)-mean(log(b_dist)))/std(log(b_dist)), bins=100, normed=True, label='Normed Ln(affinites)')
plot(linspace(-5,5,100),norm.pdf(linspace(-5,5,100),loc=0), lw=3, color='r', label='$N(0,1)$')
axvline((log(affinities[clockgenes['PRR7']])-mean(log(b_dist)))/std(log(b_dist)), color='g', lw=3, label='PRR7 z-score')
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)
legend(loc='upper left', fontsize=16)
#savefig('images/toc1_background_matrix.pdf',format='pdf',dpi=300)

<matplotlib.legend.Legend at 0x26d35f450>


figure(figsize=(10,7))
b_dist = array(background[clockgenes['PRR5']])[array(background[clockgenes['PRR5']]) > 0]
hist((log(b_dist)-mean(log(b_dist)))/std(log(b_dist)), bins=100, normed=True, label='Normed Ln(affinites)')
plot(linspace(-5,5,100),norm.pdf(linspace(-5,5,100),loc=0), lw=3, color='r', label='$N(0,1)$')
axvline((log(affinities[clockgenes['PRR5']])-mean(log(b_dist)))/std(log(b_dist)), color='g', lw=3, label='PRR5 z-score')
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)
legend(loc='upper left', fontsize=16)
#savefig('images/toc1_background_matrix.pdf',format='pdf',dpi=300)

<matplotlib.legend.Legend at 0x26954df50>


figure(figsize=(10,7))
b_dist = array(background[clockgenes['CCA1']])[array(background[clockgenes['CCA1']]) > 0]
hist((log(b_dist)-mean(log(b_dist)))/std(log(b_dist)), bins=100, normed=True, label='Normed Ln(affinites)')
plot(linspace(-5,5,100),norm.pdf(linspace(-5,5,100),loc=0), lw=3, color='r', label='$N(0,1)$')
axvline((log(affinities[clockgenes['CCA1']])-mean(log(b_dist)))/std(log(b_dist)), color='g', lw=3, label='CCA1 z-score')
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)
legend(loc='upper left', fontsize=16)
#savefig('images/toc1_background_matrix.pdf',format='pdf',dpi=300)

<matplotlib.legend.Legend at 0x26699f110>


figure(figsize=(10,7))
b_dist = array(background[clockgenes['GI']])[array(background[clockgenes['GI']]) > 0]
hist((log(b_dist)-mean(log(b_dist)))/std(log(b_dist)), bins=100, normed=True, label='Normed Ln(affinites)')
plot(linspace(-5,5,100),norm.pdf(linspace(-5,5,100),loc=0), lw=3, color='r', label='$N(0,1)$')
axvline((log(affinities[clockgenes['GI']])-mean(log(b_dist)))/std(log(b_dist)), color='g', lw=3, label='GI z-score')
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)
legend(loc='upper left', fontsize=16)
#savefig('images/toc1_background_matrix.pdf',format='pdf',dpi=300)

<matplotlib.legend.Legend at 0x25bf41d10>


figure(figsize=(10,7))
b_dist = array(background['AT5G28640'])[array(background['AT5G28640']) > 0]
hist((log(b_dist)-mean(log(b_dist)))/std(log(b_dist)), bins=100, normed=True, label='Normed Ln(affinites)')
plot(linspace(-5,5,100),norm.pdf(linspace(-5,5,100),loc=0), lw=3, color='r', label='$N(0,1)$')
axvline((log(affinities['AT5G28640'])-mean(log(b_dist)))/std(log(b_dist)), color='g', lw=3, label='AN3 z-score')
title('LUX binding', fontsize=22)
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)
legend(loc='upper left', fontsize=16)
savefig('images/AN3_lux_background_matrix.pdf',format='pdf',dpi=300)


figure(figsize=(10,7))
gene_name = 'ELF4'
b_dist = array(background[clockgenes[gene_name]])[array(background[clockgenes[gene_name]]) > 0]
hist((log(b_dist)-mean(log(b_dist)))/std(log(b_dist)), bins=100, normed=True, label='Normed Ln(affinites)')
plot(linspace(-5,5,100),norm.pdf(linspace(-5,5,100),loc=0), lw=3, color='r', label='$N(0,1)$')
axvline((log(affinities[clockgenes['ELF4']])-mean(log(b_dist)))/std(log(b_dist)), color='g', lw=3, label=gene_name+' z-score')
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)
legend(loc='upper left', fontsize=16)
#savefig('images/toc1_background_matrix.pdf',format='pdf',dpi=300)


z_dist = {}
for b in background.keys():
    abackground = array(background[b])
    abackground = abackground[abackground>0]
    z_dist[b]=(log(affinities[b])-mean(log(abackground)))/std(log(abackground))

    
z_dist_matrix = {}
for b in background.keys():
    abackground = array(background_matrix[b])
    abackground = abackground[abackground>0]
    z_dist_matrix[b]=(log(affinities_matrix[b])-mean(log(abackground)))/std(log(abackground))


figure(figsize=(10,7))
hist(z_dist.values(),bins=50, normed=True, label='Full E-scores')
for c in clockgenes.keys():
    try:
        #axvline(z_dist[clockgenes[c]],color='blue', lw=2)
        if z_dist[clockgenes[c]] < 1:
            print 'Less ',c,z_dist[clockgenes[c]]
        else:
            print 'More ',c,z_dist[clockgenes[c]]
    except:
        print 'Not in list ',c
        pass
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)

hist(z_dist_matrix.values(),bins=50, normed=True, color='orange', alpha=0.6, label = 'EMA selected')
for c in clockgenes.keys():
    try:
        #axvline(z_dist_matrix[clockgenes[c]],color='orange', lw=2)
        if z_dist_matrix[clockgenes[c]] < 1:
            print 'Less ',c,z_dist_matrix[clockgenes[c]]
        else:
            print 'More ',c,z_dist_matrix[clockgenes[c]]
    except:
        print 'Not in list ',c
        pass
xticks(fontsize=16)
yticks(fontsize=16)
xlabel('z-score', fontsize=22)
ylabel('Frequency', fontsize=22)

legend(loc='upper left', fontsize=22)
#savefig('images/z_scores_across_genome.pdf', format='pdf', dpi=300)

Not in list  ZTL
Less  LUX 0.994916702971
Not in list  ELF3
More  ELF4 1.73328920584
Less  GI 0.303650466338
More  PRR9 1.01610674353
Less  CCA1 -0.515151794548
More  PRR5 1.39268921025
More  PRR7 2.16444495771
Not in list  LHY
Less  TOC1 0.234697646675
Not in list  ZTL
More  LUX 1.45536989606
Not in list  ELF3
More  ELF4 2.22049396431
Less  GI 0.322936012696
More  PRR9 1.96657705897
Less  CCA1 0.203273432275
More  PRR5 1.7991233374
More  PRR7 3.10362113223
Not in list  LHY
Less  TOC1 0.130989170065

<matplotlib.legend.Legend at 0x270c83350>


print 'Gene\tKd\t\tz-val'
for cg in clockgenes.keys():
    try:
        print cg+'\t'+ str(1/affinities[clockgenes[cg]])+'\t'+str(z_dist[clockgenes[cg]])
    except:
        pass

Gene	Kd		z-val
LUX	1.81221231565	0.994916702971
ELF4	1.67726966106	1.73328920584
GI	2.4909602976	0.303650466338
PRR9	2.54456399207	1.01610674353
CCA1	3.54808502084	-0.515151794548
PRR5	2.8726197252	1.39268921025
PRR7	1.27235587464	2.16444495771
TOC1	3.17038777374	0.234697646675

	chr	start	stop
0	1	116576	117835
1	1	182044	182730
2	1	183069	183795
3	1	189790	190541
4	1	219331	221195

	8-mer	Reverse Complement	Design_1_Chamber_1	Design_1_Chamber_2	Design_1_Chamber_3	Design_1_average	Design_2_Chamber_1	Design_2_Chamber_2	Design_2_Chamber_3	Design_2_average
0	AAAAAAAA	TTTTTTTT	0.422272	0.428349	0.430615	0.427079	0.306605	0.320749	0.322564	0.316639
1	AAAAAAAC	GTTTTTTT	0.330570	0.325226	0.336900	0.330898	0.254036	0.209630	0.166244	0.209970
2	AAAAAAAG	CTTTTTTT	0.288988	0.272463	0.307813	0.289755	0.226867	0.229429	0.276126	0.244141
3	AAAAAAAT	ATTTTTTT	0.330156	0.350296	0.313538	0.331330	0.346196	0.345928	0.349245	0.347123
4	AAAAAACA	TGTTTTTT	0.310488	0.293285	0.259394	0.287722	0.262225	0.199524	0.271337	0.244362

Genomic Regulation by LUX¶

We want to filter the peaks for those that on the promoters of genes¶

This matrix was obtained by Error Model Average (EMA) Kinney JB et al 2007 PNAS coded in python by me¶

Analysis of affinity of LUX for clock genes¶