from matplotlib.pyplot import *
from numpy import average,std,polyfit,poly1d
import numpy as np
from scipy import interpolate
from scipy.stats import pearsonr, spearmanr, sem,mannwhitneyu, ks_2samp, wilcoxon
import pandas as pd
import os
from seatongraf_utility_functions import normalise_mean, interpolate_integrate

##############################################################################
# Load and process datasets
##############################################################################

#Load proteomics data
protein_data = pd.read_excel('./Table EV1, Quantitiative proteomics dataset.xlsx',
                             index_col=0)

photoperiod_1 = 6  #SPP
photoperiod_2 = 18  #LPP
# Names of columns to get average protein abundances from:
idx1 = 'mean_{}h'.format(photoperiod_1)
idx2 = 'mean_{}h'.format(photoperiod_2)
protein_data['Measured change (LPP-SPP)'] = (protein_data[idx2] - protein_data[idx1])/0.5/(protein_data[idx2] + protein_data[idx1])


#Load transcript data
transcript_data = pd.read_csv('./DIURNAL_LDHH_ST.txt',sep='\t',index_col=1)

# Calculate amplitudes
timepoints = ['ZT{}'.format(x) for x in range(0,45,4)]
transcript_data['Amplitude'] = transcript_data.loc[:, timepoints].apply(lambda x: max(normalise_mean(x)), axis=1)

# compress transcript data (average timeseries) and duplicate end point, 
# replacing previous data
first_day_timepoints = ['ZT{}'.format(x) for x in range(0,21,4)]
second_day_timepoints = ['ZT{}'.format(x) for x in range(24,45,4)]
transcript_data.loc[:, first_day_timepoints] = (transcript_data.loc[:, first_day_timepoints].values + 
                                                transcript_data.loc[:, second_day_timepoints].values)/2
transcript_data.drop(second_day_timepoints,axis=1,inplace=True)
transcript_data['ZT24'] = transcript_data['ZT0']

transcript_data = transcript_data[~transcript_data.index.duplicated(keep='last')]

# now we only have timepoints covering one day, from ZT0 to ZT24 inclusive
timepoints = [idx*4.0 for idx in range(7)] #numerical values
timepoints_column_names = ['ZT{}'.format(int(x)) for x in timepoints] #for retreival from dataframe


##############################################################################
# Select genes
##############################################################################

# select rhythmic genes, according to amplitude and self correlation (i.e. 
# correlation of gene expression across successive days).
corr_threshold = 0.9
amp_threshold = 1.7

#phase and transcript data
transcript_data = transcript_data.query('Correlation>@corr_threshold & Amplitude>@amp_threshold')

#set of common genes (rhythmic transcripts with available protein measurements)
gene_list = list( set(protein_data.index) & set(transcript_data.index))

protein_data = protein_data.loc[gene_list,:]
transcript_data = transcript_data.loc[gene_list,:]

##############################################################################
# Modelling analysis
##############################################################################

light_factor = 3.5 #from Pal2013, page 1256

dilution_factor_1 = (photoperiod_1*light_factor+(24-photoperiod_1))
dilution_factor_2 = (photoperiod_2*light_factor+(24-photoperiod_2))
protein_data['Modelled change (LPP-SPP)'] = np.nan
for gene_name in gene_list:
    transcript_timeseries = transcript_data.loc[gene_name,timepoints_column_names].tolist()
    protein_modelled_photoperiod_1 = 1/dilution_factor_1*(light_factor*interpolate_integrate(timepoints,transcript_timeseries,0,photoperiod_1) + interpolate_integrate(timepoints,transcript_timeseries,photoperiod_1,24))
    protein_modelled_photoperiod_2 = 1/dilution_factor_2*(light_factor*interpolate_integrate(timepoints,transcript_timeseries,0,photoperiod_2) + interpolate_integrate(timepoints,transcript_timeseries,photoperiod_2,24))
    protein_data.loc[gene_name,'Modelled change (LPP-SPP)'] = (protein_modelled_photoperiod_2-protein_modelled_photoperiod_1)/0.5/(protein_modelled_photoperiod_1+protein_modelled_photoperiod_2)


# Bin and plot the data    
protein_reg_est_bins = dict()
bin_size = 0.1
for gene_name in gene_list:
    bin_address_determinant = protein_data.loc[gene_name,'Modelled change (LPP-SPP)']
    to_be_binned = protein_data.loc[gene_name,'Measured change (LPP-SPP)']
    try:
        protein_reg_est_bins[bin_address_determinant//bin_size].append(to_be_binned)
    except KeyError:
        # there is no such bin yet, so we need to create it
        protein_reg_est_bins[bin_address_determinant//bin_size] = [to_be_binned]


x_data = protein_data.loc[gene_list, 'Modelled change (LPP-SPP)',].tolist()
y_data = protein_data.loc[gene_list, 'Measured change (LPP-SPP)'].tolist()
P_R,P_p = pearsonr(x_data,y_data)
S_R,S_p = spearmanr(x_data,y_data)
print 'Pearson\'s R = {}, p = {}'.format(P_R,P_p)
fit = polyfit(x_data, y_data, 1)
fit_fn = poly1d(fit)
print 'slope = {}, intercept = {}'.format(fit[0], fit[1])


##############################################################################
# Plot figures
##############################################################################

out_directory = './'

xlower = -0.4
xupper = 0.4
ylower = -0.7
yupper = 0.7
FS = 18
MS = 10
LW = 2.3

fig=figure(figsize=(6,5))
ax = subplot(111)
boxplot(protein_reg_est_bins.values(),positions=[bin_size/2.0 + x*bin_size for x in protein_reg_est_bins.keys()],widths=0.05,notch=False)
plot([xlower,xupper], fit_fn([xlower,xupper]), '-k')
plot([xlower,xupper], [xlower,xupper], '--k')
xlabel('Predicted regulation',fontsize=FS)
ylabel('Measured regulation',fontsize=FS)
xticks(np.arange(-0.4,0.41,0.2),[-0.4,-0.2,0,0.2,0.4],fontsize=FS-2)
yticks([-0.6+x*0.2 for x in range(9)],fontsize=FS-2)

ylim([ylower,yupper])
xlim([xlower,xupper])

tight_layout()

fig.savefig(os.path.join(out_directory,'seatongraf_translational_coincidence_model_comparison.svg'),transparent=True,format='svg')