from matplotlib.pyplot import * from numpy import average,std,polyfit,poly1d import numpy as np from scipy import interpolate from scipy.stats import pearsonr, spearmanr, sem,mannwhitneyu, ks_2samp, wilcoxon import pandas as pd import os from seatongraf_utility_functions import normalise_mean, interpolate_integrate ############################################################################## # Load and process datasets ############################################################################## #Load proteomics data protein_data = pd.read_excel('./Table EV1, Quantitiative proteomics dataset.xlsx', index_col=0) photoperiod_1 = 6 #SPP photoperiod_2 = 18 #LPP # Names of columns to get average protein abundances from: idx1 = 'mean_{}h'.format(photoperiod_1) idx2 = 'mean_{}h'.format(photoperiod_2) protein_data['Measured change (LPP-SPP)'] = (protein_data[idx2] - protein_data[idx1])/0.5/(protein_data[idx2] + protein_data[idx1]) #Load transcript data transcript_data = pd.read_csv('./DIURNAL_LDHH_ST.txt',sep='\t',index_col=1) # Calculate amplitudes timepoints = ['ZT{}'.format(x) for x in range(0,45,4)] transcript_data['Amplitude'] = transcript_data.loc[:, timepoints].apply(lambda x: max(normalise_mean(x)), axis=1) # compress transcript data (average timeseries) and duplicate end point, # replacing previous data first_day_timepoints = ['ZT{}'.format(x) for x in range(0,21,4)] second_day_timepoints = ['ZT{}'.format(x) for x in range(24,45,4)] transcript_data.loc[:, first_day_timepoints] = (transcript_data.loc[:, first_day_timepoints].values + transcript_data.loc[:, second_day_timepoints].values)/2 transcript_data.drop(second_day_timepoints,axis=1,inplace=True) transcript_data['ZT24'] = transcript_data['ZT0'] transcript_data = transcript_data[~transcript_data.index.duplicated(keep='last')] # now we only have timepoints covering one day, from ZT0 to ZT24 inclusive timepoints = [idx*4.0 for idx in range(7)] #numerical values timepoints_column_names = ['ZT{}'.format(int(x)) for x in timepoints] #for retreival from dataframe ############################################################################## # Select genes ############################################################################## # select rhythmic genes, according to amplitude and self correlation (i.e. # correlation of gene expression across successive days). corr_threshold = 0.9 amp_threshold = 1.7 #phase and transcript data transcript_data = transcript_data.query('Correlation>@corr_threshold & Amplitude>@amp_threshold') #set of common genes (rhythmic transcripts with available protein measurements) gene_list = list( set(protein_data.index) & set(transcript_data.index)) protein_data = protein_data.loc[gene_list,:] transcript_data = transcript_data.loc[gene_list,:] ############################################################################## # Modelling analysis ############################################################################## light_factor = 3.5 #from Pal2013, page 1256 dilution_factor_1 = (photoperiod_1*light_factor+(24-photoperiod_1)) dilution_factor_2 = (photoperiod_2*light_factor+(24-photoperiod_2)) protein_data['Modelled change (LPP-SPP)'] = np.nan for gene_name in gene_list: transcript_timeseries = transcript_data.loc[gene_name,timepoints_column_names].tolist() protein_modelled_photoperiod_1 = 1/dilution_factor_1*(light_factor*interpolate_integrate(timepoints,transcript_timeseries,0,photoperiod_1) + interpolate_integrate(timepoints,transcript_timeseries,photoperiod_1,24)) protein_modelled_photoperiod_2 = 1/dilution_factor_2*(light_factor*interpolate_integrate(timepoints,transcript_timeseries,0,photoperiod_2) + interpolate_integrate(timepoints,transcript_timeseries,photoperiod_2,24)) protein_data.loc[gene_name,'Modelled change (LPP-SPP)'] = (protein_modelled_photoperiod_2-protein_modelled_photoperiod_1)/0.5/(protein_modelled_photoperiod_1+protein_modelled_photoperiod_2) # Bin and plot the data protein_reg_est_bins = dict() bin_size = 0.1 for gene_name in gene_list: bin_address_determinant = protein_data.loc[gene_name,'Modelled change (LPP-SPP)'] to_be_binned = protein_data.loc[gene_name,'Measured change (LPP-SPP)'] try: protein_reg_est_bins[bin_address_determinant//bin_size].append(to_be_binned) except KeyError: # there is no such bin yet, so we need to create it protein_reg_est_bins[bin_address_determinant//bin_size] = [to_be_binned] x_data = protein_data.loc[gene_list, 'Modelled change (LPP-SPP)',].tolist() y_data = protein_data.loc[gene_list, 'Measured change (LPP-SPP)'].tolist() P_R,P_p = pearsonr(x_data,y_data) S_R,S_p = spearmanr(x_data,y_data) print 'Pearson\'s R = {}, p = {}'.format(P_R,P_p) fit = polyfit(x_data, y_data, 1) fit_fn = poly1d(fit) print 'slope = {}, intercept = {}'.format(fit[0], fit[1]) ############################################################################## # Plot figures ############################################################################## out_directory = './' xlower = -0.4 xupper = 0.4 ylower = -0.7 yupper = 0.7 FS = 18 MS = 10 LW = 2.3 fig=figure(figsize=(6,5)) ax = subplot(111) boxplot(protein_reg_est_bins.values(),positions=[bin_size/2.0 + x*bin_size for x in protein_reg_est_bins.keys()],widths=0.05,notch=False) plot([xlower,xupper], fit_fn([xlower,xupper]), '-k') plot([xlower,xupper], [xlower,xupper], '--k') xlabel('Predicted regulation',fontsize=FS) ylabel('Measured regulation',fontsize=FS) xticks(np.arange(-0.4,0.41,0.2),[-0.4,-0.2,0,0.2,0.4],fontsize=FS-2) yticks([-0.6+x*0.2 for x in range(9)],fontsize=FS-2) ylim([ylower,yupper]) xlim([xlower,xupper]) tight_layout() fig.savefig(os.path.join(out_directory,'seatongraf_translational_coincidence_model_comparison.svg'),transparent=True,format='svg')