from numpy import average
from scipy import interpolate
from scipy.stats import pearsonr, spearmanr, sem, ks_2samp,wilcoxon,mannwhitneyu
import pandas as pd
import numpy as np
from matplotlib.pyplot import *


#1. Load data
protein_data = pd.read_csv('aryal2011_proteomics_data_relative_isotope_abundance_timecourse.csv')
timepoint_strings = protein_data.columns[4:17]
nC = len(protein_data.columns)
nT = len(timepoint_strings)


#convert timepoints to 'real timepoints'
timestep = 4
timepoints = [13+idx*timestep for idx in range(len(timepoint_strings))]

protein_data['Degradation rate (h-1)'] = np.nan
protein_data['Half life (h)'] = np.nan

#select only proteins with at least 10 timepoints:
bool_subset=protein_data[timepoint_strings].apply(lambda x:~np.isnan(x)).apply(sum,axis=1)>=10
protein_data = protein_data[bool_subset]

#pearson correlation threshold
corr_threshold = 0.9
#estimated fraction of synthesis that is labelled (ksH/(ksH+ksL))
limiting_label_fraction = 0.8
for idx in protein_data.index:
    protein_timeseries = protein_data[timepoint_strings].ix[idx]
    #filter out Nan entries and log protein data
    timepoints_temp = [ x for (x,y) in zip(timepoints,protein_timeseries) if not np.isnan(y)]
    logged_protein_timeseries = [ np.log(limiting_label_fraction-y) for y in protein_timeseries if not np.isnan(y)]
    #linear regression applied to data
    R_p,P_p = pearsonr(timepoints_temp,logged_protein_timeseries)
    if len(logged_protein_timeseries) >= 10 and abs(R_p) > corr_threshold:
        fit_fn = np.polyfit(timepoints_temp,logged_protein_timeseries,1)
        k_deg = -fit_fn[0]
        protein_data.loc[idx,'Degradation rate (h-1)'] = k_deg
        protein_data.loc[idx,'Half life (h)'] = np.log(2)/k_deg

out_data = protein_data[['ORF**','Degradation rate (h-1)','Half life (h)']]
out_data.to_csv('Cyanothece_degradation_rates.csv',index=False)