from numpy import average from scipy import interpolate from scipy.stats import pearsonr, spearmanr, sem, ks_2samp,wilcoxon,mannwhitneyu import pandas as pd import numpy as np from matplotlib.pyplot import * #1. Load data protein_data = pd.read_csv('aryal2011_proteomics_data_relative_isotope_abundance_timecourse.csv') timepoint_strings = protein_data.columns[4:17] nC = len(protein_data.columns) nT = len(timepoint_strings) #convert timepoints to 'real timepoints' timestep = 4 timepoints = [13+idx*timestep for idx in range(len(timepoint_strings))] protein_data['Degradation rate (h-1)'] = np.nan protein_data['Half life (h)'] = np.nan #select only proteins with at least 10 timepoints: bool_subset=protein_data[timepoint_strings].apply(lambda x:~np.isnan(x)).apply(sum,axis=1)>=10 protein_data = protein_data[bool_subset] #pearson correlation threshold corr_threshold = 0.9 #estimated fraction of synthesis that is labelled (ksH/(ksH+ksL)) limiting_label_fraction = 0.8 for idx in protein_data.index: protein_timeseries = protein_data[timepoint_strings].ix[idx] #filter out Nan entries and log protein data timepoints_temp = [ x for (x,y) in zip(timepoints,protein_timeseries) if not np.isnan(y)] logged_protein_timeseries = [ np.log(limiting_label_fraction-y) for y in protein_timeseries if not np.isnan(y)] #linear regression applied to data R_p,P_p = pearsonr(timepoints_temp,logged_protein_timeseries) if len(logged_protein_timeseries) >= 10 and abs(R_p) > corr_threshold: fit_fn = np.polyfit(timepoints_temp,logged_protein_timeseries,1) k_deg = -fit_fn[0] protein_data.loc[idx,'Degradation rate (h-1)'] = k_deg protein_data.loc[idx,'Half life (h)'] = np.log(2)/k_deg out_data = protein_data[['ORF**','Degradation rate (h-1)','Half life (h)']] out_data.to_csv('Cyanothece_degradation_rates.csv',index=False)