# -*- coding: utf-8 -*- """ Created on Wed Jun 12 10:02:27 2019 @author: mhill """ """ preprocessing.py gets prepared data und maps it to lower dimensions, so the clustering and visualization gets easier """ """ importing stuff """ import time import umap import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import PCA from sklearn.manifold import TSNE """ declarations """ # datafilename = location of your datafile cell_datafile = 'H:/Internship/ML-Models/Data/cell_data.csv' nuclei_datafile = 'H:/Internship/ML-Models/Data/nuclei_data.csv' # save_location = where you want to store your results save_location = 'H:/Internship/ML-Models/Results/Preprocessing/' # ICA_iter = max number of iterations for your ICA ICA_iter = 600 # perplexity parameter for t-SNE perplexity = range(5,23,3) # parameters for UMAP neighbors = [2,10,15,30,100,200] distance = [0.0,0.1,0.3,0.5,0.9] """ loading features, data and models """ # loading data cell_data = pd.read_csv(cell_datafile) cell_data.drop(['Unnamed: 0'], axis=1, inplace=True) nuclei_data = pd.read_csv(nuclei_datafile) nuclei_data.drop(['Unnamed: 0'], axis=1, inplace=True) # create the dataframes, where the mapped data should be stored mapped_cell_data = pd.DataFrame() mapped_nuclei_data = pd.DataFrame() """ using the models """ # approach 1: # PCA -> 3 dimensions pca_3_transformer = PCA(n_components=3) cell_pca_3 = pca_3_transformer.fit_transform(cell_data.values) nuclei_pca_3 = pca_3_transformer.fit_transform(nuclei_data.values) mapped_cell_data['PCA_1'] = cell_pca_3[:,0] mapped_cell_data['PCA_2'] = cell_pca_3[:,1] mapped_cell_data['PCA_3'] = cell_pca_3[:,2] mapped_nuclei_data['PCA_1'] = nuclei_pca_3[:,0] mapped_nuclei_data['PCA_2'] = nuclei_pca_3[:,1] mapped_nuclei_data['PCA_3'] = nuclei_pca_3[:,2] print('Explained variation per principal component - 3dim PCA: {}'.format(pca_3_transformer.explained_variance_ratio_)) # plot the 3D PCA data ax = plt.figure().gca(projection='3d') ax.scatter(xs=mapped_cell_data.loc[:,:]["PCA_1"], ys=mapped_cell_data.loc[:,:]["PCA_2"], zs=mapped_cell_data.loc[:,:]["PCA_3"]) ax.set_xlabel('PCA_1') ax.set_ylabel('PCA_2') ax.set_zlabel('PCA_3') plt.show() ax = plt.figure().gca(projection='3d') ax.scatter(xs=mapped_nuclei_data.loc[:,:]["PCA_1"], ys=mapped_nuclei_data.loc[:,:]["PCA_2"], zs=mapped_nuclei_data.loc[:,:]["PCA_3"]) ax.set_xlabel('PCA_1') ax.set_ylabel('PCA_2') ax.set_zlabel('PCA_3') plt.show() # approach 2: # PCA -> 2 dimensions # plot the 2D PCA data plt.figure(figsize=(16,10)) sns.scatterplot(x="PCA_1", y="PCA_2", data=mapped_cell_data) plt.title("Cell Data - PCA - 2 dimensions") plt.savefig(save_location+'cell_pca_2.png') plt.close() plt.figure(figsize=(16,10)) sns.scatterplot(x="PCA_1", y="PCA_2", data=mapped_nuclei_data) plt.title("Nuclei Data - PCA - 2 dimensions") plt.savefig(save_location+'nuclei_pca_2.png') plt.close() # approach 3 and 4: # t-sne -> 2 dimensions # PCA -> 10 dimensions -> t-sne -> 2 dimensions pca_10_transformer = PCA(n_components=10) cell_pca_10 = pca_10_transformer.fit_transform(cell_data.values) nuclei_pca_10 = pca_10_transformer.fit_transform(nuclei_data.values) # loop over the perplexity parameter for p in perplexity: # create the TSNE mapping tsne = TSNE(n_components=2, verbose=1, perplexity=p, n_iter=10000) # approach 3 # t-SNE -> 2 dimensions # start a timer time_start = time.time() # transform the data cell_tsne_2 = tsne.fit_transform(cell_data.values) print('t-SNE for cells done! Reduced from 18 to 2 dimensions! Time elapsed: {} seconds'.format(time.time()-time_start)) time_start = time.time() nuclei_tsne_2 = tsne.fit_transform(nuclei_data.values) print('t-SNE for nuclei done! Reduced from 19 to 2 dimensions! Time elapsed: {} seconds'.format(time.time()-time_start)) mapped_cell_data['2D_TSNE_1_perplexity_'+str(p)] = cell_tsne_2[:,0] mapped_cell_data['2D_TSNE_2_perplexity_'+str(p)] = cell_tsne_2[:,1] mapped_nuclei_data['2D_TSNE_1_perplexity_'+str(p)] = nuclei_tsne_2[:,0] mapped_nuclei_data['2D_TSNE_2_perplexity_'+str(p)] = nuclei_tsne_2[:,1] # plot the 2D t-SNE data plt.figure() sns.scatterplot(x='2D_TSNE_1_perplexity_'+str(p), y='2D_TSNE_2_perplexity_'+str(p), data=mapped_cell_data) plt.xlabel("t-SNE dimension 1", fontsize='large') plt.ylabel("t-SNE dimension 2", fontsize='large') #plt.title("Cell Data - t-SNE - 2 dimensions - perplexity "+str(p)) plt.savefig(save_location+'cell_tsne_2_perplexity_'+str(p)+'.png') plt.close() plt.figure() sns.scatterplot(x='2D_TSNE_1_perplexity_'+str(p), y='2D_TSNE_2_perplexity_'+str(p), data=mapped_nuclei_data) plt.xlabel("t-SNE dimension 1", fontsize='large') plt.ylabel("t-SNE dimension 2", fontsize='large') #plt.title("Nuclei Data - t-SNE - 2 dimensions - perplexity "+str(p)) plt.savefig(save_location+'nuclei_tsne_2_perplexity_'+str(p)+'.png') plt.close() # approach 4 # PCA -> 10 dimensions -> t-sne -> 2 dimensions # start a timer time_start = time.time() # transform the data cell_pca_10_tsne_2 = tsne.fit_transform(cell_pca_10) print('t-SNE for cells done! Reduced from 10 to 2 dimensions! Time elapsed: {} seconds'.format(time.time()-time_start)) time_start = time.time() nuclei_pca_10_tsne_2 = tsne.fit_transform(nuclei_pca_10) print('t-SNE nuclei done! Reduced from 10 to 2 dimensions! Time elapsed: {} seconds'.format(time.time()-time_start)) mapped_cell_data['10D_PCA_2D_TSNE_1_perplexity_'+str(p)] = cell_pca_10_tsne_2[:,0] mapped_cell_data['10D_PCA_2D_TSNE_2_perplexity_'+str(p)] = cell_pca_10_tsne_2[:,1] mapped_nuclei_data['10D_PCA_2D_TSNE_1_perplexity_'+str(p)] = nuclei_pca_10_tsne_2[:,0] mapped_nuclei_data['10D_PCA_2D_TSNE_2_perplexity_'+str(p)] = nuclei_pca_10_tsne_2[:,1] # plot the 2D PCA t-SNE data plt.figure() sns.scatterplot(x="10D_PCA_2D_TSNE_1_perplexity_"+str(p), y="10D_PCA_2D_TSNE_2_perplexity_"+str(p), data=mapped_cell_data) plt.xlabel("t-SNE dimension 1", fontsize='large') plt.ylabel("t-SNE dimension 2", fontsize='large') #plt.title("Cell Data - Combination of PCA and t-SNE - 10 and 2 dimensions - perplexity " +str(p)) plt.savefig(save_location+'cell_pca_10_tsne_2_perplexity_'+str(p)+'.png') plt.close() plt.figure() sns.scatterplot(x="10D_PCA_2D_TSNE_1_perplexity_"+str(p), y="10D_PCA_2D_TSNE_2_perplexity_"+str(p), data=mapped_nuclei_data) plt.xlabel("t-SNE dimension 1", fontsize='large') plt.ylabel("t-SNE dimension 2", fontsize='large') #plt.title("Nuclei Data - Combination of PCA and t-SNE - 10 and 2 dimensions - perplexity " +str(p)) plt.savefig(save_location+'nuclei_pca_10_tsne_2_perplexity_'+str(p)+'.png') plt.close() # approach 5: # UMAP -> 2 dimensions # loop over the UMAP parameters for n in neighbors: for dist in distance: # create the UMAP transformer umap_transformer = umap.UMAP(n_neighbors=n,min_dist=dist,n_components=2) # transform the data cell_umap_2 = umap_transformer.fit_transform(cell_data.values) nuclei_umap_2 = umap_transformer.fit_transform(nuclei_data.values) mapped_cell_data['2D_UMAP_1_neighbors_'+str(n)+'_dist_'+str(dist)] = cell_umap_2[:,0] mapped_cell_data['2D_UMAP_2_neighbors_'+str(n)+'_dist_'+str(dist)] = cell_umap_2[:,1] mapped_nuclei_data['2D_UMAP_1_neighbors_'+str(n)+'_dist_'+str(dist)] = nuclei_umap_2[:,0] mapped_nuclei_data['2D_UMAP_2_neighbors_'+str(n)+'_dist_'+str(dist)] = nuclei_umap_2[:,1] # plot the 2D UMAP data plt.figure() sns.scatterplot(x="2D_UMAP_1_neighbors_"+str(n)+"_dist_"+str(dist), y="2D_UMAP_2_neighbors_"+str(n)+"_dist_"+str(dist), data= mapped_cell_data) plt.xlabel("UMAP dimension 1", fontsize='large') plt.ylabel("UMAP dimension 2", fontsize='large') #plt.title("Cell Data - UMAP - 2 dimensions - n_neighbors "+str(n)+" - min_dist "+str(dist)) plt.savefig(save_location+'cell_umap_2_neighbors_'+str(n)+'_dist_'+str(dist)+'.png') plt.close() plt.figure() sns.scatterplot(x="2D_UMAP_1_neighbors_"+str(n)+"_dist_"+str(dist), y="2D_UMAP_2_neighbors_"+str(n)+"_dist_"+str(dist), data= mapped_nuclei_data) plt.xlabel("UMAP dimension 1", fontsize='large') plt.ylabel("UMAP dimension 2", fontsize='large') #plt.title("Nuclei Data - UMAP - 2 dimensions - n_neighbors "+str(n)+" - min_dist "+str(dist)) plt.savefig(save_location+'nuclei_umap_2_neighbors_'+str(n)+'_dist_'+str(dist)+'.png') plt.close() ''' Output ''' mapped_cell_data.to_csv(save_location+'mapped_cell_data.csv') mapped_nuclei_data.to_csv(save_location+'mapped_nuclei_data.csv')