# -*- coding: utf-8 -*-
"""
Created on Wed Jun 12 10:02:27 2019

@author: mhill
"""

"""
preprocessing.py

gets prepared data und maps it to lower dimensions, so the clustering
and visualization gets easier
"""

""" importing stuff """

import time
import umap
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

""" declarations """

# datafilename = location of your datafile
cell_datafile = 'H:/Internship/ML-Models/Data/cell_data.csv'
nuclei_datafile = 'H:/Internship/ML-Models/Data/nuclei_data.csv'

# save_location = where you want to store your results
save_location = 'H:/Internship/ML-Models/Results/Preprocessing/'

# ICA_iter = max number of iterations for your ICA
ICA_iter = 600

# perplexity parameter for t-SNE
perplexity = range(5,23,3)

# parameters for UMAP
neighbors = [2,10,15,30,100,200]
distance = [0.0,0.1,0.3,0.5,0.9]

""" loading features, data and models """

# loading data
cell_data = pd.read_csv(cell_datafile)
cell_data.drop(['Unnamed: 0'], axis=1, inplace=True)
nuclei_data = pd.read_csv(nuclei_datafile)
nuclei_data.drop(['Unnamed: 0'], axis=1, inplace=True)

# create the dataframes, where the mapped data should be stored
mapped_cell_data = pd.DataFrame()
mapped_nuclei_data = pd.DataFrame()

""" using the models """

# approach 1:
# PCA -> 3 dimensions
pca_3_transformer = PCA(n_components=3)
cell_pca_3 = pca_3_transformer.fit_transform(cell_data.values)
nuclei_pca_3 = pca_3_transformer.fit_transform(nuclei_data.values)

mapped_cell_data['PCA_1'] = cell_pca_3[:,0]
mapped_cell_data['PCA_2'] = cell_pca_3[:,1]
mapped_cell_data['PCA_3'] = cell_pca_3[:,2]

mapped_nuclei_data['PCA_1'] = nuclei_pca_3[:,0]
mapped_nuclei_data['PCA_2'] = nuclei_pca_3[:,1]
mapped_nuclei_data['PCA_3'] = nuclei_pca_3[:,2]

print('Explained variation per principal component - 3dim PCA: {}'.format(pca_3_transformer.explained_variance_ratio_))

# plot the 3D PCA data
ax = plt.figure().gca(projection='3d')
ax.scatter(xs=mapped_cell_data.loc[:,:]["PCA_1"],
    ys=mapped_cell_data.loc[:,:]["PCA_2"],
    zs=mapped_cell_data.loc[:,:]["PCA_3"])
ax.set_xlabel('PCA_1')
ax.set_ylabel('PCA_2')
ax.set_zlabel('PCA_3')
plt.show()

ax = plt.figure().gca(projection='3d')
ax.scatter(xs=mapped_nuclei_data.loc[:,:]["PCA_1"],
    ys=mapped_nuclei_data.loc[:,:]["PCA_2"],
    zs=mapped_nuclei_data.loc[:,:]["PCA_3"])
ax.set_xlabel('PCA_1')
ax.set_ylabel('PCA_2')
ax.set_zlabel('PCA_3')
plt.show()

# approach 2:
# PCA -> 2 dimensions

# plot the 2D PCA data
plt.figure(figsize=(16,10))
sns.scatterplot(x="PCA_1", y="PCA_2", data=mapped_cell_data)
plt.title("Cell Data - PCA - 2 dimensions")
plt.savefig(save_location+'cell_pca_2.png')
plt.close()

plt.figure(figsize=(16,10))
sns.scatterplot(x="PCA_1", y="PCA_2", data=mapped_nuclei_data)
plt.title("Nuclei Data - PCA - 2 dimensions")
plt.savefig(save_location+'nuclei_pca_2.png')
plt.close()

# approach 3 and 4:
# t-sne -> 2 dimensions
# PCA -> 10 dimensions -> t-sne -> 2 dimensions

pca_10_transformer = PCA(n_components=10)
cell_pca_10 = pca_10_transformer.fit_transform(cell_data.values)
nuclei_pca_10 = pca_10_transformer.fit_transform(nuclei_data.values)

# loop over the perplexity parameter
for p in perplexity:

    # create the TSNE mapping
    tsne = TSNE(n_components=2, verbose=1, perplexity=p, n_iter=10000)

    # approach 3
    # t-SNE -> 2 dimensions

    # start a timer
    time_start = time.time()
    # transform the data
    cell_tsne_2 = tsne.fit_transform(cell_data.values)
    print('t-SNE for cells done! Reduced from 18 to 2 dimensions! Time elapsed: {} seconds'.format(time.time()-time_start))

    time_start = time.time()
    nuclei_tsne_2 = tsne.fit_transform(nuclei_data.values)
    print('t-SNE for nuclei done! Reduced from 19 to 2 dimensions! Time elapsed: {} seconds'.format(time.time()-time_start))

    mapped_cell_data['2D_TSNE_1_perplexity_'+str(p)] = cell_tsne_2[:,0]
    mapped_cell_data['2D_TSNE_2_perplexity_'+str(p)] = cell_tsne_2[:,1]

    mapped_nuclei_data['2D_TSNE_1_perplexity_'+str(p)] = nuclei_tsne_2[:,0]
    mapped_nuclei_data['2D_TSNE_2_perplexity_'+str(p)] = nuclei_tsne_2[:,1]

    # plot the 2D t-SNE data
    plt.figure()
    sns.scatterplot(x='2D_TSNE_1_perplexity_'+str(p), y='2D_TSNE_2_perplexity_'+str(p), data=mapped_cell_data)
    plt.xlabel("t-SNE dimension 1", fontsize='large')
    plt.ylabel("t-SNE dimension 2", fontsize='large')
    #plt.title("Cell Data - t-SNE - 2 dimensions - perplexity "+str(p))
    plt.savefig(save_location+'cell_tsne_2_perplexity_'+str(p)+'.png')
    plt.close()

    plt.figure()
    sns.scatterplot(x='2D_TSNE_1_perplexity_'+str(p), y='2D_TSNE_2_perplexity_'+str(p), data=mapped_nuclei_data)
    plt.xlabel("t-SNE dimension 1", fontsize='large')
    plt.ylabel("t-SNE dimension 2", fontsize='large')
    #plt.title("Nuclei Data - t-SNE - 2 dimensions - perplexity "+str(p))
    plt.savefig(save_location+'nuclei_tsne_2_perplexity_'+str(p)+'.png')
    plt.close()

    # approach 4
    # PCA -> 10 dimensions -> t-sne -> 2 dimensions

    # start a timer
    time_start = time.time()
    # transform the data
    cell_pca_10_tsne_2 = tsne.fit_transform(cell_pca_10)
    print('t-SNE for cells done! Reduced from 10 to 2 dimensions! Time elapsed: {} seconds'.format(time.time()-time_start))

    time_start = time.time()
    nuclei_pca_10_tsne_2 = tsne.fit_transform(nuclei_pca_10)
    print('t-SNE nuclei done! Reduced from 10 to 2 dimensions! Time elapsed: {} seconds'.format(time.time()-time_start))

    mapped_cell_data['10D_PCA_2D_TSNE_1_perplexity_'+str(p)] = cell_pca_10_tsne_2[:,0]
    mapped_cell_data['10D_PCA_2D_TSNE_2_perplexity_'+str(p)] = cell_pca_10_tsne_2[:,1]

    mapped_nuclei_data['10D_PCA_2D_TSNE_1_perplexity_'+str(p)] = nuclei_pca_10_tsne_2[:,0]
    mapped_nuclei_data['10D_PCA_2D_TSNE_2_perplexity_'+str(p)] = nuclei_pca_10_tsne_2[:,1]

    # plot the 2D PCA t-SNE data
    plt.figure()
    sns.scatterplot(x="10D_PCA_2D_TSNE_1_perplexity_"+str(p), y="10D_PCA_2D_TSNE_2_perplexity_"+str(p), data=mapped_cell_data)
    plt.xlabel("t-SNE dimension 1", fontsize='large')
    plt.ylabel("t-SNE dimension 2", fontsize='large')
    #plt.title("Cell Data - Combination of PCA and t-SNE - 10 and 2 dimensions - perplexity " +str(p))
    plt.savefig(save_location+'cell_pca_10_tsne_2_perplexity_'+str(p)+'.png')
    plt.close()

    plt.figure()
    sns.scatterplot(x="10D_PCA_2D_TSNE_1_perplexity_"+str(p), y="10D_PCA_2D_TSNE_2_perplexity_"+str(p), data=mapped_nuclei_data)
    plt.xlabel("t-SNE dimension 1", fontsize='large')
    plt.ylabel("t-SNE dimension 2", fontsize='large')
    #plt.title("Nuclei Data - Combination of PCA and t-SNE - 10 and 2 dimensions - perplexity " +str(p))
    plt.savefig(save_location+'nuclei_pca_10_tsne_2_perplexity_'+str(p)+'.png')
    plt.close()

# approach 5:
# UMAP -> 2 dimensions

# loop over the UMAP parameters
for n in neighbors:
    for dist in distance:

        # create the UMAP transformer
        umap_transformer = umap.UMAP(n_neighbors=n,min_dist=dist,n_components=2)

        # transform the data
        cell_umap_2 = umap_transformer.fit_transform(cell_data.values)
        nuclei_umap_2 = umap_transformer.fit_transform(nuclei_data.values)

        mapped_cell_data['2D_UMAP_1_neighbors_'+str(n)+'_dist_'+str(dist)] = cell_umap_2[:,0]
        mapped_cell_data['2D_UMAP_2_neighbors_'+str(n)+'_dist_'+str(dist)] = cell_umap_2[:,1]

        mapped_nuclei_data['2D_UMAP_1_neighbors_'+str(n)+'_dist_'+str(dist)] = nuclei_umap_2[:,0]
        mapped_nuclei_data['2D_UMAP_2_neighbors_'+str(n)+'_dist_'+str(dist)] = nuclei_umap_2[:,1]

        # plot the 2D UMAP data
        plt.figure()
        sns.scatterplot(x="2D_UMAP_1_neighbors_"+str(n)+"_dist_"+str(dist), y="2D_UMAP_2_neighbors_"+str(n)+"_dist_"+str(dist), data= mapped_cell_data)
        plt.xlabel("UMAP dimension 1", fontsize='large')
        plt.ylabel("UMAP dimension 2", fontsize='large')
        #plt.title("Cell Data - UMAP - 2 dimensions - n_neighbors "+str(n)+" - min_dist "+str(dist))
        plt.savefig(save_location+'cell_umap_2_neighbors_'+str(n)+'_dist_'+str(dist)+'.png')
        plt.close()

        plt.figure()
        sns.scatterplot(x="2D_UMAP_1_neighbors_"+str(n)+"_dist_"+str(dist), y="2D_UMAP_2_neighbors_"+str(n)+"_dist_"+str(dist), data= mapped_nuclei_data)
        plt.xlabel("UMAP dimension 1", fontsize='large')
        plt.ylabel("UMAP dimension 2", fontsize='large')
        #plt.title("Nuclei Data - UMAP - 2 dimensions - n_neighbors "+str(n)+" - min_dist "+str(dist))
        plt.savefig(save_location+'nuclei_umap_2_neighbors_'+str(n)+'_dist_'+str(dist)+'.png')
        plt.close()

''' Output '''

mapped_cell_data.to_csv(save_location+'mapped_cell_data.csv')
mapped_nuclei_data.to_csv(save_location+'mapped_nuclei_data.csv')