# -*- coding: utf-8 -*-
"""
Created on Thu Jun 27 12:06:13 2019

@author: mhill
"""

"""
clustering.py

clusters preprocessed data using DBSCAN
"""

""" importing stuff """

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cluster import DBSCAN

""" declarations and loading data """
# locations, where the preprocessed data is stored and where the clustered data should be stored
file_location = 'H:/Internship/ML-Models/Results/2D/Preprocessing/'
save_location = 'H:/Internship/ML-Models/Results/2D/Clustering/'

# create Dataframes, where the clustered data should be stored
cluster_results_cells = pd.DataFrame()
cluster_results_nuclei = pd.DataFrame()

# define parameters for DBSCAN
epsilon = [0.3,0.5,2,4,5,10]
neighbors = [5,10,20]

# choose the mapping, on which you want to cluster
cell_mapping = ['2D_TSNE_1_perplexity_8','2D_TSNE_2_perplexity_8']
nuclei_mapping = ['2D_UMAP_1_neighbors_10_dist_0.0','2D_UMAP_2_neighbors_10_dist_0.0']

# show the clustering or not?
show_flag = 1

# read in the preprocessed data
cell_data = pd.read_csv(file_location+'mapped_cell_data.csv').drop(columns=['Unnamed: 0'])
nuclei_data = pd.read_csv(file_location+'mapped_nuclei_data.csv').drop(columns=['Unnamed: 0'])

""" clustering """

# loop over the parameters for DBSCAN
for e in epsilon:
    for n in neighbors:

        # create a DBSCAN clustering of the cells with the set of parameters
        db = DBSCAN(eps=e, min_samples=n).fit(cell_data[cell_mapping].values)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        cluster_results_cells['eps_'+str(e)+'_samples_'+str(n)] = labels
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

        # print statistics
        print('Cells: Estimated number of clusters for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_clusters_)
        print('Cells: Estimated number of noise points for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_noise_)

        # create a figure
        plt.figure()
        unique_labels = set(labels)
        colors = [plt.cm.Spectral(each)
                  for each in np.linspace(0, 1, len(unique_labels))]
        for k, col in zip(unique_labels, colors):
            if k == -1:
                # Black used for noise.
                col = [0, 0, 0, 1]

            class_member_mask = (labels == k)

            xy = cell_data[cell_mapping].values[class_member_mask & core_samples_mask]
            plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), \
                     markeredgecolor='k', markersize=14)

            xy = cell_data[cell_mapping].values[class_member_mask & ~core_samples_mask]
            plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), \
                     markeredgecolor='k', markersize=6)

        #plt.title('Estimated number of cell clusters for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_clusters_)
        plt.xlabel("t-SNE dimension 1", fontsize='large')
        plt.ylabel("t-SNE dimension 2", fontsize='large')

        # optionally show the figure
        if show_flag == 1:
            plt.show()

        # save the figure
        plt.savefig(save_location+'cell_clusters_eps_'+str(e)+'_samples_'+str(n)+'.png')

        # if the figure was shown, close it
        if show_flag == 1:
            plt.close()

        # create a DBSCAN clustering of the nuclei with the set of parameters
        db = DBSCAN(eps=e, min_samples=n).fit(nuclei_data[nuclei_mapping].values)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        cluster_results_nuclei['eps_'+str(e)+'_samples_'+str(n)] = labels
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

        # print statistics
        print('Nuclei: Estimated number of clusters for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_clusters_)
        print('Nuclei: Estimated number of noise points for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_noise_)

        # create a figure
        plt.figure()
        unique_labels = set(labels)
        colors = [plt.cm.Spectral(each)
                  for each in np.linspace(0, 1, len(unique_labels))]
        for k, col in zip(unique_labels, colors):
            if k == -1:
                # Black used for noise.
                col = [0, 0, 0, 1]

            class_member_mask = (labels == k)

            xy = nuclei_data[nuclei_mapping].values[class_member_mask & core_samples_mask]
            plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), \
                     markeredgecolor='k', markersize=14)

            xy = nuclei_data[nuclei_mapping].values[class_member_mask & ~core_samples_mask]
            plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), \
                     markeredgecolor='k', markersize=6)

        #plt.title('Estimated number of nuclei clusters for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_clusters_)
        plt.xlabel("UMAP dimension 1", fontsize='large')
        plt.ylabel("UMAP dimension 2", fontsize='large')

        # optionally show the figure
        if show_flag == 1:
            plt.show()

        # save the figure
        plt.savefig(save_location+'nuclei_clusters_eps_'+str(e)+'_samples_'+str(n)+'.png')

        # if the figure was shown, close it
        if show_flag == 1:
            plt.close()

""" saving the clustered data """

cluster_results_cells.to_csv(save_location+'clustered_cell_data.csv')
cluster_results_nuclei.to_csv(save_location+'clustered_nuclei_data.csv')