# -*- coding: utf-8 -*- """ Created on Thu Jun 27 12:06:13 2019 @author: mhill """ """ clustering.py clusters preprocessed data using DBSCAN """ """ importing stuff """ import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn import metrics from sklearn.cluster import DBSCAN """ declarations and loading data """ # locations, where the preprocessed data is stored and where the clustered data should be stored file_location = 'H:/Internship/ML-Models/Results/2D/Preprocessing/' save_location = 'H:/Internship/ML-Models/Results/2D/Clustering/' # create Dataframes, where the clustered data should be stored cluster_results_cells = pd.DataFrame() cluster_results_nuclei = pd.DataFrame() # define parameters for DBSCAN epsilon = [0.3,0.5,2,4,5,10] neighbors = [5,10,20] # choose the mapping, on which you want to cluster cell_mapping = ['2D_TSNE_1_perplexity_8','2D_TSNE_2_perplexity_8'] nuclei_mapping = ['2D_UMAP_1_neighbors_10_dist_0.0','2D_UMAP_2_neighbors_10_dist_0.0'] # show the clustering or not? show_flag = 1 # read in the preprocessed data cell_data = pd.read_csv(file_location+'mapped_cell_data.csv').drop(columns=['Unnamed: 0']) nuclei_data = pd.read_csv(file_location+'mapped_nuclei_data.csv').drop(columns=['Unnamed: 0']) """ clustering """ # loop over the parameters for DBSCAN for e in epsilon: for n in neighbors: # create a DBSCAN clustering of the cells with the set of parameters db = DBSCAN(eps=e, min_samples=n).fit(cell_data[cell_mapping].values) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ cluster_results_cells['eps_'+str(e)+'_samples_'+str(n)] = labels n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) # print statistics print('Cells: Estimated number of clusters for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_clusters_) print('Cells: Estimated number of noise points for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_noise_) # create a figure plt.figure() unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = cell_data[cell_mapping].values[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), \ markeredgecolor='k', markersize=14) xy = cell_data[cell_mapping].values[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), \ markeredgecolor='k', markersize=6) #plt.title('Estimated number of cell clusters for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_clusters_) plt.xlabel("t-SNE dimension 1", fontsize='large') plt.ylabel("t-SNE dimension 2", fontsize='large') # optionally show the figure if show_flag == 1: plt.show() # save the figure plt.savefig(save_location+'cell_clusters_eps_'+str(e)+'_samples_'+str(n)+'.png') # if the figure was shown, close it if show_flag == 1: plt.close() # create a DBSCAN clustering of the nuclei with the set of parameters db = DBSCAN(eps=e, min_samples=n).fit(nuclei_data[nuclei_mapping].values) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ cluster_results_nuclei['eps_'+str(e)+'_samples_'+str(n)] = labels n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) # print statistics print('Nuclei: Estimated number of clusters for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_clusters_) print('Nuclei: Estimated number of noise points for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_noise_) # create a figure plt.figure() unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = nuclei_data[nuclei_mapping].values[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), \ markeredgecolor='k', markersize=14) xy = nuclei_data[nuclei_mapping].values[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), \ markeredgecolor='k', markersize=6) #plt.title('Estimated number of nuclei clusters for eps = '+str(e)+' and min_samples = '+str(n)+': %d' % n_clusters_) plt.xlabel("UMAP dimension 1", fontsize='large') plt.ylabel("UMAP dimension 2", fontsize='large') # optionally show the figure if show_flag == 1: plt.show() # save the figure plt.savefig(save_location+'nuclei_clusters_eps_'+str(e)+'_samples_'+str(n)+'.png') # if the figure was shown, close it if show_flag == 1: plt.close() """ saving the clustered data """ cluster_results_cells.to_csv(save_location+'clustered_cell_data.csv') cluster_results_nuclei.to_csv(save_location+'clustered_nuclei_data.csv')