# -*- coding: utf-8 -*- """ Created on Wed Jul 31 15:19:18 2019 @author: mhill """ """ AdaBoost.py uses AdaBoost on the data """ """ importing stuff """ import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import AdaBoostClassifier """ declarations and loading data """ # where is your data stored? data_location = "H:/Internship/ML-Models/Data/2D" # where is the ground truth stored? truth_location = "H:/Internship/ML-Models/Results/2D/Clustering" # which clustering shall be the ground truth? truth_cluster_cells = ['eps_10_samples_5'] truth_cluster_nuclei = ['eps_0.5_samples_5'] # load data cell_data = pd.read_csv(data_location+"/cell_data.csv").drop(columns=["Unnamed: 0"]) nuclei_data = pd.read_csv(data_location+"/nuclei_data.csv").drop(columns=["Unnamed: 0"]) # load ground truth cell_ground_truth = pd.read_csv(truth_location+"/clustered_cell_data.csv")[truth_cluster_cells] nuclei_ground_truth = pd.read_csv(truth_location+"/clustered_nuclei_data.csv")[truth_cluster_nuclei] # append ground truth to data cell_data['ground_truth'] = cell_ground_truth cell_ground_truth.drop(columns=truth_cluster_cells, inplace=True) nuclei_data['ground_truth'] = nuclei_ground_truth nuclei_ground_truth.drop(columns=truth_cluster_nuclei, inplace=True) # shuffle data cell_data.sample(frac=1) nuclei_data.sample(frac=1) # split the labels off again cell_ground_truth['ground_truth'] = cell_data['ground_truth'] cell_data.drop(columns=['ground_truth'], inplace=True) nuclei_ground_truth['ground_truth'] = nuclei_data['ground_truth'] nuclei_data.drop(columns=['ground_truth'], inplace=True) # number of deciders in AdaBoost no_of_est = [20, 40, 60, 80, 100] # create arrays, where accuracy and importances shall be stored acc_array = np.zeros((2, len(no_of_est))) imp_array_cells = np.zeros((len(no_of_est), len(cell_data.columns.values))) imp_array_nuclei = np.zeros((len(no_of_est), len(nuclei_data.columns.values))) """ create training and test sets """ cell_train, cell_test, cell_label_train, cell_label_test = train_test_split(cell_data, cell_ground_truth, test_size = 0.2) nuclei_train, nuclei_test, nuclei_label_train, nuclei_label_test = train_test_split(nuclei_data, nuclei_ground_truth, test_size = 0.2) """ AdaBoost """ for i in range(len(no_of_est)): # defining ada_cells = AdaBoostClassifier(n_estimators=no_of_est[i]) ada_nuclei = AdaBoostClassifier(n_estimators=no_of_est[i]) # training ada_cells.fit(cell_train, cell_label_train) ada_nuclei.fit(nuclei_train, nuclei_label_train) # testing cell_acc = ada_cells.score(cell_test, cell_label_test) acc_array[0,i] = cell_acc nuclei_acc = ada_nuclei.score(nuclei_test, nuclei_label_test) acc_array[1,i] = nuclei_acc # feature importances cell_feat_imp = ada_cells.feature_importances_ imp_array_cells[i,:] = cell_feat_imp nuclei_feat_imp = ada_nuclei.feature_importances_ imp_array_nuclei[i,:] = nuclei_feat_imp """ making the data more comprehensible """ # creating dataframes imp_cells = pd.DataFrame(data=imp_array_cells ,columns=cell_data.columns) imp_cells['number_of_estimators'] = no_of_est imp_cells.set_index('number_of_estimators', inplace=True) imp_cells.to_csv('H:/Internship/ML-Models/Results/2D/Classification/AB_imp_cells.csv') imp_nuclei = pd.DataFrame(data=imp_array_nuclei, columns=nuclei_data.columns) imp_nuclei['number_of_estimators'] = no_of_est imp_nuclei.set_index('number_of_estimators', inplace=True) imp_nuclei.to_csv('H:/Internship/ML-Models/Results/2D/Classification/AB_imp_nuclei.csv') accuracy = pd.DataFrame(data=acc_array, index=['Cells','Nuclei'], columns=no_of_est) accuracy.to_csv('H:/Internship/ML-Models/Results/2D/Classification/AB_accuracy.csv') """ plotting like guy fawkes, y'all """ # accuracies ind = np.arange(len(no_of_est)) # the x locations for the groups width = 0.35 # the width of the bars fig, ax = plt.subplots() rects1 = ax.bar(ind - width/2, accuracy.loc['Cells'].values, width, label='Cells') rects2 = ax.bar(ind + width/2, accuracy.loc['Nuclei'].values, width, label='Nuclei') # Add some text for labels, title and custom x-axis tick labels, etc. ax.set_ylabel('Accuracy', fontsize='large') ax.set_ylim([0,1]) #ax.set_title('Accuracy by number of trees and dataset') ax.set_xlabel('number of estimators', fontsize='large') ax.set_xticks(ind) ax.set_xticklabels(('20', '40', '60', '80', '100'), fontsize='large') plt.grid(axis='y') ax.legend(fontsize='x-large') plt.savefig('H:/Internship/ML-Models/Results/2D/Classification/AB_Acc.png') """ plotting the importances """ # summing up and taking the mean of the importances mean_imp_cells = sum(imp_array_cells) / len(no_of_est) mean_imp_nuclei = sum(imp_array_nuclei) / len(no_of_est) plt.figure() #plt.title("Mean importances for Cell Classification with Random Forests") plt.bar(range(cell_feat_imp.shape[0]), mean_imp_cells) #plt.xticks(range(cell_feat_importance.shape[0]), cell_data.columns.values, \ # rotation=45, ha='right', fontsize='small') plt.xticks(range(cell_feat_imp.shape[0]), fontsize='large') plt.xlabel('Feature', fontsize='large') plt.ylabel('Mean Feature Importance', fontsize='large') plt.grid(axis='y') plt.show() plt.savefig('H:/Internship/ML-Models/Results/2D/Classification/AB_Imp_Cells.png') plt.figure() #plt.title("Mean importances for Nuclei Classification with Random Forests") plt.bar(range(nuclei_feat_imp.shape[0]), mean_imp_nuclei) #plt.xticks(range(nuclei_feat_importance.shape[0]), nuclei_data.columns.values, \ # rotation=45, ha='right', fontsize='small') plt.xticks(range(nuclei_feat_imp.shape[0]), fontsize='large') plt.xlabel('Feature', fontsize='large') plt.ylabel('Mean Feature Importance', fontsize='large') plt.grid(axis='y') plt.show() plt.savefig('H:/Internship/ML-Models/Results/2D/Classification/AB_Imp_Nuclei.png')