# -*- coding: utf-8 -*- """ Created on Thu Jul 25 11:58:28 2019 @author: mhill """ """ RF.py uses Random Forest the data """ """ importing stuff """ import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier """ declarations and loading data """ # where is your data stored? data_location = "H:/Internship/ML-Models/Data" # where is the ground truth stored? truth_location = "H:/Internship/ML-Models/Results/Clustering" # which clustering shall be the ground truth? truth_cluster_cells = ['eps_10_samples_5'] truth_cluster_nuclei = ['eps_2_samples_5'] # load data cell_data = pd.read_csv(data_location+"/cell_data.csv").drop(columns=["Unnamed: 0"]) nuclei_data = pd.read_csv(data_location+"/nuclei_data.csv").drop(columns=["Unnamed: 0"]) # load ground truth cell_ground_truth = pd.read_csv(truth_location+"/clustered_cell_data.csv")[truth_cluster_cells] nuclei_ground_truth = pd.read_csv(truth_location+"/clustered_nuclei_data.csv")[truth_cluster_nuclei] # append ground truth to data cell_data['ground_truth'] = cell_ground_truth cell_ground_truth.drop(columns=truth_cluster_cells, inplace=True) nuclei_data['ground_truth'] = nuclei_ground_truth nuclei_ground_truth.drop(columns=truth_cluster_nuclei, inplace=True) # shuffle data cell_data.sample(frac=1) nuclei_data.sample(frac=1) # split the labels off again cell_ground_truth['ground_truth'] = cell_data['ground_truth'] cell_data.drop(columns=['ground_truth'], inplace=True) nuclei_ground_truth['ground_truth'] = nuclei_data['ground_truth'] nuclei_data.drop(columns=['ground_truth'], inplace=True) # number of tress in RF's no_of_trees = [20, 40, 60, 80, 100] # create arrays, where accuracy and importances shall be stored acc_array = np.zeros((2, len(no_of_trees))) imp_array_cells = np.zeros((len(no_of_trees), len(cell_data.columns.values))) imp_array_nuclei = np.zeros((len(no_of_trees), len(nuclei_data.columns.values))) """ create training and test sets """ cell_train, cell_test, cell_label_train, cell_label_test = train_test_split(cell_data, cell_ground_truth, test_size = 0.2) nuclei_train, nuclei_test, nuclei_label_train, nuclei_label_test = train_test_split(nuclei_data, nuclei_ground_truth, test_size = 0.2) """ Random Forest """ for i in range(len(no_of_trees)): # defining rf_cells = RandomForestClassifier(n_estimators=no_of_trees[i]) rf_nuclei = RandomForestClassifier(n_estimators=no_of_trees[i]) # training rf_cells.fit(cell_train, cell_label_train) rf_nuclei.fit(nuclei_train, nuclei_label_train) # testing cell_acc = rf_cells.score(cell_test, cell_label_test) acc_array[0,i] = cell_acc nuclei_acc = rf_nuclei.score(nuclei_test, nuclei_label_test) acc_array[1,i] = nuclei_acc # feature importance cell_feat_importance = rf_cells.feature_importances_ imp_array_cells[i,:] = cell_feat_importance nuclei_feat_importance = rf_nuclei.feature_importances_ imp_array_nuclei[i,:] = nuclei_feat_importance """ putting stuff out there """ # # printing accuracies # print("Mean accuracy score for cell predictions - %d trees: %.4f" %(no_of_trees[i], cell_acc)) # print("Mean accuracy score for nuclei predictions - %d trees: %.4f" %(no_of_trees[i], nuclei_acc)) # # # feature importance graphs # plt.figure() # plt.title("Cell Classification with %d trees: Feature importances" %no_of_trees[i]) # plt.bar(range(cell_feat_importance.shape[0]), cell_feat_importance) # plt.xticks(range(cell_feat_importance.shape[0]), cell_data.columns.values, \ # rotation=45, ha='right', fontsize='small') # plt.show() # # plt.figure() # plt.title("Nuclei Classification with %d trees: Feature importances" %no_of_trees[i]) # plt.bar(range(nuclei_feat_importance.shape[0]), nuclei_feat_importance) # plt.xticks(range(nuclei_feat_importance.shape[0]), nuclei_data.columns.values, \ # rotation=45, ha='right', fontsize='small') # plt.show() """ making the data more comprehensible """ # creating dataframes imp_cells = pd.DataFrame(data=imp_array_cells ,columns=cell_data.columns) imp_cells['number_of_trees'] = no_of_trees imp_cells.set_index('number_of_trees', inplace=True) imp_cells.to_csv('H:\Internship\ML-Models\Results\Classification\RF_imp_cells.csv') imp_nuclei = pd.DataFrame(data=imp_array_nuclei, columns=nuclei_data.columns) imp_nuclei['number_of_trees'] = no_of_trees imp_nuclei.set_index('number_of_trees', inplace=True) imp_nuclei.to_csv('H:\Internship\ML-Models\Results\Classification\RF_imp_nuclei.csv') accuracy = pd.DataFrame(data=acc_array, index=['Cells','Nuclei'], columns=no_of_trees) accuracy.to_csv('H:\Internship\ML-Models\Results\Classification\RF_accuracy.csv') """ plotting the accuracys """ ind = np.arange(len(no_of_trees)) # the x locations for the groups width = 0.35 # the width of the bars fig, ax = plt.subplots() rects1 = ax.bar(ind - width/2, accuracy.loc['Cells'].values, width, label='Cells') rects2 = ax.bar(ind + width/2, accuracy.loc['Nuclei'].values, width, label='Nuclei') # Add some text for labels, title and custom x-axis tick labels, etc. ax.set_ylabel('Accuracy', fontsize='large') ax.set_ylim([0.85,1]) #ax.set_title('Accuracy by number of trees and dataset') ax.set_xlabel('number of trees', fontsize='large') ax.set_xticks(ind) ax.set_xticklabels(('20', '40', '60', '80', '100'), fontsize='large') plt.grid(axis='y') ax.legend(fontsize='x-large') plt.savefig('H:\Internship\ML-Models\Results\Classification\RF_Acc.png') """ plotting the importances """ # summing up and taking the mean of the importances mean_imp_cells = sum(imp_array_cells) / len(no_of_trees) mean_imp_nuclei = sum(imp_array_nuclei) / len(no_of_trees) plt.figure() #plt.title("Mean importances for Cell Classification with Random Forests") plt.bar(range(cell_feat_importance.shape[0]), mean_imp_cells) #plt.xticks(range(cell_feat_importance.shape[0]), cell_data.columns.values, \ # rotation=45, ha='right', fontsize='small') plt.xticks(range(cell_feat_importance.shape[0]), fontsize='large') plt.xlabel('Feature', fontsize='large') plt.ylabel('Mean Feature Importance', fontsize='large') plt.grid(axis='y') plt.show() plt.savefig('H:\Internship\ML-Models\Results\Classification\RF_Imp_Cells.png') plt.figure() #plt.title("Mean importances for Nuclei Classification with Random Forests") plt.bar(range(nuclei_feat_importance.shape[0]), mean_imp_nuclei) #plt.xticks(range(nuclei_feat_importance.shape[0]), nuclei_data.columns.values, \ # rotation=45, ha='right', fontsize='small') plt.xticks(range(nuclei_feat_importance.shape[0]), fontsize='large') plt.xlabel('Feature', fontsize='large') plt.ylabel('Mean Feature Importance', fontsize='large') plt.grid(axis='y') plt.show() plt.savefig('H:\Internship\ML-Models\Results\Classification\RF_Imp_Nuclei.png')