# -*- coding: utf-8 -*-
"""
Created on Thu Jul 25 11:58:28 2019

@author: mhill
"""

"""
RF.py

uses Random Forest the data
"""

""" importing stuff """

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

""" declarations and loading data """

# where is your data stored?
data_location = "H:/Internship/ML-Models/Data"

# where is the ground truth stored?
truth_location = "H:/Internship/ML-Models/Results/Clustering"

# which clustering shall be the ground truth?
truth_cluster_cells = ['eps_10_samples_5']
truth_cluster_nuclei = ['eps_2_samples_5']

# load data
cell_data = pd.read_csv(data_location+"/cell_data.csv").drop(columns=["Unnamed: 0"])
nuclei_data = pd.read_csv(data_location+"/nuclei_data.csv").drop(columns=["Unnamed: 0"])

# load ground truth
cell_ground_truth = pd.read_csv(truth_location+"/clustered_cell_data.csv")[truth_cluster_cells]
nuclei_ground_truth = pd.read_csv(truth_location+"/clustered_nuclei_data.csv")[truth_cluster_nuclei]

# append ground truth to data
cell_data['ground_truth'] = cell_ground_truth
cell_ground_truth.drop(columns=truth_cluster_cells, inplace=True)
nuclei_data['ground_truth'] = nuclei_ground_truth
nuclei_ground_truth.drop(columns=truth_cluster_nuclei, inplace=True)

# shuffle data
cell_data.sample(frac=1)
nuclei_data.sample(frac=1)

# split the labels off again
cell_ground_truth['ground_truth'] = cell_data['ground_truth']
cell_data.drop(columns=['ground_truth'], inplace=True)
nuclei_ground_truth['ground_truth'] = nuclei_data['ground_truth']
nuclei_data.drop(columns=['ground_truth'], inplace=True)

# number of tress in RF's
no_of_trees = [20, 40, 60, 80, 100]

# create arrays, where accuracy and importances shall be stored
acc_array = np.zeros((2, len(no_of_trees)))
imp_array_cells = np.zeros((len(no_of_trees), len(cell_data.columns.values)))
imp_array_nuclei = np.zeros((len(no_of_trees), len(nuclei_data.columns.values)))

""" create training and test sets """

cell_train, cell_test, cell_label_train, cell_label_test = train_test_split(cell_data, cell_ground_truth, test_size = 0.2)
nuclei_train, nuclei_test, nuclei_label_train, nuclei_label_test = train_test_split(nuclei_data, nuclei_ground_truth, test_size = 0.2)

""" Random Forest """

for i in range(len(no_of_trees)):

    # defining
    rf_cells = RandomForestClassifier(n_estimators=no_of_trees[i])
    rf_nuclei = RandomForestClassifier(n_estimators=no_of_trees[i])

    # training
    rf_cells.fit(cell_train, cell_label_train)
    rf_nuclei.fit(nuclei_train, nuclei_label_train)

    # testing
    cell_acc = rf_cells.score(cell_test, cell_label_test)
    acc_array[0,i] = cell_acc
    nuclei_acc = rf_nuclei.score(nuclei_test, nuclei_label_test)
    acc_array[1,i] = nuclei_acc

    # feature importance
    cell_feat_importance = rf_cells.feature_importances_
    imp_array_cells[i,:] = cell_feat_importance
    nuclei_feat_importance = rf_nuclei.feature_importances_
    imp_array_nuclei[i,:] = nuclei_feat_importance

    """ putting stuff out there """

#    # printing accuracies
#    print("Mean accuracy score for cell predictions - %d trees: %.4f" %(no_of_trees[i], cell_acc))
#    print("Mean accuracy score for nuclei predictions - %d trees: %.4f" %(no_of_trees[i], nuclei_acc))
#
#    # feature importance graphs
#    plt.figure()
#    plt.title("Cell Classification with %d trees: Feature importances" %no_of_trees[i])
#    plt.bar(range(cell_feat_importance.shape[0]), cell_feat_importance)
#    plt.xticks(range(cell_feat_importance.shape[0]), cell_data.columns.values, \
#               rotation=45, ha='right', fontsize='small')
#    plt.show()
#
#    plt.figure()
#    plt.title("Nuclei Classification with %d trees: Feature importances" %no_of_trees[i])
#    plt.bar(range(nuclei_feat_importance.shape[0]), nuclei_feat_importance)
#    plt.xticks(range(nuclei_feat_importance.shape[0]), nuclei_data.columns.values, \
#               rotation=45, ha='right', fontsize='small')
#    plt.show()

""" making the data more comprehensible """

# creating dataframes
imp_cells = pd.DataFrame(data=imp_array_cells ,columns=cell_data.columns)
imp_cells['number_of_trees'] = no_of_trees
imp_cells.set_index('number_of_trees', inplace=True)
imp_cells.to_csv('H:\Internship\ML-Models\Results\Classification\RF_imp_cells.csv')
imp_nuclei = pd.DataFrame(data=imp_array_nuclei, columns=nuclei_data.columns)
imp_nuclei['number_of_trees'] = no_of_trees
imp_nuclei.set_index('number_of_trees', inplace=True)
imp_nuclei.to_csv('H:\Internship\ML-Models\Results\Classification\RF_imp_nuclei.csv')
accuracy = pd.DataFrame(data=acc_array, index=['Cells','Nuclei'], columns=no_of_trees)
accuracy.to_csv('H:\Internship\ML-Models\Results\Classification\RF_accuracy.csv')

""" plotting the accuracys """

ind = np.arange(len(no_of_trees))  # the x locations for the groups
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, accuracy.loc['Cells'].values, width, label='Cells')
rects2 = ax.bar(ind + width/2, accuracy.loc['Nuclei'].values, width, label='Nuclei')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy', fontsize='large')
ax.set_ylim([0.85,1])
#ax.set_title('Accuracy by number of trees and dataset')
ax.set_xlabel('number of trees', fontsize='large')
ax.set_xticks(ind)
ax.set_xticklabels(('20', '40', '60', '80', '100'), fontsize='large')
plt.grid(axis='y')
ax.legend(fontsize='x-large')
plt.savefig('H:\Internship\ML-Models\Results\Classification\RF_Acc.png')

""" plotting the importances """

# summing up and taking the mean of the importances
mean_imp_cells = sum(imp_array_cells) / len(no_of_trees)
mean_imp_nuclei = sum(imp_array_nuclei) / len(no_of_trees)

plt.figure()
#plt.title("Mean importances for Cell Classification with Random Forests")
plt.bar(range(cell_feat_importance.shape[0]), mean_imp_cells)
#plt.xticks(range(cell_feat_importance.shape[0]), cell_data.columns.values, \
#           rotation=45, ha='right', fontsize='small')
plt.xticks(range(cell_feat_importance.shape[0]), fontsize='large')
plt.xlabel('Feature', fontsize='large')
plt.ylabel('Mean Feature Importance', fontsize='large')
plt.grid(axis='y')
plt.show()
plt.savefig('H:\Internship\ML-Models\Results\Classification\RF_Imp_Cells.png')

plt.figure()
#plt.title("Mean importances for Nuclei Classification with Random Forests")
plt.bar(range(nuclei_feat_importance.shape[0]), mean_imp_nuclei)
#plt.xticks(range(nuclei_feat_importance.shape[0]), nuclei_data.columns.values, \
#          rotation=45, ha='right', fontsize='small')
plt.xticks(range(nuclei_feat_importance.shape[0]), fontsize='large')
plt.xlabel('Feature', fontsize='large')
plt.ylabel('Mean Feature Importance', fontsize='large')
plt.grid(axis='y')
plt.show()
plt.savefig('H:\Internship\ML-Models\Results\Classification\RF_Imp_Nuclei.png')