# -*- coding: utf-8 -*-
"""
Created on Wed Jul 31 15:19:18 2019

@author: mhill
"""

"""
AdaBoost.py

uses AdaBoost on the data
"""

""" importing stuff """

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

""" declarations and loading data """

# where is your data stored?
data_location = "H:/Internship/ML-Models/Data/2D"

# where is the ground truth stored?
truth_location = "H:/Internship/ML-Models/Results/2D/Clustering"

# which clustering shall be the ground truth?
truth_cluster_cells = ['eps_10_samples_5']
truth_cluster_nuclei = ['eps_0.5_samples_5']

# load data
cell_data = pd.read_csv(data_location+"/cell_data.csv").drop(columns=["Unnamed: 0"])
nuclei_data = pd.read_csv(data_location+"/nuclei_data.csv").drop(columns=["Unnamed: 0"])

# load ground truth
cell_ground_truth = pd.read_csv(truth_location+"/clustered_cell_data.csv")[truth_cluster_cells]
nuclei_ground_truth = pd.read_csv(truth_location+"/clustered_nuclei_data.csv")[truth_cluster_nuclei]

# append ground truth to data
cell_data['ground_truth'] = cell_ground_truth
cell_ground_truth.drop(columns=truth_cluster_cells, inplace=True)
nuclei_data['ground_truth'] = nuclei_ground_truth
nuclei_ground_truth.drop(columns=truth_cluster_nuclei, inplace=True)

# shuffle data
cell_data.sample(frac=1)
nuclei_data.sample(frac=1)

# split the labels off again
cell_ground_truth['ground_truth'] = cell_data['ground_truth']
cell_data.drop(columns=['ground_truth'], inplace=True)
nuclei_ground_truth['ground_truth'] = nuclei_data['ground_truth']
nuclei_data.drop(columns=['ground_truth'], inplace=True)

# number of deciders in AdaBoost
no_of_est = [20, 40, 60, 80, 100]

# create arrays, where accuracy and importances shall be stored
acc_array = np.zeros((2, len(no_of_est)))
imp_array_cells = np.zeros((len(no_of_est), len(cell_data.columns.values)))
imp_array_nuclei = np.zeros((len(no_of_est), len(nuclei_data.columns.values)))

""" create training and test sets """

cell_train, cell_test, cell_label_train, cell_label_test = train_test_split(cell_data, cell_ground_truth, test_size = 0.2)
nuclei_train, nuclei_test, nuclei_label_train, nuclei_label_test = train_test_split(nuclei_data, nuclei_ground_truth, test_size = 0.2)

""" AdaBoost """

for i in range(len(no_of_est)):

    # defining
    ada_cells = AdaBoostClassifier(n_estimators=no_of_est[i])
    ada_nuclei = AdaBoostClassifier(n_estimators=no_of_est[i])

    # training
    ada_cells.fit(cell_train, cell_label_train)
    ada_nuclei.fit(nuclei_train, nuclei_label_train)

    # testing
    cell_acc = ada_cells.score(cell_test, cell_label_test)
    acc_array[0,i] = cell_acc
    nuclei_acc = ada_nuclei.score(nuclei_test, nuclei_label_test)
    acc_array[1,i] = nuclei_acc

    # feature importances
    cell_feat_imp = ada_cells.feature_importances_
    imp_array_cells[i,:] = cell_feat_imp
    nuclei_feat_imp = ada_nuclei.feature_importances_
    imp_array_nuclei[i,:] = nuclei_feat_imp

""" making the data more comprehensible """

# creating dataframes
imp_cells = pd.DataFrame(data=imp_array_cells ,columns=cell_data.columns)
imp_cells['number_of_estimators'] = no_of_est
imp_cells.set_index('number_of_estimators', inplace=True)
imp_cells.to_csv('H:/Internship/ML-Models/Results/2D/Classification/AB_imp_cells.csv')
imp_nuclei = pd.DataFrame(data=imp_array_nuclei, columns=nuclei_data.columns)
imp_nuclei['number_of_estimators'] = no_of_est
imp_nuclei.set_index('number_of_estimators', inplace=True)
imp_nuclei.to_csv('H:/Internship/ML-Models/Results/2D/Classification/AB_imp_nuclei.csv')
accuracy = pd.DataFrame(data=acc_array, index=['Cells','Nuclei'], columns=no_of_est)
accuracy.to_csv('H:/Internship/ML-Models/Results/2D/Classification/AB_accuracy.csv')

""" plotting like guy fawkes, y'all """

# accuracies
ind = np.arange(len(no_of_est))  # the x locations for the groups
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, accuracy.loc['Cells'].values, width, label='Cells')
rects2 = ax.bar(ind + width/2, accuracy.loc['Nuclei'].values, width, label='Nuclei')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy', fontsize='large')
ax.set_ylim([0,1])
#ax.set_title('Accuracy by number of trees and dataset')
ax.set_xlabel('number of estimators', fontsize='large')
ax.set_xticks(ind)
ax.set_xticklabels(('20', '40', '60', '80', '100'), fontsize='large')
plt.grid(axis='y')
ax.legend(fontsize='x-large')
plt.savefig('H:/Internship/ML-Models/Results/2D/Classification/AB_Acc.png')

""" plotting the importances """

# summing up and taking the mean of the importances
mean_imp_cells = sum(imp_array_cells) / len(no_of_est)
mean_imp_nuclei = sum(imp_array_nuclei) / len(no_of_est)

plt.figure()
#plt.title("Mean importances for Cell Classification with Random Forests")
plt.bar(range(cell_feat_imp.shape[0]), mean_imp_cells)
#plt.xticks(range(cell_feat_importance.shape[0]), cell_data.columns.values, \
#           rotation=45, ha='right', fontsize='small')
plt.xticks(range(cell_feat_imp.shape[0]), fontsize='large')
plt.xlabel('Feature', fontsize='large')
plt.ylabel('Mean Feature Importance', fontsize='large')
plt.grid(axis='y')
plt.show()
plt.savefig('H:/Internship/ML-Models/Results/2D/Classification/AB_Imp_Cells.png')

plt.figure()
#plt.title("Mean importances for Nuclei Classification with Random Forests")
plt.bar(range(nuclei_feat_imp.shape[0]), mean_imp_nuclei)
#plt.xticks(range(nuclei_feat_importance.shape[0]), nuclei_data.columns.values, \
#          rotation=45, ha='right', fontsize='small')
plt.xticks(range(nuclei_feat_imp.shape[0]), fontsize='large')
plt.xlabel('Feature', fontsize='large')
plt.ylabel('Mean Feature Importance', fontsize='large')
plt.grid(axis='y')
plt.show()
plt.savefig('H:/Internship/ML-Models/Results/2D/Classification/AB_Imp_Nuclei.png')