# -*- coding: utf-8 -*-
"""
Created on Thu Jun 13 12:55:27 2019

@author: mhill
"""

"""
prepare_data.py

load in the data contained in the single files and clean it up
"""

""" importing stuff """

import pandas as pd

""" declarations """

# filename = location of your csv file (minus tile, image, object and type)
filename = 'H:/Internship/CellProfiler/Results/Spreadsheets/cardiomyocytes'

# tile_scans = tilescans you want to consider
tile_scans = [1,2]

# images = images you want to consider
images = range(1,10)

# negFeatures = list of features you don't want/need
neg_cellFeatures = ['ImageNumber', 'ObjectNumber', 'Mean_Nuclei_Number_Object_Number', \
               'Number_Object_Number', 'Parent_Cells_small', 'AreaShape_Center_X', \
               'AreaShape_Center_Y', 'AreaShape_Center_Z', 'Children_Nuclei_Count', \
               'Location_CenterMassIntensity_X_Trop_rescaled', \
               'Location_CenterMassIntensity_Y_Trop_rescaled', \
               'Location_CenterMassIntensity_Z_Trop_rescaled', 'Location_Center_X', \
               'Location_Center_Y', 'Location_Center_Z', \
               'Location_MaxIntensity_X_Trop_rescaled', \
               'Location_MaxIntensity_Y_Trop_rescaled', \
               'Location_MaxIntensity_Z_Trop_rescaled', \
               'Mean_Nuclei_Distance_Centroid_Cells', 'Mean_Nuclei_Location_Center_X', \
               'Mean_Nuclei_Location_Center_Y', 'Mean_Nuclei_Location_Center_Z']
neg_nucleiFeatures = ['ImageNumber', 'ObjectNumber', 'AreaShape_Center_X', \
                      'AreaShape_Center_Y', 'AreaShape_Center_Z', \
                      'Location_CenterMassIntensity_X_DAPI_rescaled', \
                      'Location_CenterMassIntensity_Y_DAPI_rescaled', \
                      'Location_CenterMassIntensity_Z_DAPI_rescaled',  \
                      'Location_Center_X', 'Location_Center_Y', 'Location_Center_Z', \
                      'Location_MaxIntensity_X_DAPI_rescaled', \
                      'Location_MaxIntensity_Y_DAPI_rescaled', \
                      'Location_MaxIntensity_Z_DAPI_rescaled', 'Number_Object_Number', \
                      'Parent_Cells', 'Parent_Nuclei_small', 'Distance_Centroid_Cells']

# savename = location where your prepared csv file shall be stored
savename = 'H:/Internship/ML-Models/Data/'

""" loading csv """

# create dataframes where the data will be stored
cell_data = pd.DataFrame()
nuclei_data = pd.DataFrame()

# loop over tiles and images
for i in tile_scans:
    for j in images:

        # fill in the dataframes by reading the data in and appending to it to the dataframes
        cell_data_temp = pd.read_csv(filename+'_'+str(i)+'_'+str(j)+'_Cells.csv')
        cell_data = cell_data.append(cell_data_temp)
        nuclei_data_temp = pd.read_csv(filename+'_'+str(i)+'_'+str(j)+'_Nuclei.csv')
        nuclei_data = nuclei_data.append(nuclei_data_temp)

""" preparing data """

# drop columns without important information
pruned_cell_data = cell_data.copy()
pruned_cell_data.drop(columns=neg_cellFeatures, inplace=True)
pruned_nuclei_data = nuclei_data.copy()
pruned_nuclei_data.drop(columns=neg_nucleiFeatures, inplace=True)

# drop columns containing no data
pruned_cell_data.dropna(axis=1, how='all', inplace=True)
pruned_nuclei_data.dropna(axis=1, how='all', inplace=True)

# deal with nan's
pruned_cell_data.fillna(pruned_cell_data.mean(), inplace=True)
pruned_nuclei_data.fillna(pruned_nuclei_data.mean(), inplace=True)

""" compute additional features """

# surface area to volume ratio
pruned_cell_data["surface_area_to_volume_ratio"] = pruned_cell_data["AreaShape_SurfaceArea"].values / \
                                                  pruned_cell_data["AreaShape_Volume"].values
pruned_nuclei_data["surface_area_to_volume_ratio"] = pruned_nuclei_data["AreaShape_SurfaceArea"].values / \
                                                    pruned_nuclei_data["AreaShape_Volume"].values

""" output """

pruned_cell_data.to_csv(savename+'cell_data.csv')
pruned_nuclei_data.to_csv(savename+'nuclei_data.csv')