# -*- coding: utf-8 -*- """ Created on Thu Jun 13 12:55:27 2019 @author: mhill """ """ prepare_data.py load in the data contained in the single files and clean it up """ """ importing stuff """ import pandas as pd """ declarations """ # filename = location of your csv file (minus tile, image, object and type) filename = 'H:/Internship/CellProfiler/Results/Spreadsheets/cardiomyocytes' # tile_scans = tilescans you want to consider tile_scans = [1,2] # images = images you want to consider images = range(1,10) # negFeatures = list of features you don't want/need neg_cellFeatures = ['ImageNumber', 'ObjectNumber', 'Mean_Nuclei_Number_Object_Number', \ 'Number_Object_Number', 'Parent_Cells_small', 'AreaShape_Center_X', \ 'AreaShape_Center_Y', 'AreaShape_Center_Z', 'Children_Nuclei_Count', \ 'Location_CenterMassIntensity_X_Trop_rescaled', \ 'Location_CenterMassIntensity_Y_Trop_rescaled', \ 'Location_CenterMassIntensity_Z_Trop_rescaled', 'Location_Center_X', \ 'Location_Center_Y', 'Location_Center_Z', \ 'Location_MaxIntensity_X_Trop_rescaled', \ 'Location_MaxIntensity_Y_Trop_rescaled', \ 'Location_MaxIntensity_Z_Trop_rescaled', \ 'Mean_Nuclei_Distance_Centroid_Cells', 'Mean_Nuclei_Location_Center_X', \ 'Mean_Nuclei_Location_Center_Y', 'Mean_Nuclei_Location_Center_Z'] neg_nucleiFeatures = ['ImageNumber', 'ObjectNumber', 'AreaShape_Center_X', \ 'AreaShape_Center_Y', 'AreaShape_Center_Z', \ 'Location_CenterMassIntensity_X_DAPI_rescaled', \ 'Location_CenterMassIntensity_Y_DAPI_rescaled', \ 'Location_CenterMassIntensity_Z_DAPI_rescaled', \ 'Location_Center_X', 'Location_Center_Y', 'Location_Center_Z', \ 'Location_MaxIntensity_X_DAPI_rescaled', \ 'Location_MaxIntensity_Y_DAPI_rescaled', \ 'Location_MaxIntensity_Z_DAPI_rescaled', 'Number_Object_Number', \ 'Parent_Cells', 'Parent_Nuclei_small', 'Distance_Centroid_Cells'] # savename = location where your prepared csv file shall be stored savename = 'H:/Internship/ML-Models/Data/' """ loading csv """ # create dataframes where the data will be stored cell_data = pd.DataFrame() nuclei_data = pd.DataFrame() # loop over tiles and images for i in tile_scans: for j in images: # fill in the dataframes by reading the data in and appending to it to the dataframes cell_data_temp = pd.read_csv(filename+'_'+str(i)+'_'+str(j)+'_Cells.csv') cell_data = cell_data.append(cell_data_temp) nuclei_data_temp = pd.read_csv(filename+'_'+str(i)+'_'+str(j)+'_Nuclei.csv') nuclei_data = nuclei_data.append(nuclei_data_temp) """ preparing data """ # drop columns without important information pruned_cell_data = cell_data.copy() pruned_cell_data.drop(columns=neg_cellFeatures, inplace=True) pruned_nuclei_data = nuclei_data.copy() pruned_nuclei_data.drop(columns=neg_nucleiFeatures, inplace=True) # drop columns containing no data pruned_cell_data.dropna(axis=1, how='all', inplace=True) pruned_nuclei_data.dropna(axis=1, how='all', inplace=True) # deal with nan's pruned_cell_data.fillna(pruned_cell_data.mean(), inplace=True) pruned_nuclei_data.fillna(pruned_nuclei_data.mean(), inplace=True) """ compute additional features """ # surface area to volume ratio pruned_cell_data["surface_area_to_volume_ratio"] = pruned_cell_data["AreaShape_SurfaceArea"].values / \ pruned_cell_data["AreaShape_Volume"].values pruned_nuclei_data["surface_area_to_volume_ratio"] = pruned_nuclei_data["AreaShape_SurfaceArea"].values / \ pruned_nuclei_data["AreaShape_Volume"].values """ output """ pruned_cell_data.to_csv(savename+'cell_data.csv') pruned_nuclei_data.to_csv(savename+'nuclei_data.csv')