diff --git a/.idea/AIS_Regress.iml b/.idea/AIS_Regress.iml index 8dc09e5476bcb840206461450ae44f23421d964a..74e2033a0a270c6e69b102c4cc7d107819aa6b2b 100644 --- a/.idea/AIS_Regress.iml +++ b/.idea/AIS_Regress.iml @@ -2,7 +2,7 @@ <module type="PYTHON_MODULE" version="4"> <component name="NewModuleRootManager"> <content url="file://$MODULE_DIR$" /> - <orderEntry type="inheritedJdk" /> + <orderEntry type="jdk" jdkName="Python 3.7 (BaseEnv)" jdkType="Python SDK" /> <orderEntry type="sourceFolder" forTests="false" /> </component> <component name="TestRunnerService"> diff --git a/.idea/misc.xml b/.idea/misc.xml index d1e22ecb89619a9c2dcf51a28d891a196d2462a0..e5f7f469df51294ab9ccfd5f85eee4b788eb4dfa 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ <?xml version="1.0" encoding="UTF-8"?> <project version="4"> - <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" /> + <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (BaseEnv)" project-jdk-type="Python SDK" /> + <component name="PyCharmProfessionalAdvertiser"> + <option name="shown" value="true" /> + </component> </project> \ No newline at end of file diff --git a/Experiments/1-VariablesEvaluation.py b/Experiments/1-VariablesEvaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..4a57dcddb550d4b2f280328ac11da52efbb8dd66 --- /dev/null +++ b/Experiments/1-VariablesEvaluation.py @@ -0,0 +1,200 @@ +from IO_utils.clean_table import clean_table +from IO_utils.statistics_utils import get_pvalue, compute_basic_statistics, compute_bivariate_statistics +from IO_utils.List_reader_utils import cross_check_dictionary, remove_features, get_all_dict_types, treat_missing_values +from _utils.plot_utils import plot_mv, plot_distribution_categorical, plot_distribution_numerical, \ + plot_significant_values +from IO_utils.FeaturePreprocessing import FeaturePreprocessing + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import os +import yaml +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV + + +# %% DATALOADIND +def get_statistics(input_df, data_dictionary, output_dir='../../out/data_exploration/statistics'): + tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] + + d, all_indices = cross_check_dictionary(input_df, data_dictionary, tables, output=[]) + + # Reorder dataframe according to data dictionary + reordered_df = input_df.reindex(columns=all_indices).drop(columns=['Id']) + + types = [] + + all_df, all_missing_values, clean_keys = remove_features(reordered_df, p=1, exclude=[]) + + for t in tables: + types.extend(get_all_dict_types(data_dictionary[t], types=[])) + + # Compute statistics of the selected tables + statistics = compute_basic_statistics(all_df) + statistics_bivariate = compute_bivariate_statistics(all_df, input_df['dmRS'], input_df['mortality'], types) + + p_dmRS, methods = get_pvalue(all_df, input_df['dmRS'], types) + p_mortality, _ = get_pvalue(all_df, input_df['mortality'], types) + # p_shiftmRS, _ = get_pvalue(all_df, input_df['shift_mRS'], types) + + statistics['p_dmRS'] = p_dmRS + statistics['p_mortality'] = p_mortality + # statistics['shift_mRS'] = p_shiftmRS + statistics['method'] = methods + + statistics['missing_values'] = all_missing_values + statistics['Percentage (%)'] = (statistics['missing_values'] * 100. / input_df.shape[0]).to_list() + statistics['types'] = types + + statistics_bivariate['p_dmRS'] = p_dmRS + statistics_bivariate['p_mortality'] = p_mortality + + #plot_significant_values(statistics, value='p_mortality', th=0.01, out_dir=output_dir) + #plot_significant_values(statistics, value='p_dmRS', th=0.01, out_dir=output_dir) + + #plot_significant_values(statistics, value='p_mortality', th=0.05, out_dir=output_dir) + #plot_significant_values(statistics, value='p_dmRS', th=0.05, out_dir=output_dir) + + # Save file + file_path = os.path.join(output_dir, 'output.xlsx') + # assert os.path.isfile(file_path), 'File already exists' + with pd.ExcelWriter(file_path) as writer: + statistics.to_excel(writer, sheet_name='Sheet1') + + # Save file + file_path_bivariate = os.path.join(output_dir, 'output_bivariate.xlsx') + # assert os.path.isfile(file_path), 'File already exists' + with pd.ExcelWriter(file_path_bivariate) as writer: + statistics_bivariate.to_excel(writer, sheet_name='Sheet1') + + return statistics + + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) +output_features = ['dmRS', 'mRS90d', 'shift_mRS', 'mortality'] +tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] +data_dicts = yaml.load(open("../dictionaries/dictionary_timepoints.yml"), Loader=yaml.Loader) +dir_out = "C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results/data_exploration" + +# Output paths +missing_values_path = os.path.join(dir_out, 'missing_values') +statistics_path = os.path.join(dir_out, 'statistics') +features_path = os.path.join(dir_out, 'features') + +# Get statistics without removing missing values etc +statistics = get_statistics(clean_df, data_dicts, + output_dir=statistics_path) + +#### Steps from List Reader +selected_d, all_keys = cross_check_dictionary(clean_df, data_dicts, tables, output_features) +reordered_df = clean_df.reindex(columns=all_keys) +clean_df, _, clean_keys = remove_features(reordered_df, p=0.1, exclude=output_features) +keys = [i for c, i in enumerate(all_keys) if not clean_keys[c]] +for k in keys: + selected_d.pop(k) +final_df = treat_missing_values(clean_df, method='median') +FP = FeaturePreprocessing(final_df.drop(columns=['dmRS', 'mRS90d', 'shift_mRS', 'mortality']), selected_d) + +# Remove data dictionaries of removed features +# Get more important features from lr and random forest + +for output_feature in ['dmRS', 'mortality']: + output_vector = final_df[output_feature].to_numpy(dtype=int).squeeze() + feature_vector = FP.create_features(final_df.drop(columns=['dmRS', 'mRS90d', 'shift_mRS', 'mortality'])) + names = FP.get_feature_names() + + rf = RandomForestClassifier(random_state=True, max_depth=5, n_estimators=100) + f = rf.fit(feature_vector, output_vector) + from sklearn.inspection import permutation_importance + result = permutation_importance( + f, feature_vector, output_vector, n_repeats=10, random_state=42, n_jobs=2 + ) + importance = result.importances_mean + importance_indices = np.argsort(importance)[::-1] + + plt.figure(figsize=(15, 6)) + x =[names[x] for x in importance_indices[0:20]] + plt.bar(x, importance[importance_indices[0:20]], yerr=result.importances_std[importance_indices[0:20]]) + plt.xticks(rotation=15, ha='right') + + plt.show() + + +## Missing values +missing_values = statistics['missing_values'] +plot_mv(missing_values, th=0.1 * clean_df.shape[0], out_dir=missing_values_path) + +with pd.ExcelWriter(os.path.join(missing_values_path, 'missing_values.xlsx')) as writer: + statistics[['missing_values', 'Percentage (%)']].to_excel(writer, float_format="%0.1f") + +if not os.path.isdir(os.path.join(features_path, 'Target')): + os.mkdir(os.path.join(features_path, 'Target')) +## Target +plot_distribution_categorical(clean_df, 'mRS90d', table=data_dicts['Output'], + title='Distribution of mRS at 90 days', + out=True, + out_dir=os.path.join(features_path, 'Target')) + +plot_distribution_categorical(clean_df, 'dmRS', table=data_dicts['Output'], + title='Distribution of functional outcome at 90 days', + out=True, + out_dir=os.path.join(features_path, 'Target')) + +plot_distribution_categorical(clean_df, 'shift_mRS', table=data_dicts['Output'], + title='Distribution shift in mRS at 90 days', + out=True, + out_dir=os.path.join(features_path, 'Target')) + +plot_distribution_categorical(clean_df, 'mortality', table=data_dicts['Output'], + title='Distribution of mortality at 90 days', + out=True, + out_dir=os.path.join(features_path, 'Target')) + +#### Table details +tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] +for t in tables: + keys_table = list(data_dicts[t].keys()) + if not os.path.isdir(os.path.join(features_path, t)): + os.mkdir(os.path.join(features_path, t)) + + # Save individual statistics of each table + # with pd.ExcelWriter(os.path.join(statistics_path, 'statistics_{}.xlsx'.format(t))) as writer: + # statistics_table = statistics.loc[keys_table, :] + # statistics_table.to_excel(writer, float_format="%0.5f") + + # Add target values to k for visualization and check that they are on the clean table + keys_table = keys_table + ['mRS90d', 'dmRS', 'shift_mRS', 'mortality'] + keys_table = list(set(keys_table)) + keys_table = [a for a in keys_table if a in clean_df.columns] + df_table = clean_df[keys_table] + + # plot_correlation_features(df_table, os.path.join(statistics_path, + # 'Correlation_{}_table.png'.format(t))) + + for k in keys_table: + print(k) + if k in ['mRS90d', 'dmRS', 'shift_mRS', 'mortality']: + continue + p_values = [statistics.loc[k, target] for target in ['p_dmRS', 'p_mortality']] + type_k = data_dicts[t][k]['type'] + if type_k in ['cat', 'ord']: + plot_distribution_categorical(clean_df, k, + table=data_dicts[t], title='Distribution {}'.format(k), + out_dir='C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results/data_exploration/features/{}'. + format(t), + p_values=p_values) + + elif type_k in ['int', 'float']: + print('int') + + plot_distribution_numerical(clean_df, k, + title='Distribution {}'.format(k), + out_dir='C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results/data_exploration/features/{}' + .format(t), + p_values=p_values) + +plt.close() diff --git a/Experiments/2-VariableSelection.py b/Experiments/2-VariableSelection.py new file mode 100644 index 0000000000000000000000000000000000000000..1205afa82d34c84727170cafb108f9347ff19082 --- /dev/null +++ b/Experiments/2-VariableSelection.py @@ -0,0 +1,162 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train import train_model +from train_graph import train_model_graph +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', mv_strategy='median', + output_feature=['dmRS']) + +output_vector = table.output_vector +meta_vector = table.meta_df + +fold_indices = split_data_cv(output_vector, seed=5, cv=5) + +methods = ['all', 'p_value', 'random_forest', 'mrmr'] +#methods = ['mrmr'] +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['FCN', 'LR', 'RF', 'MLP', 'XGB', 'Graph']) +#### From all variables evaluate different variable selection methods + +for method in methods: + + for k in [3, 5, 10, 15, 20, 30, 50]: + #for k in [5,10]: + if method == 'all' and k > 3: + continue + + features = table.select_features(method=method, k=k, fold_indices=fold_indices) + + feature_vector = table.final_df[features] + FP = FeaturePreprocessing(feature_vector, table.selected_d) + feature_vector = FP.create_features(feature_vector) + + config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 80, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} + + config_graph = { + 'Age': True, + 'beta_Age': 2, + 'Sex': False, + 'pre-mRS': True, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 1 + + } + metrics_FCN = {} + metrics_LR = {} + metrics_RF = {} + metrics_MLP = {} + metrics_XGB = {} + metrics_Graph = {} + for f in range(5): + # FCN + ("Training FCN of fold {}".format(f)) + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d,load_images=False, meta=meta_vector, one_hot=True) + + dl = dataloader_fold.get_loaders() + dl_graph = dataloader_fold.build_graph(config_graph) + + torch.manual_seed(0) + print('-----------------Training FCN--------------------- ') + + result, model = train_model(config, loaders=dl) + metrics_FCN['Fold {0}'.format(f)] = result['val_metrics'] + print('-------------------------------------- ') + + # LR + print('-----------------Training LR--------------------- ') + _, result_LR = apply_LR(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_LR['Fold {0}'.format(f)] = result_LR[1] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_LR[0]['auc'], + result_LR[1]['auc'], + result_LR[2][ + 'auc'])) + print('-------------------------------------- ') + + # RF + print('-----------------Training RF--------------------- ') + + _, result_RF = apply_random_forest(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_RF['Fold {0}'.format(f)] = result_RF[1] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_RF[0]['auc'], + result_RF[1]['auc'], + result_RF[2][ + 'auc'])) + print('-------------------------------------- ') + # MLP + print('-----------------Training MLP--------------------- ') + + _, result_MLP = apply_mlp(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_MLP['Fold {0}'.format(f)] = result_MLP[1] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_MLP[0]['auc'], + result_MLP[1]['auc'], + result_MLP[2]['auc'])) + print('-------------------------------------- ') + # XGB Boost + print('-----------------Training XGB Boost--------------------- ') + + _, result_XGB = apply_xgbBoost(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_XGB['Fold {0}'.format(f)] = result_XGB[1] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_XGB[0]['auc'], + result_XGB[1]['auc'], + result_XGB[2]['auc'])) + print('-------------------------------------- ') + print('-----------------Training Graph--------------------- ') + + result, _ = train_model_graph(config, loaders=dl_graph, indices=fold_indices[f]) + metrics_Graph['Fold {0}'.format(f)] = result['val_metrics'] + print('-------------------------------------- ') + + if method == 'all': + Result_c.update('FCN', method, metrics_FCN) + Result_c.update('LR', method, metrics_LR) + Result_c.update('RF', method, metrics_RF) + Result_c.update('MLP', method, metrics_MLP) + Result_c.update('XGB', method, metrics_XGB) + Result_c.update('Graph', method, metrics_Graph) + + else: + Result_c.update('FCN', method + '_' + str(k), metrics_FCN) + Result_c.update('LR', method + '_' + str(k), metrics_LR) + Result_c.update('RF', method + '_' + str(k), metrics_RF) + Result_c.update('MLP', method + '_' + str(k), metrics_MLP) + Result_c.update('XGB', method + '_' + str(k), metrics_XGB) + Result_c.update('Graph', method + '_' + str(k), metrics_Graph) + + #Saves results on validation set + output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/preliminary results' + Result_c.save(output_dir=output_dir, name='Variable_selection_val') diff --git a/Experiments/3-MVStrategy.py b/Experiments/3-MVStrategy.py new file mode 100644 index 0000000000000000000000000000000000000000..9b960eee7511441bf85a824df2765582eceb3421 --- /dev/null +++ b/Experiments/3-MVStrategy.py @@ -0,0 +1,150 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train import train_model +from train_graph import train_model_graph +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +strategies = [ 'median', 'knn', 'mice'] +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['FCN', 'LR', 'RF', 'MLP', 'XGB', 'Graph']) +#### From all variables evaluate different variable selection methods + +for s in strategies: + + table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', + mv_strategy=s, + output_feature=['dmRS']) + + output_vector = table.output_vector + meta_vector = table.meta_df + + fold_indices = split_data_cv(output_vector, seed=5, cv=5) + features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) + + feature_vector = table.final_df[features] + FP = FeaturePreprocessing(feature_vector, table.selected_d) + feature_vector = FP.create_features(feature_vector) + + config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} + + metrics_FCN = {} + metrics_LR = {} + metrics_RF = {} + metrics_MLP = {} + metrics_XGB = {} + metrics_Graph = {} + + config_graph = { + 'Age': True, + 'beta_Age': 3, + 'Sex': False, + 'pre-mRS': True, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 1 + + } + for f in range(5): + ("Training fold {}".format(f)) + + # GCN + print('-----------------Training GCN--------------------- ') + + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d, load_images=False,meta =meta_vector, one_hot=True) + dl_graph = dataloader_fold.build_graph(config_graph) + torch.manual_seed(0) + + result, _ = train_model_graph(config, loaders=dl_graph, indices=fold_indices[f]) + metrics_Graph['Fold {0}'.format(f)] = result['test_metric'] + + # FCN + print('-----------------Training LR--------------------- ') + + + dl = dataloader_fold.get_loaders() + torch.manual_seed(0) + + result, model = train_model(config, loaders=dl) + metrics_FCN['Fold {0}'.format(f)] = result['test_metric'] + # LR + print('-----------------Training LR--------------------- ') + _, result_LR = apply_LR(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_LR['Fold {0}'.format(f)] = result_LR[2] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_LR[0]['auc'], + result_LR[1]['auc'], + result_LR[2][ + 'auc'])) + print('-------------------------------------- ') + + # RF + print('-----------------Training RF--------------------- ') + + _, result_RF = apply_random_forest(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_RF['Fold {0}'.format(f)] = result_RF[2] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_RF[0]['auc'], + result_RF[1]['auc'], + result_RF[2][ + 'auc'])) + print('-------------------------------------- ') + # MLP + print('-----------------Training MLP--------------------- ') + + _, result_MLP = apply_mlp(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_MLP['Fold {0}'.format(f)] = result_MLP[2] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_MLP[0]['auc'], + result_MLP[1]['auc'], + result_MLP[2]['auc'])) + print('-------------------------------------- ') + # XGB Boost + print('-----------------Training XGB Boost--------------------- ') + + _, result_XGB = apply_xgbBoost(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_XGB['Fold {0}'.format(f)] = result_XGB[2] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_XGB[0]['auc'], + result_XGB[1]['auc'], + result_XGB[2]['auc'])) + print('-------------------------------------- ') + + Result_c.update('FCN', s, metrics_FCN) + Result_c.update('LR', s, metrics_LR) + Result_c.update('RF', s, metrics_RF) + Result_c.update('MLP', s, metrics_MLP) + Result_c.update('XGB', s, metrics_XGB) + + Result_c.update('Graph', s, metrics_Graph) + +output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/preliminary results' +Result_c.save(output_dir=output_dir, name='MVStrategy_selection_test_mrmr10') diff --git a/Experiments/4-Timepoints.py b/Experiments/4-Timepoints.py new file mode 100644 index 0000000000000000000000000000000000000000..c6d15f96322d0c2c7c41a79d524d66c4d4262dff --- /dev/null +++ b/Experiments/4-Timepoints.py @@ -0,0 +1,164 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train import train_model +from train_graph import train_model_graph +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +tables_modalities = [ + #['NCCT', 'CTP', 'CTA'], + #['NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out'], + #['NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out', 'Control CT'], + + #['Metadata', 'NCCT', 'CTP', 'CTA'], + #['Metadata', 'NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out'], + #['Metadata', 'NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out', 'Control CT'], + + #['Metadata'], + #['Clinical'], + #['Metadata', 'Clinical'], + #['Treatment', 'Treatment_out', 'Metadata', 'Clinical']] + ['NCCT', 'CTP', 'CTA', 'Metadata', 'Clinical'], + ['NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out', 'Metadata', 'Clinical'], + ['NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out', 'Control CT', 'Metadata', 'Clinical']] + + +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['FCN', 'LR', 'RF', 'MLP', 'XGB', 'Graph']) +#### From all variables evaluate different variable selection methods + +for t in tables_modalities: + + table = TableReader(input_df=clean_df, tables=t, data_dictionaries='modalities', + mv_strategy='median', + output_feature=['mortality']) + + output_vector = table.output_vector + meta_vector = table.meta_df + + fold_indices = split_data_cv(output_vector, seed=5, cv=5) + print(t) + features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) + + feature_vector = table.final_df[features] + FP = FeaturePreprocessing(feature_vector, table.selected_d) + feature_vector = FP.create_features(feature_vector) + + config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} + + config_graph = { + 'Age': True, + 'beta_Age': 4, + 'Sex': True, + 'pre-mRS': False, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 1 + + } + metrics_FCN = {} + metrics_LR = {} + metrics_RF = {} + metrics_MLP = {} + metrics_XGB = {} + metrics_Graph = {} + for f in range(5): + # GCN + + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d, load_images=False, meta=meta_vector, one_hot=True) + dl_graph = dataloader_fold.build_graph(config_graph) + torch.manual_seed(0) + + result, _ = train_model_graph(config, loaders=dl_graph, indices=fold_indices[f]) + metrics_Graph['Fold {0}'.format(f)] = result['test_metric'] + + + # FCN + ("Training FCN of fold {}".format(f)) + + dl = dataloader_fold.get_loaders() + torch.manual_seed(0) + + result, model = train_model(config, loaders=dl) + metrics_FCN['Fold {0}'.format(f)] = result['test_metric'] + + # LR + print('-----------------Training LR--------------------- ') + _, result_LR = apply_LR(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_LR['Fold {0}'.format(f)] = result_LR[2] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_LR[0]['auc'], + result_LR[1]['auc'], + result_LR[2][ + 'auc'])) + print('-------------------------------------- ') + + # RF + print('-----------------Training RF--------------------- ') + + _, result_RF = apply_random_forest(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_RF['Fold {0}'.format(f)] = result_RF[2] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_RF[0]['auc'], + result_RF[1]['auc'], + result_RF[2][ + 'auc'])) + print('-------------------------------------- ') + # MLP + print('-----------------Training MLP--------------------- ') + + _, result_MLP = apply_mlp(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_MLP['Fold {0}'.format(f)] = result_MLP[2] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_MLP[0]['auc'], + result_MLP[1]['auc'], + result_MLP[2]['auc'])) + print('-------------------------------------- ') + # XGB Boost + print('-----------------Training XGB Boost--------------------- ') + + _, result_XGB = apply_xgbBoost(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_XGB['Fold {0}'.format(f)] = result_XGB[2] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_XGB[0]['auc'], + result_XGB[1]['auc'], + result_XGB[2]['auc'])) + print('-------------------------------------- ') + + Result_c.update('FCN', '_'.join(t), metrics_FCN) + Result_c.update('LR', '_'.join(t), metrics_LR) + Result_c.update('RF', '_'.join(t), metrics_RF) + Result_c.update('MLP', '_'.join(t), metrics_MLP) + Result_c.update('XGB', '_'.join(t), metrics_XGB) + Result_c.update('Graph', '_'.join(t), metrics_Graph) + +output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results' +Result_c.save(output_dir=output_dir, name='Timepoints_K10_mortality') \ No newline at end of file diff --git a/Experiments/5-Uncertainty.py b/Experiments/5-Uncertainty.py new file mode 100644 index 0000000000000000000000000000000000000000..5940be756ab10a5b6e4f11b8361a71aaa1b7a082 --- /dev/null +++ b/Experiments/5-Uncertainty.py @@ -0,0 +1,100 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader + +from evaluate_model import test, get_metrics_unc, plot_selectedsamples_metrics, plot_uncetainties +from train import train_model +import torch +import os +import numpy as np +# %% DATALOADIND + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', mv_strategy='median', + output_feature=['dmRS']) + +output_vector = table.output_vector + +fold_indices = split_data_cv(output_vector, seed=5, cv=5) + +features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) + +feature_vector = table.final_df[features] +FP = FeaturePreprocessing(feature_vector, table.selected_d) +feature_vector = FP.create_features(feature_vector) + +config = {'lr': 0.01, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20, + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} + +######## +mean_preds = [] +combined = [] +epistemic = [] +cls = [] +results_pred= {} +results_epis= {} +for p in[1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,0.1]: + results_pred[p]={} + results_epis[p] = {} + +for k in range(5): + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[k], + table.selected_d, one_hot=True) + dl = dataloader_fold.get_loaders() + fold_name = "C:/Users/martinca1/PhD/Projects/AI_Stroke/out/models/FCNEnsemble" + + torch.manual_seed(0) + for i in range(10): + + if not os.path.exists(fold_name): + os.mkdir(fold_name) + + path_model = os.path.join(fold_name, "model_{}_fold_{}.pt".format(i, k)) + if os.path.isfile(path_model): + continue + else: + print("Training model {} of fold {}".format(i, k)) + _, model = train_model(config, loaders=dl) + torch.save(model, path_model) + + state_dict_paths = [os.path.join(fold_name, "model_{}_fold_{}.pt".format(i, k)) for i in range(10)] + + pred, unc, epistemic_unc, y = test(config, dl[2], state_dict_paths) + mean_preds.extend(pred.tolist()) + combined.extend(unc.tolist()) + epistemic.extend(epistemic_unc.tolist()) + cls.extend(y.tolist()) + +################### Test +p = np.array(mean_preds) +y = np.array(cls) +c = np.array(combined) +e = np.array(epistemic) + +# with pd.ExcelWriter("C:/Users/martinca1/PhD/Projects/AI_Stroke/out/uncertainty/predictive_uncertainty.xlsx") as writer: +for per in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]: + results_pred[per] = get_metrics_unc(c, p, y, per) + +plot_selectedsamples_metrics(c, p, y, uncertainty='Predictive') +plot_selectedsamples_metrics(e, p, y, uncertainty='Epistemic') +plot_uncetainties(p, y, c, e) +import matplotlib.pyplot as plt +plt.show() + + + diff --git a/Experiments/6-Graph_constr.py b/Experiments/6-Graph_constr.py new file mode 100644 index 0000000000000000000000000000000000000000..a37da69da6140a5198c232eaee670978069ddc71 --- /dev/null +++ b/Experiments/6-Graph_constr.py @@ -0,0 +1,116 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train_graph import train_model_graph +from sklearn.model_selection import ParameterGrid +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +tables_modalities = ['all_timepoints'] +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['Graph']) +#### From all variables evaluate different variable selection methods + + + +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', + mv_strategy='median', + output_feature=['dmRS']) + +output_vector = table.output_vector +meta_vector = table.meta_df +fold_indices = split_data_cv(output_vector, seed=5, cv=5) +features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) +feature_vector = table.final_df[features] + +FP = FeaturePreprocessing(feature_vector, table.selected_d) +feature_vector = FP.create_features(feature_vector) + +config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} +metrics = {} + + +grid_config_graph = [ + +{ + 'Age': [True], + 'beta_Age': [1, 2, 3, 4, 5], + 'Sex': [True, False], + 'pre-mRS': [True], + 'beta_mRS': [1, 2], + 'NIHSS': [False] +}, + +{ + 'Age': [True], + 'beta_Age': [1, 2, 3, 4, 5], + 'Sex': [True, False], + 'pre-mRS': [False], + 'NIHSS': [False]}, + +{ + 'Age': [False], + 'Sex': [True, False], + 'pre-mRS': [True], + 'beta_mRS': [1, 2], + 'NIHSS': [False]}, + { + 'Age': [False], + 'Sex': [True], + 'pre-mRS': [False], + 'NIHSS': [False]}, + +] + +"""grid_config_graph = { + 'Age': [True, False], + 'beta_Age': [1, 2, 3, 4, 5], + 'Sex': [True, False], + 'pre-mRS': [True, False], + 'beta_mRS': [1, 2], + 'NIHSS': [True, False], + 'beta_NIHSS': [1, 3, 5, 7, 10] +}""" +grid = ParameterGrid(grid_config_graph) + +for config_graph in grid: + print(config_graph) + for f in range(5): + # FCN + ("Training FCN of fold {}".format(f)) + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d,load_images=False, meta= meta_vector, one_hot=True) + + loader = dataloader_fold.build_graph(config_graph) + + torch.manual_seed(0) + result, model = train_model_graph(config, loaders=loader, indices=fold_indices[f]) + metrics['Fold {0}'.format(f)] = result['val_metrics'] + # LR + + print('-------------------------------------- ') + import json + Result_c.update('Graph', json.dumps(config_graph), metrics) + + output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/preliminary results' + Result_c.save(output_dir=output_dir, name='Graph_grid_validation_new') diff --git a/Experiments/7-Graph_model.py b/Experiments/7-Graph_model.py new file mode 100644 index 0000000000000000000000000000000000000000..a3fa88cfb31e18960ca6e6105d899fac14c88e81 --- /dev/null +++ b/Experiments/7-Graph_model.py @@ -0,0 +1,82 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train_graph import train_model_graph +from sklearn.model_selection import ParameterGrid +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +tables_modalities = ['all_timepoints'] +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['Graph']) +#### From all variables evaluate different variable selection methods + + +output_feature = 'dmRS' +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', + mv_strategy='median', + output_feature=[output_feature]) + +output_vector = table.output_vector +meta_vector = table.meta_df +fold_indices = split_data_cv(output_vector, seed=5, cv=5) +features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) +feature_vector = table.final_df[features] + +FP = FeaturePreprocessing(feature_vector, table.selected_d) +feature_vector = FP.create_features(feature_vector) + +config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} +metrics = {} + +config_graph = { + 'Age': True, + 'beta_Age': 3, + 'Sex': False, + 'pre-mRS': True, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 1 + + } + + +for f in range(5): + # FCN + ("Training FCN of fold {}".format(f)) + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d,load_images=False, meta= meta_vector, + one_hot=True) + + loader = dataloader_fold.build_graph(config_graph) + + torch.manual_seed(0) + result, model = train_model_graph(config, loaders=loader, indices=fold_indices[f]) + metrics['Fold {0}'.format(f)] = result['test_metric'] + # LR + + print('-------------------------------------- ') +Result_c.update('Graph', 'Results', metrics) + +output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results' +Result_c.save(output_dir=output_dir, name='Graph_{}'.format(output_feature)) \ No newline at end of file diff --git a/Experiments/8-Uncertainty_graph.py b/Experiments/8-Uncertainty_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..4635eca6fe8440473c84ed6404b440494e4b49b4 --- /dev/null +++ b/Experiments/8-Uncertainty_graph.py @@ -0,0 +1,145 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader + +from evaluate_model import test_graph, get_metrics_unc, plot_selectedsamples_metrics, plot_uncetainties, \ + plot_age_uncertainty, plot_boxplots +from train_graph import train_model_graph +import torch +import os +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import ParameterGrid + +# %% DATALOADIND + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', mv_strategy='median', + output_feature=['mortality']) + +output_vector = table.output_vector + +fold_indices = split_data_cv(output_vector, seed=5, cv=5) + +features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) + +feature_vector = table.final_df[features] +metadata_vector = table.meta_df +FP = FeaturePreprocessing(feature_vector, table.selected_d) +feature_vector = FP.create_features(feature_vector) + +"""config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 5, + 'layer1': 96, + 'layer2': 32, + 'layer3': 8, + 'layer4': 4 + + }, + 'dropout': 0.2, + 'classification': True, + 'out_classes': 2}""" +config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} +######## +mean_preds = [] +combined = [] +epistemic = [] +cls = [] +ages = [] +results_pred = {} +results_epis = {} +for p in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]: + results_pred[p] = {} + results_epis[p] = {} + +for k in range(5): + + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[k], + table.selected_d, load_images=False, meta=metadata_vector, + one_hot=True) + ages_fold = dataloader_fold.meta['Age'].values[fold_indices[k][2]] + + """config_graph = { + 'Age': True, + 'beta_Age': 3, + 'Sex': False, + 'pre-mRS': True, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 3 + + }""" + config_graph = { + 'Age': True, + 'beta_Age': 4, + 'Sex': True, + 'pre-mRS': False, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 3 + + } + fold_name = "C:/Users/martinca1/PhD/Projects/AI_Stroke/out/models/EdgeDropout3_mortality" + loader = dataloader_fold.build_graph(config_graph=config_graph) + torch.manual_seed(0) + for i in range(10): + + if not os.path.exists(fold_name): + os.mkdir(fold_name) + + path_model = os.path.join(fold_name, "model_{}_fold_{}.pt".format(i, k)) + if os.path.isfile(path_model): + continue + else: + print("Training model {} of fold {}".format(i, k)) + _, model = train_model_graph(config, loaders=loader, indices=fold_indices[k]) + torch.save(model, path_model) + + state_dict_paths = [os.path.join(fold_name, "model_{}_fold_{}.pt".format(i, k)) for i in range(10)] + + pred, unc, epistemic_unc, y = test_graph(config, loader, fold_indices[k][2], state_dict_paths) + + ages.extend(ages_fold.tolist()) + mean_preds.extend(pred.tolist()) + combined.extend(unc.tolist()) + epistemic.extend(epistemic_unc.tolist()) + cls.extend(y.tolist()) + +################### Test +p = np.array(mean_preds) +y = np.array(cls) +c = np.array(combined) +e = np.array(epistemic) +a = np.array(ages) + +# with pd.ExcelWriter("C:/Users/martinca1/PhD/Projects/AI_Stroke/out/uncertainty/predictive_uncertainty.xlsx") as writer: +for th in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1,0]: + results_pred[th] = get_metrics_unc(c, p, y, th) + + + +plot_boxplots(p,y,uncertainty=c) +#plot_selectedsamples_metrics(c, p, y, uncertainty='Predictive') +#plot_selectedsamples_metrics(e, p, y, uncertainty='Epistemic') +#plot_uncetainties(p, y, c, e) +#plot_age_uncertainty(p, y, c, a) + +plt.show() diff --git a/Experiments/9-Images.py b/Experiments/9-Images.py new file mode 100644 index 0000000000000000000000000000000000000000..677d4a6f88dd2ccea4f016eb50972d14fb449573 --- /dev/null +++ b/Experiments/9-Images.py @@ -0,0 +1,55 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train import train_model +from train_graph import train_model_graph +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=6) + +# Given a clean table get features and labels +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', mv_strategy='median', + output_feature=['mortality']) + +output_vector = table.output_vector +meta_vector = table.meta_df +ids_vector = table.patient_ids + + +fold_indices = split_data_cv(output_vector, seed=5, cv=5) +## Load the images given by fold indices + +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['3DCNN']) + +#### From all variables evaluate different variable selection methods + +for f in range(5): + # FCN + print("Training FCN of fold {}".format(f)) + """ + #dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + # table.selected_d,meta_vector, one_hot=True) + + dl = dataloader_fold.get_loaders() + dl_graph = dataloader_fold.build_graph(config_graph) + torch.manual_seed(0) + + + result, _ = train_model_graph(config, loaders=dl_graph, indices=fold_indices[f]) + print('-------------------------------------- ') + + + Result_c.update('Graph', method + '_' + str(k), metrics_Graph) +""" + +output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/preliminary results' +Result_c.save(output_dir=output_dir, name='Variable_selection_Graph_mort_all') diff --git a/Experiments/_t_test.py b/Experiments/_t_test.py new file mode 100644 index 0000000000000000000000000000000000000000..90bf206aca5bee2210df7e5dfc1a0fd707ac54c0 --- /dev/null +++ b/Experiments/_t_test.py @@ -0,0 +1,62 @@ +import scipy.stats +import numpy as np + + + + +def compute_t_value(mean_base, sd_base, means, sds, samples ): + for i in range(len(means)): + mean_2 = means[i] + sd_2 = sds[i] + t1 = (mean_base - mean_2) + t2 = (np.power(sd_base, 2) + np.power(sd_2, 2)) * (1 / samples) + t2 = np.sqrt(t2) + + t =- t1 / t2 + + p = 2 * min(scipy.stats.t.cdf(t, samples), scipy.stats.t.cdf(-t, samples)) + print('Paired t-test of {}+-{} against {}+-{} : p_value: {}'.format(mean_base, sd_base, + mean_2, sd_2, + p)) + + + + +samples = [160, 220, 274, 249] + + +######### Calcaneus +mean_base = 8.46 +means = [9.12, 8.69] +sd_base = 0.63 +sds = [1.03, 1.23] + +print('Calceaneus ') +compute_t_value(mean_base, sd_base, means, sds, samples[0]) + +############## Ankle +mean_base = 6.32 +means = [9.02, 5.86] +sd_base = 0.25 +sds = [1.59, 0.40] + +print('Ankle') +compute_t_value(mean_base, sd_base, means, sds, samples[1]) + +############ Knee +mean_base = 6.8 +means = [7.79, 6.49] +sd_base = 0.55 +sds = [0.53, 1.05] + +print('Knee') +compute_t_value(mean_base, sd_base, means, sds, samples[2]) + +########### Wrist +mean_base = 7.85 +means = [9.59, 9.93] +sd_base = 0.94 +sds = [2.15, 1.41] + +print('Wrist') +compute_t_value(mean_base, sd_base, means, sds, samples[3]) \ No newline at end of file diff --git a/IO_utils/Dataloader.py b/IO_utils/Dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..18339b7faeab4bd3d29616f6207b60ecebc3c191 --- /dev/null +++ b/IO_utils/Dataloader.py @@ -0,0 +1,63 @@ +from torch.utils.data import DataLoader +from torch_geometric.loader import DataLoader as graph_Dataloader +from IO_utils.Datasets import MyDataset, Graph_Dataset + + +class MyDataLoader: + + def __init__(self, feature, output, fold_indices, selected_d, load_images=False, patient_ids=None, + meta=None, batch_size=None, one_hot=False): + train_indices, val_indices, test_indices = fold_indices + + self.features = feature + self.output = output + self.meta = meta + self.one_hot = one_hot + self.patient_ids = patient_ids + self.load_images = load_images + + self.batch_size = batch_size if batch_size is not None else int(0.8 * feature.shape[0]) + + train_dataset = MyDataset(feature, output, train_indices, + data_dictionaries=selected_d, one_hot=one_hot) + + self.train_loader = DataLoader(dataset=train_dataset, + batch_size=self.batch_size, + shuffle=True, + num_workers=0) + + val_dataset = MyDataset(feature, output, val_indices, + data_dictionaries=selected_d, one_hot=one_hot) + + self.val_loader = DataLoader(dataset=val_dataset, + batch_size=self.batch_size, + num_workers=0) + + test_dataset = MyDataset(feature, output, test_indices, + data_dictionaries=selected_d, one_hot=one_hot) + + self.test_loader = DataLoader(dataset=test_dataset, + batch_size=self.batch_size, + num_workers=0) + + self.train_features, self.train_output = feature[train_indices, :], output[train_indices].squeeze() + self.test_features, self.test_outout = feature[test_indices, :], output[test_indices].squeeze() + self.val_features, self.val_outout = feature[val_indices, :], output[val_indices].squeeze() + + def get_loaders(self): + return self.train_loader, self.val_loader, self.test_loader + + def build_graph(self, config_graph): + # print(df_feature) + train_dataset = Graph_Dataset(self.meta, + self.features, self.output, config_graph, self.one_hot) + + train_loader = graph_Dataloader(dataset=train_dataset.get(), + batch_size=self.batch_size, + num_workers=0) + + return train_loader + + + + diff --git a/IO_utils/Datasets.py b/IO_utils/Datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..29ac784292687f45b572fdc30a401ad4b4851dd1 --- /dev/null +++ b/IO_utils/Datasets.py @@ -0,0 +1,140 @@ +import torch +import numpy as np +from torch_geometric.data import Data +from scipy.spatial.distance import correlation + +class MyDataset: + + def __init__(self, features, labels, indices, data_dictionaries, one_hot=False, + images= False, patient_ids=None): + + self.features = features[indices, :] + self.labels = labels[indices] + self.patient_indices = patient_ids[indices, 0] if patient_ids is not None else None + self.data_dictionaries = data_dictionaries + self.one_hot = one_hot + self.images = images + self.patients_ids = patient_ids + + self.volumes = {} + if self.images: + + self.load_images() + + def load_images (self): + #Load images + for i in range(len(self.patient_indices)): + #Load volume + self.volumes[i]= {} + for modality in ['NCCT', 'CTP']: + #Load modality + self.volumes[i][modality]=None + + + def __len__(self): + return self.features.shape[0] + + def __getitem__(self, i): + # Get features + features = self.features[i, :] + x = torch.from_numpy(features.squeeze()).float() + if self.one_hot: + label = np.zeros((np.max(self.labels + 1))) + label[self.labels[i]] = 1 + y = torch.from_numpy(label) + else: + y = torch.from_numpy(np.array(self.labels[i])).float() + data = {'x': x, + 'y': y + } + + if self.images: + for m in ['NCCT', 'CTP']: + data[m]= self.volumes[i][m] + + + + return data + +class Graph_Dataset: + + def __init__(self, meta, features, labels, config_graph, one_hot): + self.meta = meta + self.features = features + self.labels = labels + + x = torch.tensor(features, dtype=torch.float) + + if one_hot: + labels_onehot = np.zeros((labels.shape[0], len(np.unique(labels)))) + for i in range(labels.shape[0]): + labels_onehot[i, labels[i]] = 1 + y = torch.tensor(labels_onehot, dtype=torch.float) + else: + y = torch.tensor(labels, dtype=torch.float) + + self.edge_index, self.weights = self.create_graph(config_graph) + + self.data = Data(x=x, y=y, edge_index=self.edge_index, weights=self.weights, labels=labels) + print("Number of nodes ", self.data.num_nodes) + print("Number of features ", self.data.num_node_features) + print("Number of edges ", self.data.num_edges) + print("Graph contain isolated nodes ", self.data.contains_isolated_nodes()) + # self.plot_graph() + + def len(self): + return self.meta.shape[0] + + def get(self): + + return [self.data] + + def create_graph(self, config_graph): + + v1 = [] + v2 = [] + + nodes = self.meta.shape[0] + weights = [] + + for i in range(nodes): + for ii in range(nodes): + if i != ii: + condition_Age = np.abs(self.meta['Age'].values[i] - self.meta['Age'].values[ii]) < \ + config_graph['beta_Age'] if config_graph['Age'] else True + condition_Sex = self.meta['Sex'].values[i] == self.meta['Sex'].values[ii] \ + if config_graph['Sex'] else True + condition_mRS = (self.meta['pre-mRS'].values[i] - self.meta['pre-mRS'].values[ii]) < \ + config_graph['beta_mRS'] if config_graph['pre-mRS'] else True + condition_NIHSS = (self.meta['NIHSS'].values[i] - self.meta['NIHSS'].values[ii]) < \ + config_graph['beta_NIHSS'] if config_graph['NIHSS'] else True + + if condition_Age and condition_Sex and condition_mRS and condition_NIHSS: + v1.append(i) + v2.append(ii) + dist = correlation(self.features[i, :], self.features[ii, :]) + if np.isnan(dist): + dist = 1.0 + weights.extend([dist]) + for i in range(nodes): + if i not in v1: + #print('Node {0} not connected'.format(i)) + #print('Age {0}, Sex {1}, pre-mRS {2} and NIHSS {3} '.format(self.meta['Age'].values[i], + # self.meta['Sex'].values[i], + # self.meta['pre-mRS'].values[i], + # self.meta['NIHSS'].values[i])) + # + corr = [correlation(self.meta.iloc[i, :], self.meta.iloc[ii, :]) for ii in range(nodes)] + sorted_corr = np.argsort(corr) + for j in range(1): + v1.append(i) + v2.append(sorted_corr[j+1]) + weights.extend([corr[sorted_corr[j+1]]]) + v2.append(i) + v1.append(sorted_corr[j+1]) + weights.extend([corr[sorted_corr[j+1]]]) + + edge_index = torch.tensor([v1, v2], dtype=torch.long) + weight_vector = torch.tensor(weights, dtype=torch.float) + + return edge_index, weight_vector diff --git a/IO_utils/FeaturePreprocessing.py b/IO_utils/FeaturePreprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..d31fbbd509a46e31c4be2c4df8f161dc8e9ba42a --- /dev/null +++ b/IO_utils/FeaturePreprocessing.py @@ -0,0 +1,109 @@ +import copy +import numpy as np + +class FeaturePreprocessing: + """ + Normalize numerical values and transform to one-hot vector encoding categorical values + :param df: Input data frame + :param data_dictionaries: dictionaries with information about the features + :param exclude: columns that have to be excluded from the feature vector + :return: + """ + + def __init__(self, df, data_dictionaries, exclude=[]): + + exclude_vector = copy.copy(exclude) + exclude_vector.extend(['Id']) + feature_vector = [] + + self.features = [i for i in df.columns if i not in exclude_vector] + self.min = [df[column].min() for column in df.columns if column != 'Id'] + self.max = [df[column].max() for column in df.columns if column != 'Id'] + self.data_dictionaries = data_dictionaries + # print('### Reading table(s)') + # print('### {} Features: {}'.format(len(features), features)) + + def create_features(self, df): + # Iterate over columns of dataframe and process them according to its data + feature_vector = [] + self.feature_vector_names = [] + for count, column in enumerate(self.features): + + d = self.data_dictionaries[column] + if d['type'] in ['cat', 'ord']: + + ordered = True if d['type'] == 'ord' else False + output = cat_to_one_hot(df[column], d, ordered=ordered) + for k in list(d['description'].values()): + self.feature_vector_names.extend([column + '#' + k]) + elif d['type'] in ['int', 'float']: + + output = df[column] * 0 if self.max[count] == self.min[count] else \ + (df[column] - self.min[count]) / (self.max[count] - self.min[count]) + output = np.reshape(output.to_numpy(), (output.shape[0], 1)) + + self.feature_vector_names.extend([column]) + + else: + raise ValueError('Column {} not included in the features'.format(column)) + + # print(feature_vector_names) + feature_vector.extend([output]) + + final_feature_vector = np.concatenate(feature_vector, axis=1) + + # Saved all the features to an excel table + # features_df = pd.DataFrame(data=final_feature_vector, index = df['Id'], columns=feature_vector_names) + # new_dir = '../out/test/Features.xlsx' + # features_df.to_excel(new_dir, columns=feature_vector_names, index=True) + return final_feature_vector + + def get_feature_names(self): + return self.feature_vector_names + +def cat_to_one_hot(features, dictionary, ordered=False): + """ + Method that transform categorical variables into one-hot vector + + :param features: Feature vector size n x 1 + :param dictionary: Dictionary with fields 'info', 'categories', and 'description' + :return: one-hot feature vector of size nxc where m are the number of classes + """ + feature_vector = copy.copy(features) + # Number of patients + patients = feature_vector.shape[0] + # Number of Categories + categories = int(dictionary['categories']) + assert isinstance(categories, int), 'Categories in data dictionary should be integer' + # Description of the categories + description = dictionary['description'] + + # Create the feature vector size patients x categories + one_hot_vector = np.zeros((patients, categories)) + + # Normal case where the feature categories are given + if description != 'None': + + # Check that the number of categories matches + assert categories == len( + description.keys()), '{}: categories and its description does not match in data dictionary' + + # If the keys given in the description are not [0, 1, ..., n] replace the values in the feature vector + expected_keys = list(range(categories)) + k = list(map(int, list(description.keys()))) + if expected_keys != k: + for i in range(categories): + feature_vector = feature_vector.replace(k[i], expected_keys[i]) + + # Get one hot vector for each patient + for count, i in enumerate(feature_vector.index): + try: + if ordered: + one_hot_vector[count, :int(feature_vector[i])] = 1 + else: + one_hot_vector[count, int(feature_vector[i])] = 1 + + except ValueError: + print('{} cannot be converted to int'.format(feature_vector[i])) + + return one_hot_vector \ No newline at end of file diff --git a/IO_utils/List_Reader.py b/IO_utils/List_Reader.py new file mode 100644 index 0000000000000000000000000000000000000000..f77e6cc5f9616121671f70521ccf9ad3e5d31d16 --- /dev/null +++ b/IO_utils/List_Reader.py @@ -0,0 +1,125 @@ +import copy +import yaml +import os + +from IO_utils.List_reader_utils import cross_check_dictionary, remove_features, treat_missing_values, \ + get_all_dict_types, save_df +from IO_utils.mv_strategies import select_features_pvalue, select_features_RF, select_features_RFE, \ + combined_selection, select_features_MRMR +from IO_utils.statistics_utils import get_pvalue, compute_basic_statistics + + +class TableReader(object): + + def __init__(self, input_df, tables, data_dictionaries='timepoints', mv_strategy='median', + output_feature=None): + + + ROOT_DIR = "C:/Users/martinca1/PhD/Projects/AI_Stroke/AIS_Regress" + dir_data_dicts = "dictionaries/dictionary_modalities.yml" if data_dictionaries == 'modalities' else \ + "dictionaries/dictionary_timepoints.yml" + + dir_data_dicts = os.path.join(ROOT_DIR, dir_data_dicts) + self.data_dicts = yaml.load(open(dir_data_dicts), Loader=yaml.Loader) + self.output_feature = copy.copy(output_feature) + + # Check that the output feature is one of the possible options + self.output_features = ['mRS90d', 'shift_mRS', 'dmRS', 'mortality'] + assert self.output_feature[0] in self.output_features, 'Output feature should be one of: ' \ + 'mRS90d, shift_mRS, dmRS, mortality and is' \ + ' {} '.format(self.output_feature[0]) + + if tables[0] == 'all_timepoints': + self.tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] + + else: + self.tables = tables + assert len(tables) == len(set(tables)), 'Tables list contains repeated element' + + # --------------------------------- 1- Select the tables ---------------------------------------------- + # Check that all elements from the selected tables are in the data frame and retrieve the corresponding + # data dictionaries and the indices from all the tables including the output indices + # All the possible outputs are included + self.selected_d, all_keys = cross_check_dictionary(input_df, self.data_dicts, self.tables, + self.output_features) + + # Select columns given by tables and and reorder dataframe to match data dictionaries order + self.reordered_df = input_df.reindex(columns=all_keys) + + # ----------------------- 2- Remove features with more than 10% missing values----------------------- + clean_df, missing_values, clean_keys = remove_features(self.reordered_df, p=0.1, exclude=self.output_features) + + # Remove data dictionaries of removed features + keys = [i for c, i in enumerate(all_keys) if not clean_keys[c]] + for k in keys: + self.selected_d.pop(k) + + # ----------------------------- 3 Handle missing values ------------------------------------------------ + # Apply the missing value strategy + self.final_df = treat_missing_values(clean_df, method=mv_strategy) + self.meta_df = input_df.loc[self.final_df.index, ['Age', 'Sex', 'pre-mRS', 'NIHSS']] + + self.patient_ids = input_df.loc[self.final_df.index, ['Id']] + # ---------------------------- 5 Set output -------------------------------------- + # This is used for fold split + self.output_vector = self.final_df[self.output_feature[0]].to_numpy(dtype=int).squeeze() + + + def select_features(self, method='all', k=10, fold_indices=None): + + # ----------------------------- Remove output features ------------------------------------------------ + features_df = copy.copy(self.final_df) + features_df = features_df.drop(columns=self.output_features) + output_vector = copy.copy(self.final_df[self.output_feature[0]]) + + if method == 'all': + selected_features = features_df.columns.tolist() + + elif method == 'p_value': + selected_features = select_features_pvalue(features_df, self.selected_d, output_vector, fold_indices, + k=k) + elif method == 'random_forest': + selected_features = select_features_RF(features_df, self.selected_d, output_vector, fold_indices, k=k) + + elif method == 'rfe': + selected_features = select_features_RFE(features_df, self.selected_d, output_vector, fold_indices, k=k) + + elif method == 'combined': + selected_features = combined_selection(features_df, self.selected_d, output_vector, fold_indices, k=k) + + elif method == 'mrmr': + selected_features = select_features_MRMR(features_df, self.selected_d, output_vector, fold_indices, k=k) + else: + raise ValueError("Not implemented") + + return selected_features + + def get_statistics(self, output_dir='../out/data_exploration/statistics', output_name='all_statistics.xlsx'): + + # Reorder dataframe according to data dictionary + reordered_df = self.reordered_df.drop(columns=['Id']) + types = [] + all_df, all_missing_values, clean_keys = remove_features(reordered_df, p=1, exclude=[]) + + tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] + for t in tables: + types.extend(get_all_dict_types(self.data_dicts[t], types=[])) + + # Compute statistics of the selected tables + statistics = compute_basic_statistics(all_df) + p_dmRS, methods = get_pvalue(all_df, all_df['dmRS'], types) + p_mortality, _ = get_pvalue(all_df, all_df['mortality'], types) + # p_shiftmRS, _ = get_pvalue(all_df, input_df['shift_mRS'], types) + + statistics['p_dmRS'] = p_dmRS + statistics['p_mortality'] = p_mortality + # statistics['shift_mRS'] = p_shiftmRS + statistics['method'] = methods + + statistics['missing_values'] = all_missing_values + statistics['Percentage (%)'] = (statistics['missing_values'] * 100. / all_df.shape[0]).to_list() + statistics['types'] = types + + save_df(statistics, output_dir=output_dir, name=output_name, overwrite=True) + + return statistics diff --git a/IO_utils/List_reader_utils.py b/IO_utils/List_reader_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..57599c412af9a7e3a4933023887c125449468e64 --- /dev/null +++ b/IO_utils/List_reader_utils.py @@ -0,0 +1,108 @@ +import numpy as np +import os +import pandas as pd +import sklearn +from sklearn.experimental import enable_iterative_imputer +from sklearn.impute import KNNImputer, IterativeImputer +def cross_check_dictionary(input_df, data_dictionary, tables, output): + d = {'Id': {}} + all_indices = ['Id'] + for t in tables: + indices = list(data_dictionary[t].keys()) + for i in indices: + d[i] = data_dictionary[t][i] + all_indices.extend(indices) + + # Check that all the features from the data dictionary are in the table + columns_df = input_df.columns + for index in all_indices: + if index not in columns_df and index != 'Id': + raise ValueError('Feature {} from data dictionary is not in data frame '.format(index)) + + for o in output: + if o not in all_indices: + all_indices.extend([o]) + + return d, all_indices + +def treat_missing_values(df, method): + if method == 'drop': + + final_df = df.dropna() + elif method == 'median': + indices = df.loc[df['mRS90d'].isnull()].index + final_df = df.drop(indices) + final_df = final_df.fillna(df.median()) + + elif method == 'knn': + imputer =KNNImputer(n_neighbors=5, weights="uniform") + indices = df.loc[df['mRS90d'].isnull()].index + final_df = df.drop(indices) + imputer.fit(final_df) + final_df[final_df.columns] = imputer.transform(final_df.to_numpy()) + + elif method == 'mice': + imputer = IterativeImputer(random_state=0, + estimator=sklearn.tree.DecisionTreeRegressor(max_features='sqrt', + random_state=0)) + indices = df.loc[df['mRS90d'].isnull()].index + final_df = df.drop(indices) + imputer.fit(final_df) + final_df[final_df.columns] = imputer.transform(final_df.to_numpy()) + else: + raise ValueError("{} mv strategy not implemented ") + + return final_df + +def remove_features(df, p, exclude): + """ + Remove columns from a dataframe with more than p*100% missing values + :param df: Pandas dataframe + :param p: Percentage of missing values + :param exclude: Columns to exclude + :return: Clean dataframe, number of missing values of the original df and the indices of the selected columns + """ + + # print('### Removing features with more than {}% of missing values'.format(p * 100)) + patients = df.shape[0] + + # Get indices of features with more than 10% of missing values + missing_values = np.array(df.isnull().sum(axis=0)) + selected_features_idx = (missing_values <= (p * patients)) + # Get indices of features to exclude + idx_exclude = [df.columns.get_loc(i) for i in exclude] + selected_features_idx[idx_exclude] = True + # Select the features from the original df + final_df = df[df.columns[selected_features_idx]] + + # Print information about the removed features + removed_columns = df.columns[[not i for i in selected_features_idx]] + removed_number = missing_values[[not i for i in selected_features_idx]] + for i in range(removed_columns.shape[0]): + print("### {} - Feature {} removed with {} missing ({:.2f} %)".format(i, removed_columns[i], + removed_number[i], + (removed_number[i] / patients) * 100)) + return final_df, missing_values, selected_features_idx + + +def get_all_dict_types(nested_dictionary, types=[]): + for key, value in nested_dictionary.items(): + + if type(value) is dict: + get_all_dict_types(value, types=types) + if key == 'type': + types.extend([value]) + else: + continue + + return types + +def save_df(df, output_dir, name='output.xlsx', overwrite=False, file_format='Excel'): + file_path = os.path.join(output_dir, name) + if not overwrite: + assert os.path.isfile(file_path), 'File already exists' + + if file_format == 'Excel': + with pd.ExcelWriter(file_path) as writer: + df.to_excel(writer, sheet_name='Sheet1') + diff --git a/IO_utils/__pycache__/Dataloader.cpython-37.pyc b/IO_utils/__pycache__/Dataloader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e09a70de9bc92e145ca06daa4644ac277fbf28f Binary files /dev/null and b/IO_utils/__pycache__/Dataloader.cpython-37.pyc differ diff --git a/IO_utils/__pycache__/Datasets.cpython-37.pyc b/IO_utils/__pycache__/Datasets.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00eb6a5eee63c0a4b4d6db939d63add7517a2947 Binary files /dev/null and b/IO_utils/__pycache__/Datasets.cpython-37.pyc differ diff --git a/IO_utils/__pycache__/FeaturePreprocessing.cpython-37.pyc b/IO_utils/__pycache__/FeaturePreprocessing.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89e3f62800b5a6305c10bb2c37cf7c1b8079f7b8 Binary files /dev/null and b/IO_utils/__pycache__/FeaturePreprocessing.cpython-37.pyc differ diff --git a/IO_utils/__pycache__/List_Reader.cpython-37.pyc b/IO_utils/__pycache__/List_Reader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..163df555b090f376793b1f44a6ed072895a93f63 Binary files /dev/null and b/IO_utils/__pycache__/List_Reader.cpython-37.pyc differ diff --git a/IO_utils/__pycache__/List_reader_utils.cpython-37.pyc b/IO_utils/__pycache__/List_reader_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e354aae7e8356f0dd5e32343905b3449e000d779 Binary files /dev/null and b/IO_utils/__pycache__/List_reader_utils.cpython-37.pyc differ diff --git a/IO_utils/__pycache__/clean_table.cpython-37.pyc b/IO_utils/__pycache__/clean_table.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a0b33d4af8f0f7e35599a5f2b3e483631a1cb92 Binary files /dev/null and b/IO_utils/__pycache__/clean_table.cpython-37.pyc differ diff --git a/IO_utils/__pycache__/mv_strategies.cpython-37.pyc b/IO_utils/__pycache__/mv_strategies.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c082ebb7d9f04c45bba16f774987e8536198cb6f Binary files /dev/null and b/IO_utils/__pycache__/mv_strategies.cpython-37.pyc differ diff --git a/IO_utils/__pycache__/split_utils.cpython-37.pyc b/IO_utils/__pycache__/split_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a492db74b2ce1f97daca711e86495299dd7ca9f Binary files /dev/null and b/IO_utils/__pycache__/split_utils.cpython-37.pyc differ diff --git a/IO_utils/__pycache__/statistics_utils.cpython-37.pyc b/IO_utils/__pycache__/statistics_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4e069e116665318b8fc45e73187d04699eba9aa Binary files /dev/null and b/IO_utils/__pycache__/statistics_utils.cpython-37.pyc differ diff --git a/IO_utils/clean_table.py b/IO_utils/clean_table.py new file mode 100644 index 0000000000000000000000000000000000000000..cf39a3040a8e82d06d9d5371c470c133b141d712 --- /dev/null +++ b/IO_utils/clean_table.py @@ -0,0 +1,321 @@ +import pandas as pd +import numpy as np + + +def clean_table(excel_dir, pre_mRS=2): + """ + This method creates a clean list processing parameter by parameter in the original list to create + a list that can be directly processed and save it + :param excel_dir: path of the excel list + :return: + """ + print('###############################################################') + print('##############Cleaning table ##################################') + print('###############################################################') + + input_df = pd.read_excel(excel_dir) + # Remove first row (info about notation) and last row (empty) + input_df = input_df.drop([0, input_df.shape[0] - 1], axis='rows') + print('######## Initial table with {} patients and {} feature'.format( + input_df.shape[0], input_df.shape[1])) + # Middle cerebral stroke selection + Territoium = input_df['Territoium'] + list_territoium = [Territoium[Territoium == i].count() for i in [0, 1, 2]] + print( + '######## Anterior circulation: {}, Middle circulation {}, Posterior circulation: {}'.format(*list_territoium)) + mc_df = input_df[input_df['Territoium'] == 1] + mc_df = mc_df.drop(['Territoium'], axis=1) + + print('######## Select {} patients with middle circulation stroke'.format(mc_df.shape[0])) + + mc_df = mc_df[mc_df['mRS90d'].notna()] + print('######## Select {} patients with valid output'.format(mc_df.shape[0])) + # Remove patients with shift in mRS <0 + shift = mc_df['mRS90d'] - mc_df['pre-mRS'] + mc_df = mc_df[shift >= 0] + + print('######## Select {} patients with non-negative shift'.format(mc_df.shape[0])) + # Remove patients with pre-mRS > 2 + mc_df = mc_df[mc_df['pre-mRS'] <= pre_mRS] + + print('######## Select {} patients with mRS < {}'.format(mc_df.shape[0], pre_mRS)) + # Remove patients not treated with either trhombectomy or stenting + #rm_indices = mc_df[(mc_df['Stenting '] == 0) & (mc_df['Thrombektomie'] == 0)] + #print('######## Remove {} patients not treated with either stenting or thrombectomy'.format(rm_indices.shape[0])) + mc_df = mc_df[(mc_df['Stenting '] == 1) | (mc_df['Thrombektomie'] == 1)] + # Meta data: id, age and sex + print('######################################## 1- Meta data preprocessing ...') + mc_df = clean_meta_data(mc_df) + + # NCCT values preprocessing + print('######################################## 2 - NCCT values preprocessing ...') + mc_df = clean_ncct_data(mc_df) + + # CTP values preprocessing + print('######################################## 3 - CTP values preprocessing ...') + mc_df = clean_ctp_data(mc_df) + + ##### CTA preprocessing + print('######################################## 4 - CTA values preprocessing ...') + mc_df = clean_cta_data(mc_df) + + ###### Treatment preprocessing + print('######################################## 5 - Treatment values preprocessing ...') + mc_df = clean_treatment_data(mc_df) + + #### Control CT preprocessing + print('######################################## 6 - Control CT values preprocessing ...') + mc_df = clean_ControlCT_data(mc_df) + + #### Preprocessing clinical data + print('######################################## 7 - Clinical values preprocessing ...') + mc_df = clean_clinical_data(mc_df) + + #### Remove dates + print('######################################## 8 - Remove dates ...') + mc_df = remove_dates(mc_df) + print('######## Clean table with {} patients and {} features'.format(mc_df.shape[0], mc_df.shape[1])) + + # mc_df.to_excel(out_dir, columns=mc_df.columns, index=False) + + print('###############################################################') + print('###############################################################') + + return mc_df + + +def remove_dates(mc_df): + mc_df = mc_df.drop(['Geburtsdatum', 'Aufnahmedatum UKER', 'Beginn Angio', 'Zeitpunkt Rekanalisation', + 'Puncture-to-Rekan', 'Zeitpunkt Verlaufs-CT \n(post 24h)', 'Zeit von Rekan bis Kontrolle', + 'Puncture-to-Rekan', 'Zeit von Bildgebung bis Rekan', 'Zeitpunkt 1. Bildgebung', + 'Symptom onset'], axis=1) + return mc_df + + +def clean_meta_data(mc_df): + # Rename columns names + mc_df = mc_df.rename(columns={'Nummer': 'Id', 'Sex (0=männlich, 1=weiblich': 'Sex', 'Alter': 'Age'}) + + # Check that ages are correct + assert np.floor(mc_df['Age']).equals( + np.floor((mc_df['Aufnahmedatum UKER'] - mc_df['Geburtsdatum']) / np.timedelta64(1, 'Y'))) + + # Replace float Age by int Age + print('#### 1.1- Convert Age to int') + mc_df['Age'] = np.floor(mc_df['Age']) + + # Remove modality of first image (all are CT) + mc_df = mc_df.drop(['1. Bildgebung UKER', '1. Bildgebung CT'], axis=1) + + return mc_df + + +def clean_ncct_data(mc_df): + mc_df = mc_df.rename(columns={'e-ASPECTS bzw pc-ASPECTS': 'e-ASPECTS', + 'Volumen e-ASPECTS': 'Volume e-ASPECTS', + 'IVCgesund': 'IVC gesund', + 'ICV-Index': 'ICV Index', + 'Ort Gefäßverschluss \nbei Aufnahme (proximalster)': 'Vessel Occlusion Location Admission', + 'Seite Gefäßverschluss \nbei Aufnahme': 'Vessel Occlusion Side Admission', + 'Symptomatische/vorgeschaltete Gefäßstenose bei Aufnahme': 'Vessel Stenosis', + 'Ort symptomatische/vorgeschaltete Gefäßstenose bei Aufnahme': 'Vessel Stenosis Location', + 'Gefäßdissektion bei Aufnahme': 'Arterial Dissection', + 'Ort Gefäßdissektion': 'Arterial Dissection Location', + 'ASPECTS oberfl/tief': 'ASPECTS oberfl_tief' + }) + + ASPECT_areas = ['C_br', 'IC_br', 'INS_br', 'L_br', 'M1_br', 'M2_br', 'M3_br', 'M4_br', 'M5_br', 'M6_br'] + for area in ASPECT_areas: + mc_df[area] = replace_values(mc_df[area], [9, 'n/a'], [0, np.nan]) + + deep_areas = ['C_br', 'IC_br', 'L_br'] + superficial_areas = ['INS_br', 'M1_br', 'M2_br', 'M3_br', 'M4_br', 'M5_br', 'M6_br'] + for index, patient in mc_df.iterrows(): + deep_stroke = patient[deep_areas].values + superficial_stroke = patient[superficial_areas].values + if any(deep_stroke) == 1: + if any(superficial_stroke) == 1: + a = 2 + else: + a = 1 + + elif any(superficial_stroke) == 1: + if any(deep_stroke) == 1: + a = 2 + else: + a = 0 + else: + a = 3 + + mc_df.loc[index, 'ASPECTS oberfl_tief'] = a + + mc_df['e-ASPECTS'] = replace_values(mc_df['e-ASPECTS'], [11], [np.nan]) + + mc_df['pc-ASPECTS'] = replace_values(mc_df['pc-ASPECTS'], [0], [np.nan]) + mc_df['Volume e-ASPECTS'] = replace_values(mc_df['Volume e-ASPECTS'], ['n/a'], [np.nan]) + + return mc_df + + +def clean_ctp_data(mc_df): + # Two patient with missing CTP, + # One -only patient with a image missing+ missing mRS+ a lot of data (posterior circulation) + # Second also posterior circulation + # mc_df['1. Bildgebung CT-P'] = replace_values(mc_df['1. Bildgebung CT-P'], ['n/a', 0], [np.nan, np.nan]) + # mc_df = mc_df.drop(mc_df[mc_df['1. Bildgebung CT-P'] == np.nan].index) + # Remove CTP - if using posterior circulation I have to modify this + mc_df = mc_df.drop(['1. Bildgebung CT-P'], axis=1) + + mc_df = mc_df.rename(columns={'Mismatch Volumen nach RAPID': 'Mismatch Volume', + 'Mismatch Ratio nach RAPID': 'Mismatch Ratio', + 'HypoperfusionIndex (Tmax10s/&max6.5s': 'Hypoperfusion Index', + 'CBV Index (rCBV in Tmax>6': 'CBV Index', + 'CBF <30% lesion volume': 'CBF_lower30_volume', + 'Tmax >6s lesion volume': 'Tmax_greater6s_volume' + }) + + list_pct_features = ['CBF_lower30_volume', 'Tmax_greater6s_volume', 'Mismatch Volume', + 'Hypoperfusion Index', 'CBV Index'] + + for feature in list_pct_features: + mc_df[feature] = replace_values(mc_df[feature], ['n/a'], [np.nan]) + + # Infinite value is not replaced + # mc_df['Mismatch Ratio'] = replace_values(mc_df['Mismatch Ratio'], ['n/a', 'none'], [np.nan, np.nan]) + print('#### 3.1 - Calculate inverse mismatch ratio - avoid infinite') + mc_df['Inverse Mismatch Ratio'] = mc_df['CBF_lower30_volume'] / replace_values(mc_df['Tmax_greater6s_volume'], + [0], [0.001]) + + print('#### 3.2 - Remove Mismatch Ratio ') + mc_df = mc_df.drop(['Mismatch Ratio'], axis=1) + # Delete values from Brainomix, only in patient from 2018 + print('#### 3.3 - Remove features obtained with Brainomix (only in patients from 2018) ') + mc_df = mc_df.drop(['Braino CBF<30%', 'Braino Tmax >6s', 'Braino Mismatch vol', 'Braino Hypoperfindex'], axis=1) + + return mc_df + + +def clean_cta_data(mc_df): + # All patients have CTA - remove + mc_df = mc_df.drop(['Akute DSA '], axis=1) + + mc_df = mc_df.rename(columns={'Gefäßverschluss DSA ': 'Vessel Occlusion CTA' + }) + + # Remove empty column + mc_df = mc_df.drop(['TTP lesion volume'], axis=1) + print('#### 4.1 - Remove tandem stenosis feature- no patient with it') + # Remove empty column + mc_df = mc_df.drop(['Tandemstenose'], axis=1) + + # Replace n/a + for feature in ['Tan Score', 'Coves Score', 'BATMAN', 'Clot Burden Score']: + mc_df[feature] = replace_values(mc_df[feature], ['n/a'], [np.nan]) + + return mc_df + + +def clean_treatment_data(mc_df): + mc_df = mc_df.rename(columns={'Thrombektomie': 'Thrombectomy', + 'ggf. weiteres Device': 'PTA', + 'Anzahl der Manöver': 'Number Maneuver', + 'Puncture-to-Rekan in min.': 'Time_Puncture_to_Recan.', + 'frustrane Rekanalisation': 'Frustrated Recanalization', + 'Gefäßverschluss nach Rekanalisation': 'Vessel Occlusion after Recan.', + 'TICI ': 'TICI', + 'Lyse i.a.': 'Lysis i.a.', + 'Lysemenge': 'Lysis quantity', + 'Gefäßverschluss in neuem Versorgungsgebiet \n(während/nach Intervention)': 'Vessel Occlusion new SupplyArea', + 'Lokalisation Gefäßverschluss': 'Vessel Occlusion new SupplyArea Location', + 'Behandlung (des neuen Gefäßverschlusses)': 'Vessel Occlusion new SupplyArea Treatment', + 'Infarkt in neuem Versorgungsgebiet': 'Infarct new SupplyArea', + 'Stenting ': 'Stenting' + }) + + mc_df['Device'] = replace_values(mc_df['Device'], [0], [np.nan]) + mc_df['PTA'] = replace_values(mc_df['PTA'], [np.nan], [0]) + mc_df['Time_Puncture_to_Recan.'] = replace_values(mc_df['Time_Puncture_to_Recan.'], ['#ZAHL!'], [np.nan]) + mc_df['Lysis i.a.'] = replace_values(mc_df['Lysis i.a.'], [9], [0]) + mc_df['Lysis quantity'] = replace_values(mc_df['Lysis quantity'], [np.nan, 'n/a'], [0, 0]) + mc_df['Infarct new SupplyArea'] = replace_values(mc_df['Infarct new SupplyArea'], ['n/a'], [np.nan]) + + return mc_df + + +def clean_ControlCT_data(mc_df): + # Missing in 5 patients: 4 died? during recanalization, 1 because new recanalization was performed + # Remove empty column + + mc_df = mc_df.rename(columns={'Zeit von Rekan bis Kontrolle in min.': 'Time_Recan_to_Control', + 'Zeit von Bildgebung bis Rekan in min': 'Time_CT_to_Angio.', + 'lakuärer Infarkt (keine Kortexbeteiligung und Infarkt ≤1,5cm ' + '(≤2,0cm falls mittels DWI in MRT gemessen) im größten Durchmesser f' + 'alls ausschließlich subkortikal)': 'Lacunar Infarct', + 'Infarktvolumen Verlaufs-CT': 'Infarct Volume ControlCT', + 'Hyperdense Mediazeichen': 'Hyperdense Media Sign', + 'pc-Aspect verlaufss-CT': 'pc-Aspect ControlCT', + 'Aspect Verlaufs-CT': 'Aspect ControlCT', + }) + + mc_df = mc_df.drop(['Dauer der Rekan'], axis=1) + mc_df['Time_CT_to_Angio.'] = replace_values(mc_df['Time_CT_to_Angio.'], ['#WERT!'], [np.nan]) + + mc_df['Infarct Volume ControlCT'] = replace_values(mc_df['Infarct Volume ControlCT'], ['n/a'], [np.nan]) + mc_df['pc-Aspect ControlCT'] = replace_values(mc_df['pc-Aspect ControlCT'], ['n/a', 0], [np.nan, np.nan, ]) + mc_df['Aspect ControlCT'] = replace_values(mc_df['Aspect ControlCT'], ['n/a'], [np.nan]) + + return mc_df + + +def clean_clinical_data(mc_df): + # + mc_df = mc_df.rename(columns={'Symptom onset / LSN': 'Symptom onset', + 'unknown onset(1=onset unbekannt, LSN angegeben)': 'Unknown Onset' + }) + + # + + # Create dichotomized functional outome + + print('#### 7.1 - Add dichotomized mRS') + mc_df['dmRS'] = [1 if i > 2 else (np.nan if np.isnan(i) else 0) for i in mc_df['mRS90d']] + print('#### 7.2 - Add mRS shift') + mc_df['shift_mRS'] = mc_df['mRS90d'] - mc_df['pre-mRS'] + print('#### 7.3 - Add mortality') + mc_df['mortality'] = [1 if i == 6 else (np.nan if np.isnan(i) else 0) for i in mc_df['mRS90d']] + + mc_df['Unknown Onset'] = replace_values(mc_df['Unknown Onset'], [np.nan], [1]) + mc_df['Zeitpunkt 1. Bildgebung'] = pd.to_datetime(mc_df['Zeitpunkt 1. Bildgebung']) + mc_df['Time_Onset_to_Admission'] = (mc_df['Zeitpunkt 1. Bildgebung'] - mc_df['Symptom onset']).dt.seconds / 60 + #mc_df.loc[ mc_df['Unknown Onset']==1,'Time_Onset_to_Admission'] = -1 + + # Handle unknown times + # u = (mc_df['Unknown Onset'] == 1).values + # mean = np.nanmedian(mc_df['Time_Onset_to_Admission'][u]) + # mc_df['Time_Onset_to_Admission'] = replace_values(mc_df['Time_Onset_to_Admission'], [np.nan], [mean]) + + return mc_df + + +def replace_values(column, original_values=[], target_values=[]): + assert len(original_values) == len(target_values), 'Length of both lists have to be the same' + + for (i, j) in zip(original_values, target_values): + column.replace(i, j, inplace=True) + + return column + + +def get_values(column): + unique_values = column.unique() + values = [] + number = [] + + for i in unique_values: + values.extend([i]) + if pd.isna(i): + number.extend([column.isna().sum()]) + else: + number.extend([column[column == i].count()]) + + print(values, number) diff --git a/IO_utils/mv_strategies.py b/IO_utils/mv_strategies.py new file mode 100644 index 0000000000000000000000000000000000000000..ac266c8690073b1023fae5a1abfa4edce7f7d398 --- /dev/null +++ b/IO_utils/mv_strategies.py @@ -0,0 +1,258 @@ +import copy +from mrmr import mrmr_classif +import numpy as np +import pandas as pd +from sklearn.feature_selection import RFE +from sklearn.ensemble import RandomForestClassifier + +from IO_utils.statistics_utils import get_pvalue +from IO_utils.FeaturePreprocessing import FeaturePreprocessing + + +def select_features_pvalue(df, data_dictionaries, output_vector, fold_indices, k=10): + # Get types of features -- different methods to compute p value + types = [] + for t in data_dictionaries.keys(): + if t not in ['Id']: + types.extend([data_dictionaries[t]['type']]) + all_p_values = np.zeros((df.shape[1] - 1)) + for indices in fold_indices: + _, _, test_indices = indices + drop_indices = df.index[test_indices] + df_copy = copy.copy(df.drop(drop_indices)) + + p_values, _ = get_pvalue(df_copy.drop(columns=['Id']), output_vector, types) + all_p_values += np.array(p_values) + + #sorted_indices = np.argsort(all_p_values) + #sorted_values = np.sort(all_p_values) + #for i, v in zip(sorted_indices, sorted_values): + # print(df.columns[i + 1], 'P_value: {} '.format(v)) + + smaller_indices = np.argsort(all_p_values)[:k] + indices = [True if a in smaller_indices else False for a in range(df.shape[1] - 1)] + indices = [False] + indices + selected_features = df.columns[indices] + + print("Features selected (p-value): {} ".format(selected_features.values)) + + return selected_features + + +def select_features_RF(df, data_dictionaries, output_vector, fold_indices, k=10): + global all_featuress_imp + FP = FeaturePreprocessing(df, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + + all_features_imp = np.zeros((all_feature.shape[1])) + + for indices in fold_indices: + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + clf = RandomForestClassifier(n_estimators=100, random_state=0) + clf.fit(features, output) + all_features_imp = all_features_imp + clf.feature_importances_ + + # plt.figure(num=None, figsize=(10, 8), dpi=80, facecolor='w', edgecolor='k') + names = FP.get_feature_names() + feat_imp = pd.Series(all_features_imp, index=names) + + # Get original features + features = [] + sorted_value = feat_imp.sort_values(ascending=False) + for next_feature in sorted_value.index: + + feature = next_feature.split('#')[0] + if feature not in features: + features.extend([feature]) + if len(features) == k: + break + + # plt.show() + print("Features selected (Ranfom-forest): {} ".format(features)) + + return features + + +def select_features_RFE(df, data_dictionaries, output_vector, fold_indices, k=10): + global all_featuress_imp + FP = FeaturePreprocessing(df, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + + all_features_ranking = np.zeros((all_feature.shape[1])) + + for indices in fold_indices: + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + clf = RandomForestClassifier(n_estimators=100, random_state=0) + rfe = RFE(estimator=clf, n_features_to_select=k, step=1) + rfe.fit(features, output) + all_features_ranking = all_features_ranking + rfe.ranking_ + + names = FP.get_feature_names() + feat_imp = pd.Series(all_features_ranking, index=names) + + # Get original features + features = [] + sorted_value = feat_imp.sort_values(ascending=True) + for next_feature in sorted_value.index: + + feature = next_feature.split('#')[0] + if feature not in features: + features.extend([feature]) + if len(features) == k: + break + + # Reorder features + features = [f for f in df.columns if f in features] + + # plt.show() + print("Features selected (RFE): {} ".format(features)) + + return features + + +def combined_selection(df, data_dictionaries, output_vector, fold_indices, k=10): + # Get types of features -- different methods to compute p value + + ### 1 - Select k*2 features by p value + types = [] + for t in data_dictionaries.keys(): + if t not in ['Id']: + types.extend([data_dictionaries[t]['type']]) + all_p_values = np.zeros((df.shape[1] - 1)) + for indices in fold_indices: + _, _, test_indices = indices + drop_indices = df.index[test_indices] + df_copy = copy.copy(df.drop(drop_indices)) + + p_values, _ = get_pvalue(df_copy.drop(columns=['Id']), output_vector, types) + all_p_values += np.array(p_values) + + sorted_indices = np.argsort(all_p_values) + sorted_values = np.sort(all_p_values) + for i, v in zip(sorted_indices, sorted_values): + print(df.columns[i + 1], 'P_value: {} '.format(v)) + + smaller_indices = np.argsort(all_p_values)[:k * 2] + indices = [True if a in smaller_indices else False for a in range(df.shape[1] - 1)] + indices = [False] + indices + selected_features = df.columns[indices] + + print("Features selected (p-value): {} ".format(selected_features.values)) + + global all_featuress_imp + df_reduced = df[selected_features] + FP = FeaturePreprocessing(df_reduced, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + + all_features_ranking = np.zeros((all_feature.shape[1])) + + for indices in fold_indices: + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + clf = RandomForestClassifier(n_estimators=5, max_depth=5, random_state=0) + rfe = RFE(estimator=clf, n_features_to_select=k, step=1) + rfe.fit(features, output) + all_features_ranking = all_features_ranking + rfe.ranking_ + + names = FP.get_feature_names() + feat_imp = pd.Series(all_features_ranking, index=names) + + # Get original features + features = [] + sorted_value = feat_imp.sort_values(ascending=True) + for next_feature in sorted_value.index: + + feature = next_feature.split('#')[0] + if feature not in features: + features.extend([feature]) + if len(features) == k: + break + + # Reorder features + features = [f for f in df.columns if f in features] + + # plt.show() + print("Features selected (RFE): {} ".format(features)) + + return features + + +def select_features_MRMR(df, data_dictionaries, output_vector, fold_indices, k=10): + global all_featuress_imp + FP = FeaturePreprocessing(df, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + + all_features_ranking = np.zeros((all_feature.shape[1])) + + for indices in fold_indices: + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + selected_features = mrmr_classif(features, output, K=k) + for counter, i in enumerate(selected_features): + all_features_ranking[i] += len(selected_features) - counter + # + # all_features_ranking = all_features_ranking + rfe.ranking_ + + names = FP.get_feature_names() + feat_imp = pd.Series(all_features_ranking, index=names) + + # Get original features + features = [] + sorted_value = feat_imp.sort_values(ascending=False) + for next_feature, next_value in zip(sorted_value.index, sorted_value.values): + + feature = next_feature.split('#')[0] + # print(feature, next_value) + if feature not in features: + features.extend([feature]) + if len(features) == k: + break + + # Reorder features + features = [f for f in df.columns if f in features] + # plt.show() + print("Features selected (MRMR): {} ".format(features)) + + return features + + +def select_features_MRMR_fold(df, data_dictionaries, output_vector, fold_indices, k=10): + global all_featuress_imp + FP = FeaturePreprocessing(df, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + all_features = [] + + for i, indices in enumerate(fold_indices): + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + selected_features = mrmr_classif(features, output, K=k) + names = FP.get_feature_names() + selecte_features_names = [names[j].split('#')[0] for j in selected_features] + s = [f for f in df.columns if f in selecte_features_names] + all_features.append(s) + + print("Features selected in fold {} (MRMR): {} ".format(i, s)) + + return all_features diff --git a/IO_utils/split_utils.py b/IO_utils/split_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd345d7c96964e38e012b83e1afdde2c56ff9ea --- /dev/null +++ b/IO_utils/split_utils.py @@ -0,0 +1,63 @@ +import copy +import random +import numpy as np + +def split_data_cv(output, seed, cv=5, p=1): + np.random.seed(seed) + random.seed(seed) + classes = np.unique(output) + indices_per_class = [np.where(output == c)[0] for c in classes] + + # Make train and test splits with same number of classes + p_split = np.ones(cv) * 1 / cv + all_indices = split_stratified(p_split, classes, indices_per_class) + folds_indices = [] + + for fold in range(cv): + cv_test_indices = copy.copy(all_indices[fold]) + cv_train_folds = copy.copy(all_indices) + cv_train_folds.pop(fold) + + + cv_train_val_indices = [item for sublist in cv_train_folds for item in sublist] + cv_train_val_indices_array = np.array(cv_train_val_indices) + + output_train_val = output[cv_train_val_indices] + indices_train_val_per_class = [cv_train_val_indices_array[np.where(output_train_val == c)[0].tolist()] for c in classes] + train_val_all_indices = split_stratified((0.8, 0.2), classes, indices_train_val_per_class) + + #random.shuffle(cv_train_val_indices) + #sp = int(0.8 * len(cv_train_val_indices)) + #cv_train_indices = cv_train_val_indices[:sp] + #cv_train_indices = cv_train_indices[:(int(p * len(cv_train_indices)))] + #cv_val_indices = cv_train_val_indices[sp:] + + cv_train_indices = copy.copy(train_val_all_indices[0]) + cv_val_indices = copy.copy(train_val_all_indices[1]) + + folds_indices.append([cv_train_indices, cv_val_indices, cv_test_indices]) + + return folds_indices + +def split_stratified(p_split, classes, indices_per_class): + t = [0] * len(classes) + indices_split = [] + + for c in (range(len(classes))): + random.shuffle(indices_per_class[c]) + + for count, sp in enumerate(p_split): + + indices = [] + + for c in range(len(classes)): + indices_c = indices_per_class[c] + t0 = t[c] + t1 = np.int(t0 + (indices_c.shape[0] * sp) + 0.5) if count != len(p_split) - 1 else indices_c.shape[0] + a = list(indices_c[t0:t1]) + indices.extend(a) + t[c] = t1 + random.shuffle(indices) + indices_split.append(indices) + + return indices_split diff --git a/IO_utils/statistics_utils.py b/IO_utils/statistics_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e7e58a96b46a93bdfda515412cdaaa409144c81e --- /dev/null +++ b/IO_utils/statistics_utils.py @@ -0,0 +1,159 @@ +import numpy as np +import pandas as pd +import rpy2.robjects as robjects +from rpy2.robjects import numpy2ri +import scipy + + +def compute_basic_statistics(df): + output = pd.DataFrame(index=df.columns) + + means = [] + medians = [] + stds = [] + ci = [] + + for i, column in enumerate(list(df)): + rmeans = robjects.r['mean'] + meanr = rmeans(robjects.FloatVector(df[column]))[0] + + mean = np.nanmean(df[column].astype('float32')) + means.extend([mean]) + median = np.nanmedian(df[column].astype('float32')) + medians.extend([median]) + std = np.nanstd(df[column].astype('float32'), ddof=1) + stds.extend([std]) + if mean != 0 and std != 0: + ci_low, ci_high = scipy.stats.norm.interval(0.95, loc=mean, scale=std) + else: + ci_low, ci_high = 0, 0 + ci.extend([[ci_low, ci_high]]) + + output['mean'] = means + output['std'] = stds + output['0.95 confidence interval'] = ci + output['median'] = medians + + return output + + +def compute_bivariate_statistics(df, dmRS, mortality, categories_list): + + output_bivartiate = pd.DataFrame(index=df.columns) + + means1 = [] + means2 = [] + means3 = [] + means4 = [] + + stds1 = [] + stds2 = [] + stds3 = [] + stds4 = [] + + for i, column in enumerate(list(df)): + + if categories_list[i] == 'cat': + + count1 = len(df[(dmRS == 0) & (df[column]==0)]) + means1.extend([count1]) + stds1.extend([0]) + + count2 = len(df[(dmRS == 1) & (df[column] == 0)]) + means2.extend([count2]) + stds2.extend([0]) + + count3 = len(df[(mortality == 0) & (df[column] == 0)]) + means3.extend([count3]) + stds3.extend([0]) + + count4 = len(df[(mortality == 1) & (df[column]==0)]) + means4.extend([count4]) + stds4.extend([0]) + + else: + + mean1 = np.nanmean(df[dmRS == 0][column].astype('float32')) + means1.extend([mean1]) + std1 = np.nanstd(df[dmRS == 0][column].astype('float32'), ddof=1) + stds1.extend([std1]) + + mean2 = np.nanmean(df[dmRS == 1][column].astype('float32')) + means2.extend([mean2]) + std2 = np.nanstd(df[dmRS == 1][column].astype('float32'), ddof=1) + stds2.extend([std2]) + + mean3 = np.nanmean(df[mortality == 0][column].astype('float32')) + means3.extend([mean3]) + std3 = np.nanstd(df[mortality== 0][column].astype('float32'), ddof=1) + stds3.extend([std3]) + + mean4 = np.nanmean(df[mortality== 1][column].astype('float32')) + means4.extend([mean4]) + std4 = np.nanstd(df[mortality == 1][column].astype('float32'), ddof=1) + stds4.extend([std4]) + + output_bivartiate['mean_mrs0'] = means1 + output_bivartiate['mean_mrs1'] = means2 + output_bivartiate['mean_mortality0'] = means3 + output_bivartiate['mean_mortality1'] = means4 + + output_bivartiate['std_mrs0'] = stds1 + output_bivartiate['std_mrs1'] = stds2 + output_bivartiate['std_mortality0'] = stds3 + output_bivartiate['std_mortality1'] = stds4 + + return output_bivartiate + + +def get_pvalue(df, target, categories_list): + methods = [] + p_values = [] + + for i, column in enumerate(list(df)): + # print(column) + if df[column].isnull().all(): + methods.extend(['None']) + p_values.extend([1]) + continue + + if categories_list[i] == 'cat': + contingency_table = pd.crosstab(index=df[column], columns=target) + # This test should only be used if the observed and expected frequencies in each cell are at least 5 + + if np.min(contingency_table.values) < 5: + if contingency_table.shape[1] > 1 and contingency_table.shape[0] > 1: + # oddsratio, p = scipy.stats.fisher_exact(contingency_table) + # print(contingency_table) + ## R Code + FisherTestR = robjects.r['fisher.test'] + numpy2ri.activate() + p = float(np.array(FisherTestR(contingency_table.to_numpy(), workspace=2e8)[0])[0]) + + method = 'Fisher test' + else: + p = 1 + method = 'None' + + else: + chi2, p, dof, expected = scipy.stats.chi2_contingency(contingency_table) + method = 'Chi-squared' + + elif categories_list[i] in ['int', 'float', 'ord']: + + a = df[column][target == 0] + b = df[column][target == 1] + stat, p = scipy.stats.ttest_ind(a.dropna(), b.dropna()) + method = 'T-test' + + else: + if column == 'Id': + p = 0 + method = 'None' + else: + raise ValueError(column, 'Statistical test for', categories_list[i], 'not implemented') + + methods.extend([method]) + p_values.extend([p]) + + return p_values, methods diff --git a/Loss/Loss_uncertainty.py b/Loss/Loss_uncertainty.py new file mode 100644 index 0000000000000000000000000000000000000000..5c749b78d6206270f7117d97139b01ff8621612b --- /dev/null +++ b/Loss/Loss_uncertainty.py @@ -0,0 +1,67 @@ +import torch + + +def old_loss(y, f, s): + y = torch.squeeze(y) + + """" + a1= torch.exp(-sigma) + a2= torch.log(f) + a3= torch.mul(y, a2) + a = torch.mul(a1, a3) + b = torch.mul(sigma, y) + loss = torch.div((-a+b),2) + loss = torch.sum(loss, dim=1) + """ + ce1 = torch.log(f) + ce2 = torch.mul(ce1, y) + cross_entropy_error = torch.mean(ce2, dim=1) + sigma = torch.exp(-s) + term1 = -torch.mul(cross_entropy_error, sigma) + term2 = s / 2 + + loss = torch.div(term1 + term2, 2) + + return loss + + +class loss_uncertainty(object): + + def __init__(self, weights): + self.total_loss = torch.zeros([1]) + self.sum = torch.zeros([1]) + self.elements = 0 + self.weights = weights + + + def get_loss(self, f, y): + y = torch.squeeze(y) + assert y.shape[1] == self.weights.shape[0] + + weights_vector = self.weights.expand([f.shape[0], -1]) + weights_vector = torch.mul(weights_vector, y) + weights_vector = torch.sum(weights_vector, dim=1) + + + a = torch.mul(f, y) + ce2 = -torch.sum(a, dim=1) + class_weighted_ce2 = torch.mul(weights_vector, ce2) + + self.update_total_loss(class_weighted_ce2, f.shape[0]) + + total_loss = torch.mean(class_weighted_ce2) + + return total_loss + + def update_total_loss(self, loss, elements): + self.sum = torch.sum(loss).detach().cpu().numpy() + self.elements += elements + self.total_loss = self.sum / self.elements + + def get_total_loss(self): + return self.total_loss + + def clear(self): + self.total_loss = torch.zeros([1]) + self.sum = torch.zeros([1]) + self.elements = 0 diff --git a/Loss/__pycache__/Loss_uncertainty.cpython-37.pyc b/Loss/__pycache__/Loss_uncertainty.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7655bee12f0561e92ed7baebd3a93b4325afb6dd Binary files /dev/null and b/Loss/__pycache__/Loss_uncertainty.cpython-37.pyc differ diff --git a/Metrics/ClassificationMetrics.py b/Metrics/ClassificationMetrics.py new file mode 100644 index 0000000000000000000000000000000000000000..7e40387b359c62c6172d96187c74e9f082e3a738 --- /dev/null +++ b/Metrics/ClassificationMetrics.py @@ -0,0 +1,126 @@ +import numpy as np +from sklearn.metrics import roc_auc_score +import torch + + +class ClassificationMetrics(object): + + def __init__(self, classes): + self.classes = classes + self.cm = np.zeros((self.classes, self.classes)) + self.accuracy = 0 + self.balanced_accuracy = 0 + self.recall = 0 + self.precision = 0 + self.f1 = 0 + self.auc = 0 + self.nll = 0 + self.pred = [] + self.y = [] + + def compute_metrics(self, pred, y): + """ + + :param pred: + :param y: + :return: + """ + + self.pred.extend([pred]) + self.y.extend([y]) + + self.cm = confusion_matrix(self.cm, pred, y) + self.accuracy = np.sum(np.diagonal(self.cm) / np.sum(np.sum(self.cm))) + self.nll = -np.mean(np.sum(np.multiply(self.y[0], np.log(self.pred[0])), axis=1)) + epsilon = np.finfo(float).eps + self.precision = [self.cm[i, i] / (np.sum(self.cm[:, i] + epsilon)) for i in range(self.classes)] + self.recall = [self.cm[i, i] / (np.sum(self.cm[i, :] + epsilon)) for i in range(self.classes)] + self.f1 = [(2 * self.precision[i] * self.recall[i]) / (self.precision[i] + self.recall[i] + epsilon) + for i in range(self.classes)] + self.balanced_accuracy = np.mean(self.recall) + + try: + if len(self.y[0].squeeze().shape) > 1: + a = np.argmax(self.y[0], axis=1) + else: + a = np.array(self.y[0]).squeeze() + + # self.auc = roc_auc_score(a, b, multi_class=mc) + self.auc = self.custom_auc(self.pred[0], a) + + except: + self.auc = 0.5 + + metrics = {'cm': self.cm, + 'accuracy': self.accuracy, + 'precision': self.precision, + 'recall': self.recall, + 'f1': self.f1, + 'balanced_accuracy': self.balanced_accuracy, + 'auc': self.auc, + 'nll': self.nll} + + return metrics + + def clear(self): + self.cm = np.zeros((self.classes, self.classes)) + self.accuracy = 0 + self.balanced_accuracy = 0 + self.recall = 0 + self.precision = 0 + self.f1 = 0 + self.auc = 0 + self.pred = [] + self.y = [] + + def multiclass_to_binary(self, pred): + + binary_pred = np.zeros((pred.shape[0], 2)) + + prob_classs0 = np.divide(np.sum(pred[:, 0:2], axis=1), 3) + prob_classs1 = np.divide(np.sum(pred[:, 2:6], axis=1), 4) + + binary_pred[:, 0] = prob_classs0 + binary_pred[:, 1] = prob_classs1 + + return binary_pred + + def custom_auc(self, prob, labels): + + unique_labels = np.unique(labels) + all_auc = 0 + + for l, label in enumerate(unique_labels): + + prob_l = prob[:, l] + class_indices = [i for i in range(len(labels)) if labels[i] == label] + rest_indices = [i for i in range(len(labels)) if labels[i] != label] + + suma = 0 + auc = 0 + for i in class_indices: + for j in rest_indices: + auc += (int(prob_l[i] < prob_l[j])) + (0.5 * int(prob_l[i] == prob_l[j])) + suma += 1 + + all_auc += 1 - (auc / suma) + + return all_auc / unique_labels.shape[0] + + +def confusion_matrix(cm, pred, y): + # assert (pred.shape == y.shape), "Input size does not match target size" + for i in range(pred.shape[0]): + if len(pred.squeeze().shape) > 1: + a = np.argmax(pred[i, :]) + else: + a = int(np.round(pred[i])) + + if len(y.squeeze().shape) > 1: + b = np.argmax(y[i, :]) + else: + b = int(y[i]) + + cm[a][b] += 1 + + return cm diff --git a/Metrics/RegressionMetrics.py b/Metrics/RegressionMetrics.py new file mode 100644 index 0000000000000000000000000000000000000000..4b11007ab0889279ff2d44773c71ba1a8830af1c --- /dev/null +++ b/Metrics/RegressionMetrics.py @@ -0,0 +1,55 @@ +import numpy as np +from Metrics._utils import int_to_binary +from Metrics.ClassificationMetrics import ClassificationMetrics + + +class RegressionMetrics(object): + + def __init__(self): + self.dif = [] + self.mse = 0 + self.mae = 0 + self.median_ae = 0 + + self.pred = [] + self.y = [] + + self.cm = ClassificationMetrics(classes=2) + + def compute_metrics(self, pred, y): + """ + + :param pred: + :param y: + :return: + """ + + self.pred.extend([pred[:, 0]]) + self.y.extend([y]) + + self.dif = np.abs(self.pred[0].squeeze() - self.y[0]) + self.mae = np.mean(self.dif) + self.median_ae = np.median(self.dif) + self.mse = np.mean(np.power(self.dif, 2)) + + metrics = {'mae': self.mae, + 'mse': self.mse, + 'median_ae': self.median_ae + } + + binary_y = int_to_binary(y, threshold=2) + binary_pred = int_to_binary(pred[:, 0], threshold=2) + clas_metrics = self.cm.compute_metrics(binary_pred, binary_y) + metrics.update(clas_metrics) + # Convert to binary prediction + + return metrics + + def clear(self): + self.mae = 0 + self.mse = 0 + self.median_ae = 0 + + self.pred = [] + self.y = [] + self.cm.clear() diff --git a/Metrics/__pycache__/ClassificationMetrics.cpython-37.pyc b/Metrics/__pycache__/ClassificationMetrics.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c52322bd8b9f23331bd9504604d94943e5fea4f4 Binary files /dev/null and b/Metrics/__pycache__/ClassificationMetrics.cpython-37.pyc differ diff --git a/Metrics/__pycache__/RegressionMetrics.cpython-37.pyc b/Metrics/__pycache__/RegressionMetrics.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2df34842674b3cb390c2ab9f89223d4a7582fef2 Binary files /dev/null and b/Metrics/__pycache__/RegressionMetrics.cpython-37.pyc differ diff --git a/Metrics/__pycache__/_utils.cpython-37.pyc b/Metrics/__pycache__/_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1210ca5be0dcb0a67b7f8deba0ec3accdefa8cc2 Binary files /dev/null and b/Metrics/__pycache__/_utils.cpython-37.pyc differ diff --git a/Metrics/_utils.py b/Metrics/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c8b3432c6ab665565d6ecf8566c712fedd18d020 --- /dev/null +++ b/Metrics/_utils.py @@ -0,0 +1,7 @@ +import numpy as np + + +def int_to_binary(vector, threshold): + a = [1 if np.floor(i*6) > threshold else 0 for i in vector] + + return np.array(a) diff --git a/README.md b/README.md deleted file mode 100644 index 0074788a857f219f7b06573fcfacfb2dc9647cd7..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# AIS_Regress - diff --git a/__pycache__/evaluate_model.cpython-37.pyc b/__pycache__/evaluate_model.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..322b92ce507f58b1850b31118dd901c41cf6da33 Binary files /dev/null and b/__pycache__/evaluate_model.cpython-37.pyc differ diff --git a/__pycache__/train.cpython-37.pyc b/__pycache__/train.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e05fa6688d18b45eec70987ff4883af0f7ac0409 Binary files /dev/null and b/__pycache__/train.cpython-37.pyc differ diff --git a/__pycache__/train_graph.cpython-37.pyc b/__pycache__/train_graph.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..966d938bfed7c0d6f1b8803f3d022ec82c7a5032 Binary files /dev/null and b/__pycache__/train_graph.cpython-37.pyc differ diff --git a/_utils/Result_container.py b/_utils/Result_container.py new file mode 100644 index 0000000000000000000000000000000000000000..0e95deeced1960ec25562a723503368028c48b35 --- /dev/null +++ b/_utils/Result_container.py @@ -0,0 +1,54 @@ +import numpy as np +import os +import pandas as pd +import pprint + +def mean_dicts(list_dicts): + number_dicts = len(list_dicts) + keys_dicts = list_dicts['Fold 0'].keys() + + mean_dict = {} + for k in keys_dicts: + + values = [list_dicts['Fold {}'.format(l)][k] for l in range(number_dicts)] + + mean_value = values[number_dicts - 1] + for l in range(number_dicts - 1): + mean_value = np.add(mean_value, values[l]) + if k != 'cm': + mean_value /= number_dicts + mean_dict[k] = mean_value + return mean_dict + +class Result_container(object): + + def __init__(self, target_metrics, output): + + self.target_metrics = {} + self.results = {} + for o in output: + self.results[o]= {} + for m in target_metrics: + self.results[o][m] = {} + + def update(self,output, method, metrics): + + + mean_m = mean_dicts(metrics) + #pprint.pprint(mean_m) + for k in self.results[output].keys(): + self.results[output][k][method] = mean_m[k] + + def save(self, output_dir, name=None): + dir_name = os.path.join(output_dir, (name+'.xlsx')) + writer = pd.ExcelWriter(path=dir_name) + + for o in self.results.keys(): + + # Write each dataframe containing a metric for all methods to a different worksheet + #items = self.results.items() + sheet_metric = pd.DataFrame.from_dict(self.results[o]) + sheet_metric.to_excel(writer, sheet_name=o) + + # Close the Pandas Excel writer and output the Excel file. + writer.save() diff --git a/_utils/__pycache__/Result_container.cpython-37.pyc b/_utils/__pycache__/Result_container.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb327b26fa5271aa851b3f12d0161b29996ea1dd Binary files /dev/null and b/_utils/__pycache__/Result_container.cpython-37.pyc differ diff --git a/_utils/__pycache__/plot_utils.cpython-37.pyc b/_utils/__pycache__/plot_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08e282b35f76714334f984cdfdfdb75929a6f35c Binary files /dev/null and b/_utils/__pycache__/plot_utils.cpython-37.pyc differ diff --git a/_utils/plot_utils.py b/_utils/plot_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..73fd8e9042a1381d0184240972261cba10509d72 --- /dev/null +++ b/_utils/plot_utils.py @@ -0,0 +1,203 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import pandas as pd +import seaborn as sns + + +def plot_significant_values(df, value, th, out_dir): + df = df.sort_values(by=[value]) + labels = df.index + values = df[value].values + indices = np.where(values < th) + + df= pd.DataFrame(data=values[indices], index=labels[indices]) + file_path = os.path.join(out_dir, '{}_{}.xlsx'.format(value, th)) + with pd.ExcelWriter(file_path) as writer: + df.to_excel(writer, sheet_name='Sheet1') + + + + inv_values = 1 / values + + fig, ax = plt.subplots(figsize=(20, 12)) + my_cmap = plt.cm.get_cmap('YlGnBu') + colors = my_cmap(np.log(inv_values) / np.max(np.log(inv_values))) + rect1 = ax.bar(labels[indices], inv_values[indices], log=True, color=colors) + for i, rect in enumerate(rect1): + height = rect.get_height() + ax.annotate('{}'.format(format(values[i], '.2e')), + xy=(rect.get_x() + (rect.get_width() / 2), height + 1), + xytext=(0, 1), + textcoords="offset points", + ha='center', va='bottom') + + # ax.plot([0, len(indices[0])], [th, th], "k--") + plt.xticks(rotation=15, ha='right') + # plt.subplots_adjust(bottom=0.5) + plt.ylim(0.1, 1.3*np.max(inv_values)) + fig_path = os.path.join(out_dir, '{}_{}.png'.format(value,th)) + plt.savefig(fig_path) + plt.close() + + +def plot_mv(mv, th, out_dir): + th = int(th) + + labels = mv.index + values = mv.values + indices = np.where(values > 0) + above_th = np.maximum(values - th, 0) + below_th = np.minimum(values, th) + + fig, ax = plt.subplots(figsize=(20, 5)) + rect1 = ax.bar(labels[indices], below_th[indices], log=True, color='g') + rect2 = ax.bar(labels[indices], above_th[indices], log=True, color='r', + bottom=below_th[indices]) + + for rect_below, rect_above in zip(rect1, rect2): + height = (rect_below.get_height() + rect_above.get_height()) + ax.annotate('{}'.format(height), + xy=(rect_above.get_x() + (rect_above.get_width() / 2), height + 1), + xytext=(0, 1), + textcoords="offset points", + ha='center', va='bottom') + + ax.plot([0, len(indices[0])], [th, th], "k--") + plt.xticks(rotation=45, ha='right') + plt.subplots_adjust(bottom=0.5) + fig_path = os.path.join(out_dir, 'missing_values.png') + plt.savefig(fig_path) + + +def plot_correlation_features(df, out_dir): + corr = df.corr() + mask = np.triu(np.ones_like(corr, dtype=bool)) + f, ax = plt.subplots() + sns.heatmap(data=corr, mask=mask, center=0, + cmap=sns.diverging_palette(230, 20), square=True) + + ax.margins(0.01) + plt.xticks(rotation=45, ha='right') + plt.tight_layout() + plt.savefig(out_dir) + + +def plot_p_values(statistics): + a = 0 + + +def plot_distribution_categorical(df, variable, table, title='', out=False, out_dir='', p_values=[]): + # + present_values = sorted(list(np.unique(df[variable]))) + expected_values = list(table[variable]['description'].keys()) + variables_names = list(table[variable]['description'].values()) + + variables_names = [variables_names[count] for count, v in enumerate(expected_values) if v in present_values] + + data = df[variable].value_counts().sort_index().values.tolist() + colors = plt.cm.get_cmap('Set3')(np.linspace(0., 1, num=len(variables_names))) + + fig, axs = plt.subplots(figsize=(15, 6), nrows=1, ncols=2) + + bars = axs[0].bar(variables_names, data, color=colors) + for bar in bars: + axs[0].annotate('{}'.format(bar.get_height()), + xy=(bar.get_x() + (bar.get_width() / 2), bar.get_height() + 0.5), + xytext=(0, 1), + textcoords="offset points", + ha='center', va='bottom' + ) + axs[0].tick_params(labelrotation=45) + + def func(pct, allvals): + absolute = int(round(pct / 100. * np.sum(allvals))) + return "{:.1f}%\n({:d} patients)".format(pct, absolute) + + wedges, texts, autotext = axs[1].pie(x=data, + autopct=lambda pct: func(pct, data), + textprops=dict(color='k'), + counterclock=False, + colors=colors) + y = np.asarray(data) + porcent = 100. * y / y.sum() + labels = ['{0} - {1:1.2f}%'.format(i, j) for i, j in zip(variables_names, porcent)] + axs[1].legend(wedges, labels, + loc='center left', bbox_to_anchor=(1, 0, 0.5, 1)) + + plt.setp(autotext, size=10) + fig.suptitle(title, size=20) + plt.tight_layout() + fig_path = os.path.join(out_dir, '{0}_distribution.png'.format(variable)) + plt.savefig(fig_path) + plt.close() + if not out: + fig, ax = plt.subplots(2, 2, figsize=(12, 8)) + + labels = {'mRS90d': ['mRS 0', 'mRS 1', 'mRS 2', 'mRS 3', 'mRS 4', 'mRS 5', 'mRS 6'], + 'dmRS': ['Good outcome', 'Bad outcome (>2mRS)'], + 'shift_mRS': ['shift of 0', 'shift of 1', 'shift of 2', 'shift of 3', + 'shift of 4', 'shift of 5', 'shift of 6'], + 'mortality': ['Alive after 90d', 'Dead after 90 d']} + + for count, output in enumerate(['mRS90d', 'shift_mRS', 'dmRS', 'mortality']): + x = int(np.floor(count / 2)) + y = int(count % 2) + sns.countplot(ax=ax[x, y], x=output, hue=variable, data=df, palette="pastel") + ax[x, y].set_xticklabels(labels[output]) + ax[x, y].get_legend().remove() + if count > 1: + p = p_values[count - 2] + color = 'k' if p > 0.05 else 'r' + ax[x, y].text(0.2, 0.95, 'P value: {:.6f}'.format(p_values[count - 2]), + ha='center', va='center', transform=ax[x, y].transAxes, fontsize=9, color=color) + + fig.legend(title=variable, labels=variables_names) + fig.suptitle(title, size=20) + plt.tight_layout() + fig_path = os.path.join(out_dir, '{0}_distribution_target.png'.format(variable)) + plt.savefig(fig_path) + plt.close() + + +def plot_distribution_numerical(df, variable, title='', out_dir='', p_values=[]): + fig, ax = plt.subplots(1, 2, figsize=(15, 6)) + df[variable] = df[variable].astype('float32') + sns.histplot(ax=ax[0], data=df, x=variable, palette='pastel', kde=True) + # sns.kdeplot(ax=ax[0], data=df, x=variable, palette='pastel') + + sns.boxplot(ax=ax[1], y=variable, data=df, palette="pastel") + sns.swarmplot(ax=ax[1], y=variable, color="k", size=3, data=df) + fig.suptitle(title) + plt.tight_layout() + fig_path = os.path.join(out_dir, '{0}_distribution.png'.format(variable)) + plt.savefig(fig_path) + plt.close() + + fig, ax = plt.subplots(2, 2, figsize=(12, 8)) + + labels = {'mRS90d': ['mRS 0', 'mRS 1', 'mRS 2', 'mRS 3', 'mRS 4', 'mRS 5', 'mRS 6'], + 'dmRS': ['Good outcome', 'Bad outcome (>2mRS)'], + 'shift_mRS': ['shift of 0', 'shift of 1', 'shift of 2', 'shift of 3', + 'shift of 4', 'shift of 5', 'shift of 6'], + 'mortality': ['Alive after 90d', 'Dead after 90 d']} + + for count, output in enumerate(['mRS90d', 'shift_mRS', 'dmRS', 'mortality']): + x = int(np.floor(count / 2)) + y = int(count % 2) + sns.boxplot(ax=ax[x, y], x=output, y=variable, data=df, palette="pastel") + sns.swarmplot(ax=ax[x, y], x=output, y=variable, color="k", size=3, data=df) + # ax[x,y].legend(title=variable, labels= variables_names) + ax[x, y].set_xticklabels(labels[output]) + # ax[x, y].get_legend().remove() + if count > 0: + p = p_values[count - 2] + color = 'k' if p > 0.05 else 'r' + ax[x, y].text(0.2, 0.95, 'P value: {:.6f}'.format(p_values[count - 2]), + ha='center', va='center', transform=ax[x, y].transAxes, fontsize=9, color=color) + + fig.suptitle(title, size=20) + plt.tight_layout() + fig_path = os.path.join(out_dir, '{0}_distribution_target.png'.format(variable)) + plt.savefig(fig_path) + plt.close() diff --git a/architectures/3D_CNN.py b/architectures/3D_CNN.py new file mode 100644 index 0000000000000000000000000000000000000000..21a40e996f3b3b67441b0bbb9f45e23778e07698 --- /dev/null +++ b/architectures/3D_CNN.py @@ -0,0 +1,64 @@ +import torch.nn as nn +import torch.nn.functional as F +import torch + + +class _3D_CNN(nn.Module): + def __init__(self, num_output): + super(_3D_CNN, self).__init__() + self.conv1 = nn.Conv3d(1, 8, kernel_size=3, stride=1, padding=1) + self.conv2 = nn.Conv3d(8, 16, kernel_size=3, stride=1, padding=1) + self.conv3 = nn.Conv3d(16, 32, kernel_size=3, stride=1, padding=1) + self.conv4 = nn.Conv3d(32, 64, kernel_size=3, stride=1, padding=1) + self.conv5 = nn.Conv3d(64, 128, kernel_size=3, stride=1, padding=1) + + self.BN1 = nn.BatchNorm3d(num_features=8) + self.BN2 = nn.BatchNorm3d(num_features=16) + self.BN3 = nn.BatchNorm3d(num_features=32) + self.BN4 = nn.BatchNorm3d(num_features=64) + self.BN5 = nn.BatchNorm3d(num_features=128) + + self.pool1 = nn.AdaptiveAvgPool3d((64, 64, 64)) + self.pool2 = nn.AdaptiveAvgPool3d((32, 32, 32)) + self.pool3 = nn.AdaptiveAvgPool3d((16, 16, 16)) + self.pool4 = nn.AdaptiveAvgPool3d((8,8,8)) + self.pool5 = nn.AdaptiveAvgPool3d((4,4,4)) + + self.fc1 = nn.Linear(10244, 1300) + self.fc2 = nn.Linear(1300, 50) + self.fc3 = nn.Linear(50, num_output) + + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight.data) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Conv3d): + nn.init.kaiming_normal_(m.weight, mode='fan_out',nonlinearity='relu') + nn.init.constant_(m.bias, 0) + + + + def forward(self, x): + + x = F.relu(self.BN1(self.conv1(x))) + x = self.pool1(x) + + x = F.relu(self.BN2(self.conv2(x))) + x = self.pool2(x) + + x = F.relu(self.BN3(self.conv3(x))) + x = self.pool3(x) + + x = F.relu(self.BN4(self.conv4(x))) + x = self.pool4(x) + + x = F.relu(self.BN5(self.conv5(x))) + x = self.pool5(x) + + x = x.view(x.size(0), -1) + + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = torch.log_softmax(self.fc3(x), dim=1) + + return x diff --git a/architectures/Edge_GCN.py b/architectures/Edge_GCN.py new file mode 100644 index 0000000000000000000000000000000000000000..bfa90f026594c82b98f7f661100a66d010af4f73 --- /dev/null +++ b/architectures/Edge_GCN.py @@ -0,0 +1,25 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch_geometric.nn import ChebConv, EdgeConv +from torch_geometric.utils import dropout_adj + +class Edge_GCN(nn.Module): + def __init__(self, nfeat, nclass, dropout=0): + super(Edge_GCN, self).__init__() + + self.dropout = dropout + + self.gc1 = EdgeConv(nn.Sequential(nn.Linear(nfeat, int(nfeat/2)), nn.ReLU())) + self.gc2 = EdgeConv(nn.Sequential(nn.Linear(int(nfeat/2), nclass), nn.ReLU())) + + + def forward(self, data, edge_index, weigths): + print(data.shape, edge_index.shape) + x = self.gc1(data, edge_index) + edge_index_drop, _ = dropout_adj(edge_index, p= self.dropout) + x = self.gc2(x, edge_index_drop) + x2= torch.log_softmax(x, dim=1) + + return x2 + diff --git a/architectures/FCN.py b/architectures/FCN.py new file mode 100644 index 0000000000000000000000000000000000000000..f2de6cbb3ad50185d48c0bf0dabd1d01a62a32c1 --- /dev/null +++ b/architectures/FCN.py @@ -0,0 +1,52 @@ +import torch +import torch.nn as nn + + +class Basic_FCN(nn.Module): + + def __init__(self, in_features, layers, out_features, dropout_rate): + super(Basic_FCN, self).__init__() + + self.layers = [] + self.n_layers = layers['number'] + l = nn.Linear(in_features, layers['layer1']) + self.layers.extend([l]) + + if self.n_layers > 2: + for i in range(self.n_layers - 2): + name0 = 'layer{}'.format(i + 1) + name1 = 'layer{}'.format(i + 2) + l = nn.Linear(layers[name0], layers[name1]) + self.layers.extend([l]) + + if self.n_layers >= 2: + name0 = 'layer{}'.format(self.n_layers - 1) + l = nn.Linear(layers[name0], out_features) + self.layers.extend([l]) + self.layers = nn.ModuleList(self.layers) + + self.dropout = nn.Dropout(p=dropout_rate) + + # self.logsoftmax = nn.LogSoftmax(dim=1) + + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight.data) + nn.init.constant_(m.bias.data, 0.03) + + def forward(self, data): + + if len(data.shape) == 1: + data = torch.reshape(data, (data.shape[0], 1)) + x = torch.relu(self.layers[0](data)) + + if self.n_layers > 2: + for i in range(1, self.n_layers - 1): + x = torch.relu(self.layers[i](x)) + x = self.dropout(x) + # x = torch.sigmoid(self.layers[self.n_layers - 1](x)) + # + #x = self.layers[self.n_layers - 1](x) + x = self.layers[self.n_layers - 1](x) + + return torch.log_softmax(x, dim=1) diff --git a/architectures/GCN.py b/architectures/GCN.py new file mode 100644 index 0000000000000000000000000000000000000000..239fd36784d042150387c3c5bcc49578ab3591a6 --- /dev/null +++ b/architectures/GCN.py @@ -0,0 +1,38 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch_geometric.nn import ChebConv, EdgeConv +from torch_geometric.utils import dropout_adj + + +class GCN(nn.Module): + def __init__(self, nfeat, nclass, dropout=0): + super(GCN, self).__init__() + + self.dropout = dropout + + self.gc1 = ChebConv(nfeat, 200, 3) + self.gc2 = ChebConv(200, 50, 3) + self.gc3 = ChebConv(50, nclass, 3) + #self.gc1 = EdgeConv(nn.Sequential(nn.Linear(nfeat, nfeat/2), nn.ReLU())) + #self.gc2 = EdgeConv(nn.Sequential(nn.Linear(nfeat/2, nclass), nn.ReLU())) + + def forward(self, data, edge_index, weigths): + + #x = self.gc1(data, edge_index, weigths) + x = self.gc1(data, edge_index) + x = F.relu(x) + x = F.dropout(x, p=self.dropout) + #edge_index_drop, _ = dropout_adj(edge_index, p= self.dropout) + #x = self.gc2(x, edge_index, weigths) + x = self.gc2(x, edge_index) + x = F.dropout(x, p=self.dropout) + #edge_index_drop, _ = dropout_adj(edge_index, p=self.dropout) + x = F.relu(x) + #x = self.gc3(x, edge_index, weigths) + x = self.gc3(x, edge_index) + #x1 = F.sigmoid(x) + x2= torch.log_softmax(x, dim=1) + + return x2 + diff --git a/architectures/ML_algorithms.py b/architectures/ML_algorithms.py new file mode 100644 index 0000000000000000000000000000000000000000..075f073f6e6d694c6e9fd37998d5a91c15ad86ad --- /dev/null +++ b/architectures/ML_algorithms.py @@ -0,0 +1,146 @@ +from Metrics.ClassificationMetrics import ClassificationMetrics + +import numpy as np +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.neural_network import MLPClassifier +import xgboost as xgb +from scipy.stats import uniform, randint + +def apply_LR(x_train, x_val, x_test, y_train, y_val, y_test): + # Number of classes + c = np.unique(y_train).shape[0] + + # Parameters for hyperparamenter optimization + solvers = ['newton-cg', 'liblinear'] + penalty = ['l2'] + c_values = [1000, 100, 10] + + # Define grid search + grid = dict(solver=solvers, penalty=penalty, C=c_values) + scoring = 'roc_auc' if c < 3 else 'f1_micro' + multiclass = 'auto' if c < 3 else 'ovr' + try: + grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=grid, n_jobs=-1, cv=5, + scoring=scoring, error_score=0) + grid_result = grid_search.fit(x_val, y_val) + + # Summarize results + print("Best dictionaries: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) + + # Use selected hyperparameters to train the model with the best dictionaries + clf = LogisticRegression(C=grid_result.best_params_['C'], solver=grid_result.best_params_['solver'], + random_state=0, max_iter=500, multi_class='ovr').fit(x_train, y_train) + except: + clf = LogisticRegression(random_state=0, max_iter=500, multi_class='ovr').fit(x_train, y_train) + + metrics_tensor = compute_metrics(clf, splits=[(x_train, y_train), (x_val, y_val), (x_test, y_test)]) + + return clf, metrics_tensor + + +def apply_random_forest(x_train, x_val, x_test, y_train, y_val, y_test, save=False): + # Parameters for hyperparamenter optimization + # Number of trees in random forest + n_estimators = [2, 5, 10, 100] + # Number of features to consider at every split + + max_features = ['log2', 'sqrt'] + [a for a in (2,5,10, 20) if a<=x_train.shape[1]] + + # Maximum number of levels in tree + max_depth = [5, 10, None] + bootstrap = [True, False] + criterion = ['gini', 'entropy'] + + # Define grid search + random_grid = {'n_estimators': n_estimators, + 'max_features': max_features, + 'max_depth': max_depth, + 'criterion': criterion, + 'bootstrap': bootstrap} + + rf = RandomForestClassifier(random_state=True) + grid_search = GridSearchCV(estimator=rf, param_grid=random_grid, n_jobs=-1, cv=2, + scoring='roc_auc', error_score=0) + grid_result = grid_search.fit(x_val, y_val) + + # Summarize results + print("Best dictionaries: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) + + # Use selected hyperparameters to train the model with the best dictionaries + clf = rf.fit(x_train, y_train) + metrics_tensor = compute_metrics(clf, splits=[(x_train, y_train), (x_val, y_val), (x_test, y_test)]) + + return clf, metrics_tensor + + +def apply_mlp(x_train, x_val, x_test, y_train, y_val, y_test): + # Number of classes + + mlp_gs = MLPClassifier(max_iter=300, verbose=False) + parameter_space = { + #'hidden_layer_sizes': [(10, 30, 10), (20,), (50, 20), (20, 50, 20)], + 'hidden_layer_sizes': [ (20,), (64, 32, 16), (128, 32, 16, 8), (40, 20)], + 'activation': ['relu'], + 'solver': ['sgd', 'adam'], + 'alpha': [0.0001, 0.05], + 'learning_rate': ['constant', 'adaptive'] + } + + clf = GridSearchCV(mlp_gs, param_grid=parameter_space, n_jobs=-1, cv=2, + scoring='roc_auc', error_score=0) + clf.fit(x_val, y_val) # X is train samples and y is the corresponding labels + print('Best %f using %s '% (clf.best_score_, clf.best_params_)) + + # Use selected hyperparameters to train the model with the best dictionaries + clf = mlp_gs.fit(x_train, y_train) + metrics_tensor = compute_metrics(clf, splits=[(x_train, y_train), (x_val, y_val), (x_test, y_test)]) + + return clf, metrics_tensor + +def apply_xgbBoost(x_train, x_val, x_test, y_train, y_val, y_test): + + params = { + 'gamma': [0.5, 1, 1.5, 2,5], + 'subsample': [0.6, 0.8, 1.0], + 'colsample_bytree': [0.4, 0.6, 0.8, 1.0], + 'max_depth': [2, 4, 6], + + + } + xgb_model = xgb.XGBClassifier( random_state=4, use_label_encoder=False, eval_metric='mlogloss') + grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, n_jobs=-1, cv=2, + scoring='roc_auc', error_score=0) + clf = grid_search.fit(x_val, y_val) + print('Best %f using %s ' % (clf.best_score_, clf.best_params_)) + best_model = xgb_model.fit(x_train, y_train) + + metrics_tensor = [] + metrics_tensor = compute_metrics(best_model, splits=[(x_train, y_train), (x_val, y_val), (x_test, y_test)] ) + splits = [(x_train, y_train), (x_val, y_val), (x_test, y_test)] + """for x, y in splits: + prob = xgb_model.predict_proba(x) + output_one_hot = np.zeros((y.shape[0], len(np.unique(y)))) + for i in range(output_one_hot.shape[0]): + output_one_hot[i, y[i]] = 1 + CM = ClassificationMetrics(classes=2) + metrics = CM.compute_metrics(prob.squeeze(), output_one_hot) + metrics_tensor.extend([metrics])""" + + return best_model, metrics_tensor + + + +def compute_metrics(clf, splits): + metrics_tensor = [] + for x, y in splits: + prob = clf.predict_proba(x) + output_one_hot = np.zeros((y.shape[0], len(np.unique(y)))) + for i in range(output_one_hot.shape[0]): + output_one_hot[i, y[i]] = 1 + CM = ClassificationMetrics(classes=2) + metrics = CM.compute_metrics(prob.squeeze(), output_one_hot) + metrics_tensor.extend([metrics]) + + return metrics_tensor diff --git a/architectures/__pycache__/Edge_GCN.cpython-37.pyc b/architectures/__pycache__/Edge_GCN.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e84e8b4313f41fe71b4b9ef5bca63c59c83a4320 Binary files /dev/null and b/architectures/__pycache__/Edge_GCN.cpython-37.pyc differ diff --git a/architectures/__pycache__/FCN.cpython-37.pyc b/architectures/__pycache__/FCN.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48d5168787abc3f17c28ddf1a6e4d22b47176aad Binary files /dev/null and b/architectures/__pycache__/FCN.cpython-37.pyc differ diff --git a/architectures/__pycache__/GCN.cpython-37.pyc b/architectures/__pycache__/GCN.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcd4d2fcac2fd81b246c8c8cdd10b29546684431 Binary files /dev/null and b/architectures/__pycache__/GCN.cpython-37.pyc differ diff --git a/architectures/__pycache__/ML_algorithms.cpython-37.pyc b/architectures/__pycache__/ML_algorithms.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbb131a9cdf60971f8ccc94d907cf2f5a60ecd32 Binary files /dev/null and b/architectures/__pycache__/ML_algorithms.cpython-37.pyc differ diff --git a/dictionaries/dictionary_modalities.yml b/dictionaries/dictionary_modalities.yml new file mode 100644 index 0000000000000000000000000000000000000000..b22e741092c66e6db082bf5dbf036bf3ef39f168 --- /dev/null +++ b/dictionaries/dictionary_modalities.yml @@ -0,0 +1,764 @@ +#### Data divided into +# 0. Output +# 1.Meta data +# 2.Clinical data +# 3.NCCT data +# 4.CTP data +# 5.CTA data +# 6.Treatment +# 7.Treatment output +# 7.Control CT data +# 8.Temporal + +# Before-Imaging: Metadata, NIHSS, Unkown_Onset, Time_Onset_to_Admission +# Pre-treatment: Metadata, NCCT, CTP, CTA (+NIHSS, Time_Onset_to_Admission, Time_CT_to_Angio) +# Post-treatment: Metadata, NCCT, CTP, CTA, Treatment, Treatment out (+NIHSS, Time_Onset_to_Admission, Time_CT_to_Angio, Time_Puncture_to_Recan ) +# Post-treatment 24 h: Metadata, NCCT, CTP, CTA, Treatment, Treatment out, Control CT (+NIHSS) + +Ouput: + + mRS90d: + type: 'ord' + info: 'Functional outcome at 90 days' + categories: 7 + description: + 0: 'mRS 0' + 1: 'mRS 1' + 2: 'mRS 2' + 3: 'mRS 3' + 4: 'mRS 4' + 5: 'mRS 5' + 6: 'mRS 6' + + dmRS: + type: 'cat' + info: 'Binary functional outcome at 90 days' + categories: 2 + description: + 0: 'Good outcome' + 1: 'Bad outcome' + + mortality: + type: 'cat' + info: 'Mortality at 90 days' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + shift_mRS: + type: 'ord' + info: 'Shift in mRS' + categories: 7 + description: + + 0: 'Shift of 0' + 1: 'Shift of 1' + 2: 'Shift of 2' + 3: 'Shift of 3' + 4: 'Shift of 4' + 5: 'Shift of 5' + 6: 'Shift of 6' + +Metadata: + + Sex: + type: 'cat' + info: 'Patient sex' + categories: 2 + description: + 0: 'man' + 1: 'woman' + + Age: + type: 'int' + info: 'Patient age' + +Clinical: + + NIHSS: + type: 'int' + info: 'National Institute of Health Stroke Scale' + categories: 43 + description: None + + pre-mRS: + type: 'ord' + info: 'mRS previous to stroke' + categories: 6 + description: + 0: 'mRS 0' + 1: 'mRS 1' + 2: 'mRS 2' + 3: 'mRS 3' + 4: 'mRS 4' + 5: 'mRS 5' + + aHT: + type: 'cat' + info: 'arterial hypertension' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + HLP: + type: 'cat' + info: 'hyperlipidemia' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + DM: + type: 'cat' + info: 'Diabetes Mellitus' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + aFib: + type: 'cat' + info: 'atrial fibrillation' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + s.p. stroke: + type: 'cat' + info: 'previous stroke' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH: + type: 'cat' + info: 'anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH mono: + type: 'cat' + info: ' mono anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH duo: + type: 'cat' + info: 'dual anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + OAK: + type: 'cat' + info: 'oral anticoagulant' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + VKA: + type: 'cat' + info: 'vitamin K antagonist' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + DOAC: + type: 'cat' + info: 'direct oral anticoagulant' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + +NCCT: + C_br: + type: 'cat' + info: 'Caudate region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + IC_br: + type: 'cat' + info: 'Intenal capsule region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + INS_br: + type: 'cat' + info: 'Insular ribbon affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + L_br: + type: 'cat' + info: 'Lentiform nucleus region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M1_br: + type: 'cat' + info: 'Anterior MCA cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M2_br: + type: 'cat' + info: 'MCA cortex lateral to the insular ribbon region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M3_br: + type: 'cat' + info: 'Posterior MCA Cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M4_br: + type: 'cat' + info: 'Anterior cortex immediately rostal to M1 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M5_br: + type: 'cat' + info: 'Lateral cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M6_br: + type: 'cat' + info: 'Posterior cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + e-ASPECTS: + type: 'ord' + info: 'electronic ASPECTS' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + + ASPECTS oberfl_tief: + type: 'cat' + info: 'depth of regions affected' + categories: 4 + description: + 0: 'superficial' + 1: 'deep' + 2: 'both' + 3: 'ASPECTS 10' + + pc-ASPECTS: + type: 'int' + info: 'posterior ASPECTS' + + + Volume e-ASPECTS: + type: 'float' + info: 'automatic volume on e-ASPECTS' + + ICV krank: + type: 'int' + info: 'Internal cerebral vein intensity, ipsilateral (HU)' + + IVC gesund: + type: 'int' + info: 'Internal cerebral vein intensity, contralateral (HU)' + + ICV Index: + type: 'float' + info: 'ICV krank/ICV gesund' + + Vessel Occlusion Location Admission: + type: 'cat' + info: 'Location of vessel occlusion at admission' + categories: 11 + description: + 0: 'No occlussion' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + Vessel Occlusion Side Admission: + type: 'cat' + info: 'side of occlussion at admission' + categories: 3 + description: + 1: 'left' + 2: 'right' + 3: 'both' + + Tan Score: + type: 'ord' + info: 'collaterals score' + categories: 4 + description: + 0: '0%' + 1: '0-50%' + 2: '50-100%' + 3: '100%' + + Coves Score: + type: 'ord' + info: 'cortical vein opacification score (score of 0(absence), 1 (moderate) or 2(full) opacification + assigned to three veins in the affected hemisphere' + categories: 7 + description: + 0: 'Coves Score 0' + 1: 'Coves Score 1' + 2: 'Coves Score 2' + 3: 'Coves Score 3' + 4: 'Coves Score 4' + 5: 'Coves Score 5' + 6: 'Coves Score 6' + + + BATMAN: + type: 'int' + info: 'Basilar artery on CTA score' + + + Clot Burden Score: + type: 'ord' + info: 'Evaluates the extent of ipsilateral thrombus' + categories: 11 + description: + 0: 'Clot Burden Score 0' + 1: 'Clot Burden Score 1' + 2: 'Clot Burden Score 2' + 3: 'Clot Burden Score 3' + 4: 'Clot Burden Score 4' + 5: 'Clot Burden Score 5' + 6: 'Clot Burden Score 6' + 7: 'Clot Burden Score 7' + 8: 'Clot Burden Score 8' + 9: 'Clot Burden Score 9' + 10: 'Clot Burden Score 10' + + Vessel Stenosis: + type: 'int' + info: 'Presence of arterial stenosis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Stenosis Location: + type: 'cat' + info: 'Location of arterial stenosis' + categories: 8 + description: + 0: 'No stenosis' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'MCA' + 4: 'PCA' + 5: 'VA' + 6: 'BA' + 7: 'ACA' + + + Arterial Dissection: + type: 'cat' + info: 'Presence of arterial dissection' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Arterial Dissection Location: + type: 'cat' + info: 'Location of arterial dissection' + categories: 4 + description: + + 0: 'No dissection' + 1: 'ACI' + 2: 'VA' + 3: 'other' + +CTP: + 'CBF_lower30_volume': + type: 'int' + info: 'Volume in which the CBF is lower that 30% of the CBF of the contraleteral side -- core --' + 'Tmax_greater6s_volume': + type: 'int' + info: 'Volume in which the Tmax is greater than 6s -- penumbra + core --' + Mismatch Volume: + type: 'int' + info: 'penumbra volume' + Inverse Mismatch Ratio: + type: 'float' + info: 'volume core/total volume' + Hypoperfusion Index: + type: 'float' + info: 'Volume Tmax>10s/volume Tmax>6s' + CBV Index: + type: 'float' + info: 'Unknown' + +CTA: + + Vessel Occlusion CTA: + + type: 'cat' + info: 'Location of vessel occlusion on CTA' + categories: 11 + description: + 0: 'No occlussion' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + +Treatment: + + Stenting: + type: 'cat' + info: 'Use of stenting' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Thrombectomy: + type: 'cat' + info: 'Use of thrombectomy' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Device: + type: 'cat' + info: 'Device employed for thrombectomy' + categories: 3 + description: + + 1: 'Stent retriever' + 2: 'Aspiration' + 3: 'Both' + + PTA: + type: 'cat' + info: 'Use of percutaneous transluminal angioplasty' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Number Maneuver: + type: 'int' + info: 'number of maneuver that were necessary' + + Lysis i.a.: + type: 'cat' + info: 'Use of intraarterial lysis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Lysis quantity: + type: 'int' + info: 'quantity of lysis used' + +Treatment_out: + + Frustrated Recanalization: + + type: 'cat' + info: 'Whether the recanalization was frustrated' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion after Recan.: + type: 'cat' + info: 'Location of vessel occlusion after recanalization' + categories: 11 + description: + 0: 'No occlusion' + 1: 'ACI' + 2: 'Carotid-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + TICI: + type: 'cat' + info: 'Reperfusion score' + categories: 5 + description: + 0: '0, No reperfusion' + 1: '1, Minimal reperfusion' + 2: '2a, Partial reperfusion (<50%)' + 3: '2b Partial reperfusion(>50%)' + 4: '3, Total reperfusion' + + + SAE: + type: 'cat' + info: 'Subarachnoid hemorrhage encephalitis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion new SupplyArea: + type: 'cat' + info: 'Vessel occlusion in a new Supply Area' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion new SupplyArea Location: + type: 'cat' + info: 'Location of vessel occlusion after recanalization in a new supply area' + categories: 11 + description: + 0: 'No occlusion' + 1: 'ACI' + 2: 'Carotid-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + Vessel Occlusion new SupplyArea Treatment: + type: 'cat' + info: 'Whether new vessel occlusion was treated' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Infarct new SupplyArea: + type: 'cat' + info: 'Whether there is an infarct in the new supply area' + categories: 2 + description: + 0: 'no' + 1: 'yes' + +Control CT: + Lacunar Infarct: + type: 'cat' + info: 'Presence of lacunar infarct' + categories: 3 + description: + 0: 'no infarct' + 1: 'Lacunar infarct' + 2: 'not a lacunar infarct' + + Infarct Volume ControlCT: + type: 'int' + info: 'Volume in ml/cm³ of the infarct in the Control CT' + + Hyperdense Media Sign: + type: 'cat' + info: 'Presence of hyperdense Media Sign' + categories: 2 + description: + 0: 'no' + 1: 'yes' + pc-Aspect ControlCT: + type: 'int' + info: 'posterior ASPECTS in Control CT' + + + Aspect ControlCT: + type: 'ord' + info: ' ASPECTS in Control CT' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + C_br2: + type: 'cat' + info: 'Caudate region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + IC_br2: + type: 'cat' + info: 'Intenal capsule region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + INS_br2: + type: 'cat' + info: 'Insular ribbon affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + L_br2: + type: 'cat' + info: 'Lentiform nucleus region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M1_br2: + type: 'cat' + info: 'Anterior MCA cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M2_br2: + type: 'cat' + info: 'MCA cortex lateral to the insular ribbon region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M3_br2: + type: 'cat' + info: 'Posterior MCA Cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M4_br2: + type: 'cat' + info: 'Anterior cortex immediately rostal to M1 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M5_br2: + type: 'cat' + info: 'Lateral cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M6_br2: + type: 'cat' + info: 'Posterior cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + +Times: + Unknown Onset: + type: 'cat' + info: 'Whether onset time of symptoms is known' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Time_Onset_to_Admission: + type: 'int' + info: 'Time from symptom onset to admission in minutes' + + Time_CT_to_Angio.: + type: 'int' + info: 'Time from CT imaging to CTA imaging in minutes' + + Time_Puncture_to_Recan.: + type: 'int' + info: 'Time from CTA to Recanalization in minutes' + + Time_Recan_to_Control: + type: 'int' + info: 'Time from Recanalization to Control CT imaging in minutes' + + + + + + + + + + diff --git a/dictionaries/dictionary_timepoints.yml b/dictionaries/dictionary_timepoints.yml new file mode 100644 index 0000000000000000000000000000000000000000..f8b8e9f74053cf23b59c45bd481597cf3903cdb2 --- /dev/null +++ b/dictionaries/dictionary_timepoints.yml @@ -0,0 +1,808 @@ +#### Data divided into +# 1.Admission: Metadata, Clinical, Unkonwn onset, Time_Onset_Admission +# 2.PostImaging: NECT, CTP, CTA +# 3.PostEVT: Treatment and Treatment output +# 4.After24h: CCT +# 5.Output + +Admission: + + Sex: + type: 'cat' + info: 'Patient sex' + categories: 2 + description: + 0: 'man' + 1: 'woman' + + Age: + type: 'int' + info: 'Patient age' + + NIHSS: + type: 'int' + info: 'National Institute of Health Stroke Scale' + categories: 43 + description: None + + s.p. stroke: + type: 'cat' + info: 'previous stroke' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + Unknown Onset: + type: 'cat' + info: 'Whether onset time of symptoms is known' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Time_Onset_to_Admission: + type: 'int' + info: 'Time from symptom onset to admission in minutes' + + #pre-mRS: + #type: 'int' + #info: 'mRS previous to stroke' + + pre-mRS: + type: 'ord' + info: 'mRS previous to stroke' + categories: 6 + description: + 0: 'mRS 0' + 1: 'mRS 1' + 2: 'mRS 2' + 3: 'mRS 3' + 4: 'mRS 4' + 5: 'mRS 5' + + + aHT: + type: 'cat' + info: 'arterial hypertension' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + HLP: + type: 'cat' + info: 'hyperlipidemia' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + DM: + type: 'cat' + info: 'Diabetes Mellitus' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + aFib: + type: 'cat' + info: 'atrial fibrillation' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH: + type: 'cat' + info: 'anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH mono: + type: 'cat' + info: ' mono anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH duo: + type: 'cat' + info: 'dual anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + OAK: + type: 'cat' + info: 'oral anticoagulant' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + VKA: + type: 'cat' + info: 'vitamin K antagonist' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + DOAC: + type: 'cat' + info: 'direct oral anticoagulant' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + +Pre-EVT: + C_br: + type: 'cat' + info: 'Caudate region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + IC_br: + type: 'cat' + info: 'Intenal capsule region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + INS_br: + type: 'cat' + info: 'Insular ribbon affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + L_br: + type: 'cat' + info: 'Lentiform nucleus region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M1_br: + type: 'cat' + info: 'Anterior MCA cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M2_br: + type: 'cat' + info: 'MCA cortex lateral to the insular ribbon region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M3_br: + type: 'cat' + info: 'Posterior MCA Cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M4_br: + type: 'cat' + info: 'Anterior cortex immediately rostal to M1 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M5_br: + type: 'cat' + info: 'Lateral cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M6_br: + type: 'cat' + info: 'Posterior cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + #e-ASPECTS: + # type: 'int' + # info: 'electronic ASPECTS' + e-ASPECTS: + type: 'ord' + info: 'electronic ASPECTS' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + ASPECTS oberfl_tief: + type: 'cat' + info: 'depth of regions affected' + categories: 4 + description: + 0: 'superficial' + 1: 'deep' + 2: 'both' + 3: 'ASPECTS 10' + + pc-ASPECTS: + type: 'cat' + info: 'posterior ASPECTS' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + Volume e-ASPECTS: + type: 'float' + info: 'automatic volume on e-ASPECTS' + + ICV krank: + type: 'int' + info: 'Internal cerebral vein intensity, ipsilateral (HU)' + + IVC gesund: + type: 'int' + info: 'Internal cerebral vein intensity, contralateral (HU)' + + ICV Index: + type: 'float' + info: 'ICV krank/ICV gesund' + + Vessel Occlusion Location Admission: + type: 'cat' + info: 'Location of vessel occlusion at admission' + categories: 11 + description: + 0: 'No occlussion' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + Vessel Occlusion Side Admission: + type: 'cat' + info: 'side of occlussion at admission' + categories: 3 + description: + 1: 'left' + 2: 'right' + 3: 'both' + + #Tan Score: + # type: 'int' + # info: 'collaterals score' + + Tan Score: + type: 'ord' + info: 'collaterals score' + categories: 4 + description: + 0: '0%' + 1: '0-50%' + 2: '50-100%' + 3: '100%' + + #Coves Score: + # type: 'int' + # info: 'cortical vein opacification score (score of 0(absence), 1 (moderate) or 2(full) opacification + # assigned to three veins in the affected hemisphere' + + Coves Score: + type: 'ord' + info: 'cortical vein opacification score (score of 0(absence), 1 (moderate) or 2(full) opacification + assigned to three veins in the affected hemisphere' + categories: 7 + description: + 0: 'Coves Score 0' + 1: 'Coves Score 1' + 2: 'Coves Score 2' + 3: 'Coves Score 3' + 4: 'Coves Score 4' + 5: 'Coves Score 5' + 6: 'Coves Score 6' + + BATMAN: + type: 'cat' + info: 'Basilar artery on CTA score' + categories: 11 + description: + 0: 'BATMAN Score 0' + 1: 'BATMAN Score 1' + 2: 'BATMAN Score 2' + 3: 'BATMAN Score 3' + 4: 'BATMAN Score 4' + 5: 'BATMAN Score 5' + 6: 'BATMAN Score 6' + 7: 'BATMAN Score 7' + 8: 'BATMAN Score 8' + 9: 'BATMAN Score 9' + 10: 'BATMAN Score 10' + + #Clot Burden Score: + # type: 'int' + # info: 'Evaluates the extent of ipsilateral thrombus' + + Clot Burden Score: + type: 'ord' + info: 'Evaluates the extent of ipsilateral thrombus' + categories: 11 + description: + 0: 'Clot Burden Score 0' + 1: 'Clot Burden Score 1' + 2: 'Clot Burden Score 2' + 3: 'Clot Burden Score 3' + 4: 'Clot Burden Score 4' + 5: 'Clot Burden Score 5' + 6: 'Clot Burden Score 6' + 7: 'Clot Burden Score 7' + 8: 'Clot Burden Score 8' + 9: 'Clot Burden Score 9' + 10: 'Clot Burden Score 10' + + Vessel Stenosis: + type: 'cat' + info: 'Presence of arterial stenosis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Stenosis Location: + type: 'cat' + info: 'Location of arterial stenosis' + categories: 8 + description: + 0: 'No stenosis' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'MCA' + 4: 'PCA' + 5: 'VA' + 6: 'BA' + 7: 'ACA' + + Arterial Dissection: + type: 'cat' + info: 'Presence of arterial dissection' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Arterial Dissection Location: + type: 'cat' + info: 'Location of arterial dissection' + categories: 4 + description: + + 0: 'No dissection' + 1: 'ACI' + 2: 'VA' + 3: 'other' + + 'CBF_lower30_volume': + type: 'int' + info: 'Volume in which the CBF is lower that 30% of the CBF of the contraleteral side -- core --' + + 'Tmax_greater6s_volume': + type: 'int' + info: 'Volume in which the Tmax is greater than 6s -- penumbra + core --' + + Mismatch Volume: + type: 'int' + info: 'penumbra volume' + + Inverse Mismatch Ratio: + type: 'float' + info: 'volume core/total volume' + + Hypoperfusion Index: + type: 'float' + info: 'Volume Tmax>10s/volume Tmax>6s' + + CBV Index: + type: 'float' + info: 'Unknown' + + Vessel Occlusion CTA: + + type: 'cat' + info: 'Location of vessel occlusion on CTA' + categories: 11 + description: + 0: 'No occlussion' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + Time_CT_to_Angio.: + type: 'int' + info: 'Time from CT imaging to CTA imaging in minutes' + +Post-EVT: + + Stenting: + type: 'cat' + info: 'Use of stenting' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Thrombectomy: + type: 'cat' + info: 'Use of thrombectomy' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Device: + type: 'cat' + info: 'Device employed for thrombectomy' + categories: 4 + description: + 0: 'No device - no thrombectomy' + 1: 'Stent retriever' + 2: 'Aspiration' + 3: 'Both' + + PTA: + type: 'cat' + info: 'Use of percutaneous transluminal angioplasty' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Number Maneuver: + type: 'int' + info: 'number of maneuver that were necessary' + + Lysis i.a.: + type: 'cat' + info: 'Use of intraarterial lysis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Lysis quantity: + type: 'int' + info: 'quantity of lysis used' + + Frustrated Recanalization: + + type: 'cat' + info: 'Whether the recanalization was frustrated' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion after Recan.: + type: 'cat' + info: 'Location of vessel occlusion after recanalization' + categories: 11 + description: + 0: 'No occlusion' + 1: 'ACI' + 2: 'Carotid-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' +# + #TICI: + # type: 'int' + # info: 'Reperfusion score' + TICI: + type: 'cat' + info: 'Reperfusion score' + categories: 5 + description: + 0: '0, No reperfusion' + 1: '1, Minimal reperfusion' + 2: '2a, Partial reperfusion (<50%)' + 3: '2b Partial reperfusion(>50%)' + 4: '3, Total reperfusion' + + SAE: + type: 'cat' + info: 'Subarachnoid hemorrhage encephalitis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion new SupplyArea: + type: 'cat' + info: 'Vessel occlusion in a new Supply Area' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion new SupplyArea Location: + type: 'cat' + info: 'Location of vessel occlusion after recanalization in a new supply area' + categories: 11 + description: + 0: 'No occlusion' + 1: 'ACI' + 2: 'Carotid-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + Vessel Occlusion new SupplyArea Treatment: + type: 'cat' + info: 'Whether new vessel occlusion was treated' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Infarct new SupplyArea: + type: 'cat' + info: 'Whether there is an infarct in the new supply area' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Time_Puncture_to_Recan.: + type: 'int' + info: 'Time from CTA to Recanalization in minutes' + +After24h: + Lacunar Infarct: + type: 'cat' + info: 'Presence of lacunar infarct' + categories: 3 + description: + 0: 'no infarct' + 1: 'Lacunar infarct' + 2: 'not a lacunar infarct' + + Infarct Volume ControlCT: + type: 'int' + info: 'Volume in ml/cm³ of the infarct in the Control CT' + + Hyperdense Media Sign: + type: 'cat' + info: 'Presence of hyperdense Media Sign' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + pc-Aspect ControlCT: + type: 'cat' + info: 'posterior ASPECTS in Control CT' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + #Aspect ControlCT: + # type: 'int' + # info: ' ASPECTS in Control CT' + + Aspect ControlCT: + type: 'ord' + info: ' ASPECTS in Control CT' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + C_br2: + type: 'cat' + info: 'Caudate region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + IC_br2: + type: 'cat' + info: 'Intenal capsule region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + INS_br2: + type: 'cat' + info: 'Insular ribbon affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + L_br2: + type: 'cat' + info: 'Lentiform nucleus region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M1_br2: + type: 'cat' + info: 'Anterior MCA cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M2_br2: + type: 'cat' + info: 'MCA cortex lateral to the insular ribbon region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M3_br2: + type: 'cat' + info: 'Posterior MCA Cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M4_br2: + type: 'cat' + info: 'Anterior cortex immediately rostal to M1 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M5_br2: + type: 'cat' + info: 'Lateral cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M6_br2: + type: 'cat' + info: 'Posterior cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Time_Recan_to_Control: + type: 'int' + info: 'Time from Recanalization to Control CT imaging in minutes' + +Output: + + mRS90d: + type: 'ord' + info: 'Functional outcome at 90 days' + categories: 7 + description: + 0: 'mRS 0' + 1: 'mRS 1' + 2: 'mRS 2' + 3: 'mRS 3' + 4: 'mRS 4' + 5: 'mRS 5' + 6: 'mRS 6' + + dmRS: + type: 'cat' + info: 'Binary functional outcome at 90 days' + categories: 2 + description: + 0: 'Good outcome' + 1: 'Bad outcome' + + mortality: + type: 'cat' + info: 'Mortality at 90 days' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + shift_mRS: + type: 'ord' + info: 'Shift in mRS' + categories: 7 + description: + + 0: 'Shift of 0' + 1: 'Shift of 1' + 2: 'Shift of 2' + 3: 'Shift of 3' + 4: 'Shift of 4' + 5: 'Shift of 5' + 6: 'Shift of 6' + + + diff --git a/evaluate_model.py b/evaluate_model.py new file mode 100644 index 0000000000000000000000000000000000000000..6ae1de8975a354db8770dd01b7973a476e3079d6 --- /dev/null +++ b/evaluate_model.py @@ -0,0 +1,433 @@ +from architectures.FCN import Basic_FCN +from architectures.GCN import GCN +from Metrics.RegressionMetrics import ClassificationMetrics +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.clean_table import clean_table +from IO_utils.split_utils import split_data_cv +from IO_utils.Dataloader import MyDataLoader +from IO_utils.List_Reader import TableReader + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd +import torch +import pprint + + +def get_metrics_unc(uncertainty_array, mean_preds_array, label_array, p): + + #order = np.argsort(uncertainty_array) + + metrics = ClassificationMetrics(classes=2) + + indices= uncertainty_array<p + m = metrics.compute_metrics(mean_preds_array[indices], label_array[indices]) + print('####') + print('Metrics using >{}'.format(p)) + print(np.count_nonzero(indices)) + pprint.pprint(m) + print('####') + + return m + +def plot_boxplots(p, y, uncertainty ): + + order = np.argsort(uncertainty) + samples = [] + auc = [] + ths = [] + n_samples = uncertainty.shape[0] + + # Minimum 10% of samples + metrics = ClassificationMetrics(classes=2) + + for i in range(order.shape[0] - int(0.2 * n_samples)): + number_samples = order.shape[0] - i + metrics.clear() + kn = order[:(number_samples - 1)] + m = metrics.compute_metrics(p[kn], y[kn]) + + samples.extend([number_samples]) + auc.extend([m['auc']]) + ths.extend([uncertainty[kn[-1]]]) + + true_positive= [] + true_negative = [] + false_positive = [] + false_negative = [] + + for i in range(p.shape[0]): + pred = np.argmax(p[i, :]) + label = np.argmax(y[i, :]) + if pred == label: + if label == 0: + true_positive.extend([uncertainty[i]]) + else: + true_negative.extend([uncertainty[i]]) + else: + if label == 0: + false_negative.extend([uncertainty[i]]) + else: + false_positive.extend([uncertainty[i]]) + + data= [true_positive, true_negative, false_positive, false_negative] + data2 = [true_positive+true_negative, false_positive+false_negative] + palette = ['lightgreen', 'darkgreen', 'salmon', 'darkred'] + labels = ['True Positive', 'True Negative', 'False Positive', 'False Negative'] + xs = [] + print(np.mean(data2[0]), np.mean(data2[1])) + print(np.std(data2[0]), np.std(data2[1])) + for i, col in enumerate(data): + xs.append(np.random.normal(np.round((i+1.1)/2) , 0.04, len(data[i]))) + + fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(13,6)) + bplot1 = ax1.boxplot(data2, labels = ['Correct predictions', 'Missclassifications'], showfliers=False) + for x, val, c, l in zip(xs, data, palette,labels): + sct1 = ax1.scatter(x, val, alpha=0.4, color=c, label=l) + + ax1.legend() + + ax2.set_xlabel('Uncertainty threshold') + ax2.set_ylabel('AUC') + lns1 = ax2.plot(ths, auc, 'k-', markersize=3, label="AUC") + ax2.invert_xaxis() + + ax3 = ax2.twinx() + ax3.set_ylabel('# of Samples') + lns2 = ax3.plot(ths, samples, 'k--', markersize=3, label="# Samples") + ax3.invert_xaxis() + + # added these three lines + lns = lns1 + lns2 + labs = [l.get_label() for l in lns] + ax2.legend(lns, labs, loc=0) + + #plt.xlabel('{} uncertainty'.format(uncertainty)) + + + + + + +def plot_selectedsamples_metrics(uncertainty_array, mean_preds_array, label_array, uncertainty=""): + order = np.argsort(uncertainty_array) + + samples = [] + auc = [] + acc = [] + b_acc = [] + th_entr = [] + + n_samples = uncertainty_array.shape[0] + + # Minimum 10% of samples + metrics = ClassificationMetrics(classes=2) + + for i in range(order.shape[0] - int(0.2 * n_samples)): + number_samples = order.shape[0] - i + metrics.clear() + kn = order[:(number_samples - 1)] + m = metrics.compute_metrics(mean_preds_array[kn], label_array[kn]) + + # print('###### ') + # print('Number of samples {}'.format(number_samples)) + # print(m['cm']) + # print(m['auc']) + # print('###### ') + + samples.extend([number_samples]) + auc.extend([m['auc']]) + acc.extend([m['accuracy']]) + b_acc.extend([m['balanced_accuracy']]) + th_entr.extend([uncertainty_array[kn[-1]]]) + + plt.figure() + + plt.plot(th_entr, acc, 'bo-', markersize=3, label="Accuracy") + plt.plot(th_entr, b_acc, 'gx-', markersize=3, label='Balanced accuracy') + plt.plot(th_entr, auc, 'rx-', markersize=3, label='AUC') + plt.xlabel('{} uncertainty'.format(uncertainty)) + + plt.figure() + plt.plot(samples, acc, 'bo-', markersize=3, label='Accuracy') + plt.plot(samples, b_acc, 'gx-', markersize=3, label='Balanced accuracy') + plt.plot(samples, auc, 'rx-', markersize=3, label='AUC') + plt.text(26, plt.gca().get_ylim()[0] + 0.05, + 'AUC acc {} \n AUC bacc {} \n AUC auc {} '.format(np.round(np.trapz(acc) / len(acc), 2), + np.round(np.trapz(b_acc) / len(b_acc), 2), + np.round(np.trapz(auc) / len(auc), 2))) + plt.xlabel("Number of samples") + plt.ylabel("Performance") + plt.ylim(0.7, 1) + plt.legend() + plt.title("Metrics ") + # plt.show() + + +def plot_uncetainties(p, y, c, e): + true_pred = [] + for i in range(p.shape[0]): + pred = np.argmax(p[i, :]) + label = np.argmax(y[i, :]) + if pred == label: + if label == 0: + true_pred.extend(['lightgreen']) + else: + true_pred.extend(['darkgreen']) + else: + if label == 0: + true_pred.extend(['salmon']) + else: + true_pred.extend(['darkred']) + + plt.figure() + for g in ['lightgreen', 'darkgreen', 'salmon', 'darkred']: + i = [i for i in range(len(true_pred)) if true_pred[i] == g] + plt.scatter(c[i], e[i], c=g, label='test', s=10) + # sc =plt.scatter(c, e, c=true_pred,) + plt.legend(['TP', 'TN', 'FN', 'FP']) + plt.xlabel('Predictive uncertainty') + plt.ylabel('Epistemic uncertainty') + +def plot_age_uncertainty(p, y, c, age): + true_pred = [] + for i in range(p.shape[0]): + pred = np.argmax(p[i, :]) + label = np.argmax(y[i, :]) + if pred == label: + if label == 0: + true_pred.extend(['lightgreen']) + else: + true_pred.extend(['darkgreen']) + else: + if label == 0: + true_pred.extend(['salmon']) + else: + true_pred.extend(['darkred']) + + plt.figure() + for g in ['lightgreen', 'darkgreen', 'salmon', 'darkred']: + i = [i for i in range(len(true_pred)) if true_pred[i] == g] + plt.scatter(age[i], c[i], c=g, label='test', s=10) + # sc =plt.scatter(c, e, c=true_pred,) + plt.legend(['TP', 'TN', 'FN', 'FP']) + plt.xlabel('Age') + plt.ylabel('Uncertainty') + + +def test_graph(config, loader, test_indices, state_dicts): + models = [] + + for m in state_dicts: + model = GCN(nfeat=loader.dataset[0].num_features, nclass=2, dropout=config['dropout']) + + model.load_state_dict(torch.load(m)) + models.append(model) + + n_models = 10 + n_droput = 100 + + + for i, batch in enumerate(loader): + + samples = len(test_indices) + predictions_y = np.zeros([samples, 2, n_models, n_droput]) + + x = batch['x'] + edge_index = batch['edge_index'] + edge_weight = batch['weights'] + y = batch['y'][test_indices] + + for m in range(n_models): + for k in range(n_droput): + model = models[m] + model.train() + test_out = models[m].forward(x, edge_index, edge_weight) + f = torch.softmax(test_out, dim=1) + f_cloned = f.clone() + f_cloned[f_cloned == 0] = 1e-30 + + predictions_y[:, :, m, k] = f_cloned[test_indices].detach().cpu().numpy() + + mean_preds = np.mean(np.mean(predictions_y, axis=2), axis=2) + predictive_uncertainty = np.sum(-np.multiply(np.log(mean_preds), mean_preds), axis=1) + + expect_data_uncertainty = np.mean( + np.mean(np.sum(-np.multiply(np.log(predictions_y), predictions_y), axis=1), axis=1), axis=1) + epistemic_uncertainty = predictive_uncertainty - expect_data_uncertainty + + """for sample in range(samples): + print('#####') + print('Sample {}'.format(sample)) + print(' True value: {}, Prediction mean: {}'.format(y[sample], mean_preds[sample])) + print(' Predictive {}, Epistemic {}'.format(predictive_uncertainty[sample], epistemic_uncertainty[sample])) + print('#####')""" + + print('Mean predictive uncertainty ', np.mean(predictive_uncertainty)) + print('Mean epistemic uncertainty', np.mean(epistemic_uncertainty)) + + return mean_preds, predictive_uncertainty, epistemic_uncertainty, y.detach().cpu().numpy() + + +def test(config, test_loader, state_dict): + models = [] + + for m in state_dict: + model = Basic_FCN(in_features=test_loader.dataset.features.shape[1], layers=config['layers'] + , out_features=2, + dropout_rate=config['dropout']) + + model.load_state_dict(torch.load(m)) + models.append(model) + + n_models = 5 + n_droput = 1 + + for batch in test_loader: + + samples = batch['x'].shape[0] + predictions_y = np.zeros([samples, 2, n_models, n_droput]) + + x = batch['x'] + y = batch['y'] + + for m in range(n_models): + for k in range(n_droput): + model = models[m] + model.train() + test_out = models[m].forward(x) + f = torch.softmax(test_out, dim=1) + + predictions_y[:, :, m, k] = f.detach().cpu().numpy() + + mean_preds = np.mean(np.mean(predictions_y, axis=2), axis=2) + predictive_uncertainty = np.sum(-np.multiply(np.log(mean_preds), mean_preds), axis=1) + + expect_data_uncertainty = np.mean( + np.mean(np.sum(-np.multiply(np.log(predictions_y), predictions_y), axis=1), axis=1), axis=1) + epistemic_uncertainty = predictive_uncertainty - expect_data_uncertainty + + """for sample in range(samples): + print('#####') + print('Sample {}'.format(sample)) + print(' True value: {}, Prediction mean: {}'.format(y[sample], mean_preds[sample])) + print(' Predictive {}, Epistemic {}'.format(predictive_uncertainty[sample], epistemic_uncertainty[sample])) + print('#####')""" + + print('Mean predictive uncertainty ', np.mean(predictive_uncertainty)) + print('Mean epistemic uncertainty', np.mean(epistemic_uncertainty)) + + return mean_preds, predictive_uncertainty, epistemic_uncertainty, y.detach().cpu().numpy() + + +if __name__ == "__main__": + + # %% DATALOADIND + + ## Clean original table + excel_dir = "../data/TheList_anonymous_mv.xlsx" + clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + + # Given a clean table get features and labels + table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', + mv_strategy='median', + output_feature=['dmRS']) + + output_vector = table.output_vector + + fold_indices = split_data_cv(output_vector, seed=5, cv=5) + + results_pred = {} + results_epis = {} + for p in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]: + results_pred[p] = {} + results_epis[p] = {} + + # for f in [5, 10, 15, 20, 25, 30, 40, 50]: + for f in [15]: + features = table.select_features(method='mrmr', k=f, fold_indices=fold_indices) + + feature_vector = table.final_df[features] + FP = FeaturePreprocessing(feature_vector, table.selected_d) + feature_vector = FP.create_features(feature_vector) + config = { + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20, + + }, + + 'dropout': 0, + 'out_classes': 2} + + ######## + mean_preds = [] + combined = [] + epistemic = [] + cls = [] + + for k in range(5): + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[k], table.selected_d, + one_hot=True) + dl = dataloader_fold.get_loaders() + state_dict_paths = ["C:/Users/martinca1/PhD/Projects/AI_Stroke/out/models/features_{}/" + "model_{}_fold_{}.pt".format(f, i, k) for i in range(5)] + pred, unc, epistemic_unc, y = test(config, dl[2], state_dict_paths) + + # _, _, _, _ = test(config, dl[0], models) + + mean_preds.extend(pred.tolist()) + combined.extend(unc.tolist()) + epistemic.extend(epistemic_unc.tolist()) + cls.extend(y.tolist()) + + p = np.array(mean_preds) + y = np.array(cls) + c = np.array(combined) + e = np.array(epistemic) + + # with pd.ExcelWriter("C:/Users/martinca1/PhD/Projects/AI_Stroke/out/uncertainty/predictive_uncertainty.xlsx") as writer: + for per in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]: + results_pred[per][f] = get_metrics_unc(c, p, y, per) + # pd_per = pd.DataFrame(results_pred[per]).T + # pd_per.to_excel(writer, sheet_name=str(per)) + + # with pd.ExcelWriter("C:/Users/martinca1/PhD/Projects/AI_Stroke/out/uncertainty/epistimic_uncertainty.xlsx") as writer: + # for per in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,0.1]: + # results_epis[per][f] = get_metrics_unc(e, p, y, per) + # pd_per = pd.DataFrame(results_epis[per]).T + # pd_per.to_excel(writer, sheet_name=str(per)) + + true_pred = [] + for i in range(p.shape[0]): + pred = np.argmax(p[i, :]) + label = np.argmax(y[i, :]) + if pred == label: + if label == 0: + true_pred.extend(['lightgreen']) + else: + true_pred.extend(['darkgreen']) + else: + if label == 0: + true_pred.extend(['salmon']) + else: + true_pred.extend(['darkred']) + + metrics = ClassificationMetrics(classes=2) + m = metrics.compute_metrics(p, y) + print('Combined metrics') + print(m) + + plot_selectedsamples_metrics(c, p, y, uncertainty='Predictive') + plot_selectedsamples_metrics(e, p, y, uncertainty='Epistemic') + + plt.figure() + for g in ['lightgreen', 'darkgreen', 'salmon', 'darkred']: + i = [i for i in range(len(true_pred)) if true_pred[i] == g] + plt.scatter(c[i], e[i], c=g, label='test') + # sc =plt.scatter(c, e, c=true_pred,) + plt.legend(['TP', 'TN', 'FN', 'FP']) + plt.xlabel('Predictive uncertainty') + plt.ylabel('Epistemic uncertainty') + plt.show() diff --git a/test.py b/test.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..056072ababd351a39c31531f26029a4728e64d66 --- /dev/null +++ b/train.py @@ -0,0 +1,204 @@ +from architectures.FCN import Basic_FCN +from Metrics.RegressionMetrics import ClassificationMetrics +from Loss.Loss_uncertainty import loss_uncertainty + +import copy +from hyperopt import STATUS_OK +from ignite.engine import Engine, Events +import neptune +import pprint +import time +import tqdm +import torch + + +def train_model(config, loaders, save_metrics=False): + # torch.manual_seed(0) + + train_loader, val_loader, test_loader = loaders + device = torch.device('cuda' if torch.cuda.is_available() else False) + model = Basic_FCN(in_features=train_loader.dataset.features.shape[1], layers=config['layers'] + , out_features=2, + dropout_rate=config['dropout']) + + p = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('Model parameters', p) + model.to(device) + + # METRICS + train_metrics = ClassificationMetrics(classes=2) + val_metrics = ClassificationMetrics(classes=2) + test_metrics = ClassificationMetrics(classes=2) + + # OPTIMIZER + optimizer = torch.optim.Adam(model.parameters(), + lr=config['lr'], + #momentum=config['momentum'], + weight_decay=config['weight_decay']) + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) + + # LOSS + import numpy as np + values, counts = np.unique(train_loader.dataset.labels, return_counts=True) + weights = torch.FloatTensor(1- counts / np.sum(counts)).to('cuda') + #weights = torch.FloatTensor((0.5, 0.5)).to('cuda') + loss_train = loss_uncertainty(weights=weights) + loss_val = loss_uncertainty(weights=weights) + loss_test = loss_uncertainty(weights=weights) + + def train(Engine, batch): + + model.train() + optimizer.zero_grad() + data = batch['x'].clone().to(device) + y_train = batch['y'].clone().to(device) + + out = model.forward(data) + + Loss = loss_train.get_loss(out, y_train.unsqueeze(1)) + m_train = train_metrics.compute_metrics(out.detach().cpu().numpy(), batch['y'].detach().cpu().numpy()) + + Loss.backward() + optimizer.step() + + Engine.state.loss = loss_train.get_total_loss() + Engine.state.metrics = train_metrics + Engine.state.m = m_train + Engine.state.out = out + Engine.state.y = y_train.detach().cpu().numpy() + + def validate(Engine, batch): + model.eval() + with torch.no_grad(): + data = batch['x'].clone().to(device) + y_val = batch['y'].clone().to(device) + + out = model.forward(data) + + lv = loss_val.get_loss(out, y_val.unsqueeze(1)) + scheduler.step(lv) + #print("Epoch {}, lr {}".format(trainer_engine.state.epoch, optimizer.param_groups[0]['lr'])) + m_val = val_metrics.compute_metrics(out.detach().cpu().numpy(), batch['y'].detach().cpu().numpy()) + + Engine.state.loss = loss_val.get_total_loss() + Engine.state.metrics = val_metrics + Engine.state.m = m_val + + def test(Engine, batch): + model.eval() + with torch.no_grad(): + data = batch['x'].clone().to(device) + y_test = batch['y'].clone().to(device) + + out = model.forward(data) + f = torch.softmax(out, dim=1) + _ = loss_test.get_loss(f, y_test.unsqueeze(1)) + m_test = test_metrics.compute_metrics(f.detach().cpu().numpy(), batch['y'].detach().cpu().numpy()) + + Engine.state.loss = loss_test.get_total_loss() + Engine.state.metrics = test_metrics + Engine.state.m = m_test + + trainer_engine = Engine(train) + validator_engine = Engine(validate) + test_engine = Engine(test) + + @trainer_engine.on(Events.STARTED) + def training_bootup(engine): + + # print("Training started") + # print("Train_loader,: iterations/epoch: ", len(train_loader), ", total number of samples", + # len(train_loader.sampler)) + # print("Validation_loader,: iterations/epoch: ", len(val_loader), ", total number of samples", + # len(val_loader.sampler)) + # print("Test_loader,: iterations/epoch: ", len(test_loader), ", total number of samples", + # len(test_loader.sampler)) + + time.sleep(0.001) + engine.pbar = tqdm.tqdm(total=300, desc='Training progress') + validator_engine.state.loss = 0 + engine.state.best_epoch = 0 + engine.state.min_loss = 1000 + engine.state.best_train_metrics = {} + engine.state.best_val_metrics = {} + engine.state.best_test_metrics = {} + + engine.count = 0 + + @trainer_engine.on(Events.EPOCH_COMPLETED) + def run_validation(engine): + validator_engine.run(val_loader, max_epochs=1) + engine.pbar.update(1) + engine.pbar.set_postfix({'loss': validator_engine.state.loss, + 'loss_train': trainer_engine.state.loss, + 'accuracy_train': trainer_engine.state.m['accuracy']}, refresh=True) + + test_engine.run(test_loader, max_epochs=1) + + if validator_engine.state.loss < engine.state.min_loss: + engine.count = 0 + engine.state.min_loss = validator_engine.state.loss + # + else: + engine.count += 1 + # print('Strike ', engine.count) + if engine.count > 40: + trainer_engine.terminate() + if True: + engine.state.best_epoch = engine.state.epoch + engine.state.best_train_metrics = trainer_engine.state.m + engine.state.best_val_metrics = validator_engine.state.m + engine.state.best_test_metrics = test_engine.state.m + engine.state.best_model = model.state_dict() + + train_metrics.clear() + val_metrics.clear() + test_metrics.clear() + + loss_train.clear() + loss_val.clear() + loss_test.clear() + + @trainer_engine.on(Events.EPOCH_COMPLETED) + def write_neptune(): + + if save_metrics: + neptune.log_metric('loss_train', trainer_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('loss_val', validator_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('loss_test', test_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('train_AUC', trainer_engine.state.m['auc']) + neptune.log_metric('train_accuracy', trainer_engine.state.m['accuracy']) + neptune.log_metric('val_AUC', validator_engine.state.m['auc']) + neptune.log_metric('val_accuracy', validator_engine.state.m['accuracy']) + neptune.log_metric('test_AUC', test_engine.state.m['auc']) + neptune.log_metric('test_accuracy', test_engine.state.m['accuracy']) + + @trainer_engine.on(Events.COMPLETED) + def run_testing(engine): + + # print(time.ctime(), "Running testing") + # test_engine.run(test_loader, max_epochs=1) + + engine.pbar.close() + # time.sleep(0.001) + + trainer_engine.run(train_loader, max_epochs=300) + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f}".format(trainer_engine.state.m['auc'], + validator_engine.state.m[ + 'auc'], + test_engine.state.m['auc'])) + # visualize(h=trainer_engine.state.best_out, color=trainer_engine.state.best_y) + result = {'loss': trainer_engine.state.min_loss, + 'epoch': trainer_engine.state.best_epoch, + 'train_metrics': trainer_engine.state.best_train_metrics, + 'val_metrics': trainer_engine.state.best_val_metrics, + 'test_metric': trainer_engine.state.best_test_metrics, + 'status': STATUS_OK} + + #pprint.pprint(trainer_engine.state.best_train_metrics) + #pprint.pprint(trainer_engine.state.best_val_metrics) + #pprint.pprint(trainer_engine.state.best_test_metrics) + + return result, trainer_engine.state.best_model diff --git a/train_graph.py b/train_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..7346ef0fed266344f5f5b0454dfeafbeff96e14b --- /dev/null +++ b/train_graph.py @@ -0,0 +1,223 @@ +from architectures.GCN import GCN + +from Metrics.RegressionMetrics import ClassificationMetrics +from Loss.Loss_uncertainty import loss_uncertainty + +import copy +from hyperopt import STATUS_OK +from ignite.engine import Engine, Events +import neptune +import pprint +import time +import tqdm +import torch + + +def train_model_graph(config, loaders, indices, save_metrics=False): + # torch.manual_seed(0) + + train_indices, val_indices, test_indices = indices + + device = torch.device('cuda' if torch.cuda.is_available() else False) + model = GCN(nfeat=loaders.dataset[0].num_features, nclass=2, dropout=config['dropout']) + + p = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('Model parameters', p) + model.to(device) + + # METRICS + train_metrics = ClassificationMetrics(classes=2) + val_metrics = ClassificationMetrics(classes=2) + test_metrics = ClassificationMetrics(classes=2) + + # OPTIMIZER + optimizer = torch.optim.Adam(model.parameters(), + lr=config['lr'], + #momentum=config['momentum'], + weight_decay=config['weight_decay']) + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7) + + # LOSS + import numpy as np + values, counts = np.unique(loaders.dataset[0].labels.data, return_counts=True) + weights = torch.FloatTensor(1- counts / np.sum(counts)).to('cuda') + #weights = torch.FloatTensor((0.5, 0.5)).to('cuda') + loss_train = loss_uncertainty(weights=weights) + loss_val = loss_uncertainty(weights=weights) + loss_test = loss_uncertainty(weights=weights) + + def train(Engine, batch): + + model.train() + optimizer.zero_grad() + data = batch['x'].clone().to(device) + edge_index = batch['edge_index'].clone().to(device) + edge_weight = batch['weights'].clone().to(device) + y_train = batch['y'].clone().to(device) + + out = model.forward(data, edge_index, edge_weight) + # f = torch.softmax(out, dim=1) + f_cloned = out.clone() + #f_cloned[f_cloned == 0] = 1e-30 + + + Loss = loss_train.get_loss(f_cloned[train_indices], y_train[train_indices].unsqueeze(1)) + + m_train = train_metrics.compute_metrics(f_cloned[train_indices].detach().cpu().numpy(), + batch['y'][train_indices].detach().cpu().numpy()) + + Loss.backward() + optimizer.step() + + Engine.state.loss = loss_train.get_total_loss() + Engine.state.metrics = train_metrics + Engine.state.m = m_train + Engine.state.out = out + Engine.state.y = y_train.detach().cpu().numpy() + + def validate(Engine, batch): + model.eval() + with torch.no_grad(): + data = batch['x'].clone().to(device) + edge_index = batch['edge_index'].clone().to(device) + edge_weight = batch['weights'].clone().to(device) + y_val = batch['y'].clone().to(device) + + out = model.forward(data, edge_index, edge_weight) + #f = torch.softmax(out, dim=1) + f_cloned = out.clone() + f_cloned[f_cloned == 0] = 1e-30 + + lv = loss_val.get_loss(f_cloned[val_indices], y_val[val_indices].unsqueeze(1)) + scheduler.step(lv) + #print("Epoch {}, lr {}".format(trainer_engine.state.epoch, optimizer.param_groups[0]['lr'])) + m_val = val_metrics.compute_metrics(f_cloned[val_indices].detach().cpu().numpy(), + batch['y'][val_indices].detach().cpu().numpy()) + + Engine.state.loss = loss_val.get_total_loss() + Engine.state.metrics = val_metrics + Engine.state.m = m_val + + def test(Engine, batch): + model.eval() + with torch.no_grad(): + data = batch['x'].clone().to(device) + edge_index = batch['edge_index'].clone().to(device) + edge_weight = batch['weights'].clone().to(device) + y_test = batch['y'].clone().to(device) + + out = model.forward(data, edge_index, edge_weight) + # f = torch.softmax(out, dim=1) + f_cloned = out.clone() + f_cloned[f_cloned == 0] = 1e-30 + + _ = loss_test.get_loss(f_cloned[test_indices], y_test[test_indices].unsqueeze(1)) + m_test = test_metrics.compute_metrics(f_cloned[test_indices].detach().cpu().numpy(), + batch['y'][test_indices].detach().cpu().numpy()) + + Engine.state.loss = loss_test.get_total_loss() + Engine.state.metrics = test_metrics + Engine.state.m = m_test + + trainer_engine = Engine(train) + validator_engine = Engine(validate) + test_engine = Engine(test) + + @trainer_engine.on(Events.STARTED) + def training_bootup(engine): + + # print("Training started") + # print("Train_loader,: iterations/epoch: ", len(train_loader), ", total number of samples", + # len(train_loader.sampler)) + # print("Validation_loader,: iterations/epoch: ", len(val_loader), ", total number of samples", + # len(val_loader.sampler)) + # print("Test_loader,: iterations/epoch: ", len(test_loader), ", total number of samples", + # len(test_loader.sampler)) + + time.sleep(0.001) + engine.pbar = tqdm.tqdm(total=300, desc='Training progress') + validator_engine.state.loss = 0 + engine.state.best_epoch = 0 + engine.state.min_loss = 1000 + engine.state.best_train_metrics = {} + engine.state.best_val_metrics = {} + engine.state.best_test_metrics = {} + + engine.count = 0 + + @trainer_engine.on(Events.EPOCH_COMPLETED) + def run_validation(engine): + validator_engine.run(loaders, max_epochs=1) + engine.pbar.update(1) + engine.pbar.set_postfix({'loss': validator_engine.state.loss, + 'loss_train': trainer_engine.state.loss, + 'accuracy_train': trainer_engine.state.m['accuracy']}, refresh=True) + + test_engine.run(loaders, max_epochs=1) + + if validator_engine.state.loss < engine.state.min_loss: + engine.count = 0 + engine.state.min_loss = validator_engine.state.loss + # + else: + engine.count += 1 + if engine.count > 20: + trainer_engine.terminate() + if True: + engine.state.best_epoch = engine.state.epoch + engine.state.best_train_metrics = trainer_engine.state.m + engine.state.best_val_metrics = validator_engine.state.m + engine.state.best_test_metrics = test_engine.state.m + engine.state.best_model = model.state_dict() + + train_metrics.clear() + val_metrics.clear() + test_metrics.clear() + + loss_train.clear() + loss_val.clear() + loss_test.clear() + + @trainer_engine.on(Events.EPOCH_COMPLETED) + def write_neptune(): + + if save_metrics: + neptune.log_metric('loss_train', trainer_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('loss_val', validator_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('loss_test', test_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('train_AUC', trainer_engine.state.m['auc']) + neptune.log_metric('train_accuracy', trainer_engine.state.m['accuracy']) + neptune.log_metric('val_AUC', validator_engine.state.m['auc']) + neptune.log_metric('val_accuracy', validator_engine.state.m['accuracy']) + neptune.log_metric('test_AUC', test_engine.state.m['auc']) + neptune.log_metric('test_accuracy', test_engine.state.m['accuracy']) + + @trainer_engine.on(Events.COMPLETED) + def run_testing(engine): + + # print(time.ctime(), "Running testing") + # test_engine.run(test_loader, max_epochs=1) + + engine.pbar.close() + # time.sleep(0.001) + + trainer_engine.run(loaders, max_epochs=300) + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f}".format(trainer_engine.state.m['auc'], + validator_engine.state.m[ + 'auc'], + test_engine.state.m['auc'])) + # visualize(h=trainer_engine.state.best_out, color=trainer_engine.state.best_y) + result = {'loss': trainer_engine.state.min_loss, + 'epoch': trainer_engine.state.best_epoch, + 'train_metrics': trainer_engine.state.best_train_metrics, + 'val_metrics': trainer_engine.state.best_val_metrics, + 'test_metric': trainer_engine.state.best_test_metrics, + 'status': STATUS_OK} + + #pprint.pprint(trainer_engine.state.best_train_metrics) + #pprint.pprint(trainer_engine.state.best_val_metrics) + #pprint.pprint(trainer_engine.state.best_test_metrics) + + return result, trainer_engine.state.best_model