From ccfbd1a93258168d55348abebd8b830f04174a03 Mon Sep 17 00:00:00 2001 From: Celia Martin Vicario <celia.martin@fau.de> Date: Thu, 19 Jan 2023 10:55:15 +0100 Subject: [PATCH] Initial commit --- .idea/AIS_Regress.iml | 2 +- .idea/misc.xml | 5 +- Experiments/1-VariablesEvaluation.py | 200 +++++ Experiments/2-VariableSelection.py | 162 ++++ Experiments/3-MVStrategy.py | 150 ++++ Experiments/4-Timepoints.py | 164 ++++ Experiments/5-Uncertainty.py | 100 +++ Experiments/6-Graph_constr.py | 116 +++ Experiments/7-Graph_model.py | 82 ++ Experiments/8-Uncertainty_graph.py | 145 ++++ Experiments/9-Images.py | 55 ++ Experiments/_t_test.py | 62 ++ IO_utils/Dataloader.py | 63 ++ IO_utils/Datasets.py | 140 +++ IO_utils/FeaturePreprocessing.py | 109 +++ IO_utils/List_Reader.py | 125 +++ IO_utils/List_reader_utils.py | 108 +++ .../__pycache__/Dataloader.cpython-37.pyc | Bin 0 -> 1850 bytes IO_utils/__pycache__/Datasets.cpython-37.pyc | Bin 0 -> 4138 bytes .../FeaturePreprocessing.cpython-37.pyc | Bin 0 -> 3546 bytes .../__pycache__/List_Reader.cpython-37.pyc | Bin 0 -> 3883 bytes .../List_reader_utils.cpython-37.pyc | Bin 0 -> 3532 bytes .../__pycache__/clean_table.cpython-37.pyc | Bin 0 -> 9996 bytes .../__pycache__/mv_strategies.cpython-37.pyc | Bin 0 -> 6790 bytes .../__pycache__/split_utils.cpython-37.pyc | Bin 0 -> 1944 bytes .../statistics_utils.cpython-37.pyc | Bin 0 -> 3419 bytes IO_utils/clean_table.py | 321 +++++++ IO_utils/mv_strategies.py | 258 ++++++ IO_utils/split_utils.py | 63 ++ IO_utils/statistics_utils.py | 159 ++++ Loss/Loss_uncertainty.py | 67 ++ .../Loss_uncertainty.cpython-37.pyc | Bin 0 -> 1917 bytes Metrics/ClassificationMetrics.py | 126 +++ Metrics/RegressionMetrics.py | 55 ++ .../ClassificationMetrics.cpython-37.pyc | Bin 0 -> 4067 bytes .../RegressionMetrics.cpython-37.pyc | Bin 0 -> 1563 bytes Metrics/__pycache__/_utils.cpython-37.pyc | Bin 0 -> 518 bytes Metrics/_utils.py | 7 + README.md | 2 - __pycache__/evaluate_model.cpython-37.pyc | Bin 0 -> 10401 bytes __pycache__/train.cpython-37.pyc | Bin 0 -> 5576 bytes __pycache__/train_graph.cpython-37.pyc | Bin 0 -> 5978 bytes _utils/Result_container.py | 54 ++ .../Result_container.cpython-37.pyc | Bin 0 -> 1794 bytes _utils/__pycache__/plot_utils.cpython-37.pyc | Bin 0 -> 7536 bytes _utils/plot_utils.py | 203 +++++ architectures/3D_CNN.py | 64 ++ architectures/Edge_GCN.py | 25 + architectures/FCN.py | 52 ++ architectures/GCN.py | 38 + architectures/ML_algorithms.py | 146 ++++ .../__pycache__/Edge_GCN.cpython-37.pyc | Bin 0 -> 1235 bytes architectures/__pycache__/FCN.cpython-37.pyc | Bin 0 -> 1531 bytes architectures/__pycache__/GCN.cpython-37.pyc | Bin 0 -> 1173 bytes .../__pycache__/ML_algorithms.cpython-37.pyc | Bin 0 -> 4090 bytes dictionaries/dictionary_modalities.yml | 764 +++++++++++++++++ dictionaries/dictionary_timepoints.yml | 808 ++++++++++++++++++ evaluate_model.py | 433 ++++++++++ test.py | 0 train.py | 204 +++++ train_graph.py | 223 +++++ 61 files changed, 5856 insertions(+), 4 deletions(-) create mode 100644 Experiments/1-VariablesEvaluation.py create mode 100644 Experiments/2-VariableSelection.py create mode 100644 Experiments/3-MVStrategy.py create mode 100644 Experiments/4-Timepoints.py create mode 100644 Experiments/5-Uncertainty.py create mode 100644 Experiments/6-Graph_constr.py create mode 100644 Experiments/7-Graph_model.py create mode 100644 Experiments/8-Uncertainty_graph.py create mode 100644 Experiments/9-Images.py create mode 100644 Experiments/_t_test.py create mode 100644 IO_utils/Dataloader.py create mode 100644 IO_utils/Datasets.py create mode 100644 IO_utils/FeaturePreprocessing.py create mode 100644 IO_utils/List_Reader.py create mode 100644 IO_utils/List_reader_utils.py create mode 100644 IO_utils/__pycache__/Dataloader.cpython-37.pyc create mode 100644 IO_utils/__pycache__/Datasets.cpython-37.pyc create mode 100644 IO_utils/__pycache__/FeaturePreprocessing.cpython-37.pyc create mode 100644 IO_utils/__pycache__/List_Reader.cpython-37.pyc create mode 100644 IO_utils/__pycache__/List_reader_utils.cpython-37.pyc create mode 100644 IO_utils/__pycache__/clean_table.cpython-37.pyc create mode 100644 IO_utils/__pycache__/mv_strategies.cpython-37.pyc create mode 100644 IO_utils/__pycache__/split_utils.cpython-37.pyc create mode 100644 IO_utils/__pycache__/statistics_utils.cpython-37.pyc create mode 100644 IO_utils/clean_table.py create mode 100644 IO_utils/mv_strategies.py create mode 100644 IO_utils/split_utils.py create mode 100644 IO_utils/statistics_utils.py create mode 100644 Loss/Loss_uncertainty.py create mode 100644 Loss/__pycache__/Loss_uncertainty.cpython-37.pyc create mode 100644 Metrics/ClassificationMetrics.py create mode 100644 Metrics/RegressionMetrics.py create mode 100644 Metrics/__pycache__/ClassificationMetrics.cpython-37.pyc create mode 100644 Metrics/__pycache__/RegressionMetrics.cpython-37.pyc create mode 100644 Metrics/__pycache__/_utils.cpython-37.pyc create mode 100644 Metrics/_utils.py delete mode 100644 README.md create mode 100644 __pycache__/evaluate_model.cpython-37.pyc create mode 100644 __pycache__/train.cpython-37.pyc create mode 100644 __pycache__/train_graph.cpython-37.pyc create mode 100644 _utils/Result_container.py create mode 100644 _utils/__pycache__/Result_container.cpython-37.pyc create mode 100644 _utils/__pycache__/plot_utils.cpython-37.pyc create mode 100644 _utils/plot_utils.py create mode 100644 architectures/3D_CNN.py create mode 100644 architectures/Edge_GCN.py create mode 100644 architectures/FCN.py create mode 100644 architectures/GCN.py create mode 100644 architectures/ML_algorithms.py create mode 100644 architectures/__pycache__/Edge_GCN.cpython-37.pyc create mode 100644 architectures/__pycache__/FCN.cpython-37.pyc create mode 100644 architectures/__pycache__/GCN.cpython-37.pyc create mode 100644 architectures/__pycache__/ML_algorithms.cpython-37.pyc create mode 100644 dictionaries/dictionary_modalities.yml create mode 100644 dictionaries/dictionary_timepoints.yml create mode 100644 evaluate_model.py delete mode 100644 test.py create mode 100644 train.py create mode 100644 train_graph.py diff --git a/.idea/AIS_Regress.iml b/.idea/AIS_Regress.iml index 8dc09e5..74e2033 100644 --- a/.idea/AIS_Regress.iml +++ b/.idea/AIS_Regress.iml @@ -2,7 +2,7 @@ <module type="PYTHON_MODULE" version="4"> <component name="NewModuleRootManager"> <content url="file://$MODULE_DIR$" /> - <orderEntry type="inheritedJdk" /> + <orderEntry type="jdk" jdkName="Python 3.7 (BaseEnv)" jdkType="Python SDK" /> <orderEntry type="sourceFolder" forTests="false" /> </component> <component name="TestRunnerService"> diff --git a/.idea/misc.xml b/.idea/misc.xml index d1e22ec..e5f7f46 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ <?xml version="1.0" encoding="UTF-8"?> <project version="4"> - <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" /> + <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (BaseEnv)" project-jdk-type="Python SDK" /> + <component name="PyCharmProfessionalAdvertiser"> + <option name="shown" value="true" /> + </component> </project> \ No newline at end of file diff --git a/Experiments/1-VariablesEvaluation.py b/Experiments/1-VariablesEvaluation.py new file mode 100644 index 0000000..4a57dcd --- /dev/null +++ b/Experiments/1-VariablesEvaluation.py @@ -0,0 +1,200 @@ +from IO_utils.clean_table import clean_table +from IO_utils.statistics_utils import get_pvalue, compute_basic_statistics, compute_bivariate_statistics +from IO_utils.List_reader_utils import cross_check_dictionary, remove_features, get_all_dict_types, treat_missing_values +from _utils.plot_utils import plot_mv, plot_distribution_categorical, plot_distribution_numerical, \ + plot_significant_values +from IO_utils.FeaturePreprocessing import FeaturePreprocessing + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import os +import yaml +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV + + +# %% DATALOADIND +def get_statistics(input_df, data_dictionary, output_dir='../../out/data_exploration/statistics'): + tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] + + d, all_indices = cross_check_dictionary(input_df, data_dictionary, tables, output=[]) + + # Reorder dataframe according to data dictionary + reordered_df = input_df.reindex(columns=all_indices).drop(columns=['Id']) + + types = [] + + all_df, all_missing_values, clean_keys = remove_features(reordered_df, p=1, exclude=[]) + + for t in tables: + types.extend(get_all_dict_types(data_dictionary[t], types=[])) + + # Compute statistics of the selected tables + statistics = compute_basic_statistics(all_df) + statistics_bivariate = compute_bivariate_statistics(all_df, input_df['dmRS'], input_df['mortality'], types) + + p_dmRS, methods = get_pvalue(all_df, input_df['dmRS'], types) + p_mortality, _ = get_pvalue(all_df, input_df['mortality'], types) + # p_shiftmRS, _ = get_pvalue(all_df, input_df['shift_mRS'], types) + + statistics['p_dmRS'] = p_dmRS + statistics['p_mortality'] = p_mortality + # statistics['shift_mRS'] = p_shiftmRS + statistics['method'] = methods + + statistics['missing_values'] = all_missing_values + statistics['Percentage (%)'] = (statistics['missing_values'] * 100. / input_df.shape[0]).to_list() + statistics['types'] = types + + statistics_bivariate['p_dmRS'] = p_dmRS + statistics_bivariate['p_mortality'] = p_mortality + + #plot_significant_values(statistics, value='p_mortality', th=0.01, out_dir=output_dir) + #plot_significant_values(statistics, value='p_dmRS', th=0.01, out_dir=output_dir) + + #plot_significant_values(statistics, value='p_mortality', th=0.05, out_dir=output_dir) + #plot_significant_values(statistics, value='p_dmRS', th=0.05, out_dir=output_dir) + + # Save file + file_path = os.path.join(output_dir, 'output.xlsx') + # assert os.path.isfile(file_path), 'File already exists' + with pd.ExcelWriter(file_path) as writer: + statistics.to_excel(writer, sheet_name='Sheet1') + + # Save file + file_path_bivariate = os.path.join(output_dir, 'output_bivariate.xlsx') + # assert os.path.isfile(file_path), 'File already exists' + with pd.ExcelWriter(file_path_bivariate) as writer: + statistics_bivariate.to_excel(writer, sheet_name='Sheet1') + + return statistics + + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) +output_features = ['dmRS', 'mRS90d', 'shift_mRS', 'mortality'] +tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] +data_dicts = yaml.load(open("../dictionaries/dictionary_timepoints.yml"), Loader=yaml.Loader) +dir_out = "C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results/data_exploration" + +# Output paths +missing_values_path = os.path.join(dir_out, 'missing_values') +statistics_path = os.path.join(dir_out, 'statistics') +features_path = os.path.join(dir_out, 'features') + +# Get statistics without removing missing values etc +statistics = get_statistics(clean_df, data_dicts, + output_dir=statistics_path) + +#### Steps from List Reader +selected_d, all_keys = cross_check_dictionary(clean_df, data_dicts, tables, output_features) +reordered_df = clean_df.reindex(columns=all_keys) +clean_df, _, clean_keys = remove_features(reordered_df, p=0.1, exclude=output_features) +keys = [i for c, i in enumerate(all_keys) if not clean_keys[c]] +for k in keys: + selected_d.pop(k) +final_df = treat_missing_values(clean_df, method='median') +FP = FeaturePreprocessing(final_df.drop(columns=['dmRS', 'mRS90d', 'shift_mRS', 'mortality']), selected_d) + +# Remove data dictionaries of removed features +# Get more important features from lr and random forest + +for output_feature in ['dmRS', 'mortality']: + output_vector = final_df[output_feature].to_numpy(dtype=int).squeeze() + feature_vector = FP.create_features(final_df.drop(columns=['dmRS', 'mRS90d', 'shift_mRS', 'mortality'])) + names = FP.get_feature_names() + + rf = RandomForestClassifier(random_state=True, max_depth=5, n_estimators=100) + f = rf.fit(feature_vector, output_vector) + from sklearn.inspection import permutation_importance + result = permutation_importance( + f, feature_vector, output_vector, n_repeats=10, random_state=42, n_jobs=2 + ) + importance = result.importances_mean + importance_indices = np.argsort(importance)[::-1] + + plt.figure(figsize=(15, 6)) + x =[names[x] for x in importance_indices[0:20]] + plt.bar(x, importance[importance_indices[0:20]], yerr=result.importances_std[importance_indices[0:20]]) + plt.xticks(rotation=15, ha='right') + + plt.show() + + +## Missing values +missing_values = statistics['missing_values'] +plot_mv(missing_values, th=0.1 * clean_df.shape[0], out_dir=missing_values_path) + +with pd.ExcelWriter(os.path.join(missing_values_path, 'missing_values.xlsx')) as writer: + statistics[['missing_values', 'Percentage (%)']].to_excel(writer, float_format="%0.1f") + +if not os.path.isdir(os.path.join(features_path, 'Target')): + os.mkdir(os.path.join(features_path, 'Target')) +## Target +plot_distribution_categorical(clean_df, 'mRS90d', table=data_dicts['Output'], + title='Distribution of mRS at 90 days', + out=True, + out_dir=os.path.join(features_path, 'Target')) + +plot_distribution_categorical(clean_df, 'dmRS', table=data_dicts['Output'], + title='Distribution of functional outcome at 90 days', + out=True, + out_dir=os.path.join(features_path, 'Target')) + +plot_distribution_categorical(clean_df, 'shift_mRS', table=data_dicts['Output'], + title='Distribution shift in mRS at 90 days', + out=True, + out_dir=os.path.join(features_path, 'Target')) + +plot_distribution_categorical(clean_df, 'mortality', table=data_dicts['Output'], + title='Distribution of mortality at 90 days', + out=True, + out_dir=os.path.join(features_path, 'Target')) + +#### Table details +tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] +for t in tables: + keys_table = list(data_dicts[t].keys()) + if not os.path.isdir(os.path.join(features_path, t)): + os.mkdir(os.path.join(features_path, t)) + + # Save individual statistics of each table + # with pd.ExcelWriter(os.path.join(statistics_path, 'statistics_{}.xlsx'.format(t))) as writer: + # statistics_table = statistics.loc[keys_table, :] + # statistics_table.to_excel(writer, float_format="%0.5f") + + # Add target values to k for visualization and check that they are on the clean table + keys_table = keys_table + ['mRS90d', 'dmRS', 'shift_mRS', 'mortality'] + keys_table = list(set(keys_table)) + keys_table = [a for a in keys_table if a in clean_df.columns] + df_table = clean_df[keys_table] + + # plot_correlation_features(df_table, os.path.join(statistics_path, + # 'Correlation_{}_table.png'.format(t))) + + for k in keys_table: + print(k) + if k in ['mRS90d', 'dmRS', 'shift_mRS', 'mortality']: + continue + p_values = [statistics.loc[k, target] for target in ['p_dmRS', 'p_mortality']] + type_k = data_dicts[t][k]['type'] + if type_k in ['cat', 'ord']: + plot_distribution_categorical(clean_df, k, + table=data_dicts[t], title='Distribution {}'.format(k), + out_dir='C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results/data_exploration/features/{}'. + format(t), + p_values=p_values) + + elif type_k in ['int', 'float']: + print('int') + + plot_distribution_numerical(clean_df, k, + title='Distribution {}'.format(k), + out_dir='C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results/data_exploration/features/{}' + .format(t), + p_values=p_values) + +plt.close() diff --git a/Experiments/2-VariableSelection.py b/Experiments/2-VariableSelection.py new file mode 100644 index 0000000..1205afa --- /dev/null +++ b/Experiments/2-VariableSelection.py @@ -0,0 +1,162 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train import train_model +from train_graph import train_model_graph +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', mv_strategy='median', + output_feature=['dmRS']) + +output_vector = table.output_vector +meta_vector = table.meta_df + +fold_indices = split_data_cv(output_vector, seed=5, cv=5) + +methods = ['all', 'p_value', 'random_forest', 'mrmr'] +#methods = ['mrmr'] +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['FCN', 'LR', 'RF', 'MLP', 'XGB', 'Graph']) +#### From all variables evaluate different variable selection methods + +for method in methods: + + for k in [3, 5, 10, 15, 20, 30, 50]: + #for k in [5,10]: + if method == 'all' and k > 3: + continue + + features = table.select_features(method=method, k=k, fold_indices=fold_indices) + + feature_vector = table.final_df[features] + FP = FeaturePreprocessing(feature_vector, table.selected_d) + feature_vector = FP.create_features(feature_vector) + + config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 80, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} + + config_graph = { + 'Age': True, + 'beta_Age': 2, + 'Sex': False, + 'pre-mRS': True, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 1 + + } + metrics_FCN = {} + metrics_LR = {} + metrics_RF = {} + metrics_MLP = {} + metrics_XGB = {} + metrics_Graph = {} + for f in range(5): + # FCN + ("Training FCN of fold {}".format(f)) + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d,load_images=False, meta=meta_vector, one_hot=True) + + dl = dataloader_fold.get_loaders() + dl_graph = dataloader_fold.build_graph(config_graph) + + torch.manual_seed(0) + print('-----------------Training FCN--------------------- ') + + result, model = train_model(config, loaders=dl) + metrics_FCN['Fold {0}'.format(f)] = result['val_metrics'] + print('-------------------------------------- ') + + # LR + print('-----------------Training LR--------------------- ') + _, result_LR = apply_LR(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_LR['Fold {0}'.format(f)] = result_LR[1] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_LR[0]['auc'], + result_LR[1]['auc'], + result_LR[2][ + 'auc'])) + print('-------------------------------------- ') + + # RF + print('-----------------Training RF--------------------- ') + + _, result_RF = apply_random_forest(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_RF['Fold {0}'.format(f)] = result_RF[1] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_RF[0]['auc'], + result_RF[1]['auc'], + result_RF[2][ + 'auc'])) + print('-------------------------------------- ') + # MLP + print('-----------------Training MLP--------------------- ') + + _, result_MLP = apply_mlp(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_MLP['Fold {0}'.format(f)] = result_MLP[1] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_MLP[0]['auc'], + result_MLP[1]['auc'], + result_MLP[2]['auc'])) + print('-------------------------------------- ') + # XGB Boost + print('-----------------Training XGB Boost--------------------- ') + + _, result_XGB = apply_xgbBoost(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_XGB['Fold {0}'.format(f)] = result_XGB[1] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_XGB[0]['auc'], + result_XGB[1]['auc'], + result_XGB[2]['auc'])) + print('-------------------------------------- ') + print('-----------------Training Graph--------------------- ') + + result, _ = train_model_graph(config, loaders=dl_graph, indices=fold_indices[f]) + metrics_Graph['Fold {0}'.format(f)] = result['val_metrics'] + print('-------------------------------------- ') + + if method == 'all': + Result_c.update('FCN', method, metrics_FCN) + Result_c.update('LR', method, metrics_LR) + Result_c.update('RF', method, metrics_RF) + Result_c.update('MLP', method, metrics_MLP) + Result_c.update('XGB', method, metrics_XGB) + Result_c.update('Graph', method, metrics_Graph) + + else: + Result_c.update('FCN', method + '_' + str(k), metrics_FCN) + Result_c.update('LR', method + '_' + str(k), metrics_LR) + Result_c.update('RF', method + '_' + str(k), metrics_RF) + Result_c.update('MLP', method + '_' + str(k), metrics_MLP) + Result_c.update('XGB', method + '_' + str(k), metrics_XGB) + Result_c.update('Graph', method + '_' + str(k), metrics_Graph) + + #Saves results on validation set + output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/preliminary results' + Result_c.save(output_dir=output_dir, name='Variable_selection_val') diff --git a/Experiments/3-MVStrategy.py b/Experiments/3-MVStrategy.py new file mode 100644 index 0000000..9b960ee --- /dev/null +++ b/Experiments/3-MVStrategy.py @@ -0,0 +1,150 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train import train_model +from train_graph import train_model_graph +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +strategies = [ 'median', 'knn', 'mice'] +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['FCN', 'LR', 'RF', 'MLP', 'XGB', 'Graph']) +#### From all variables evaluate different variable selection methods + +for s in strategies: + + table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', + mv_strategy=s, + output_feature=['dmRS']) + + output_vector = table.output_vector + meta_vector = table.meta_df + + fold_indices = split_data_cv(output_vector, seed=5, cv=5) + features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) + + feature_vector = table.final_df[features] + FP = FeaturePreprocessing(feature_vector, table.selected_d) + feature_vector = FP.create_features(feature_vector) + + config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} + + metrics_FCN = {} + metrics_LR = {} + metrics_RF = {} + metrics_MLP = {} + metrics_XGB = {} + metrics_Graph = {} + + config_graph = { + 'Age': True, + 'beta_Age': 3, + 'Sex': False, + 'pre-mRS': True, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 1 + + } + for f in range(5): + ("Training fold {}".format(f)) + + # GCN + print('-----------------Training GCN--------------------- ') + + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d, load_images=False,meta =meta_vector, one_hot=True) + dl_graph = dataloader_fold.build_graph(config_graph) + torch.manual_seed(0) + + result, _ = train_model_graph(config, loaders=dl_graph, indices=fold_indices[f]) + metrics_Graph['Fold {0}'.format(f)] = result['test_metric'] + + # FCN + print('-----------------Training LR--------------------- ') + + + dl = dataloader_fold.get_loaders() + torch.manual_seed(0) + + result, model = train_model(config, loaders=dl) + metrics_FCN['Fold {0}'.format(f)] = result['test_metric'] + # LR + print('-----------------Training LR--------------------- ') + _, result_LR = apply_LR(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_LR['Fold {0}'.format(f)] = result_LR[2] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_LR[0]['auc'], + result_LR[1]['auc'], + result_LR[2][ + 'auc'])) + print('-------------------------------------- ') + + # RF + print('-----------------Training RF--------------------- ') + + _, result_RF = apply_random_forest(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_RF['Fold {0}'.format(f)] = result_RF[2] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_RF[0]['auc'], + result_RF[1]['auc'], + result_RF[2][ + 'auc'])) + print('-------------------------------------- ') + # MLP + print('-----------------Training MLP--------------------- ') + + _, result_MLP = apply_mlp(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_MLP['Fold {0}'.format(f)] = result_MLP[2] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_MLP[0]['auc'], + result_MLP[1]['auc'], + result_MLP[2]['auc'])) + print('-------------------------------------- ') + # XGB Boost + print('-----------------Training XGB Boost--------------------- ') + + _, result_XGB = apply_xgbBoost(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_XGB['Fold {0}'.format(f)] = result_XGB[2] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_XGB[0]['auc'], + result_XGB[1]['auc'], + result_XGB[2]['auc'])) + print('-------------------------------------- ') + + Result_c.update('FCN', s, metrics_FCN) + Result_c.update('LR', s, metrics_LR) + Result_c.update('RF', s, metrics_RF) + Result_c.update('MLP', s, metrics_MLP) + Result_c.update('XGB', s, metrics_XGB) + + Result_c.update('Graph', s, metrics_Graph) + +output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/preliminary results' +Result_c.save(output_dir=output_dir, name='MVStrategy_selection_test_mrmr10') diff --git a/Experiments/4-Timepoints.py b/Experiments/4-Timepoints.py new file mode 100644 index 0000000..c6d15f9 --- /dev/null +++ b/Experiments/4-Timepoints.py @@ -0,0 +1,164 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train import train_model +from train_graph import train_model_graph +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +tables_modalities = [ + #['NCCT', 'CTP', 'CTA'], + #['NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out'], + #['NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out', 'Control CT'], + + #['Metadata', 'NCCT', 'CTP', 'CTA'], + #['Metadata', 'NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out'], + #['Metadata', 'NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out', 'Control CT'], + + #['Metadata'], + #['Clinical'], + #['Metadata', 'Clinical'], + #['Treatment', 'Treatment_out', 'Metadata', 'Clinical']] + ['NCCT', 'CTP', 'CTA', 'Metadata', 'Clinical'], + ['NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out', 'Metadata', 'Clinical'], + ['NCCT', 'CTP', 'CTA', 'Treatment', 'Treatment_out', 'Control CT', 'Metadata', 'Clinical']] + + +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['FCN', 'LR', 'RF', 'MLP', 'XGB', 'Graph']) +#### From all variables evaluate different variable selection methods + +for t in tables_modalities: + + table = TableReader(input_df=clean_df, tables=t, data_dictionaries='modalities', + mv_strategy='median', + output_feature=['mortality']) + + output_vector = table.output_vector + meta_vector = table.meta_df + + fold_indices = split_data_cv(output_vector, seed=5, cv=5) + print(t) + features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) + + feature_vector = table.final_df[features] + FP = FeaturePreprocessing(feature_vector, table.selected_d) + feature_vector = FP.create_features(feature_vector) + + config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} + + config_graph = { + 'Age': True, + 'beta_Age': 4, + 'Sex': True, + 'pre-mRS': False, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 1 + + } + metrics_FCN = {} + metrics_LR = {} + metrics_RF = {} + metrics_MLP = {} + metrics_XGB = {} + metrics_Graph = {} + for f in range(5): + # GCN + + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d, load_images=False, meta=meta_vector, one_hot=True) + dl_graph = dataloader_fold.build_graph(config_graph) + torch.manual_seed(0) + + result, _ = train_model_graph(config, loaders=dl_graph, indices=fold_indices[f]) + metrics_Graph['Fold {0}'.format(f)] = result['test_metric'] + + + # FCN + ("Training FCN of fold {}".format(f)) + + dl = dataloader_fold.get_loaders() + torch.manual_seed(0) + + result, model = train_model(config, loaders=dl) + metrics_FCN['Fold {0}'.format(f)] = result['test_metric'] + + # LR + print('-----------------Training LR--------------------- ') + _, result_LR = apply_LR(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_LR['Fold {0}'.format(f)] = result_LR[2] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_LR[0]['auc'], + result_LR[1]['auc'], + result_LR[2][ + 'auc'])) + print('-------------------------------------- ') + + # RF + print('-----------------Training RF--------------------- ') + + _, result_RF = apply_random_forest(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_RF['Fold {0}'.format(f)] = result_RF[2] + print("AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_RF[0]['auc'], + result_RF[1]['auc'], + result_RF[2][ + 'auc'])) + print('-------------------------------------- ') + # MLP + print('-----------------Training MLP--------------------- ') + + _, result_MLP = apply_mlp(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_MLP['Fold {0}'.format(f)] = result_MLP[2] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_MLP[0]['auc'], + result_MLP[1]['auc'], + result_MLP[2]['auc'])) + print('-------------------------------------- ') + # XGB Boost + print('-----------------Training XGB Boost--------------------- ') + + _, result_XGB = apply_xgbBoost(dataloader_fold.train_features, dataloader_fold.val_features, + dataloader_fold.test_features, dataloader_fold.train_output, + dataloader_fold.val_outout, dataloader_fold.test_outout) + metrics_XGB['Fold {0}'.format(f)] = result_XGB[2] + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f} ".format(result_XGB[0]['auc'], + result_XGB[1]['auc'], + result_XGB[2]['auc'])) + print('-------------------------------------- ') + + Result_c.update('FCN', '_'.join(t), metrics_FCN) + Result_c.update('LR', '_'.join(t), metrics_LR) + Result_c.update('RF', '_'.join(t), metrics_RF) + Result_c.update('MLP', '_'.join(t), metrics_MLP) + Result_c.update('XGB', '_'.join(t), metrics_XGB) + Result_c.update('Graph', '_'.join(t), metrics_Graph) + +output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results' +Result_c.save(output_dir=output_dir, name='Timepoints_K10_mortality') \ No newline at end of file diff --git a/Experiments/5-Uncertainty.py b/Experiments/5-Uncertainty.py new file mode 100644 index 0000000..5940be7 --- /dev/null +++ b/Experiments/5-Uncertainty.py @@ -0,0 +1,100 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader + +from evaluate_model import test, get_metrics_unc, plot_selectedsamples_metrics, plot_uncetainties +from train import train_model +import torch +import os +import numpy as np +# %% DATALOADIND + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', mv_strategy='median', + output_feature=['dmRS']) + +output_vector = table.output_vector + +fold_indices = split_data_cv(output_vector, seed=5, cv=5) + +features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) + +feature_vector = table.final_df[features] +FP = FeaturePreprocessing(feature_vector, table.selected_d) +feature_vector = FP.create_features(feature_vector) + +config = {'lr': 0.01, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20, + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} + +######## +mean_preds = [] +combined = [] +epistemic = [] +cls = [] +results_pred= {} +results_epis= {} +for p in[1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,0.1]: + results_pred[p]={} + results_epis[p] = {} + +for k in range(5): + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[k], + table.selected_d, one_hot=True) + dl = dataloader_fold.get_loaders() + fold_name = "C:/Users/martinca1/PhD/Projects/AI_Stroke/out/models/FCNEnsemble" + + torch.manual_seed(0) + for i in range(10): + + if not os.path.exists(fold_name): + os.mkdir(fold_name) + + path_model = os.path.join(fold_name, "model_{}_fold_{}.pt".format(i, k)) + if os.path.isfile(path_model): + continue + else: + print("Training model {} of fold {}".format(i, k)) + _, model = train_model(config, loaders=dl) + torch.save(model, path_model) + + state_dict_paths = [os.path.join(fold_name, "model_{}_fold_{}.pt".format(i, k)) for i in range(10)] + + pred, unc, epistemic_unc, y = test(config, dl[2], state_dict_paths) + mean_preds.extend(pred.tolist()) + combined.extend(unc.tolist()) + epistemic.extend(epistemic_unc.tolist()) + cls.extend(y.tolist()) + +################### Test +p = np.array(mean_preds) +y = np.array(cls) +c = np.array(combined) +e = np.array(epistemic) + +# with pd.ExcelWriter("C:/Users/martinca1/PhD/Projects/AI_Stroke/out/uncertainty/predictive_uncertainty.xlsx") as writer: +for per in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]: + results_pred[per] = get_metrics_unc(c, p, y, per) + +plot_selectedsamples_metrics(c, p, y, uncertainty='Predictive') +plot_selectedsamples_metrics(e, p, y, uncertainty='Epistemic') +plot_uncetainties(p, y, c, e) +import matplotlib.pyplot as plt +plt.show() + + + diff --git a/Experiments/6-Graph_constr.py b/Experiments/6-Graph_constr.py new file mode 100644 index 0000000..a37da69 --- /dev/null +++ b/Experiments/6-Graph_constr.py @@ -0,0 +1,116 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train_graph import train_model_graph +from sklearn.model_selection import ParameterGrid +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +tables_modalities = ['all_timepoints'] +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['Graph']) +#### From all variables evaluate different variable selection methods + + + +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', + mv_strategy='median', + output_feature=['dmRS']) + +output_vector = table.output_vector +meta_vector = table.meta_df +fold_indices = split_data_cv(output_vector, seed=5, cv=5) +features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) +feature_vector = table.final_df[features] + +FP = FeaturePreprocessing(feature_vector, table.selected_d) +feature_vector = FP.create_features(feature_vector) + +config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} +metrics = {} + + +grid_config_graph = [ + +{ + 'Age': [True], + 'beta_Age': [1, 2, 3, 4, 5], + 'Sex': [True, False], + 'pre-mRS': [True], + 'beta_mRS': [1, 2], + 'NIHSS': [False] +}, + +{ + 'Age': [True], + 'beta_Age': [1, 2, 3, 4, 5], + 'Sex': [True, False], + 'pre-mRS': [False], + 'NIHSS': [False]}, + +{ + 'Age': [False], + 'Sex': [True, False], + 'pre-mRS': [True], + 'beta_mRS': [1, 2], + 'NIHSS': [False]}, + { + 'Age': [False], + 'Sex': [True], + 'pre-mRS': [False], + 'NIHSS': [False]}, + +] + +"""grid_config_graph = { + 'Age': [True, False], + 'beta_Age': [1, 2, 3, 4, 5], + 'Sex': [True, False], + 'pre-mRS': [True, False], + 'beta_mRS': [1, 2], + 'NIHSS': [True, False], + 'beta_NIHSS': [1, 3, 5, 7, 10] +}""" +grid = ParameterGrid(grid_config_graph) + +for config_graph in grid: + print(config_graph) + for f in range(5): + # FCN + ("Training FCN of fold {}".format(f)) + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d,load_images=False, meta= meta_vector, one_hot=True) + + loader = dataloader_fold.build_graph(config_graph) + + torch.manual_seed(0) + result, model = train_model_graph(config, loaders=loader, indices=fold_indices[f]) + metrics['Fold {0}'.format(f)] = result['val_metrics'] + # LR + + print('-------------------------------------- ') + import json + Result_c.update('Graph', json.dumps(config_graph), metrics) + + output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/preliminary results' + Result_c.save(output_dir=output_dir, name='Graph_grid_validation_new') diff --git a/Experiments/7-Graph_model.py b/Experiments/7-Graph_model.py new file mode 100644 index 0000000..a3fa88c --- /dev/null +++ b/Experiments/7-Graph_model.py @@ -0,0 +1,82 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train_graph import train_model_graph +from sklearn.model_selection import ParameterGrid +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +tables_modalities = ['all_timepoints'] +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['Graph']) +#### From all variables evaluate different variable selection methods + + +output_feature = 'dmRS' +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', + mv_strategy='median', + output_feature=[output_feature]) + +output_vector = table.output_vector +meta_vector = table.meta_df +fold_indices = split_data_cv(output_vector, seed=5, cv=5) +features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) +feature_vector = table.final_df[features] + +FP = FeaturePreprocessing(feature_vector, table.selected_d) +feature_vector = FP.create_features(feature_vector) + +config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} +metrics = {} + +config_graph = { + 'Age': True, + 'beta_Age': 3, + 'Sex': False, + 'pre-mRS': True, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 1 + + } + + +for f in range(5): + # FCN + ("Training FCN of fold {}".format(f)) + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + table.selected_d,load_images=False, meta= meta_vector, + one_hot=True) + + loader = dataloader_fold.build_graph(config_graph) + + torch.manual_seed(0) + result, model = train_model_graph(config, loaders=loader, indices=fold_indices[f]) + metrics['Fold {0}'.format(f)] = result['test_metric'] + # LR + + print('-------------------------------------- ') +Result_c.update('Graph', 'Results', metrics) + +output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/SR_results' +Result_c.save(output_dir=output_dir, name='Graph_{}'.format(output_feature)) \ No newline at end of file diff --git a/Experiments/8-Uncertainty_graph.py b/Experiments/8-Uncertainty_graph.py new file mode 100644 index 0000000..4635eca --- /dev/null +++ b/Experiments/8-Uncertainty_graph.py @@ -0,0 +1,145 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader + +from evaluate_model import test_graph, get_metrics_unc, plot_selectedsamples_metrics, plot_uncetainties, \ + plot_age_uncertainty, plot_boxplots +from train_graph import train_model_graph +import torch +import os +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import ParameterGrid + +# %% DATALOADIND + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + +# Given a clean table get features and labels +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', mv_strategy='median', + output_feature=['mortality']) + +output_vector = table.output_vector + +fold_indices = split_data_cv(output_vector, seed=5, cv=5) + +features = table.select_features(method='mrmr', k=10, fold_indices=fold_indices) + +feature_vector = table.final_df[features] +metadata_vector = table.meta_df +FP = FeaturePreprocessing(feature_vector, table.selected_d) +feature_vector = FP.create_features(feature_vector) + +"""config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 5, + 'layer1': 96, + 'layer2': 32, + 'layer3': 8, + 'layer4': 4 + + }, + 'dropout': 0.2, + 'classification': True, + 'out_classes': 2}""" +config = {'lr': 0.001, + 'momentum': 0, + 'weight_decay': 0.001, + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20 + + }, + 'dropout': 0, + 'classification': True, + 'out_classes': 2} +######## +mean_preds = [] +combined = [] +epistemic = [] +cls = [] +ages = [] +results_pred = {} +results_epis = {} +for p in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]: + results_pred[p] = {} + results_epis[p] = {} + +for k in range(5): + + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[k], + table.selected_d, load_images=False, meta=metadata_vector, + one_hot=True) + ages_fold = dataloader_fold.meta['Age'].values[fold_indices[k][2]] + + """config_graph = { + 'Age': True, + 'beta_Age': 3, + 'Sex': False, + 'pre-mRS': True, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 3 + + }""" + config_graph = { + 'Age': True, + 'beta_Age': 4, + 'Sex': True, + 'pre-mRS': False, + 'beta_mRS': 1, + 'NIHSS': False, + 'beta_NIHSS': 3 + + } + fold_name = "C:/Users/martinca1/PhD/Projects/AI_Stroke/out/models/EdgeDropout3_mortality" + loader = dataloader_fold.build_graph(config_graph=config_graph) + torch.manual_seed(0) + for i in range(10): + + if not os.path.exists(fold_name): + os.mkdir(fold_name) + + path_model = os.path.join(fold_name, "model_{}_fold_{}.pt".format(i, k)) + if os.path.isfile(path_model): + continue + else: + print("Training model {} of fold {}".format(i, k)) + _, model = train_model_graph(config, loaders=loader, indices=fold_indices[k]) + torch.save(model, path_model) + + state_dict_paths = [os.path.join(fold_name, "model_{}_fold_{}.pt".format(i, k)) for i in range(10)] + + pred, unc, epistemic_unc, y = test_graph(config, loader, fold_indices[k][2], state_dict_paths) + + ages.extend(ages_fold.tolist()) + mean_preds.extend(pred.tolist()) + combined.extend(unc.tolist()) + epistemic.extend(epistemic_unc.tolist()) + cls.extend(y.tolist()) + +################### Test +p = np.array(mean_preds) +y = np.array(cls) +c = np.array(combined) +e = np.array(epistemic) +a = np.array(ages) + +# with pd.ExcelWriter("C:/Users/martinca1/PhD/Projects/AI_Stroke/out/uncertainty/predictive_uncertainty.xlsx") as writer: +for th in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1,0]: + results_pred[th] = get_metrics_unc(c, p, y, th) + + + +plot_boxplots(p,y,uncertainty=c) +#plot_selectedsamples_metrics(c, p, y, uncertainty='Predictive') +#plot_selectedsamples_metrics(e, p, y, uncertainty='Epistemic') +#plot_uncetainties(p, y, c, e) +#plot_age_uncertainty(p, y, c, a) + +plt.show() diff --git a/Experiments/9-Images.py b/Experiments/9-Images.py new file mode 100644 index 0000000..677d4a6 --- /dev/null +++ b/Experiments/9-Images.py @@ -0,0 +1,55 @@ +from IO_utils.clean_table import clean_table +from IO_utils.List_Reader import TableReader +from IO_utils.split_utils import split_data_cv +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.Dataloader import MyDataLoader +from _utils.Result_container import Result_container +from train import train_model +from train_graph import train_model_graph +from architectures.ML_algorithms import apply_LR, apply_mlp, apply_random_forest, apply_xgbBoost +# %% DATALOADIND +import torch +import os + +## Clean original table +excel_dir = "../../data/TheList_anonymous_mv.xlsx" +clean_df = clean_table(excel_dir=excel_dir, pre_mRS=6) + +# Given a clean table get features and labels +table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', mv_strategy='median', + output_feature=['mortality']) + +output_vector = table.output_vector +meta_vector = table.meta_df +ids_vector = table.patient_ids + + +fold_indices = split_data_cv(output_vector, seed=5, cv=5) +## Load the images given by fold indices + +Result_c = Result_container(target_metrics=['auc', 'accuracy', 'balanced_accuracy', 'f1', 'cm'], + output=['3DCNN']) + +#### From all variables evaluate different variable selection methods + +for f in range(5): + # FCN + print("Training FCN of fold {}".format(f)) + """ + #dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[f], + # table.selected_d,meta_vector, one_hot=True) + + dl = dataloader_fold.get_loaders() + dl_graph = dataloader_fold.build_graph(config_graph) + torch.manual_seed(0) + + + result, _ = train_model_graph(config, loaders=dl_graph, indices=fold_indices[f]) + print('-------------------------------------- ') + + + Result_c.update('Graph', method + '_' + str(k), metrics_Graph) +""" + +output_dir = 'C:/Users/martinca1/PhD/Projects/AI_Stroke/out/results/preliminary results' +Result_c.save(output_dir=output_dir, name='Variable_selection_Graph_mort_all') diff --git a/Experiments/_t_test.py b/Experiments/_t_test.py new file mode 100644 index 0000000..90bf206 --- /dev/null +++ b/Experiments/_t_test.py @@ -0,0 +1,62 @@ +import scipy.stats +import numpy as np + + + + +def compute_t_value(mean_base, sd_base, means, sds, samples ): + for i in range(len(means)): + mean_2 = means[i] + sd_2 = sds[i] + t1 = (mean_base - mean_2) + t2 = (np.power(sd_base, 2) + np.power(sd_2, 2)) * (1 / samples) + t2 = np.sqrt(t2) + + t =- t1 / t2 + + p = 2 * min(scipy.stats.t.cdf(t, samples), scipy.stats.t.cdf(-t, samples)) + print('Paired t-test of {}+-{} against {}+-{} : p_value: {}'.format(mean_base, sd_base, + mean_2, sd_2, + p)) + + + + +samples = [160, 220, 274, 249] + + +######### Calcaneus +mean_base = 8.46 +means = [9.12, 8.69] +sd_base = 0.63 +sds = [1.03, 1.23] + +print('Calceaneus ') +compute_t_value(mean_base, sd_base, means, sds, samples[0]) + +############## Ankle +mean_base = 6.32 +means = [9.02, 5.86] +sd_base = 0.25 +sds = [1.59, 0.40] + +print('Ankle') +compute_t_value(mean_base, sd_base, means, sds, samples[1]) + +############ Knee +mean_base = 6.8 +means = [7.79, 6.49] +sd_base = 0.55 +sds = [0.53, 1.05] + +print('Knee') +compute_t_value(mean_base, sd_base, means, sds, samples[2]) + +########### Wrist +mean_base = 7.85 +means = [9.59, 9.93] +sd_base = 0.94 +sds = [2.15, 1.41] + +print('Wrist') +compute_t_value(mean_base, sd_base, means, sds, samples[3]) \ No newline at end of file diff --git a/IO_utils/Dataloader.py b/IO_utils/Dataloader.py new file mode 100644 index 0000000..18339b7 --- /dev/null +++ b/IO_utils/Dataloader.py @@ -0,0 +1,63 @@ +from torch.utils.data import DataLoader +from torch_geometric.loader import DataLoader as graph_Dataloader +from IO_utils.Datasets import MyDataset, Graph_Dataset + + +class MyDataLoader: + + def __init__(self, feature, output, fold_indices, selected_d, load_images=False, patient_ids=None, + meta=None, batch_size=None, one_hot=False): + train_indices, val_indices, test_indices = fold_indices + + self.features = feature + self.output = output + self.meta = meta + self.one_hot = one_hot + self.patient_ids = patient_ids + self.load_images = load_images + + self.batch_size = batch_size if batch_size is not None else int(0.8 * feature.shape[0]) + + train_dataset = MyDataset(feature, output, train_indices, + data_dictionaries=selected_d, one_hot=one_hot) + + self.train_loader = DataLoader(dataset=train_dataset, + batch_size=self.batch_size, + shuffle=True, + num_workers=0) + + val_dataset = MyDataset(feature, output, val_indices, + data_dictionaries=selected_d, one_hot=one_hot) + + self.val_loader = DataLoader(dataset=val_dataset, + batch_size=self.batch_size, + num_workers=0) + + test_dataset = MyDataset(feature, output, test_indices, + data_dictionaries=selected_d, one_hot=one_hot) + + self.test_loader = DataLoader(dataset=test_dataset, + batch_size=self.batch_size, + num_workers=0) + + self.train_features, self.train_output = feature[train_indices, :], output[train_indices].squeeze() + self.test_features, self.test_outout = feature[test_indices, :], output[test_indices].squeeze() + self.val_features, self.val_outout = feature[val_indices, :], output[val_indices].squeeze() + + def get_loaders(self): + return self.train_loader, self.val_loader, self.test_loader + + def build_graph(self, config_graph): + # print(df_feature) + train_dataset = Graph_Dataset(self.meta, + self.features, self.output, config_graph, self.one_hot) + + train_loader = graph_Dataloader(dataset=train_dataset.get(), + batch_size=self.batch_size, + num_workers=0) + + return train_loader + + + + diff --git a/IO_utils/Datasets.py b/IO_utils/Datasets.py new file mode 100644 index 0000000..29ac784 --- /dev/null +++ b/IO_utils/Datasets.py @@ -0,0 +1,140 @@ +import torch +import numpy as np +from torch_geometric.data import Data +from scipy.spatial.distance import correlation + +class MyDataset: + + def __init__(self, features, labels, indices, data_dictionaries, one_hot=False, + images= False, patient_ids=None): + + self.features = features[indices, :] + self.labels = labels[indices] + self.patient_indices = patient_ids[indices, 0] if patient_ids is not None else None + self.data_dictionaries = data_dictionaries + self.one_hot = one_hot + self.images = images + self.patients_ids = patient_ids + + self.volumes = {} + if self.images: + + self.load_images() + + def load_images (self): + #Load images + for i in range(len(self.patient_indices)): + #Load volume + self.volumes[i]= {} + for modality in ['NCCT', 'CTP']: + #Load modality + self.volumes[i][modality]=None + + + def __len__(self): + return self.features.shape[0] + + def __getitem__(self, i): + # Get features + features = self.features[i, :] + x = torch.from_numpy(features.squeeze()).float() + if self.one_hot: + label = np.zeros((np.max(self.labels + 1))) + label[self.labels[i]] = 1 + y = torch.from_numpy(label) + else: + y = torch.from_numpy(np.array(self.labels[i])).float() + data = {'x': x, + 'y': y + } + + if self.images: + for m in ['NCCT', 'CTP']: + data[m]= self.volumes[i][m] + + + + return data + +class Graph_Dataset: + + def __init__(self, meta, features, labels, config_graph, one_hot): + self.meta = meta + self.features = features + self.labels = labels + + x = torch.tensor(features, dtype=torch.float) + + if one_hot: + labels_onehot = np.zeros((labels.shape[0], len(np.unique(labels)))) + for i in range(labels.shape[0]): + labels_onehot[i, labels[i]] = 1 + y = torch.tensor(labels_onehot, dtype=torch.float) + else: + y = torch.tensor(labels, dtype=torch.float) + + self.edge_index, self.weights = self.create_graph(config_graph) + + self.data = Data(x=x, y=y, edge_index=self.edge_index, weights=self.weights, labels=labels) + print("Number of nodes ", self.data.num_nodes) + print("Number of features ", self.data.num_node_features) + print("Number of edges ", self.data.num_edges) + print("Graph contain isolated nodes ", self.data.contains_isolated_nodes()) + # self.plot_graph() + + def len(self): + return self.meta.shape[0] + + def get(self): + + return [self.data] + + def create_graph(self, config_graph): + + v1 = [] + v2 = [] + + nodes = self.meta.shape[0] + weights = [] + + for i in range(nodes): + for ii in range(nodes): + if i != ii: + condition_Age = np.abs(self.meta['Age'].values[i] - self.meta['Age'].values[ii]) < \ + config_graph['beta_Age'] if config_graph['Age'] else True + condition_Sex = self.meta['Sex'].values[i] == self.meta['Sex'].values[ii] \ + if config_graph['Sex'] else True + condition_mRS = (self.meta['pre-mRS'].values[i] - self.meta['pre-mRS'].values[ii]) < \ + config_graph['beta_mRS'] if config_graph['pre-mRS'] else True + condition_NIHSS = (self.meta['NIHSS'].values[i] - self.meta['NIHSS'].values[ii]) < \ + config_graph['beta_NIHSS'] if config_graph['NIHSS'] else True + + if condition_Age and condition_Sex and condition_mRS and condition_NIHSS: + v1.append(i) + v2.append(ii) + dist = correlation(self.features[i, :], self.features[ii, :]) + if np.isnan(dist): + dist = 1.0 + weights.extend([dist]) + for i in range(nodes): + if i not in v1: + #print('Node {0} not connected'.format(i)) + #print('Age {0}, Sex {1}, pre-mRS {2} and NIHSS {3} '.format(self.meta['Age'].values[i], + # self.meta['Sex'].values[i], + # self.meta['pre-mRS'].values[i], + # self.meta['NIHSS'].values[i])) + # + corr = [correlation(self.meta.iloc[i, :], self.meta.iloc[ii, :]) for ii in range(nodes)] + sorted_corr = np.argsort(corr) + for j in range(1): + v1.append(i) + v2.append(sorted_corr[j+1]) + weights.extend([corr[sorted_corr[j+1]]]) + v2.append(i) + v1.append(sorted_corr[j+1]) + weights.extend([corr[sorted_corr[j+1]]]) + + edge_index = torch.tensor([v1, v2], dtype=torch.long) + weight_vector = torch.tensor(weights, dtype=torch.float) + + return edge_index, weight_vector diff --git a/IO_utils/FeaturePreprocessing.py b/IO_utils/FeaturePreprocessing.py new file mode 100644 index 0000000..d31fbbd --- /dev/null +++ b/IO_utils/FeaturePreprocessing.py @@ -0,0 +1,109 @@ +import copy +import numpy as np + +class FeaturePreprocessing: + """ + Normalize numerical values and transform to one-hot vector encoding categorical values + :param df: Input data frame + :param data_dictionaries: dictionaries with information about the features + :param exclude: columns that have to be excluded from the feature vector + :return: + """ + + def __init__(self, df, data_dictionaries, exclude=[]): + + exclude_vector = copy.copy(exclude) + exclude_vector.extend(['Id']) + feature_vector = [] + + self.features = [i for i in df.columns if i not in exclude_vector] + self.min = [df[column].min() for column in df.columns if column != 'Id'] + self.max = [df[column].max() for column in df.columns if column != 'Id'] + self.data_dictionaries = data_dictionaries + # print('### Reading table(s)') + # print('### {} Features: {}'.format(len(features), features)) + + def create_features(self, df): + # Iterate over columns of dataframe and process them according to its data + feature_vector = [] + self.feature_vector_names = [] + for count, column in enumerate(self.features): + + d = self.data_dictionaries[column] + if d['type'] in ['cat', 'ord']: + + ordered = True if d['type'] == 'ord' else False + output = cat_to_one_hot(df[column], d, ordered=ordered) + for k in list(d['description'].values()): + self.feature_vector_names.extend([column + '#' + k]) + elif d['type'] in ['int', 'float']: + + output = df[column] * 0 if self.max[count] == self.min[count] else \ + (df[column] - self.min[count]) / (self.max[count] - self.min[count]) + output = np.reshape(output.to_numpy(), (output.shape[0], 1)) + + self.feature_vector_names.extend([column]) + + else: + raise ValueError('Column {} not included in the features'.format(column)) + + # print(feature_vector_names) + feature_vector.extend([output]) + + final_feature_vector = np.concatenate(feature_vector, axis=1) + + # Saved all the features to an excel table + # features_df = pd.DataFrame(data=final_feature_vector, index = df['Id'], columns=feature_vector_names) + # new_dir = '../out/test/Features.xlsx' + # features_df.to_excel(new_dir, columns=feature_vector_names, index=True) + return final_feature_vector + + def get_feature_names(self): + return self.feature_vector_names + +def cat_to_one_hot(features, dictionary, ordered=False): + """ + Method that transform categorical variables into one-hot vector + + :param features: Feature vector size n x 1 + :param dictionary: Dictionary with fields 'info', 'categories', and 'description' + :return: one-hot feature vector of size nxc where m are the number of classes + """ + feature_vector = copy.copy(features) + # Number of patients + patients = feature_vector.shape[0] + # Number of Categories + categories = int(dictionary['categories']) + assert isinstance(categories, int), 'Categories in data dictionary should be integer' + # Description of the categories + description = dictionary['description'] + + # Create the feature vector size patients x categories + one_hot_vector = np.zeros((patients, categories)) + + # Normal case where the feature categories are given + if description != 'None': + + # Check that the number of categories matches + assert categories == len( + description.keys()), '{}: categories and its description does not match in data dictionary' + + # If the keys given in the description are not [0, 1, ..., n] replace the values in the feature vector + expected_keys = list(range(categories)) + k = list(map(int, list(description.keys()))) + if expected_keys != k: + for i in range(categories): + feature_vector = feature_vector.replace(k[i], expected_keys[i]) + + # Get one hot vector for each patient + for count, i in enumerate(feature_vector.index): + try: + if ordered: + one_hot_vector[count, :int(feature_vector[i])] = 1 + else: + one_hot_vector[count, int(feature_vector[i])] = 1 + + except ValueError: + print('{} cannot be converted to int'.format(feature_vector[i])) + + return one_hot_vector \ No newline at end of file diff --git a/IO_utils/List_Reader.py b/IO_utils/List_Reader.py new file mode 100644 index 0000000..f77e6cc --- /dev/null +++ b/IO_utils/List_Reader.py @@ -0,0 +1,125 @@ +import copy +import yaml +import os + +from IO_utils.List_reader_utils import cross_check_dictionary, remove_features, treat_missing_values, \ + get_all_dict_types, save_df +from IO_utils.mv_strategies import select_features_pvalue, select_features_RF, select_features_RFE, \ + combined_selection, select_features_MRMR +from IO_utils.statistics_utils import get_pvalue, compute_basic_statistics + + +class TableReader(object): + + def __init__(self, input_df, tables, data_dictionaries='timepoints', mv_strategy='median', + output_feature=None): + + + ROOT_DIR = "C:/Users/martinca1/PhD/Projects/AI_Stroke/AIS_Regress" + dir_data_dicts = "dictionaries/dictionary_modalities.yml" if data_dictionaries == 'modalities' else \ + "dictionaries/dictionary_timepoints.yml" + + dir_data_dicts = os.path.join(ROOT_DIR, dir_data_dicts) + self.data_dicts = yaml.load(open(dir_data_dicts), Loader=yaml.Loader) + self.output_feature = copy.copy(output_feature) + + # Check that the output feature is one of the possible options + self.output_features = ['mRS90d', 'shift_mRS', 'dmRS', 'mortality'] + assert self.output_feature[0] in self.output_features, 'Output feature should be one of: ' \ + 'mRS90d, shift_mRS, dmRS, mortality and is' \ + ' {} '.format(self.output_feature[0]) + + if tables[0] == 'all_timepoints': + self.tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] + + else: + self.tables = tables + assert len(tables) == len(set(tables)), 'Tables list contains repeated element' + + # --------------------------------- 1- Select the tables ---------------------------------------------- + # Check that all elements from the selected tables are in the data frame and retrieve the corresponding + # data dictionaries and the indices from all the tables including the output indices + # All the possible outputs are included + self.selected_d, all_keys = cross_check_dictionary(input_df, self.data_dicts, self.tables, + self.output_features) + + # Select columns given by tables and and reorder dataframe to match data dictionaries order + self.reordered_df = input_df.reindex(columns=all_keys) + + # ----------------------- 2- Remove features with more than 10% missing values----------------------- + clean_df, missing_values, clean_keys = remove_features(self.reordered_df, p=0.1, exclude=self.output_features) + + # Remove data dictionaries of removed features + keys = [i for c, i in enumerate(all_keys) if not clean_keys[c]] + for k in keys: + self.selected_d.pop(k) + + # ----------------------------- 3 Handle missing values ------------------------------------------------ + # Apply the missing value strategy + self.final_df = treat_missing_values(clean_df, method=mv_strategy) + self.meta_df = input_df.loc[self.final_df.index, ['Age', 'Sex', 'pre-mRS', 'NIHSS']] + + self.patient_ids = input_df.loc[self.final_df.index, ['Id']] + # ---------------------------- 5 Set output -------------------------------------- + # This is used for fold split + self.output_vector = self.final_df[self.output_feature[0]].to_numpy(dtype=int).squeeze() + + + def select_features(self, method='all', k=10, fold_indices=None): + + # ----------------------------- Remove output features ------------------------------------------------ + features_df = copy.copy(self.final_df) + features_df = features_df.drop(columns=self.output_features) + output_vector = copy.copy(self.final_df[self.output_feature[0]]) + + if method == 'all': + selected_features = features_df.columns.tolist() + + elif method == 'p_value': + selected_features = select_features_pvalue(features_df, self.selected_d, output_vector, fold_indices, + k=k) + elif method == 'random_forest': + selected_features = select_features_RF(features_df, self.selected_d, output_vector, fold_indices, k=k) + + elif method == 'rfe': + selected_features = select_features_RFE(features_df, self.selected_d, output_vector, fold_indices, k=k) + + elif method == 'combined': + selected_features = combined_selection(features_df, self.selected_d, output_vector, fold_indices, k=k) + + elif method == 'mrmr': + selected_features = select_features_MRMR(features_df, self.selected_d, output_vector, fold_indices, k=k) + else: + raise ValueError("Not implemented") + + return selected_features + + def get_statistics(self, output_dir='../out/data_exploration/statistics', output_name='all_statistics.xlsx'): + + # Reorder dataframe according to data dictionary + reordered_df = self.reordered_df.drop(columns=['Id']) + types = [] + all_df, all_missing_values, clean_keys = remove_features(reordered_df, p=1, exclude=[]) + + tables = ['Admission', 'Pre-EVT', 'Post-EVT', 'After24h'] + for t in tables: + types.extend(get_all_dict_types(self.data_dicts[t], types=[])) + + # Compute statistics of the selected tables + statistics = compute_basic_statistics(all_df) + p_dmRS, methods = get_pvalue(all_df, all_df['dmRS'], types) + p_mortality, _ = get_pvalue(all_df, all_df['mortality'], types) + # p_shiftmRS, _ = get_pvalue(all_df, input_df['shift_mRS'], types) + + statistics['p_dmRS'] = p_dmRS + statistics['p_mortality'] = p_mortality + # statistics['shift_mRS'] = p_shiftmRS + statistics['method'] = methods + + statistics['missing_values'] = all_missing_values + statistics['Percentage (%)'] = (statistics['missing_values'] * 100. / all_df.shape[0]).to_list() + statistics['types'] = types + + save_df(statistics, output_dir=output_dir, name=output_name, overwrite=True) + + return statistics diff --git a/IO_utils/List_reader_utils.py b/IO_utils/List_reader_utils.py new file mode 100644 index 0000000..57599c4 --- /dev/null +++ b/IO_utils/List_reader_utils.py @@ -0,0 +1,108 @@ +import numpy as np +import os +import pandas as pd +import sklearn +from sklearn.experimental import enable_iterative_imputer +from sklearn.impute import KNNImputer, IterativeImputer +def cross_check_dictionary(input_df, data_dictionary, tables, output): + d = {'Id': {}} + all_indices = ['Id'] + for t in tables: + indices = list(data_dictionary[t].keys()) + for i in indices: + d[i] = data_dictionary[t][i] + all_indices.extend(indices) + + # Check that all the features from the data dictionary are in the table + columns_df = input_df.columns + for index in all_indices: + if index not in columns_df and index != 'Id': + raise ValueError('Feature {} from data dictionary is not in data frame '.format(index)) + + for o in output: + if o not in all_indices: + all_indices.extend([o]) + + return d, all_indices + +def treat_missing_values(df, method): + if method == 'drop': + + final_df = df.dropna() + elif method == 'median': + indices = df.loc[df['mRS90d'].isnull()].index + final_df = df.drop(indices) + final_df = final_df.fillna(df.median()) + + elif method == 'knn': + imputer =KNNImputer(n_neighbors=5, weights="uniform") + indices = df.loc[df['mRS90d'].isnull()].index + final_df = df.drop(indices) + imputer.fit(final_df) + final_df[final_df.columns] = imputer.transform(final_df.to_numpy()) + + elif method == 'mice': + imputer = IterativeImputer(random_state=0, + estimator=sklearn.tree.DecisionTreeRegressor(max_features='sqrt', + random_state=0)) + indices = df.loc[df['mRS90d'].isnull()].index + final_df = df.drop(indices) + imputer.fit(final_df) + final_df[final_df.columns] = imputer.transform(final_df.to_numpy()) + else: + raise ValueError("{} mv strategy not implemented ") + + return final_df + +def remove_features(df, p, exclude): + """ + Remove columns from a dataframe with more than p*100% missing values + :param df: Pandas dataframe + :param p: Percentage of missing values + :param exclude: Columns to exclude + :return: Clean dataframe, number of missing values of the original df and the indices of the selected columns + """ + + # print('### Removing features with more than {}% of missing values'.format(p * 100)) + patients = df.shape[0] + + # Get indices of features with more than 10% of missing values + missing_values = np.array(df.isnull().sum(axis=0)) + selected_features_idx = (missing_values <= (p * patients)) + # Get indices of features to exclude + idx_exclude = [df.columns.get_loc(i) for i in exclude] + selected_features_idx[idx_exclude] = True + # Select the features from the original df + final_df = df[df.columns[selected_features_idx]] + + # Print information about the removed features + removed_columns = df.columns[[not i for i in selected_features_idx]] + removed_number = missing_values[[not i for i in selected_features_idx]] + for i in range(removed_columns.shape[0]): + print("### {} - Feature {} removed with {} missing ({:.2f} %)".format(i, removed_columns[i], + removed_number[i], + (removed_number[i] / patients) * 100)) + return final_df, missing_values, selected_features_idx + + +def get_all_dict_types(nested_dictionary, types=[]): + for key, value in nested_dictionary.items(): + + if type(value) is dict: + get_all_dict_types(value, types=types) + if key == 'type': + types.extend([value]) + else: + continue + + return types + +def save_df(df, output_dir, name='output.xlsx', overwrite=False, file_format='Excel'): + file_path = os.path.join(output_dir, name) + if not overwrite: + assert os.path.isfile(file_path), 'File already exists' + + if file_format == 'Excel': + with pd.ExcelWriter(file_path) as writer: + df.to_excel(writer, sheet_name='Sheet1') + diff --git a/IO_utils/__pycache__/Dataloader.cpython-37.pyc b/IO_utils/__pycache__/Dataloader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e09a70de9bc92e145ca06daa4644ac277fbf28f GIT binary patch literal 1850 zcmZ?b<>g{vU|@Kf9h4lx#lY|w#DQUE1_lNP1_p*=D+UIJ6owSW9EM!RC`LvIn<<wu ziW$si%3;Z6jbhDZi(+GBaA!ziPGM<bNMUJaiegV;4Q9|}dkHd8lkpapOJYf)Pkv%b zYLO<>El%G`5WhIJ<QA`cQDQ+xJd~RZQVzqcFh{5{FfgPtL@}l?L@}jsf-PZ;VoqUA zVQXQCVo707;b>uqVohNRX3*ri#RD-LVwam=GT4(03=9Gc3=HfH3=Ga7Z>cddFw`)| zFxPU`a@X?IFxD`{Gt@BEFvK&~u-7oeGu3d`FvK(0u+=ccv(&KGFvPRgaHKF~a~4_E zaO4S;u%$58FlMo*FiA3qFf@a7bCht_FxD{DaMiHXaHcT#vexpJaMkeEFl2G0u%s}h zu=cXlFvN31RdK^r@s)7Z@PSn^rLgrf*D%EMK-KWT)$o^a)$pUJ;jLk;;YwjhVQ6Ja zW0C~9oJ*2n0pCIfCWab@c>WrOc!3(m8XgoCJV+`8YZ&5%Y8Y#{QB-gvsSvJVh!;s= z2xib^_e-BO69it`7cnw0FlaK}5===fNsLd)OfJdH&r2-IOf9~}o}ZT*pOIe@qRDcL z9V88kTdt(UlH`o|;>@blTkORdrD<t7skgZEN^|4O^NX@mi;6Xwi#Qk<7>YPS1UJZV z(OVp8sfi_}MXANN*z!wD3Q9|EvE-(fBo=|<?iP1JVo7FdUP*jrO7Si3oczR;_{`kI z^wi>8%$a#5w^)la5(`ox#(|P>5fcLg!!4eYqQuO+c#!JUqFY>Li8(MfcS&k-35>&D zTv(c#T9q2b2T>1mM-)^)#5Ga8U_Ee2h!OdvCHbW#Q9K~C;Zk6mATpW)w^)i(bJA|H zLuHG2!7fY7&q;~T%uC5kPA$I0Rh*iWnp~2a5}#58O07lwpkUzx5#Rs<hX60caWFmH zARA#E9<bA3Jg7=&WJG~g!Z;9>P~J+0B6$V|hF?C;RxzQ)sYS&xxrs$3nR&^HhA{yd zE-?W``B|ySCB-q0p7FsYMfusOF^-<W@j<ESMXANbF`oYMr6rj;#WA2f3<+etg34PQ z@$s2?nI-Y@pcpPT17&I^E+#fcIAl^_;$Y-rVqs)qWMbrkvyoX!j7p4EqR1Iq4`ymI zC@q2v1<|0m2Ip%~u+=bRG1M?*G1f3-F{LpFGiWmUX)+fHgMv{6M2LdY8Do(+hz;^L z*z-jSp!ncUPlY9iVtr6K0ZKm%j8zgSPC!zW3|7Fvz`(-5z`zD8T?`l)7)s!7sA0%r z29+>bEX}O7OeL%}Obgf+f@IlK7*m*fnQNJAK(1u=gJds^EWuF(GQ0?!4tPM01|=Ox zb`VI17oL!Szr~!MS^`f0nk--+-r`Bl&r8cpj|ZzPk^|`n2L-y%lS(slQXr~~Kt2cA z#K6SHD8$6U$im3Mgcb%!$~9Tt{QUgfG}%B=o|l-L8Xtd)D?UCqKczGW#O8^QFDy;W zfyfjof?S{kB0!}&B<xY$QUogZL19@8N;w=%42)cA9L!)*KTVD(fs*{9<P1G<lG6jF zU~ndn5(5jzr>ExUrj`_CChI{0q)3^8fgwr|mPYkJX|y=Cq__x_WFP?xvJRU&ZgJQ^ T{9p%4V#OdA@i6i*3NQlzN%+(j literal 0 HcmV?d00001 diff --git a/IO_utils/__pycache__/Datasets.cpython-37.pyc b/IO_utils/__pycache__/Datasets.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00eb6a5eee63c0a4b4d6db939d63add7517a2947 GIT binary patch literal 4138 zcmZ?b<>g{vU|`tIJufj(l!4(fhy%mS3=9ko3=9m#9t;c&DGVu$ISf${nlXwgg&~D0 zhdGxeiiHs*#+<{N%NE7P$l%V9!ji(;!jQt+%oN4$&XB^E!rsD=!rsgj#gW1o%%I8f z5@f!gCgUv@m&B4pO~zZ?$@xV^sX2)ynfZCiAW0a8J3x<tfgzP4iZO*DiYbLVg*k<# zg)xdbg*An(g&~S1g*}C%g&~SHg)@b#g&~S9g(;XpljjzvZzafr;?xp1zhn-mnIM{l zfq}sp<gYXa28J5O8s-#+6oyu&G$u)g8isg=8m1a%FpIH<J%u5gv&f)^y@ol35hS0- zlmb%2RKrrk5YJr0TEh^}Qo~lm5YJk}Uc(U2mI;z&uVKhyElR3kSirH60TepH44O=S zMT`s#44Ry`IMPxROG=AUi*K>zBqpWi6yM@6NG!=r%`1t|%uC5kPA$GAn37nM7@v}v z3<{CNqRiCdTkQFHsqq>4CAZiza}(23i=%j;nu_ByQ;Kh~m*wY_=B5_k;?BuWOo@l6 z(d4+rQk<HTR>aD{z)-}-z`$^e9co1pC>)A7LE@nJzr_tT4P@9#h9Ut528Lf=&Q>v@ z#i>QbF}aCFC7F52iH0!&87?saMfq8&$tA@xj-K(sB}Mt!sWFb8!SO+<=|!o<#W9}# z@uel1ImI#1z%SM-sJz7yAD@|*SrQ))vbR_S6dBAyj9iRTjBF6d#Ky?RR3!wDGCi1v zWKghy)iW?KfII?@IVT1Nh6<Y!h7!gahAgIL#%4wrhFGmyMo{=?G1oBKFjPpTfW(Aq z!7&CBlVqr2vSBDx31=u22xmxP0LO|Z(=8T1XXlVx%+4VJnk=_iixTtFQ*SZnq~;Z| zgMx<}6l=^yyr9ry%#7m5%}+_p$t<Y^hY^Gj1gYa;U|^5{B`S~)8JI+vxEQ&ZszebU zLW&r;pFn<Nf~7`~2{jB^3|WjRjFJ%7LDCEpI0A367H1?Dq-rvP-2$;06i-DU>u<5g z$LFNx#m6gwoB%SCfw4*uVL4Pb87#xVzyNY2C_#V&a|I|HFk~^*Fd{NSEmH~O0;U?K z1<VT>7BVhiS;$b!3{KWjC2R}WOV}20)G%amrZ9>#EM#2BSj&>a2+9&!oHdM+3^oj4 zy(|kEYgzN8K<pB>1zaHYAX$)luz9RC%r&ejOmmoO*=iWFxQmul$fYp5FvN=0f-(>f zNNo+fBts2b4Z96PVOKaq4O=jSCX3%oP^f7#-D0e`#aOAyR3r=veo&Iq<b?)QNq$js z#x1V2qWs+Wywco)%3JKkg{7&fRjIdF({l0?OTa1r7E@lqE!L{kqWt1p%(;mbkQ~CA zSX7i)Sp>=zMc}ySfP}X=$QDrgE0P8!1y*q0Es_DTSwPwK7Go}YoN&j-r>B-=mZav! z$E$+kMGBOTnS>aH7}*%b82K1^n4}n)7&(~P7`Yh5SdbG7l1@!#H#a{`_FEkB@p*~4 zsqyi*xZ>k;^HWN5Ky04)_`=e}9EePj5Xh0BbW|h>VrhT~a2i1npwv<XDgi;!Ukvgl z2NMG$7e5CV2Qye4r34107jPAzgIoeLrZBZIL@|TQVNm%CDTi61<uF?cXE1{%*DYT6 zqQruXcxW+@46+a8e~^`+oCQvL-#|%^5nAdp#52|~*D%C0l`v<q)PP9V6vkdq@?b5h z10}l}<}8jBCP{`8PH=)^UdY(Yn8vh_v6j6;t%MsU3oZ*pYB^GvYuIZzYM5&{BpK2` zIX@(v0c<jR33C?2WX?PRun03kq=sPuPYugL#u%m=hIrl@hIqab{u;I#&Kia+-WrB1 zz8dBfmR?Y~FHj<w!dk<SC6Fc5%*4o0BA5b7#sXQwP#!1|3uK8vd7$hfut0Pn1BeeU zdpJQ=AZtoVWkD)9n-_snhbAjHc|p<@S87UnDyULOt+>Tro|>7SQBn*}q*VfbrMXF| zMGE<83VHb{sl^Ia!Uz#q{i1-Z3}jTXLX|8yP85>!^GXsk^As|R^K%kQQd6L&Xo}rp z$xSUu1ZN|#O+~Vx1kP5Hnpd1(RHOh(Pl_M{RLm5Cin}5)u!7RO%)-*tB2f^BC%Gs! zu_QG<9TfdVdLZTcAOe(@iqt?VSPP0W^Ga@U=9T8ggWZ2i5XOv$xgH`5cJ?iCsC$d! zVGf39)8r}wmCHp2AmhP#6O_nq@g(QxrDdi=9l!~a1m)Q%UT~f*j?d3a&B!k)0_SK@ z21d)b;9O)2%C8{*GC=Dm0VWnkIYtpi5k?^<0VW|P2}V9JtP%qUDJ+>|)?1*`3gmTg zdI#lA9A!M%6QCf)Uc!R|Iuzt8kYS+G9o62P)I5}G1!OJQN>EV`vN9di_+s?aWCX_n zc1vzCr>B-gz%5~_5=6BmJ+%a0g@6hYklCO{T5%Q=14BAPEmJx}EprDWtPZPXNoS~K zoxoU;R>D-n0?L;Y7-J(k8744R1R#kx)v|Rk)UahU7n#*Cg7SA!RSEL~7LYk0UKVQy zgCqljSHmXBAi=PZv4$~)S&|`}A&)7ArIx*hF@;r<p=cRc9jhb*k}fb0q$`^Vtd^sO zF@;T%q39M^EgOnjFb||w0#qHc!_DWcVNBtWWSGEMBmmaMfuak{gX`ji>*A_mFEXp) zC^D<zEHbO%Dl)5KTEN!9u#l0Fp@w+@TMZkCRl?rEn8ncnYDGg^p&)(@8=^99W~${b zVP3#l!@ZDk0%K8L3Rf+64eJ7~8t!yPkO_rm;ox>n#V%N=#;)sR=wPT}%4VOySk%+b z(9W0!YB=+>aKKbFGuHByFfZV);aSL7%Ue-Z!j!@bilNxNT0S&a^49RB@QO1?Lfj3q zm!}3UCdmMC1IRy6pD`_9L&$^N2X-qKJrfuUkA*W#U@YtjXDH#x;sv$5)0s+mviMT? zdYNkZ!D6f>JX!o;zCaCsjX)ZxanBEFP2Xa6Oi#VVk(64J81I;#8gh#{IJKgRy`U&n zH#aC4Dgk1#`gwW;2jAiXs|2&tK~)h0!v}kKrU7LuP<jS6CyLD&K<T-LAy%)Ju>_R% z7_*qrVu&$|3Aw@4%rudykR_NwlhIF;xd@agz{S!nmdu>|WKE`9OnL^lm@+dpnIPp3 zxZDEQ%AkDoi_0b_v$!NVKexcH9+v&s7^?J8vmbK7s%MjvpPZOeY^R6N3aLUEi*!M4 zRspd6kU|<%E*B|+ieBc#q~cp_Wr;bZpbki4K|yL>3dFUb^79sJW^rC(-YvG&ijvg4 zlw0hHMd`))MJ3>BpvVwp0837OUOKoY&=iD}zLp?irZU4@Ol3y5Si$9a5xBqy6+cnD z$@zIHnV^mgD2kA|!KoF<+}xnxD1L-ua4g+oNy#iOxy1tNAVzV6T#}j+58~Wn%mUXe zQM{0PF}^G{xg@^`T#};%F(`D4j2RdhVnD$PYRWJ$u`seQ@i4N08)6(xR!llfCQN2b z0*qXYT#RarJd7-iB8+^DT<j8zTp*fJh!MmRVpL%gVH9AhlENJ+ewyq>nxOhd3q)vx z2yl}OR7n<Dfw&;2fL)11fZYgcU|ECQ2ugz-OdJv%%p5F5AW=<DaPZ$^1vh0O(I^6L z;>M?^=I5rC6lEssftsL2pd@@ttT;Kdpi-|G)T2wx(E|lwVqS78B$8P`ZU7mIWIa3; a-r}%<v@7jEl?SA4$jHIS!^p$LX9@rkZLhQd literal 0 HcmV?d00001 diff --git a/IO_utils/__pycache__/FeaturePreprocessing.cpython-37.pyc b/IO_utils/__pycache__/FeaturePreprocessing.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89e3f62800b5a6305c10bb2c37cf7c1b8079f7b8 GIT binary patch literal 3546 zcmZ?b<>g{vU|?wF^-g>$$iVOz#DQUE1_lNP1_p*=9R>!56owSW9EK<e%^1bx&XB^C z!ra1;!raUh#hk*P!kWU?!WhMp!Whh;$?+0ohF>y>gkcoZQW>HcQy8L{QW&F{(;3nj z!DfMUv!t-LFhsGYu%)oKFhsGXa0D}Ga^4bgOHC{(ElLe2N-ZeLPfjf^&df_s)MR8} z;8IXfQ1Hty%1zA4tV&hLE6q(U%1lnoQ7B8yDNQX_NX$!7C@D(JD^ANV%2g=ISIEyx z)y>E+Q7B7IF3B%aNX<*mPsz+nS4d7QNlnj3)(bY;svxl_F;^ia%}T*Dub{L<AtkXS zQ6a4;F*g-OKCvV*J|#1`Br`uRu_!aO*h&GJr%;|*lA(~92XaIrNR2{bQhsTPLP<ud zLK?(t#mL5`RwU<?rleXaB<JUp=H?YElw>59C}bp-rGmVWl&Sz#kfM-Ql%I=i4Afr` z!-`T%N{jNWz?@_jXhg6wFff4P0)&gx7#J8z7#1+pFfU~6Wb9yEz?j0A&bW|~ks+O- zmZhB$8Z#Un3|UMr49$!+4DrnEp!i}*VQxVdWdVz_qKUHBFx4=`v!yTuGib8;-D2`g zNoIsPgNcEG0puiMPy&-+U|>jRs9}f|sAa5S>|n@dFJh@-oXAwj63noYp$Mc-lj#<d zp201~OijjHd{FPlLn09@|0~SdDkiizwWv5IH?gQBGcP&OFeV_wB_^OKKPxr4q&UXW zGd{SaC_g(j#?do4J}5Q4D7Cma#?wE(v?Md9I0jo1(<`X_#buL|SzMBwpIcxDb}Pu? zYz$R~Sk=eJXXa&=#K-H|<m4wO<`moMA#`K+x&qAW5+JXqFk~|qv2-xhFiJ8kU|7fi z2?9;VB2a8;GTvg&&CJteDgwpfEjCEv&}6*Dl#&JxgCY(F28JR|1_lNZP^dy&N*!wx zE1=dEaWgP5XfhV@fUE*V48l@Tgr%D7w^)+%3o381rB;-r=B3<Xhd8$Q76&Z77l8_% zB3=dthFgN5OofttG+A%46sP8-L97u1DPo64`Yk@F3COXp$plf!4ssPJ6mD_Aq5@<? zF&`+^F>x{SFtRX-F@j)~EZzuA1_d5S6Udcp3=9m;AkWz`g3@U%6DY05GfZHt&?#Z8 zVaQ@?W^`egz!)nN!&J*$%TmLT#azQ8$xzE$!<xb<$&kX7%~mu4%4W`HD)LBS$zw`k ztz|1=sbK`EtYNKTOJVC}u4S*-Qv%Y(F3GTfZ6O25+*qkvjv9t6h6U_3EGZn~3^g3$ z43I2ZSP;&T*8??|Gn;J!V-a5sLl#F3vm`?eLl!59&Somw2eBiCON1d$1;U2ON`Ta% zsFz@nW~gN^;ab35!@huLAwvy&7H<ldBtr^!8q-3iT6UOAATG$`E8$DwS-@YzvXHTv zv5B#UX#sl;J4CL8D@&k;DTO<Qx0k7wvrr<Op@uV<L6gt#7E4KGL8>OxE#~CJl3UFA zMJYv+3=9k*Zkmj@*z=20Qj1bkZgHoi7AF^F7Jv%NUyRBi)tPxEw^-A1@)JvnK!wOl z5K*P(3@#xRs%sVU@=Fvl^S}jXib7@{a;aFX$#{z;u_CirQ}C7uYVwKCOUz9zzQvge zt~wG+Qi}vZX$YL;_>vP#;!E=5^Yc>UGxAGrv4Bc~TWpZ3rHCJ-4wSmUDft#tUcoK) zqSWGy#DdgY93}bjd8N4pmA6<S0$gFB`pC7YD8J|y8>AYz#hsj=2dZ}R5=&Awxr#ug z3nUeTGSV&9<owdS5^#3A#h6kg1<C_#`K2WVr6so*v!g`PGV>C1;!!;VDFxuU@)mz` zQEFmIDoj-|$dF<cP_kxXVPs>J0ObfK5k?_KF-93iJ|+c5HAX2W0cJjC9ws5SDiy3b z1kFfz@dAnt5C&%yP>um5m0$)<Mn6r)B1K4{0rn!;d0+yZQ3caeOJEL!M20OWtAX<$ zW0e|KH(+S-(_{zbcu-u%$KT?LkI&6dDa}cZkH5teA75CSm;;qzkB?8uPmYf-5(7n^ zA&4*n5n$&a2#~{yKrR3Y6@z#j>?}+?yd274QMY7JaSRGZP$>v1Z^526VT4wu3=0`+ znQNG{7(uzemZgNLhB1XnlA)QgmbHYrhP8yLnW@;Qge8R;oM#wQKxGA63F`v38kQQ? zG^T}&wd^%)DXiJdMLTL(OW14J7I1*df^4SZj1ra<Hju6o&JwN~)@H_L##)XN&Jykt zrV>t=x@M+k##+uAjv7vo2^A_3l`agi617}4%nNvGIBU2h8EQB{<U*!e=E6DQ3>9T1 zyfw^OeBcr%BZjG#yOyictcI!RO$vJrdkuFAM+s96Ge}1>V=Ge{lMO?j2t)=U4iXJ# zC_KW#P{N<hHG#26i;)2YO9WCl7YIUhgUSg;hDe6OBM}URN5UCu*n=4~xe^s&O*!Ax zl8pQmNP7p-ZUf~KMB}U|GchSAwOAoD52HQD1#as=8*R|MQEa6EO+?VfkwP)Jsi;t) zV2G?7-e9Y=QgDGa<ti1xO}Dhn)SQ%J1$9spPF+Vq9p<3aVs#yG8&e&ryi$jl3T=JC zTbxkmz#NgE26ab8vO;-AYEh~}u0mo_D!ACnE6q(xEmFu&Q%KH9EG|xkG!DVV-YqVK z-BtR|&{iurG$3tLgf|q5GxAGwQWTO>K|z(8o?28S4Jt)g{POcst6ZvUt>CR*XiGV> zq*wvTBMK?`sm0)OH8-&&8PtG9x1va%fq}uVO18RIAvrM*R01cZDkSIUm8BMyq^5ux ztC@KvnnFdO`nO0El+$EEF5t>6&de(=Nz6-5y~XEPT%1}|l9`_eE**>1Knhr^Qj7A7 zZ!zbj=G|h+POU610+m{~Sc?+#(o=6S=O%*7+Jc<K<kTW1u!hXMl+=nMEik*FC^N65 zNE=k<@PLc+TU>}x0hgpj;3AfzAh9GfHLs+&$Q<M}P~b%IK?{3m>mrIbwW1(3xg<3u z9_$PQkU~)DR-_AJfyxI+p$jS`ij<LiObi@cOk7Mnj6zHt%zVro%q)x|%sh-@j2w)7 zj1tTuj7*Fyj8aSzOahEDOkkawjBc8&;Gnw23a&iBVF502!154_P_4YhVFM}ZKur^d NVvx%?7&(~uYycu(qTT=i literal 0 HcmV?d00001 diff --git a/IO_utils/__pycache__/List_Reader.cpython-37.pyc b/IO_utils/__pycache__/List_Reader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..163df555b090f376793b1f44a6ed072895a93f63 GIT binary patch literal 3883 zcmZ?b<>g{vU|`rT*_qfW!NBks#DQTJ1_lNP1_p-W5(Wl_6owSW9EK<e%^1Z9<}*bx zr7)y0<uK>6M6u+uMzQ9yMX}|wN3rK}L~$^J)H3IA=5j@G<#I=H=ki4H<nl)G=JG}H zf#q0o_;UrK1abwV1Q{9J8B$nN*jgA;*iwa>nWKbL7=sx!*<XU(;-|@aODwr4zqmL) zIU_YWJ3b{dxg;|`FR`fd7JpG{Zhl#6d|GN^Noi4P@hy>(qSVBa_}t9m;>^7C__D;D z($wNxLg}d`@rgM(V6E{bl?AEAx7dpl%TnW0(m<vcr{<(4m%vSlF97SiC4?>><aSFK zQ_S_2P;!25Qf6LiN<2h1Ge7T^2!x3+)HldCNR#On7sxG83nY^Ba|=pKQsa{ni!+ns zi%SwqGK)(xlZ%r<9*1FOSVU+sFfgPtfC4{?DTObEC55$xF^V~bKZPTOvxPB=B}E{G zCxy3#F^V;XIha9H@D_JSVp2|OP-<dIYSAsOlFZ!Hg8a<9lHyxzxv43ciFtm>Y*0&h z85kHqY>-{W9gGYNDU7vjHOwhY*-S+ODa?6HDJ-?@B@9`N3z%xyYS<St)v}i`XR(y9 z*047-mM~|rrLgug)iA`fmvAiLs9{~mSi=y{nZlOB4su&MOASLjR}DiJX9|}jLk&X~ zS2kO5SqXOv_W~Z6=t9P3#wNxZCXk90o@}O~Nh!Q3d@1}X0$|;|d2BTx^Lb17YM7cC zOZZCoL3A@?HdFDI67Cd1knR$J8paxiEcP0PEMAZ;%`7n>J%Y6yHH=w8H5@5Iy&!i9 zmx$CbWC^DTgVdym^fJdV*K*c!buzRwq%o$5riit0l!(@FG&8y|G&9z6S4h=xyD-EG z)q>nAxInCiXCWga!vw}cfpCTr@fyw=mK1TY-z91ovm|R6vLv#kQY2C&Q>0R)(^=A( zKt7I_Mv{|BXM~E%fK<amK$0O#wnQ#PwwG~%{6dBrhIoY(hF}IwIln4XXDj{C;?$yI z{oKT&lFYp1L___63>W=?qWrAX<dR~2N6+}+lA`?VRDDO!;P{}_^rF<_;#*v~`6-Dx znI)O2#Z_wXOp}>ftPkf@#v|0|Rp#d4R)<JDAa$CIx7d90!O4m(Hz?TBAmtWkaYklZ zNqlZl@GX`U5XqUFUsM9Jzp{vtfq|jQ&%d;!ptM8*npYKyGxAGwQWTO>74q{^74p-p z6d)$)D8Nn9Q2?2wqX0KaAu%sSA+uPax>g~I50s%1&br0vm;%oB`FU0B0Y$00u3;fn z90B>oB_P%<4#%{T)FLC3j4D-drYKg($t*5WNY2kINzBYERwzm>NKGtBO;JeANzF~o zE74@U#h#p>Q<|GsoIYzN2<)`iWV*#zaEm>)A~~ltB{do3D^T_U<rz?6DGVwsWf&M3 z(iv(PVnt$@Y8h*pIv8pgB^ioYYM3T66|w{~tYjzxDbZxU#iVC&i!u2YW2PqKEw1F8 z)Wp2_?9|F)u=FoqXR8=+l*hoMJSHH+B?cDdG4LplL5lJiPyhJRlFXdq7@y4Il6XkQ z*DI*}#bpEXEvU4yGX=Sl37p-lG{8XtRTdwgnU`4-AFpSVlb@WJQ*5V)&;*L_+|-hc z{FGbFj_Ikln1fR*s@MyPQgw5Kf}>ddJUxPgZ!vkMXfobnO#zj!nu@oW@{4b=6eN~p z++xYf&&<2UQkj^WbBiS>KQZMNOMXFWUXc(31H&z@l*E!maKT)BizPX~pz;=9J~*Mk zO3qvSsQltve2&G%sYReVz_q9-zvvcQT7FS(V#zJG5>Rnhe2Y0JHSZR4acW5s$e*{k zASH2XN_@&K_M+6xyp+_6TRcUn`9&$IMIh0%B2X0G;!Mpe%}p&zEJ?k^T##Q-1d7vJ z9BG+(i8=8pX}6ej@{@0|f^|o+=ca-jofgGikXVwLnpYB^nNoa<qa;5*uQa!y@)mPu zUdb)?;=<C@)T-25yigC6r6!l;7isd|Vku6|NxQ|7nO9I+5}%S*WC98ZL3lubE4R$l z;#=IgW%0!&AQz=q78x)wFhp?#`TK{&yLbjg@ug%I#ly7}M{$6%11JIA;(#Qkl(buX z;0%GN2U)=KjM?BcrpZ{Oz`(#zqzJN{0~Y&ypyUCvih+fNk%y6miH}i?k%yU&QH+U? ziHA{unT=6~jfatsnU7h5S&Wg3QHX_)QHDv3xk?0EV!Fkgn3MAo)W}E%1t%!Gurn|) zfUq;jre_Qc3?&Q;K$QaHLdIH_8kQ`k8ip+96vkf0S{9g04MP@73X>#*B*Q|+TGkrI z6y|KEB9|JLEY<~V3mIzJ^5)errm$o)6(yCh*Ra$uWO3B6)-cyFrLgw0*0SX_)G(&7 zWiu5ulyG8^O{!r`Vb5kNnpeVwMb@Q;F@+<Wsc26LHx^m78pagPY^I`HB|KPUIZAj_ zxIh(C4O=jSCbyp^W052%#6bl|kq;<3*$d*qO$|-PA}<i1w<s|$B|kSlEx#zWxFm|X zC@u9C2dr^%izT-xx2THWFTX?~Gq(UzPNk-3@)hZWOtk^UBTGtAenF8Th!2WHuzk1K zO7cP3zX%lSMdlzm4iLc!B0y17#0}EQ6$WZ7fpe55dl4vhij+YrKn}abla`;85}%ou zl9`-Z9K{VS!;3+Q*9qisL3kd6NrBTCgaD^7euxTKt}b=~MZXFtK``?$f|3^pBM%b? zBM&1NBL|}x6AEUkl15HkXeRwq($mw=FD=mr=Z4gZf}H%KL{RCVk7!W+5(Z@t1YfTr zr??^+l<h$-;73l9PZ$_L$&n$8F@-UmF@>p@v6eZVp_ZkDsfIa)Ih`Se1>9m_t!1la z2Q^wjZJczLT8;{p8V(nRSeaVR8kPmjC7@);TEi*Hkj{|8*~<iKGZb2dGnBB^ur@Q+ za+R>xu-34GT01o?&CD@OwcNElP%&;v2B;WsEnf|H4Oa?}4MPoY4Oa@U4MPo24Oa@E z4MPoE4Oa@k4FiajWJnPZVW?rv;z$vcWRPZ9z_}14E@Z<{!&1YQB5cD@!d1gn18QA~ zq=@#iFf!C|1v6-h`4y=#Ffc$<<x5bi)MP4>0>!yBh!6+W+L%>okpx)NEmlxDUtA;# zQYr>YKx_r^pz8V-cR@V7!iFRv4{)Uj3TcK&2S{sxFCeujIW?~&F+Ej5LshdVf`Ne{ zM3eaz7qqNQ$t=3Xl9!m9dW$o^EVZb-D6=G0le@?gWLp4;0L9%cw$zG})V!1;R*;Au zhyXQYi+DiJVJ*%`EJ(e@UXmXVPW_<fttLMtVMl|~EH~6DkS!2{L5_%G1ErLdv?w7E zjaq<03QTZm%~%q}g-A|CksupEwL(UIN^wyXh{qTYN&TSe9#Y8irKgr4HS!ZdIY0>% zD$D|mEQ}&dptLW;%)uzaD8kIa$ic+H#K9=X$iXDQR3(L+{!xw5WG(`w-y#791_nP( zW^ln$<O}976@`MrBn%W`Y_~Y#<3V1HkH5tg4{G<7=A_2Q-{OgnFDy;WfyfkjgVcwB zh!hY3b~}OqW#l4I4FL+^Vo)K%!NkDGWy&QB7S$AiltkJfts0;pmVz}%^uP_0B5+Rx z(nKi&)y?3F7hKUoq7W2EQDSgSi0UshwHQ)-fh;XTExDp3;TjMz0<{F>&>~QpxW$&A r1ZpgU9f}k&V9$a4b&JC$H$SB`C)Ey=5sE<?IT%5)Aj2fXB)|*+oSr(g literal 0 HcmV?d00001 diff --git a/IO_utils/__pycache__/List_reader_utils.cpython-37.pyc b/IO_utils/__pycache__/List_reader_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e354aae7e8356f0dd5e32343905b3449e000d779 GIT binary patch literal 3532 zcmZ?b<>g{vU|^6D>rA{Sz`*br#DQUE1_lNP1_p-WJO&1a6owSW9EK=HFwGRj1g4pz zSim%E6l)4Y3R4bSE_)O^BS<ZC4o5C$6lX416c-~y3QG!W3qurl3R?<$3qur73P%cO z3qurdIzt*`3Renu3u6>tieL(Via-lv6n_e1FoUMhOOPx4G#PJ6q~;|i<)p@EmZTOX zmSmQt#%Jahl$N9xX)@j7^7ixdgz#<&c)}DwMUq)SMldii@Gvkiure?(ID`DM0pu6P zOomLxT9y>XbjDiN3YQut7lv4yTDB5~8pawnNrnZC3mKXjYuPJgYS>*EVnu2>AmTL~ zk_<I0H5@h!g?ix(HLMGmYS<StGBOm3gfrAIWHHxrR%q3*x-i5l)pFHv)o^CB7bVni zr7&hQ7bTRiq%bXDt>Ie8*v#0(SeOvbP@z!6?7|Q$QOjM!UBjBqUep6Nkvp9cY-Ue5 z$S&42reFq5X1`lZo+(wPZmEeSrA4U<)wK#~Mftf3DTyVC3MrY%C7JnoiA9wPnZ*iu z`6UXOc@UYjqQu-(1x>bFEIFCQCAV0zQ!9&av87g&q~@jEVo%P`Db39*zQq-mm{XeS zT2z!@bc-!5zbH4cM3d(hM`m6@X-RxaS`<IXmUx6Mx7bQR5mJ1MEx)v+ptR%`V@ecv zVopwcW?o8Wa%%A{#*$m?aCT-C7u2Hol(bu{nRzLx6}K4kS27fdGB7aw3U;=N2`x@7 zDvrrbEGo&&OHMS53CM7X2`I|XN=+^)j&byi4=yRn&rXeT^bC#<N=+|HEiR7n^p7ts z$;>H^@yRSMi7!e`Oi3+*i0Bnm-V#eL$}cXCPtHh9&PH-LJ1G9;K#78pgOQDqi&21) zi&2D;j~T)iV&Y=tU=&~~5@%pwNM-`5gC-m{SmI$}WMHUaOkvDsDiW$;SirE5p_VD{ zT@7OjQ#MnPM-4+3V+}(Jvn0a;riBcWpaj8O!?1v*hIt`lEmI9B`WY6mEo1=k@^;iP zrm$o)70oJPPhkaRj9#W%76K-)Ea0eN0ws<bmIa(OObfU`X4Np&FlBMuFyy7xFs5*1 zGZp<P;Yr~v;mzX9;!oj9;Y{J~WlG_p*mRB(ffQa)(yaj{NIt(?EGb3#1-IC8Q&Tb% z^KP-_1_fIhq}*c8&dYlV%51mTOY<^8sY#P5iaRepFEukgBPqYA_!fIPh*?s6izPQR zIkkw9fq~%`OL1XQi6&DNPi|sGd>SN?7vJJ3O3X{i&y6oGNi0d#WGYf*U|@*iOf4?S z%uOuGFRB7%gWNKO;*z4olGOA{aMD-E%q_@C%}vcKNlj7E6uiX-a%EoPE#{p3<Xdc+ z#d)PUIYq(@3=BmQAdA`3GIMhB5{o23BB1i_7IRu=$t}*3qQtynkn3-8l;p?fmF5;y z7IA{qfXb9x?8VtRsfk5-Q7k1zsi{$-E~&|x#hLkeAw{XF&=i_qR0JwmHCb;lrKEvE zsU#ym<rYU;W?o`Wd`eo8AjmK5(2@+C6(EEd$W<aGMX8A;@wu7B#hH2O@nxX=Su6ue zy*i*&$;bgtsUn~h%*4US!w6z=F^Vy9FtRYB@R^EqL1`J}F_1=X1_lO@FF^d_BMb}- z9Sm6vDU6Z~wag`q3z#|>7BDYl$YM!hOkwI}T*z3<Qo~ZiSi>yBkj+@j+72qLnNwI= zI6%c^Gh;1V3Tq8(4Vw)^2gn@O8dgaLNrqbX4u&jHdS+>102PJJAk#T&SW?(ikVQBv z>`K^bI3Tt+GrBOu>eO<Tu%~b=;HcrM;i%z~WT@dpBAKNbQaD8z7P2&hircPmh8p%7 zmKxSH=3oX*u0$h71_mw#1qFqm)ZF~CR0U{(11TdD!DS(&C@jw`$xz76FG^J?$w<so zD9|!AFi=&1hO+`BoWTZJ6(klV<|?G5St$f0=A|SSBMd>7FR)SwNG(cE%_~VvPgTfI z!(&uxMRHDQN~)EDGo;v6D9KlVia|^)N-ZfZ%Ck~%&Ph##l;2=C>nP-v<|d^UVVVvS zDalAx$S=xF2PG?olr)9Jyc7kH7_|6>DJf3PNlgaj2x!QFZ3JgMO~zX+i4~c}$)J1) zN+Y0h50rmFd88PWN75N;7-9u#89`}f0b>p0LdJ<qg)G64@>`P$oGos#r>B<0=j11A zGTma*Gbj>bU|`T>EYbqy7*H`^1WMMwxNJbxP;!25ft@EPF@vmPW2jOt0>>IW-|N}r z<R>TQ6x-<`)Q6zB50rj`K_-Cg0lSZ@ma&GBhXKiD;OwNyRAc~h14tcM078IVQ)I}% zz~B!u1mX<ZSP+1)pvqiXSs7Gh=_(*q;}E~6D1cLDbuByrDQHw%=^3TfDyV9{OaX-^ z=PlOajKqS}TTFQcw^$R4iV`b}R6vCVb8%@dBobJQ67$kiZ?P5>W#*L>fik-$H#iP% zF&5lnhbFctj)KIJ%+$P+;#+*sgbyuXqC{cI1RfRfnJE=f+?gpA@lf4GpaLk0AL{CO zXmX0;g9$*=DmebZK@Cd;pb)<W)dn}+5)?R6phU;T#Vp3i!z9GW!NkWb!YBl)l^NL> zB^ZlrK;;)GhCs?dg)b;=f*W#P3=9kvE|6N-g(21=hN+ebRF2g!H#3$nWpfm1l`z*Z zgKFzuCPs!DCU70iRl^9XkD*O0u0pOtu5gALP=k{h)YfDzsVqn>zQqEfG+A%4W|pMp z78lupf|&)>0J<dvO6Q=q4u}^IQdV5V0g4`$D8anc;*!)9r1nOUEl4ACc53A<R&dz@ z4iX3f3Xmd41_p)*P$2Mue8(un#KXwL$ic|R#Kr`c`^61uVd+)m6j!+2Vs))ZPR&UM zg#buBs5t{F&B37&#=yW(!jQ$dfT@PDhG`*FEo%*Pu^L!}xrTKiW3gHZOA5H@S;GPj ziJ%hJ1#BRhevw+X8ip+P8nzVX6c$ieg)^iu1Tq9ML@=Z<1T$!|`c(<LW#*(RB<6q` zT9pc^6`92)#YL_l-?0T}q^6b_YBJv9Dh9FQ^AdAYH92lE<rm*#DM&2IxW$r{pP2`$ zdebs<Qg87&78j=$f!eE(1}9TN$}MiNzru?^Eqri&n+o!$COahUazVl%J|(m077NJq zTb%i2sYT@=<x$)qL*qdebz(^rCzu1alnpFh1PN<+33Q9SII%1>J|zv5ql!V@0tQAt zMixd9a4lLC0*XlHA{Pb*1~;(pGzD(4Lh61KP$}X8_HY5XJ}B}9iD-gqaS>>1KrgkT zAhjqn7gQVM6oJxg6dz0qTG4|Ft|Cw>2DdT6l`}YngJT<PKaz_P&gQVm%}*)KNwotN a(#0YS3=ABM5Xi$Qz{nxWA<H4p!vg?gUY>gZ literal 0 HcmV?d00001 diff --git a/IO_utils/__pycache__/clean_table.cpython-37.pyc b/IO_utils/__pycache__/clean_table.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a0b33d4af8f0f7e35599a5f2b3e483631a1cb92 GIT binary patch literal 9996 zcmZ?b<>g{vU|^WGtvgZ1hJoQRhy%mS3=9ko3=9m#B@7G<DGVu$ISf&ZV45k4IYlys zIfbQ#F^VOHHHEE(A&NDHJ%yu%A&M=9Gli>#A&NbPJB6o(A&Mh~H-)c-A&N7FKSiL0 zA&M(SFh!_^A&NUiI7Os|A&MuRA)O(ODMd6ztc5X(H$^-}qJ<%fFNHCfK~w4_$Sr;^ znHU%tl0gKB%f`UK0Ky>W75`ymU?^coVQgk(1kp@T8p3B>z*NJqkg=AjhAE3Vg}H_) zizS6wk|Bjff+3v=<m_IiTBZ_)6!rzIFgY;IBFV52Y*GzV3Wp>^Cqp|!8Y9?S94Xu` z49$$S%q0vdJXx$Y%zX?X8*7*#DpOdpnIsu%S!!6am{T~?8B<tN*m{|2SxOjEco(qN zut4lz2zFf!OASj3pCrQqwuKBJJtYh&{FrJ$Dr#6#1SA<G7;0H-SZY{Pn6p`->IE^? zL)6wVW-~(72qDzeFhYF=vRN1u4%tjKEGZ%omJ`esq97lYu-C9OGlE?%2Ih0X`Ql(c zC!8+<=5xXMl3+eJoG%6D^T7GiU_LLLF9YWD!TGXCE(V1dJT{Qx85)AY44QIPx49G) z6cj=-GK&>*Q%f@PQxuYmQWHy3ixm<Tl5<iM^AvJ2i%S#=it>|Fi;FY!(iI94ixP8F zOHzvzk}8onnRyB&8L0~SMVaZDd5Jj?Z6*0o<G{8gLIpDtOB9k5^AwU&6;d*bQj<$^ zDq$9+rYI!lr6?39mZd6WmVk9xfh|->tw>JIiBHKavQj8WEXh#FPXpTr76zLOQCyT- zQd*Q}1?E)QD^mv04R8hp2q+9n5|eUL6$l$%#K^$F@DfDaVkycmFV<wd#gbT&SzKiX z)2rZ_msygTn4<tOvOKdSL!r7B<i*U?ypm#YXjIoKq@^a7loqAl;tEMED#|R$&n(SN z1{n*=(x9x(%D})N3@UA87#JAR8EP0}MQRy47&;hg7(tn80mDLuiA;qo!3--IG#PKP zCg+#tm1r{EV$w6X#h9tdSOh8(ir5(#7=HOWTg8MHrxq2*<R%uCWacF&8pZ@<xWoh$ z<!7ZPmlVf1dd3Hr6y;~9#yENg#|NdR7o`>#$9Ve3mzHGa6vu!QO+45)dIgogxNJb_ zCpka2z)l?GK9FPC7^>uvmD%LvCnx3<+vy?HYBGa^t_W026@m0r1;azkF|Q=GC^NrE zAvv=sxilxSBr`wHN};+|N5MBUB_$^nSsW}JkY9{lN0k-a(BRaZ)Z`MBpa;iAE~b&i zVBg<j%MA*)G)Sq^C1^%jVoqj?LVjsUL1{@9dqGjEZf;O;l`%nmdHH#|d8z4%C7ETZ z3dI?jX(d&f1U2Ob1uNJnRM%E<2A8Dfm4H&~E#8ofqWs*X)a;V{+|1M}cbFlBXaz%E z1>e+?M1_>Zl0=1qqEw_}Ku=Gv%8QTzFnI+d1ziO{XXg-wvc#Oy)M6}VdBW5Zq7{r4 zbQPRK0;pk>3D_t{+(!8mat%yg!Bjz4Ap}%(=BDPA;5IM-rhyQxU<PtqeqKpYevSgZ zfb%0{5=>sf9AsinW?p7;Vh(N>+QQTmq7^I@bQOY9bMwnmK?x+aSOJtKOkvs-z@;T| z<yx^O|1H*nqRhOKTTBHhQCvl-i7D~mqWu<2N>P5nE!N_U#DdgYY-#yLxrrrFta<q* zd5O39A*D!eYDr=|D2c)Z@{*HFAOd_470D$9Xnd#!5r}*VEaD)lprX!@(02}jhzdj0 zCPPCIBFIw&4ge4*wOEtw7AM%%pnB>SD5uBg1_j^Z$jmD!Es0M_i{b|rhVdovGW-^6 zZgPA|8pwN@X(iwS7*-f_GcYjR;)YlWE=@s2d~pb<z-Q-R<Y44r;$o6vWMdR#;$ReF zWMbrD6krr!5@8Zx1YsT!i%E<{hKYlbgBcHIlwkpxRip@NB*JPG5Y5BDz~Brju{9VN z7-|@@7*ZHhm_V&oP}47kJ%u%e1JtNX=LWS@dzoq(Y8ZkUG<p4Q@wlfZl@^s0rzDn? z=2i(gmZs$;X5^-VxeB4)u0d5iPO0gcd3g$sdFh$?RT5FDnI#3KdD$fjL8;k^d5Jlh z#o#i%N-&@_FBw$r>6YZ{g5|0d5sJf7i*gc6(~5PSLln3)3i69f6pTzVG^=DlN)*cS z^B~44BxM#Wctc_`C$&lzF6)$;laijARGOC#Rsu0!93%~KjG>+aLP-^GaAj^mNq(+E zeqM2E3AkR;WGn*3i6&!_AOiz~CS#F20|Ns%4j}|628*N_7#PAq!3gdtF!C_6F|sl8 zFoMP5H8?1|Ibq>lg%;j(m{VAL8EY9zzzuRx6MP}#0%j<WO_HI8A%z{(z%5~1zygYs zg-p^63mF$O7JHSjHZhj4rEr0ph)B9?7*d#RKyeE0Wu!BuaHR0UW021;ipewO7IScF z#VuyX^i)meC^o;++}zZnQc&9>IJH7S!@xH8@RGc|oXq459R)+%^3=>E5Ic(1F{dQ8 zNR#mvdvbnGX>MLIw&*WX0tFMOV!y>0StSE1E))#)40RQp^YhA5i%Jw6(^C~n@)a`k zN~(lVA_0<m1W?7CLtyd1af_`eH7_wY^%hfJ!7bLboc#QvD7MtX(!`wNTihj?xv43s zIVFi^Cb#$;i;Gi>K<x(CqN4nwB7TsI@I;~r0|P@mC=&TV*^fztnU7I}nS)V?S%|3! zBnOX3P$cuhA~K2*YsLX*8&Ia1!=1tlir5rBQ0ETRu>*DLKwY^M@pP_Q#tN+(Mi+)y zrCO#E#u|niCP{`Ai4@6nrWC0XrYz=kre<c4s0~A*Q#b?I48e5fTILji6d@#Y*lSrT znravpuq<S7VTdh@VX9@VWvgMUVU}dbVy$JbVXI*Qvp7oFYS^0@Q>3$*iq4g=)o?)B zCsJf;ISap~NY`@aX&{MzLQ?+*sz$1oGmj-jwwAMovxXsyy@oY~JB`VPp|CHU0px=e zMoESgxpYRb>ls0=Phqry$jD1Fq)4I2$U|j7zLrFhVFD%HU<OSEzbejDUB}=6SLcx6 zDuFOiic3{Mh;Vs^IV+^67MJFwRB?JbhbegGrKDC=DTk#N7pLYZ_$Mdllon^^=PCH) zCxgoy1;>=!%wmvGl{|)`;LMa%BnA9X1)w%oesN}Tl?0jq+|(*z$0AVcGciZOC9}9V zH5rubs${STB5Q&8N+CZfwJ0qoz9cg>O_RGy0`575q^fd-f@HYw1tGrALwG-m(=*H& z$@jWo-<KLOFfjNRl_<EUrX5~#cz#)GQE_rcPHAzm0#{OMrUI<6R?sLY%CE@GP0Rtc zoHa{T85kIXQ!`6a327{~1gV4+m5C*p#mO0|`epe=>8ZuZ8HqV1sR$E_OH%Xli&GVl ztu3{MI<J^)6XX~e7@(d@0R?w9D7aBA5rtZUp{z<6nwp^Dt`81(#v+jKqgb5dlZv8P zJ;4;4r(ZCL%i;qfS$z#5lo5n7hEOID$`nGG#U~ZL1QqgC%z65WXuX@4pyKc)sKTh? zLPV=3N0B<H6~I@NT9A{NoEi^lJQsm#<Xg;niFr{RnV_~retGdNHi+CU=ES_pTg*B6 z$(mf?`Y4JA;-~n;qSVCVC>9VE#g&qpS^yOhEG{ibElSHw&P>dK2;E}M1m%<{_JYI` zP$x8s8>}4C?THdXRu2)o#h3`G@jxBOB9JeNKw(@6DlwfvB_<;qBNGPXVU}RzVpL+{ zViaN&V&r1vV&nl;k4$V#Fq(x)h)Ie`icyLQY#zLn1qBudtdu<juP)QU4HQU4mIWF; zVohNK4Xc5gDRbCTxWQ#DsPtt4iKnyHGFGU8OIW#DrV=JlSt`kp!jsNe!kopD4lbW+ zK&6*mI710*3NNS<tziI-Xo1wGa7r?y@TD`R@PkxJgDPPG8<09dh`KC>6jp?LN?22b z;Nrq5BA_xpodwjKg19!AK~vlhy|Qx-(G5UuXH@a~W)|lrmLz8=KyrH(A6y^^RDe|p zdsG(WgCY}Bu7PuW6{oWkwCIZxa&~fy&&e-OEiyKUF9VmCQKBKai52nbp!RQSky$ZR zSd*<v8g4kmwmgNr#N-TxAjbetmnuoPB-mzTDHRk)fZd>=0kXu<pjclmH?hJ@&$PHo z67DJmjUo^`Gfx4e$j+=v%-P9J!N%A?RUs!8R2VBjTv{atl2@=ZD@GO9gyc_fuBx&D zw<e7B3_(4e#2oPGK&nC}s9jN<s*no{zamg@Dd;LBmgQ%rC}ifPW#)l05~vxL{%7UK zOTI|~_Eln@P;IDwtx|;A0qW91dq1cu6so|TJOyJtP)`-A1Ui6ItdO5nl9-v7nxX(6 zUT`W(%*@Nr&8$$+$j{5ERLIOj4E>}P<>x9G85mk<Dpc`76e>77xq*VFiXX~{g-Mk# zQ~>UjvizJX5vU+6O2GMAlMUQn1s5$v`k<B%s5~h$0I@&~Ax&mT!669ls}v-c#KYWv ziyg{B&+TFi3=9iExm^vE+t~$}5c!>lnGMucVU%OxVB}*IV&Y&FVB%mDV=4maf#+z@ z2oI<&4eoG&2FxKH4nz*BVSwc5OvV&8P(B9Bvm@k7m{K_4IgAq_UZIx4<-!mvSIbxe zYAu0UAl&JUB`jI2pnO}yP{U}$Q0Ny9>AdjxRdG3HmzJa|xCA>YAm_bOkZYm!1yY>} zQdT7bt@#lR3g-|<SgI5Z2?;<=uT|jwwSoyajY1PgNn&0~YOVsLO3W;VjrQp(<mJPX z10=a*mPGM}fE7d3rfRZOafT%3DFi3y7o}ElJLi|B7DG5uY)+0LzK(uXg3dYlB??ZZ zMJcH;b)XsuB^iL?9W5<rGJ%spkr^mmfD3jo0WRG^BYDR_@ht#~ZzcgI0Tuy94kiI+ zE=D<~B9JURW<hxhqzW9fXW(TusO(DtPkz8kXi$q7G})2DKZh?x02IgIHlv^<Ly8be z`%e(jGK91wLGs|%BokDIMUo*!7)1{YRE9&6Aw?ve5w3#+D$Xg%05U5@G@U6$C`Aa; zwoc)M%JNDwq==%J$7=&gF=BpEJkYVV<dXc{$|&Z55XUNhztY^K)FK7n#Jtqfvecp~ z@sP~i)Oc8LHohc3J}5OgF;A~b(ygepxTGktBsE0=ER>j&Sp_Pb@~UJpl2u|FXwV(3 zOfQNh#M9Zcip!_6II~zGQ!i1kiVwmrEKSTS$t<a?!feClrIsrMmlhP{R5})=CRS-+ z*np}4(H_+#q!K>x1u8Yu5{r^c6j05I;sCXjL8FnHd_|yMRFz12dYYa>IjB)tlv=Fd zl3JFToLUMR!*<N8O3cVnNP&dl;ccK0l!OHyd?)}^ROV*p=~YRjLBcRE71H5{j~{?r zt(ch>QjMXGG_Zm~Q=y8}r?NN|9GX#JW^QU;dTOZ`qDhIA_cQYp@={Ava}~l;i;DA$ z(o6Hwi_=q+GE++wxHQTSFUcrM%}db-TjmKGur31)#^&c~mV#QSKKa>jXJR&~OSKpn z7@Sfw67y1WK*JLnDXGO^%knVQrWR{fDS76lB^G6ufPw^vn~+9>p+!>^8zjiVEzv4A z<tRrFAH^aa1_lOjRjSE?t%ATY$RPo0rGm->ki!|Uw)P<HKOSZ&Mi7)>lmoTuK>AQh z3Q+qPTvE)#QBurdPvHcW4dD8KE1eNkYJl2M;5HekugHtseoFzB2uw8$DWDR84@Cx4 zB7kJrK<&5`{&Xf#s}F>sd7sUO0c<8aNFGHeJ2dYL_(h3=^EEg}f-*I9ys%0TEbJTt z5(AHM=~eOjBqo>UB^D`wqbj+iN&?D&w&g&BAE1$5=a4E<kII5nP+?u1s^FWNl9{Lw zoSB|iB@F5p6c>OdDWLW^hg1nTqKRm-SE+!8uyBokfszO~*C13O2DuR9U{GbHD;O9U zauTyk4=*W7g}ObvL_s4vH8U?&!8^aGB()+bwInq&Cli$Z6-x6`ARf#vQFzq3#8AgH zIafjBQRfmP9fRatg|x(+oMMIC%#xDSoMHu+a8HmmzCj@h>8ZJ(PI;cDLT0W)dePx+ zhv%21<|(+87A0qZWs0DB6HALh{ngCW!}CD{P71}PN!j^DC7IcYIhs{c;2_N|fwUL% zKp7J@Kv*S?9;j8RnaLTcc~xSFU@e0VD1t^5tAwB-i!{mz%{W#3E{Ub7Md0!YJZOmA zP^e;44tEU-Q3U59aMsdfg=8eq@Dh3zi+#vYl7WHYD=0TffofM4Hby39NbX@`<Y5+I z1oem%K)DB`4kh=n!1_enP;$>4rW6)Xo+)8WVFk63K)s(9P>+Q}k^wXyU&8>Y=~BQ0 zg^)f@4MPeyXi%|+A%zE23qtxoDSRpHXgc|!`GiT50i1gnQv@I_D{$UpUBFhukRm9_ zu#gc%gK`dtSHl1r3zA^SV$b485d!y~gh6t_44NW-XhW@4yrFs7dHLme3jUyxS52lW zVU(d*1$_mdV81H;Qm88M_^gJZEtpm)%}WB6^m!#ZAXN&9dFiR?sY$7Mny|=}2e;<U z!L2dJloW-O%;b!G&@@?9YKlT`P%u1zVNKLvP|SdeN$@PIN-bjzV+unyOOZ$lW1dh6 z!ve+{#)XVUDkThAOnFQxOpsYuO=j>UCOD-)QUx=l7tfkmoR^pfZhUJp6@iMsBG4Ej zdIK9Y7%$1lz)%cw1sg+^I%Hk|Whz3?26Gy<2s9XZizNj#-zW+7DX8B9@+ru7&{;*$ zOa*98GKv#C%LtmZsFHvx2lZh<%5(FJN)mH2ODc&B5hg+*@{$eIvIO<zkwc`&93%?r zj1USBSwwh5al%~(4&o|Y!}u?47#J9$Ak8swasn68uwg$<PDob8(i$!TwPB;UOY*@J zJ)p7VTTCe>x7dqQlk@XZigDD7pbX3hawe#vV_;%q<YDJv6l3II6k+6G6l0QN6k?R& zC<3WU289DCyMuZ`pnw6_f-Vf8^*J?+&5R`sHB8Nn*-XU}C5)i;I!%lfY9&lHj5SQn zOfC$u5;07*%(W~v3|Y)I%r%ga1!hKuLKSGwhQ+T+!zVQ_9kOmBDZeBGykw$SAp^9~ zq9h-*6a%z+qc|})H3T$%pP2_9anfYD#hjCxSELQ9;F+s33vRJP$D%Y@Z?QoJ`J?z@ zYd)akQ&GGniACwDB~UInLEmD`LQm76uBQ$ta6m&j42%*?B8(zT;1Uv^!cYRE09>en zRx;EwrZdzsr8CqrR|J3ua6D>RYM2%<*07{AE(9%QVp_ml!?KXE$gPHX0b>or0_KI_ z0dR<Hp%_H8hM@+$;AjEsLI#*jI710r4O0zsGZQ023TQry(NB~87F%gvW?^aSEw<E( zlGMBuNLpY4<=k7$#ihALAP<1&5Hwkf96<?;7oq_c0EVE@W6J|~3c#U(78JL*5TRuX z3K1btN?_z*WMO1s0u76CF^VzCFcrB&rkOxxfF^$wTR~!8N@6kCHMdyvN^=V;!DR|~ zDi1ur1fH7!51oN~+u)8_5hy7ZfvOI0x&wzd*kB|#!8>}lIBatBQ%ZAE?LghQVvttQ aR3cXp2M31`hdzfIhX{u>ha`s_A144u7)h@H literal 0 HcmV?d00001 diff --git a/IO_utils/__pycache__/mv_strategies.cpython-37.pyc b/IO_utils/__pycache__/mv_strategies.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c082ebb7d9f04c45bba16f774987e8536198cb6f GIT binary patch literal 6790 zcmZ?b<>g{vU|<mZ(~)>Xg@NHQhy%mS3=9ko3=9m#Qy3T+QW#Pga~PsPG*b>^E>jc} zBZ$qI!yLr|mSc@#OJPW1&SB5xh~fatvE*>(az$~0*{nI-xja!kU^ZJ0Z!TXHA0tDG zND4;^XA5H#e~M@dR|<CvW0XLOSPD-HZwq6TV2XGOUkZN<W0X*eM2bL)U<+fEaEfG# zP>OI1W0Xh=V=#lJ)Ju?C{WKYG@#Gfe7R4v$Bo-HErfD+XVh(b1)nvRS7L=HmlAr6A zUzA#0;tY|^OfAx6yv3ECS`uGSmY7qTs>ygu#4R<kq_ikCpeVJVC_g#1xHvN}{Us=9 zl377kFfcIiGcYi)F)%QILZ$c@0|P@kLoI8CP7UJ%hJ_3+46zEeY&C2tjM?l(d^M~K z7;6}7*d!TJm?Rm}85c4#G8FQKGn6nbV6I`vVo70^WJqC_U|7gl%U%&t!|cKk8&Jzp z!x6(=%UR1=%T>dW#ahE9$xzE(!nS~|hG7AF4fjIEg^aa4B^)(8S?nn+>5M6?y^J+X zHLT6dF-*0*wVXBVB}@xAYj_tjnlse07y5v0<Emj_$e6-V!_vwm$<WEr&XC5K!j{6` z!coGF?2cwe7lvlWT8<QsbjBJEafVus8ip*M8V*T@TD}tA6wU>FHGEn83mKak85wH$ zf*CZqKtayrnR1Jzq_QCOB_jg^LlFl91A`{xE%xO6oYLI9;$)B@DCn3N7#LU>7#M^> zDM<z#ZZ!<CBDIV)j2#TwY(*j|40%i`j1!p(S%MifnL^xFGH5d0V$w6X#h9qcc#FR{ zH!&wCwJ1I_FC{ZMwYZ3ffq~(dzq3_LXmM&$aZGMvQAuWAa-v~OK!!_9Kv8~HYH~?& zjH73Ka7j^qc4~~HXK;K_YI;#>adC{Ne|%|4W=?TTZdrVBNl{`+YI<gBv0g#tFD{#$ z%;J*d{M-UNMUX>5u4ZGX(kf2PNlh+^PlF_nVrYudv&qR%PRuE`(?e);t5Sd_qGAPz zuGAC-jRIXr60=gMu2s+!xW$s4T3LLHEw!Q~H815BQ(nO>)~eK^{Nh`z#TkhOskc}& z^HNePZm}fi7gXM2Nh!)NC;}zGTdaviMTwQS*b|G=i}Q<0Zm||6=B1|=aWgP5++r;# z%FHXd#g>*|l$%&`iw$gJu_pg5rj)c>f+>k5iSa3!$t9Wjd5J}tsl~T=^Giz#N=xF) zQj<&ai*E6x<>#cpg7+3<_AORWq!!;|EV;##n3EG<5Dzi%7CSTwZZXE+;wecjE`e*} z0XY=TV^2wo2fLpGu2m2gBJhv|$D<}=5hz;~34pvI22lf70S#+Vd>1Q$QxXd!A0r2& z1fvKe7Z?gKNihns3o$BycuWFJMfwa34Dbvq%D}(?3Oi>|d<ro#FqAOVFw`(MGu5(! zGbkwgvDGkTF_kc<F!nMoU|Gme%U;5|fUSltiyf447c$mzRHT5jXG|?;4QC8<Emti! zC=)K=s9{^kSi@DrEe<LK8Csdrm?Rl$c}qAJaDcKLL<XFFI8&HYSW;MfnQHlK_!n^1 z@Ye7xWMX8f;i%!y;s#~4LaT6w8rB6o3mIw!N_eyQYB*{HQrLQ#Y6a67YI$n}v-nfk zQ#g7VYlSN8YJ{=`To__aYK3cr7YL?sE@Vt$kz}Y9sS&B+&1NrpSHrtNs73@*;FJj0 z@HR8nuw*k8od{qkJQ2=NB9g+jKonF&fHN|0FoPyHC?m6`fFisIRNB2v0cB-S*41RX z#giAGT3nKun^=-xRD6r42warL7ndZKq-ru2i8C-TxM?!p;!G?~PR&cn%uD~ps9dFn zB@YB8=B4H5>ZXAUVNFO*5Gw+erMLK#i&7IyQsHrai=!kzKCd*lpz;<oC{42$7nY`` zR;3n+fZPR&(IPPrO9DiIN|js8X_+NaBGCL9pP5^bUsRHqmz-K0e@hTl#KUCr5_40F zZ!s05++qt(1!n_pP)3S}q;0ToZm|{@<Ybl<fl8`d%sHufMY14k<Uj?RXpsVl4JsRo zK#8$P8N^iqrCTPqfLq+4i~_al78i&E$zxFhAU0H@805Sn4Uit*lA^@SJa~@bE=$aT zaf-A+3OHbTZm~g3y2YHFlXi<0;tjAY$h2ELAWx*GK)i8_Coi=EURd2?hnfM-Y7hdP z(S*=5T96ya&SE`qCIe+H0VV+^E=Dd;_G02;<YDAv=3wMwW?|%E<Y5A3L68_5Gaplt zA9{uZ#VWXlG$A*`6@fEc7BLyFlK~^c@s=Pm9A61n4POmQ3R4POFEcp%ad$A(@WQhn z4^sBy1!q6LEPinIV~1qFPKFx6ECEm(1)lu`;n`0pg$t4WIvBFqi~e*lED(lgKamaw zX!g4nz)*NC98&vor|`6ZYFANEjoZmkB9_9tK)iziQnz+61T$#z`Drp18Guq3C_@$* zf>@9YS!4ua8-tP~^DRzz*1RQ>2hTt8CHe7?*uBM4T#{M<&8H!nj726O&8DFE0+|Mm zYUG+3l%vuaY8YY#;59Q#4YXE<q#aN}tjScw3re1hX_|~hZXgpuqTuqoNDx%cf-to1 zREIbO;kF<*S8Nq$l?=8r!_5_1Qb>SHiXt<RW#%Bl0z_DX2wa6i5hw)~S%DnNQIwdM zotc*&Ut|r^VFMy;L4+NM0F_}y_8=B0&liCjyWlF7sEl9a2-4^TB0yElEfJKg4z(AY z-HTj6id;biD5Dj*gIFFQ=P?(hr51UDxLzQ_8$|ek2wxBZ3R_q>gLAGhmLLrT1vw~X zFo3cwBWl*=17%(&Y?)UClzqcM$pKVHf|3KMSc7NZHm0G|z^&ygVOqcqs&{MoD-23_ zYWQmSo0(h~VwGZ;Y6WTqOL%J-viNEQQkcaVBpFiJ7Vy^yE@W(GVq_>hfZ9w>VHaTl zH<cON8Pga+8NY?2L;!13nG30@%m;2N3zhJua4+Dm5y}FY4r(eh)Cfs3)C!jf)d+*z z9U?U%3xsPx?GDizrYw;X(G;Fu#sy*v8EVDAT?0`_*I*%It$0NlxERcXb_*nGC2P6B zodMB>j5QK9lF03jP6n`y7)(Zzp;oFyJcT!f7u133Wv-Pjk*JZbVM$?55$I*Em8p?g zAlboCBLyx9#cO1;q{JC&#V0TpHia|Ph%As^$WSX=B9kRsBVHq$BG}7RE0@mD$xtJg zB?sz!^)lAVSGd&3XR*34#9GxV)F>>FPZ3$jn8GK?P^(y@2r3FEFco=qFf3rKQG~Wd z6kvtn1g0XT0EP)ng-Xx@Q8a}UTp;p63q;-&u?74n1)?~pKm_$Jia0@;0$eJBxSEXM zyjsN*0IB`J4U3nc!iW662e_nz7IH;=ppq8aLIq_-L@_rBl;=P;u`yJsCFkcRW#*-( zKnvc?{5))>95LMkNV)$Klp%{ig|sF!xX8c7nVVP<pORWol2HUH{fqoTArSx~K-r&0 zrF>BkBml6K^Sh7&0DtkVSR@2;Cb)b>E3d(&FR0!rk_0IMds_;`k_HuxETERnE#|7s z0!V2L>gp7M+G0hZPGpfhNFk`W!df$cnuds4pePum7+VDZF8e{nzNQk+240aWNVgh@ zPzMp923(OQhy^apwLvT$5TOflHXq2@sfhj*FO&~yXWnAWyv0~n1ZrtTaiyfi7p10@ zCa0#LRV76sAmb2~2E4u~3I$2vuP>M&^)jMgC4`wkt3lBh11j`cu=c6IHHiqL9FrKV zZ^g_3>SLkQE_{rj+C>1RcHsimH;f$2;Myetxpo0XHal$i#E;y@VI8<sEF-^h2x<U= zO2S^|TD}S$Py-P(_z|lR!&J*(D*$Tzu`)<9lyKMZH8a-mOE8!-Ff-UN6!wHOAobG( zq5U+$6qa75S|M=hD3ryY!kWU?%UCO1;Zq_|Bb+5zBb+4!u7|8*m}*68MM0&13TQxv zLz1CZtVRr6QZg2abucUtsS$&gl-#hAvglO+L*XlEDao0_1ui8K%|>ptW+M-%l!P}M zH5qR)dP9o^H{`}2xLm<rK7w*0(JeqcWy3R2;(`{A8t4rc-yq*0Y{g@hES5$fNHKY> zK#=vfI8*aVb5lX1Jdl<hwgMm;l%YUr6H=gn%L8y(1ukw7{XcQs#RF<H5R!MnWdtGv zg9{39E(Ya%&=^^1UP)?EQ5?v2EZI4VD-YC>1D6>^*qeeP*h2OYDCj|n0JAN~$0)?8 zzyfOtN-*&;f$}O3BO9{_6CYC%sNWACIRTZ}prRGjj4nRMK-|blIzuf-#ex#n8fI{g zni9hV9?Im3VXoz_<*DTbw^2Z&CfqeV$gP)JKCle?LdF`fj3h%Xe+fIN4F~GF^)lBA z)UYn#SO^=wWKLme;iwS+w-z9+G*++(Ll!4UtX8mwV*wYaeaFa9!kxmlfTxDDMsOh$ zXvCw?D4d~&BbY&x9hO%#8H*A@g%qf=1(k-NWCN;(Kq;UYRM&v>gIFzN2SW{`B*OxR z6o!S2DU6Z~u+hpQP=^Yfr!*PC9SLaL;TB_-CL^R_kJ@th1BxM#HEax3y4a#H9yA(< zEr(Dq)82xPv>}FF;=#iT;KmQ4k@FYHsT!EMGc!*C<W$hOrUp_*<;C4yO9G{KP$^Rc z3IIrl43y1ou_YE1q~@hS>UmJGYYGvY-J-xll+b~WVn~L<mifSK7(@<agp4C=GC|@f z4dgO#4@(kfoV*1E2gq^Iyv4}EBm^BI0S$==fkq{nM3{;)A&n+bU8l)dM353K0)@ye z7SQ|!I9zVAg2%tWBVM=I3KH{D5{rvK{njYS;_RH%#G*VssCN+!PH<BvN&qIGnpd2f zo0OAU1ZoD|l7x+V>w$*uGK)(xlZzq4;E<tZS-3njl2Il>AkhgLSOpLE6oI;oMWC7; zTx5WK3rRQNbPguK0m)&Lo1apelWGUbTg9Me9S0*1BL^dh<N*yTF>}drsdH&@xpQd) E0IH$`Q2+n{ literal 0 HcmV?d00001 diff --git a/IO_utils/__pycache__/split_utils.cpython-37.pyc b/IO_utils/__pycache__/split_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a492db74b2ce1f97daca711e86495299dd7ca9f GIT binary patch literal 1944 zcmZ?b<>g{vU|=ZPlbZOFgMr~Ohy%mS3=9ko3=9m#dJGH<DGVu$ISf${nlXwI%x8*X zPT@#lNnveajABV)OJQ$eh+<7)3}(>ed<ine?<Fe(1H(&31_p*?7LWh~1A`y~0|Ofa z0|UtP;vWnQ3?&R%j0>1*7#A`!GL#@#3=5b$7#A|uva~a%F{Uu4Ftu>hu(&WZGuE<} zFf3rHVOq$T!Ysm2!z9g6%T~f#!&bvm!`jST%bw0q%TY0}gsq0DnbCzIc2X^83Ht)} z8ul7aNrr`twOkP1LdII|8tw%gHJl)OQ&>`%S{Q1$!RGUnFf8D#;aSMo$<V=A!z0Pi z$<)r!&XmTK!iwYy-V)Xnke$5E%(Z-AGx<{3!8Y(idCXv*Kn=$Nt{VOtff}xKW{_Ql zap4R#9Kj5l9Dd25kO74Y6EskSLBTD@z`&5sP{R-_R?Aqzuz<0Hp@uOV<Xr|yhKWpt zEWr$#j71=wnoPHt@(OOTmS?0ErGhwm2DccKH5qTQ<(HNel$NYy_~q+t6%$&VT2vg9 zn^;tmnU|bs7!#1;5))9ApOu<iQXJ#x86R9yl%Jg%<LDV2AC#J2lv-RI<LMt?T9TPl z98+A7lUV{`>lIY~;<CxfEG|jT&n>Xy1UVJr?kYuyvXsP<#Q5YgJ)4~T<iwm}J3WNz zB2aY0y$j-i(nB!F3@Mm*g=!gVKrzNq%T&WOk-3lu<kcdOJzzqUxd@bQZm}1aCV^aW zizTxpH5V)eAwVg)h?{|dK@4Op#A$FVL0onQ1_p2ltHG?4t7Yt90EaOnB8(R>EyM_E z<|2@Li$KZw7F$U^$bL;GuzQL?rfM?Xl1eU%FDXjQ%!@Be%!$v;OUX=5EsjqtDoU)p zB>;&JgglyqK|xRqav>XoCR6&XnIQ1e9?Cdr4~cAUuoG^v6(#1S<mcXEDNapIxy4qR zmswbvdW$7LFSYoV03;rYONtUpGSf0sQ*N;qCFZ54-eO74FQ~l5T##RIixuJ|w#0&h z)Vvf;!6FV&{ug3kV7SGUTy~4G;1+vwPGWI!YVj>Wm@f)ai{in8x7Z8f!FJr@PRxP( zGKx1XKPROa=IL83AZ`>tC_qw+OJKrLd>{dE2!Z5_qeS3*q!5dehFOXTOt|$?q6k4` z`vu@8K{bQzLNXN`-kQus{2;%AJuCoXfl|yZUa*Jap@|n1!^Ik)Jjuwz$i>LTD8wwp z$iv9O#K*`7;xqFxaxrl*iZJqU@G<f+i7*u@fl4<}iUw)oWnf?cB?xCw5_|?KHyP6z zOBiYxn;AtIYMIj+YFR3jN}wX(@=2(cwS;K_a}84sE2z9+WGGY!XQ)_J!cxNkmY)~H zRLce})oM_5=G3x6bk%a!FoSg0a@DYvFo5)>Fi9|EGZ)P&VO_vh!&SqX#h$__$xy>C z!XVC&!Yt0PkTH(~CR)o~!coIn!&SrG$|MOYUwLXcAmtw;Lk%~`WE+OUsBi|bD>*<V zSPjbpt{O0_&>@_m22>!i_<<8CIGd+~0)m0zfxRa8E#{omydqG3ECQALx7dp_O4HJE zQg3ml=9T8A7A2OXLQ*wzW?soH*5ZuBg4A0qpxkwfEw!Q~H7}(Ils7bai{wG^B?}@z z#p5l;k|<tiTtgBhI2+z#P0laPE4jr~TyTpWDjmfMWyB}nVk$AX#Z+Q=i!l+LARq)N z5fq6sFfeF=;!FYLct$=(7A7G^AtoUv5Y55J!Dz-Pz{mwo5`0Wx)tXF2pd1VKu_jxQ zILK*WKeFbP<`z_fy#-DiVAT+-;AL46$kJOJHo5sJr8%i~pu)Tulq^9uaWHYI0s!98 B+=2iA literal 0 HcmV?d00001 diff --git a/IO_utils/__pycache__/statistics_utils.cpython-37.pyc b/IO_utils/__pycache__/statistics_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4e069e116665318b8fc45e73187d04699eba9aa GIT binary patch literal 3419 zcmZ?b<>g{vU|?Xr)SK)s!octt#DQTZ1_lNP1_p*=9|i`76owSW9EK=HFwGRj45nFf zS)*7PL2^tvEV*n^Y+yEH4to@P3Udle3quq~3Tp~m3qurV3VRAi3qurF3S%&XCg)3# z9e$dOw>a`ja|<esiZYWKK|%}+4EziX3~USx49*~%l^7WqN*J;jYZ$VaQW$#~YZ=oS zYMIg*YMIj+YFW}5YFR7pl`xmE)G#zNHZ!^~#GZ>`s%5KXFJaAMOJR~^sO6~PC}CZ| zUc*qsF3GTvv6(T2S(2favxH*-Cq!%kR|?BQ#)XWvTs2G!xNEr585c4#GL&#E;6an; zu3=ulUBeBN&*DXv2l=v>sg|dPWdU~$4@6B3R|<1BbJ3$39td5+m&KnYkR_PHUc*(x zlfu!<9K%%0Tg#Wnk-`aK)vzw$uHmiWOJ{<ZP^c5mP{UNin8IYkP{UHgn8IblP{Ufo zn8IztP{Ulqn8IVjPy-52UQkf7X6B`&R@`FAO-;-z0)^%+_OzV*#1dnpmmr=d<1Lny zl>D^xG%$E)ugP?aIVV5)7He^GVos_i6Ic(JxW!ytl2RpMpl4~Sker{FmYI^8mz=7Q znOBlpRF;@?i!C=bB{MNkQ|J~`LCP&om&B4px1z+{)LZPy`8lPzdBwLlQ}ar5Q;QNy zQg5;3WEPj);wZ{b%1TWxDZa&66vgcZa#vVta!G#CEvCGJTkLs>c_3%sVoNM8sVqpn z#g<x8lA4!tixVsXvg{UHUSeKxNealhnFW=%Sc^*%ONwu?<mDIT-r|5cO_To?Q%c$` zw*1nPg3^*GR*<2^QS1=+7DurZm!uTmVoJ`u#h7`E4dU1+wjzisu;QX376t}}B5n`? z${A5?$(ivv`Q=gU$(ivPnduoT8H$t`7#My9Ia|eq7N-^!$K)m!m1O26CmO~CWVplx z6y<~c8RO^~A6!zDpPd@x=ouUzl$u_YT3j6C=^tNOl9^K+19E<5aY<%!F+@PGpz@YP za(-?>X-R5)Qetstay&u_2PiM-fiegq9}^2B7&3`5axscAi7@gog0KW552FC16cY!d z2onb*7%~;9FfcGAgFFP%Dvg}64lto*EO2H@XQ*XMXQ*XQXQ<^!XQ<^&XQ<^$XQ<^) zXQ<_wz*OOdoYf{U#acj<EprX8Btr^QHq!*gB8?K(8ipFi6y|KE8ipD^Nd^$hfkBd? znX#6?hIIj34L>At*KjUiOJRnvp?X-b=n<%4TfkN$08v-Ng{FrIWIBdDf;H?5*lGl! zdblz4V6#W4hGPL+jSy514?<6&T?zXF4w#=L!G2o6nZgP#G=#xE5{Bq1Vb9{irY?mg zg{_yVRs`hZ8WEUUsF^I;OlW3`f_*FsQwKE@O&!!sF;Ixqh{4o?%>;)DnwjF@5D|x| z1DlDV4r-=E4fg`J8VQ(MsF`T?FJ!Eh1c#U;OdZrrG<8rjrD}K<u+>OG)E3%>gNiYh z6m}bi8nzmi6b>7P8ul8N6iyq48jczka52YO!vZemxN2Cy#T<7HOA4<ILk&+2O9~&T zI1Fac<oDBLEMjJ0V7SGcoLB-bx{5$mM-eLn1A``G5jz6|Llh?{cgN=z6&oO!hEe<w z(fp#4#GK5MN&|EO!zhm8k`#zG7|Sq<4=jezgvvM6<Sya?6&$=Ef)7N1DzhR1unTii z^NPekV!|Lo1Vo5}2nmo|rHf=iida%|gMx2y=E7{a#h;v5lA4}hl$lx_4=P8a1R*6` zd{Sl^sMtv?NsVFyd8OD8N*h6GV<>GB#R@9)3?Y;egffOuCPfM$7bt>)i7h$5G_S-k z3PKw}X=5mD0x3!j!6m36gkuEe7(qD3V2&|_V*=)wfD1|pp$zh!47_*)IX*M7Bo(QM z(*PB537~R{2~os>3pz#?CMhO9CK*N^Mji~3j|C*I0F}X{1>l7txJYE=U@FoA6^fu# z1F}qzfq?;3x`JC9J&X(tu(rhn#)^s(h7v|t%VGj!Y+4LcEo&`X4MPo^B*Oxxg$xUr z7cvz2)G#k#NnwN}?HZN^EGe+sTcU=kh80o(7JUJ=NFX|E7*kk!nQGa=#R_{C=R(F5 z)@+8NTQ%%iT%a~f3R5;q(FHJ%4P30U6kRCc&f-a72eoHPco*<3WME_{;V)rdAW*|m z!@fXpAwvpB3TH1<3LB`610tIlYdKT6YPkwOr7+cUrZCoW<w=wXWeG13sbOEp7{gM_ zUCUX^Q_GveUCWhM0(B=(HroWoA}6>j*+4~8EgziE1m^RD%@M8PTOhWOp@ttsE@X;f zsuif^OyRBN%2TUhOW^}o1z{;{P`ian#B116_@VX)G_y1@f<g_}+5&}I4JRZ73njuq zZ5`$`reFq5LBCrpe))N+kX)?_$|+3X3||DwT`xh|9GvT`xYII=Gg6E6N>YnU(n0as za(%g@CSw$5d45rLaY15oY8AH|Scw8iNfozqMy76YVQFGfYKkWFE#}O;l3T3c*3vEJ z{Gya9wh&#Ax?4=1DODoDC5a{AsyZ<TtXm;1zo?2oFTX?~Gq)foH8(Y{BsE1-q6m@M z*)ogsN^^2<F(>BaKr%h3T)4%NT$Ep2T#}en1S$h=G3RFH-C`?C%qdMRzQtOckywyg z1WJ!ZLZDm;YJK10NK7utEK4j&Edtg4w^$R4iV`btag^l8gWJtTk|32*p!_S4oRMi1 zpPZjpl9`vDnwMM|#aROKLVRXk$}P5(qWprq#9LfpAj@2fit>vz1tCQaTS;P3dTL3L zCMbup=cblq<fjzh;wXrRxC^O-5JWOOz9cazCp8KhrXi`tB|*0s3qTD+kp3tZkYjE! zr{t&I;z+G1NKGzDO}WLGc#ARV77M6V11S_h?WiJf$-tGKS`uFXwlNn}4%mUB5Y)_J zl4247<$FdZCLTs1W&uVS7A{5(MkOXTCKg6EW)Vg<CO#$sP{W6jjhTy)g^`U(h>3?$ zfT_q0lvucLu|mQL9NugNiFqlB#o#hJinpks(nt^1DuskQD99l}3Tilk3kt9~NX|vL hnZqVGKczG$)eh9SE(WDXPzl4s$iv9N%%Q;H3IKP3VR!%l literal 0 HcmV?d00001 diff --git a/IO_utils/clean_table.py b/IO_utils/clean_table.py new file mode 100644 index 0000000..cf39a30 --- /dev/null +++ b/IO_utils/clean_table.py @@ -0,0 +1,321 @@ +import pandas as pd +import numpy as np + + +def clean_table(excel_dir, pre_mRS=2): + """ + This method creates a clean list processing parameter by parameter in the original list to create + a list that can be directly processed and save it + :param excel_dir: path of the excel list + :return: + """ + print('###############################################################') + print('##############Cleaning table ##################################') + print('###############################################################') + + input_df = pd.read_excel(excel_dir) + # Remove first row (info about notation) and last row (empty) + input_df = input_df.drop([0, input_df.shape[0] - 1], axis='rows') + print('######## Initial table with {} patients and {} feature'.format( + input_df.shape[0], input_df.shape[1])) + # Middle cerebral stroke selection + Territoium = input_df['Territoium'] + list_territoium = [Territoium[Territoium == i].count() for i in [0, 1, 2]] + print( + '######## Anterior circulation: {}, Middle circulation {}, Posterior circulation: {}'.format(*list_territoium)) + mc_df = input_df[input_df['Territoium'] == 1] + mc_df = mc_df.drop(['Territoium'], axis=1) + + print('######## Select {} patients with middle circulation stroke'.format(mc_df.shape[0])) + + mc_df = mc_df[mc_df['mRS90d'].notna()] + print('######## Select {} patients with valid output'.format(mc_df.shape[0])) + # Remove patients with shift in mRS <0 + shift = mc_df['mRS90d'] - mc_df['pre-mRS'] + mc_df = mc_df[shift >= 0] + + print('######## Select {} patients with non-negative shift'.format(mc_df.shape[0])) + # Remove patients with pre-mRS > 2 + mc_df = mc_df[mc_df['pre-mRS'] <= pre_mRS] + + print('######## Select {} patients with mRS < {}'.format(mc_df.shape[0], pre_mRS)) + # Remove patients not treated with either trhombectomy or stenting + #rm_indices = mc_df[(mc_df['Stenting '] == 0) & (mc_df['Thrombektomie'] == 0)] + #print('######## Remove {} patients not treated with either stenting or thrombectomy'.format(rm_indices.shape[0])) + mc_df = mc_df[(mc_df['Stenting '] == 1) | (mc_df['Thrombektomie'] == 1)] + # Meta data: id, age and sex + print('######################################## 1- Meta data preprocessing ...') + mc_df = clean_meta_data(mc_df) + + # NCCT values preprocessing + print('######################################## 2 - NCCT values preprocessing ...') + mc_df = clean_ncct_data(mc_df) + + # CTP values preprocessing + print('######################################## 3 - CTP values preprocessing ...') + mc_df = clean_ctp_data(mc_df) + + ##### CTA preprocessing + print('######################################## 4 - CTA values preprocessing ...') + mc_df = clean_cta_data(mc_df) + + ###### Treatment preprocessing + print('######################################## 5 - Treatment values preprocessing ...') + mc_df = clean_treatment_data(mc_df) + + #### Control CT preprocessing + print('######################################## 6 - Control CT values preprocessing ...') + mc_df = clean_ControlCT_data(mc_df) + + #### Preprocessing clinical data + print('######################################## 7 - Clinical values preprocessing ...') + mc_df = clean_clinical_data(mc_df) + + #### Remove dates + print('######################################## 8 - Remove dates ...') + mc_df = remove_dates(mc_df) + print('######## Clean table with {} patients and {} features'.format(mc_df.shape[0], mc_df.shape[1])) + + # mc_df.to_excel(out_dir, columns=mc_df.columns, index=False) + + print('###############################################################') + print('###############################################################') + + return mc_df + + +def remove_dates(mc_df): + mc_df = mc_df.drop(['Geburtsdatum', 'Aufnahmedatum UKER', 'Beginn Angio', 'Zeitpunkt Rekanalisation', + 'Puncture-to-Rekan', 'Zeitpunkt Verlaufs-CT \n(post 24h)', 'Zeit von Rekan bis Kontrolle', + 'Puncture-to-Rekan', 'Zeit von Bildgebung bis Rekan', 'Zeitpunkt 1. Bildgebung', + 'Symptom onset'], axis=1) + return mc_df + + +def clean_meta_data(mc_df): + # Rename columns names + mc_df = mc_df.rename(columns={'Nummer': 'Id', 'Sex (0=männlich, 1=weiblich': 'Sex', 'Alter': 'Age'}) + + # Check that ages are correct + assert np.floor(mc_df['Age']).equals( + np.floor((mc_df['Aufnahmedatum UKER'] - mc_df['Geburtsdatum']) / np.timedelta64(1, 'Y'))) + + # Replace float Age by int Age + print('#### 1.1- Convert Age to int') + mc_df['Age'] = np.floor(mc_df['Age']) + + # Remove modality of first image (all are CT) + mc_df = mc_df.drop(['1. Bildgebung UKER', '1. Bildgebung CT'], axis=1) + + return mc_df + + +def clean_ncct_data(mc_df): + mc_df = mc_df.rename(columns={'e-ASPECTS bzw pc-ASPECTS': 'e-ASPECTS', + 'Volumen e-ASPECTS': 'Volume e-ASPECTS', + 'IVCgesund': 'IVC gesund', + 'ICV-Index': 'ICV Index', + 'Ort Gefäßverschluss \nbei Aufnahme (proximalster)': 'Vessel Occlusion Location Admission', + 'Seite Gefäßverschluss \nbei Aufnahme': 'Vessel Occlusion Side Admission', + 'Symptomatische/vorgeschaltete Gefäßstenose bei Aufnahme': 'Vessel Stenosis', + 'Ort symptomatische/vorgeschaltete Gefäßstenose bei Aufnahme': 'Vessel Stenosis Location', + 'Gefäßdissektion bei Aufnahme': 'Arterial Dissection', + 'Ort Gefäßdissektion': 'Arterial Dissection Location', + 'ASPECTS oberfl/tief': 'ASPECTS oberfl_tief' + }) + + ASPECT_areas = ['C_br', 'IC_br', 'INS_br', 'L_br', 'M1_br', 'M2_br', 'M3_br', 'M4_br', 'M5_br', 'M6_br'] + for area in ASPECT_areas: + mc_df[area] = replace_values(mc_df[area], [9, 'n/a'], [0, np.nan]) + + deep_areas = ['C_br', 'IC_br', 'L_br'] + superficial_areas = ['INS_br', 'M1_br', 'M2_br', 'M3_br', 'M4_br', 'M5_br', 'M6_br'] + for index, patient in mc_df.iterrows(): + deep_stroke = patient[deep_areas].values + superficial_stroke = patient[superficial_areas].values + if any(deep_stroke) == 1: + if any(superficial_stroke) == 1: + a = 2 + else: + a = 1 + + elif any(superficial_stroke) == 1: + if any(deep_stroke) == 1: + a = 2 + else: + a = 0 + else: + a = 3 + + mc_df.loc[index, 'ASPECTS oberfl_tief'] = a + + mc_df['e-ASPECTS'] = replace_values(mc_df['e-ASPECTS'], [11], [np.nan]) + + mc_df['pc-ASPECTS'] = replace_values(mc_df['pc-ASPECTS'], [0], [np.nan]) + mc_df['Volume e-ASPECTS'] = replace_values(mc_df['Volume e-ASPECTS'], ['n/a'], [np.nan]) + + return mc_df + + +def clean_ctp_data(mc_df): + # Two patient with missing CTP, + # One -only patient with a image missing+ missing mRS+ a lot of data (posterior circulation) + # Second also posterior circulation + # mc_df['1. Bildgebung CT-P'] = replace_values(mc_df['1. Bildgebung CT-P'], ['n/a', 0], [np.nan, np.nan]) + # mc_df = mc_df.drop(mc_df[mc_df['1. Bildgebung CT-P'] == np.nan].index) + # Remove CTP - if using posterior circulation I have to modify this + mc_df = mc_df.drop(['1. Bildgebung CT-P'], axis=1) + + mc_df = mc_df.rename(columns={'Mismatch Volumen nach RAPID': 'Mismatch Volume', + 'Mismatch Ratio nach RAPID': 'Mismatch Ratio', + 'HypoperfusionIndex (Tmax10s/&max6.5s': 'Hypoperfusion Index', + 'CBV Index (rCBV in Tmax>6': 'CBV Index', + 'CBF <30% lesion volume': 'CBF_lower30_volume', + 'Tmax >6s lesion volume': 'Tmax_greater6s_volume' + }) + + list_pct_features = ['CBF_lower30_volume', 'Tmax_greater6s_volume', 'Mismatch Volume', + 'Hypoperfusion Index', 'CBV Index'] + + for feature in list_pct_features: + mc_df[feature] = replace_values(mc_df[feature], ['n/a'], [np.nan]) + + # Infinite value is not replaced + # mc_df['Mismatch Ratio'] = replace_values(mc_df['Mismatch Ratio'], ['n/a', 'none'], [np.nan, np.nan]) + print('#### 3.1 - Calculate inverse mismatch ratio - avoid infinite') + mc_df['Inverse Mismatch Ratio'] = mc_df['CBF_lower30_volume'] / replace_values(mc_df['Tmax_greater6s_volume'], + [0], [0.001]) + + print('#### 3.2 - Remove Mismatch Ratio ') + mc_df = mc_df.drop(['Mismatch Ratio'], axis=1) + # Delete values from Brainomix, only in patient from 2018 + print('#### 3.3 - Remove features obtained with Brainomix (only in patients from 2018) ') + mc_df = mc_df.drop(['Braino CBF<30%', 'Braino Tmax >6s', 'Braino Mismatch vol', 'Braino Hypoperfindex'], axis=1) + + return mc_df + + +def clean_cta_data(mc_df): + # All patients have CTA - remove + mc_df = mc_df.drop(['Akute DSA '], axis=1) + + mc_df = mc_df.rename(columns={'Gefäßverschluss DSA ': 'Vessel Occlusion CTA' + }) + + # Remove empty column + mc_df = mc_df.drop(['TTP lesion volume'], axis=1) + print('#### 4.1 - Remove tandem stenosis feature- no patient with it') + # Remove empty column + mc_df = mc_df.drop(['Tandemstenose'], axis=1) + + # Replace n/a + for feature in ['Tan Score', 'Coves Score', 'BATMAN', 'Clot Burden Score']: + mc_df[feature] = replace_values(mc_df[feature], ['n/a'], [np.nan]) + + return mc_df + + +def clean_treatment_data(mc_df): + mc_df = mc_df.rename(columns={'Thrombektomie': 'Thrombectomy', + 'ggf. weiteres Device': 'PTA', + 'Anzahl der Manöver': 'Number Maneuver', + 'Puncture-to-Rekan in min.': 'Time_Puncture_to_Recan.', + 'frustrane Rekanalisation': 'Frustrated Recanalization', + 'Gefäßverschluss nach Rekanalisation': 'Vessel Occlusion after Recan.', + 'TICI ': 'TICI', + 'Lyse i.a.': 'Lysis i.a.', + 'Lysemenge': 'Lysis quantity', + 'Gefäßverschluss in neuem Versorgungsgebiet \n(während/nach Intervention)': 'Vessel Occlusion new SupplyArea', + 'Lokalisation Gefäßverschluss': 'Vessel Occlusion new SupplyArea Location', + 'Behandlung (des neuen Gefäßverschlusses)': 'Vessel Occlusion new SupplyArea Treatment', + 'Infarkt in neuem Versorgungsgebiet': 'Infarct new SupplyArea', + 'Stenting ': 'Stenting' + }) + + mc_df['Device'] = replace_values(mc_df['Device'], [0], [np.nan]) + mc_df['PTA'] = replace_values(mc_df['PTA'], [np.nan], [0]) + mc_df['Time_Puncture_to_Recan.'] = replace_values(mc_df['Time_Puncture_to_Recan.'], ['#ZAHL!'], [np.nan]) + mc_df['Lysis i.a.'] = replace_values(mc_df['Lysis i.a.'], [9], [0]) + mc_df['Lysis quantity'] = replace_values(mc_df['Lysis quantity'], [np.nan, 'n/a'], [0, 0]) + mc_df['Infarct new SupplyArea'] = replace_values(mc_df['Infarct new SupplyArea'], ['n/a'], [np.nan]) + + return mc_df + + +def clean_ControlCT_data(mc_df): + # Missing in 5 patients: 4 died? during recanalization, 1 because new recanalization was performed + # Remove empty column + + mc_df = mc_df.rename(columns={'Zeit von Rekan bis Kontrolle in min.': 'Time_Recan_to_Control', + 'Zeit von Bildgebung bis Rekan in min': 'Time_CT_to_Angio.', + 'lakuärer Infarkt (keine Kortexbeteiligung und Infarkt ≤1,5cm ' + '(≤2,0cm falls mittels DWI in MRT gemessen) im größten Durchmesser f' + 'alls ausschließlich subkortikal)': 'Lacunar Infarct', + 'Infarktvolumen Verlaufs-CT': 'Infarct Volume ControlCT', + 'Hyperdense Mediazeichen': 'Hyperdense Media Sign', + 'pc-Aspect verlaufss-CT': 'pc-Aspect ControlCT', + 'Aspect Verlaufs-CT': 'Aspect ControlCT', + }) + + mc_df = mc_df.drop(['Dauer der Rekan'], axis=1) + mc_df['Time_CT_to_Angio.'] = replace_values(mc_df['Time_CT_to_Angio.'], ['#WERT!'], [np.nan]) + + mc_df['Infarct Volume ControlCT'] = replace_values(mc_df['Infarct Volume ControlCT'], ['n/a'], [np.nan]) + mc_df['pc-Aspect ControlCT'] = replace_values(mc_df['pc-Aspect ControlCT'], ['n/a', 0], [np.nan, np.nan, ]) + mc_df['Aspect ControlCT'] = replace_values(mc_df['Aspect ControlCT'], ['n/a'], [np.nan]) + + return mc_df + + +def clean_clinical_data(mc_df): + # + mc_df = mc_df.rename(columns={'Symptom onset / LSN': 'Symptom onset', + 'unknown onset(1=onset unbekannt, LSN angegeben)': 'Unknown Onset' + }) + + # + + # Create dichotomized functional outome + + print('#### 7.1 - Add dichotomized mRS') + mc_df['dmRS'] = [1 if i > 2 else (np.nan if np.isnan(i) else 0) for i in mc_df['mRS90d']] + print('#### 7.2 - Add mRS shift') + mc_df['shift_mRS'] = mc_df['mRS90d'] - mc_df['pre-mRS'] + print('#### 7.3 - Add mortality') + mc_df['mortality'] = [1 if i == 6 else (np.nan if np.isnan(i) else 0) for i in mc_df['mRS90d']] + + mc_df['Unknown Onset'] = replace_values(mc_df['Unknown Onset'], [np.nan], [1]) + mc_df['Zeitpunkt 1. Bildgebung'] = pd.to_datetime(mc_df['Zeitpunkt 1. Bildgebung']) + mc_df['Time_Onset_to_Admission'] = (mc_df['Zeitpunkt 1. Bildgebung'] - mc_df['Symptom onset']).dt.seconds / 60 + #mc_df.loc[ mc_df['Unknown Onset']==1,'Time_Onset_to_Admission'] = -1 + + # Handle unknown times + # u = (mc_df['Unknown Onset'] == 1).values + # mean = np.nanmedian(mc_df['Time_Onset_to_Admission'][u]) + # mc_df['Time_Onset_to_Admission'] = replace_values(mc_df['Time_Onset_to_Admission'], [np.nan], [mean]) + + return mc_df + + +def replace_values(column, original_values=[], target_values=[]): + assert len(original_values) == len(target_values), 'Length of both lists have to be the same' + + for (i, j) in zip(original_values, target_values): + column.replace(i, j, inplace=True) + + return column + + +def get_values(column): + unique_values = column.unique() + values = [] + number = [] + + for i in unique_values: + values.extend([i]) + if pd.isna(i): + number.extend([column.isna().sum()]) + else: + number.extend([column[column == i].count()]) + + print(values, number) diff --git a/IO_utils/mv_strategies.py b/IO_utils/mv_strategies.py new file mode 100644 index 0000000..ac266c8 --- /dev/null +++ b/IO_utils/mv_strategies.py @@ -0,0 +1,258 @@ +import copy +from mrmr import mrmr_classif +import numpy as np +import pandas as pd +from sklearn.feature_selection import RFE +from sklearn.ensemble import RandomForestClassifier + +from IO_utils.statistics_utils import get_pvalue +from IO_utils.FeaturePreprocessing import FeaturePreprocessing + + +def select_features_pvalue(df, data_dictionaries, output_vector, fold_indices, k=10): + # Get types of features -- different methods to compute p value + types = [] + for t in data_dictionaries.keys(): + if t not in ['Id']: + types.extend([data_dictionaries[t]['type']]) + all_p_values = np.zeros((df.shape[1] - 1)) + for indices in fold_indices: + _, _, test_indices = indices + drop_indices = df.index[test_indices] + df_copy = copy.copy(df.drop(drop_indices)) + + p_values, _ = get_pvalue(df_copy.drop(columns=['Id']), output_vector, types) + all_p_values += np.array(p_values) + + #sorted_indices = np.argsort(all_p_values) + #sorted_values = np.sort(all_p_values) + #for i, v in zip(sorted_indices, sorted_values): + # print(df.columns[i + 1], 'P_value: {} '.format(v)) + + smaller_indices = np.argsort(all_p_values)[:k] + indices = [True if a in smaller_indices else False for a in range(df.shape[1] - 1)] + indices = [False] + indices + selected_features = df.columns[indices] + + print("Features selected (p-value): {} ".format(selected_features.values)) + + return selected_features + + +def select_features_RF(df, data_dictionaries, output_vector, fold_indices, k=10): + global all_featuress_imp + FP = FeaturePreprocessing(df, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + + all_features_imp = np.zeros((all_feature.shape[1])) + + for indices in fold_indices: + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + clf = RandomForestClassifier(n_estimators=100, random_state=0) + clf.fit(features, output) + all_features_imp = all_features_imp + clf.feature_importances_ + + # plt.figure(num=None, figsize=(10, 8), dpi=80, facecolor='w', edgecolor='k') + names = FP.get_feature_names() + feat_imp = pd.Series(all_features_imp, index=names) + + # Get original features + features = [] + sorted_value = feat_imp.sort_values(ascending=False) + for next_feature in sorted_value.index: + + feature = next_feature.split('#')[0] + if feature not in features: + features.extend([feature]) + if len(features) == k: + break + + # plt.show() + print("Features selected (Ranfom-forest): {} ".format(features)) + + return features + + +def select_features_RFE(df, data_dictionaries, output_vector, fold_indices, k=10): + global all_featuress_imp + FP = FeaturePreprocessing(df, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + + all_features_ranking = np.zeros((all_feature.shape[1])) + + for indices in fold_indices: + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + clf = RandomForestClassifier(n_estimators=100, random_state=0) + rfe = RFE(estimator=clf, n_features_to_select=k, step=1) + rfe.fit(features, output) + all_features_ranking = all_features_ranking + rfe.ranking_ + + names = FP.get_feature_names() + feat_imp = pd.Series(all_features_ranking, index=names) + + # Get original features + features = [] + sorted_value = feat_imp.sort_values(ascending=True) + for next_feature in sorted_value.index: + + feature = next_feature.split('#')[0] + if feature not in features: + features.extend([feature]) + if len(features) == k: + break + + # Reorder features + features = [f for f in df.columns if f in features] + + # plt.show() + print("Features selected (RFE): {} ".format(features)) + + return features + + +def combined_selection(df, data_dictionaries, output_vector, fold_indices, k=10): + # Get types of features -- different methods to compute p value + + ### 1 - Select k*2 features by p value + types = [] + for t in data_dictionaries.keys(): + if t not in ['Id']: + types.extend([data_dictionaries[t]['type']]) + all_p_values = np.zeros((df.shape[1] - 1)) + for indices in fold_indices: + _, _, test_indices = indices + drop_indices = df.index[test_indices] + df_copy = copy.copy(df.drop(drop_indices)) + + p_values, _ = get_pvalue(df_copy.drop(columns=['Id']), output_vector, types) + all_p_values += np.array(p_values) + + sorted_indices = np.argsort(all_p_values) + sorted_values = np.sort(all_p_values) + for i, v in zip(sorted_indices, sorted_values): + print(df.columns[i + 1], 'P_value: {} '.format(v)) + + smaller_indices = np.argsort(all_p_values)[:k * 2] + indices = [True if a in smaller_indices else False for a in range(df.shape[1] - 1)] + indices = [False] + indices + selected_features = df.columns[indices] + + print("Features selected (p-value): {} ".format(selected_features.values)) + + global all_featuress_imp + df_reduced = df[selected_features] + FP = FeaturePreprocessing(df_reduced, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + + all_features_ranking = np.zeros((all_feature.shape[1])) + + for indices in fold_indices: + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + clf = RandomForestClassifier(n_estimators=5, max_depth=5, random_state=0) + rfe = RFE(estimator=clf, n_features_to_select=k, step=1) + rfe.fit(features, output) + all_features_ranking = all_features_ranking + rfe.ranking_ + + names = FP.get_feature_names() + feat_imp = pd.Series(all_features_ranking, index=names) + + # Get original features + features = [] + sorted_value = feat_imp.sort_values(ascending=True) + for next_feature in sorted_value.index: + + feature = next_feature.split('#')[0] + if feature not in features: + features.extend([feature]) + if len(features) == k: + break + + # Reorder features + features = [f for f in df.columns if f in features] + + # plt.show() + print("Features selected (RFE): {} ".format(features)) + + return features + + +def select_features_MRMR(df, data_dictionaries, output_vector, fold_indices, k=10): + global all_featuress_imp + FP = FeaturePreprocessing(df, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + + all_features_ranking = np.zeros((all_feature.shape[1])) + + for indices in fold_indices: + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + selected_features = mrmr_classif(features, output, K=k) + for counter, i in enumerate(selected_features): + all_features_ranking[i] += len(selected_features) - counter + # + # all_features_ranking = all_features_ranking + rfe.ranking_ + + names = FP.get_feature_names() + feat_imp = pd.Series(all_features_ranking, index=names) + + # Get original features + features = [] + sorted_value = feat_imp.sort_values(ascending=False) + for next_feature, next_value in zip(sorted_value.index, sorted_value.values): + + feature = next_feature.split('#')[0] + # print(feature, next_value) + if feature not in features: + features.extend([feature]) + if len(features) == k: + break + + # Reorder features + features = [f for f in df.columns if f in features] + # plt.show() + print("Features selected (MRMR): {} ".format(features)) + + return features + + +def select_features_MRMR_fold(df, data_dictionaries, output_vector, fold_indices, k=10): + global all_featuress_imp + FP = FeaturePreprocessing(df, data_dictionaries) + all_feature = FP.create_features(df) + all_output = output_vector.to_numpy(dtype=int).squeeze() + all_features = [] + + for i, indices in enumerate(fold_indices): + train_indices, val_indices, test_indices = indices + + features = copy.copy(all_feature)[train_indices + val_indices, :] + output = copy.copy(all_output)[train_indices + val_indices] + + selected_features = mrmr_classif(features, output, K=k) + names = FP.get_feature_names() + selecte_features_names = [names[j].split('#')[0] for j in selected_features] + s = [f for f in df.columns if f in selecte_features_names] + all_features.append(s) + + print("Features selected in fold {} (MRMR): {} ".format(i, s)) + + return all_features diff --git a/IO_utils/split_utils.py b/IO_utils/split_utils.py new file mode 100644 index 0000000..bcd345d --- /dev/null +++ b/IO_utils/split_utils.py @@ -0,0 +1,63 @@ +import copy +import random +import numpy as np + +def split_data_cv(output, seed, cv=5, p=1): + np.random.seed(seed) + random.seed(seed) + classes = np.unique(output) + indices_per_class = [np.where(output == c)[0] for c in classes] + + # Make train and test splits with same number of classes + p_split = np.ones(cv) * 1 / cv + all_indices = split_stratified(p_split, classes, indices_per_class) + folds_indices = [] + + for fold in range(cv): + cv_test_indices = copy.copy(all_indices[fold]) + cv_train_folds = copy.copy(all_indices) + cv_train_folds.pop(fold) + + + cv_train_val_indices = [item for sublist in cv_train_folds for item in sublist] + cv_train_val_indices_array = np.array(cv_train_val_indices) + + output_train_val = output[cv_train_val_indices] + indices_train_val_per_class = [cv_train_val_indices_array[np.where(output_train_val == c)[0].tolist()] for c in classes] + train_val_all_indices = split_stratified((0.8, 0.2), classes, indices_train_val_per_class) + + #random.shuffle(cv_train_val_indices) + #sp = int(0.8 * len(cv_train_val_indices)) + #cv_train_indices = cv_train_val_indices[:sp] + #cv_train_indices = cv_train_indices[:(int(p * len(cv_train_indices)))] + #cv_val_indices = cv_train_val_indices[sp:] + + cv_train_indices = copy.copy(train_val_all_indices[0]) + cv_val_indices = copy.copy(train_val_all_indices[1]) + + folds_indices.append([cv_train_indices, cv_val_indices, cv_test_indices]) + + return folds_indices + +def split_stratified(p_split, classes, indices_per_class): + t = [0] * len(classes) + indices_split = [] + + for c in (range(len(classes))): + random.shuffle(indices_per_class[c]) + + for count, sp in enumerate(p_split): + + indices = [] + + for c in range(len(classes)): + indices_c = indices_per_class[c] + t0 = t[c] + t1 = np.int(t0 + (indices_c.shape[0] * sp) + 0.5) if count != len(p_split) - 1 else indices_c.shape[0] + a = list(indices_c[t0:t1]) + indices.extend(a) + t[c] = t1 + random.shuffle(indices) + indices_split.append(indices) + + return indices_split diff --git a/IO_utils/statistics_utils.py b/IO_utils/statistics_utils.py new file mode 100644 index 0000000..e7e58a9 --- /dev/null +++ b/IO_utils/statistics_utils.py @@ -0,0 +1,159 @@ +import numpy as np +import pandas as pd +import rpy2.robjects as robjects +from rpy2.robjects import numpy2ri +import scipy + + +def compute_basic_statistics(df): + output = pd.DataFrame(index=df.columns) + + means = [] + medians = [] + stds = [] + ci = [] + + for i, column in enumerate(list(df)): + rmeans = robjects.r['mean'] + meanr = rmeans(robjects.FloatVector(df[column]))[0] + + mean = np.nanmean(df[column].astype('float32')) + means.extend([mean]) + median = np.nanmedian(df[column].astype('float32')) + medians.extend([median]) + std = np.nanstd(df[column].astype('float32'), ddof=1) + stds.extend([std]) + if mean != 0 and std != 0: + ci_low, ci_high = scipy.stats.norm.interval(0.95, loc=mean, scale=std) + else: + ci_low, ci_high = 0, 0 + ci.extend([[ci_low, ci_high]]) + + output['mean'] = means + output['std'] = stds + output['0.95 confidence interval'] = ci + output['median'] = medians + + return output + + +def compute_bivariate_statistics(df, dmRS, mortality, categories_list): + + output_bivartiate = pd.DataFrame(index=df.columns) + + means1 = [] + means2 = [] + means3 = [] + means4 = [] + + stds1 = [] + stds2 = [] + stds3 = [] + stds4 = [] + + for i, column in enumerate(list(df)): + + if categories_list[i] == 'cat': + + count1 = len(df[(dmRS == 0) & (df[column]==0)]) + means1.extend([count1]) + stds1.extend([0]) + + count2 = len(df[(dmRS == 1) & (df[column] == 0)]) + means2.extend([count2]) + stds2.extend([0]) + + count3 = len(df[(mortality == 0) & (df[column] == 0)]) + means3.extend([count3]) + stds3.extend([0]) + + count4 = len(df[(mortality == 1) & (df[column]==0)]) + means4.extend([count4]) + stds4.extend([0]) + + else: + + mean1 = np.nanmean(df[dmRS == 0][column].astype('float32')) + means1.extend([mean1]) + std1 = np.nanstd(df[dmRS == 0][column].astype('float32'), ddof=1) + stds1.extend([std1]) + + mean2 = np.nanmean(df[dmRS == 1][column].astype('float32')) + means2.extend([mean2]) + std2 = np.nanstd(df[dmRS == 1][column].astype('float32'), ddof=1) + stds2.extend([std2]) + + mean3 = np.nanmean(df[mortality == 0][column].astype('float32')) + means3.extend([mean3]) + std3 = np.nanstd(df[mortality== 0][column].astype('float32'), ddof=1) + stds3.extend([std3]) + + mean4 = np.nanmean(df[mortality== 1][column].astype('float32')) + means4.extend([mean4]) + std4 = np.nanstd(df[mortality == 1][column].astype('float32'), ddof=1) + stds4.extend([std4]) + + output_bivartiate['mean_mrs0'] = means1 + output_bivartiate['mean_mrs1'] = means2 + output_bivartiate['mean_mortality0'] = means3 + output_bivartiate['mean_mortality1'] = means4 + + output_bivartiate['std_mrs0'] = stds1 + output_bivartiate['std_mrs1'] = stds2 + output_bivartiate['std_mortality0'] = stds3 + output_bivartiate['std_mortality1'] = stds4 + + return output_bivartiate + + +def get_pvalue(df, target, categories_list): + methods = [] + p_values = [] + + for i, column in enumerate(list(df)): + # print(column) + if df[column].isnull().all(): + methods.extend(['None']) + p_values.extend([1]) + continue + + if categories_list[i] == 'cat': + contingency_table = pd.crosstab(index=df[column], columns=target) + # This test should only be used if the observed and expected frequencies in each cell are at least 5 + + if np.min(contingency_table.values) < 5: + if contingency_table.shape[1] > 1 and contingency_table.shape[0] > 1: + # oddsratio, p = scipy.stats.fisher_exact(contingency_table) + # print(contingency_table) + ## R Code + FisherTestR = robjects.r['fisher.test'] + numpy2ri.activate() + p = float(np.array(FisherTestR(contingency_table.to_numpy(), workspace=2e8)[0])[0]) + + method = 'Fisher test' + else: + p = 1 + method = 'None' + + else: + chi2, p, dof, expected = scipy.stats.chi2_contingency(contingency_table) + method = 'Chi-squared' + + elif categories_list[i] in ['int', 'float', 'ord']: + + a = df[column][target == 0] + b = df[column][target == 1] + stat, p = scipy.stats.ttest_ind(a.dropna(), b.dropna()) + method = 'T-test' + + else: + if column == 'Id': + p = 0 + method = 'None' + else: + raise ValueError(column, 'Statistical test for', categories_list[i], 'not implemented') + + methods.extend([method]) + p_values.extend([p]) + + return p_values, methods diff --git a/Loss/Loss_uncertainty.py b/Loss/Loss_uncertainty.py new file mode 100644 index 0000000..5c749b7 --- /dev/null +++ b/Loss/Loss_uncertainty.py @@ -0,0 +1,67 @@ +import torch + + +def old_loss(y, f, s): + y = torch.squeeze(y) + + """" + a1= torch.exp(-sigma) + a2= torch.log(f) + a3= torch.mul(y, a2) + a = torch.mul(a1, a3) + b = torch.mul(sigma, y) + loss = torch.div((-a+b),2) + loss = torch.sum(loss, dim=1) + """ + ce1 = torch.log(f) + ce2 = torch.mul(ce1, y) + cross_entropy_error = torch.mean(ce2, dim=1) + sigma = torch.exp(-s) + term1 = -torch.mul(cross_entropy_error, sigma) + term2 = s / 2 + + loss = torch.div(term1 + term2, 2) + + return loss + + +class loss_uncertainty(object): + + def __init__(self, weights): + self.total_loss = torch.zeros([1]) + self.sum = torch.zeros([1]) + self.elements = 0 + self.weights = weights + + + def get_loss(self, f, y): + y = torch.squeeze(y) + assert y.shape[1] == self.weights.shape[0] + + weights_vector = self.weights.expand([f.shape[0], -1]) + weights_vector = torch.mul(weights_vector, y) + weights_vector = torch.sum(weights_vector, dim=1) + + + a = torch.mul(f, y) + ce2 = -torch.sum(a, dim=1) + class_weighted_ce2 = torch.mul(weights_vector, ce2) + + self.update_total_loss(class_weighted_ce2, f.shape[0]) + + total_loss = torch.mean(class_weighted_ce2) + + return total_loss + + def update_total_loss(self, loss, elements): + self.sum = torch.sum(loss).detach().cpu().numpy() + self.elements += elements + self.total_loss = self.sum / self.elements + + def get_total_loss(self): + return self.total_loss + + def clear(self): + self.total_loss = torch.zeros([1]) + self.sum = torch.zeros([1]) + self.elements = 0 diff --git a/Loss/__pycache__/Loss_uncertainty.cpython-37.pyc b/Loss/__pycache__/Loss_uncertainty.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7655bee12f0561e92ed7baebd3a93b4325afb6dd GIT binary patch literal 1917 zcmZ?b<>g{vU|=vwn3s5rje+4Yhy%kc3=9ko3=9m#Dhvz^DGVu$ISf$@DNHHMEeuhN z?hGj`DXc9FDXghX&CF5EDU87knrtsYdi|1_K@<Z61ITQUDb65MQWzK*N*ER})-Wt& ztOc={Y8V$X)-sncEMTr-u3=cnRLfGrki}BNlERq6)XP-MTEeh^wT6isq@E3|inWGq zArm)4Eqe`93bQmrEk_B%0=63V8V+%W6y}9YwVXAa!3>%#elHmr7#K7eZ!xE2=DuVC zvDt62mgE;DXWU{hE-Xz=txCPcoRgn^i#fM6=N3zDYGU3k=G2M;kcKi%u3L<iw;0oI zF&0NLC#M?TVopvqiV{vP$}cXCPt7YS$}gylPc16SFS^BAoSB}Rc#E|pwJ6sRLK)p+ z$;mG+Udd1-!oa}r%g@;=CbT%Us5mA!v8W_7FFDaLCLqHlCZH%kD>b>KIL6U4KDeYP zKRY$X(K9$cC^fw(wYWIOCm%}0m*yp>7L_Ds=9N_H6;$5h$j?cM2U!3L)?y)0I5To_ zb20KT@_{i2lK@kZ7y|=CGDr%BK|uhHNfQPJhE#?q#uSDqCQvB1Fhnt@FsHDzFhsGW zu%@uJFhsGYu%~deFhsGXaHepzFhsGZa0fGJ^4t;t*^2C(WRR&K8$dKD#=)_y1C8Yr z#&pJoj5Q4LOrSW2^O#eZY8c{KYCtlqDGb33n#_JhAOniPQN~up%D}*Ii?u4XD8Kj? zS4n<JVh-55x0s7db8m5^=A`DP=9LuRVlPk4OwTAO)?~WHQk<HTRwTo~010w%xEG0n zjNyon&&<m#iI0~9r9TdkTN(Kn`54(4t0d8VqX$z9PrM-YAlHDsH3#ag8m5JewM;ck zSxhO6k_<HrS<G2XDNK?K*-XVMB`i&hAdv;EHH;9^6y|i0dQcK)t6{ETTF3-S?pf?L z%qfg1EWKdf0=61PunZ{ivx5^qOD`yqgOWc;7wbZ%TDBU71spYOFmo3&F@iD>XARpz z##;6o_Fx80R-|Bj`Tzg_|C)?Nph(f=0taak8z?|ni!%}nQo&($i_fvRIJKxGGe6H2 zoR-;AD+&_xQi?#{E|LPN7A!4DNi0c?N5n!ACnzs*6v=`Dn-@gzFfcGg@j)XazAQDl zB){kuV`33MNL(m6C$YFV9wM8X5}%xER3r(~1<D%8@syrg0*+->P&|QBF9RbNqZE@E zC`mK(FbXj8FbOenF=2@{m`-@Cf&2`@;8?SO#Tw&6#sy3Z85S@vWLUtmkfDYlp0S33 zg&~WzhRK|Pi6Ne~h9L_ScdXJNIW};@Wc1TyhlCwCG}uy7OA?baZZRhpl-^>^E6pva zERqI!2IM16W=NO_g9N~$s9r2mW?*0dxuzHt4-AYFjC_n$N*HMnBfyhExdNPS85kHq zY_PXLX4EibfpQO{pC%(XOf?xHE<m;S79S{p5D{n#3SdxnV_>Y3$FLq%EsEXnAXS3~ zDLhAG&EMbvM9JB#kf4+R1rR92z)1`ofH+*vnw*oGSOhM$K>lR_=iDkW3`c{N`Dt?8 z;)svWOUzA;kH5tgAD^3_Qknx|^Tfv&mL}#vWQsrqK@ljGfg_|y6XY5#5P=9bFbh<C z6p1r1Fo24cVo){4!NkGNA<V(b!3-AHWP$hxlvr=E<tKp(bFc&?S%VD$6Cjgrao9lI QV+V@DVo=e|!OS560B2OUyZ`_I literal 0 HcmV?d00001 diff --git a/Metrics/ClassificationMetrics.py b/Metrics/ClassificationMetrics.py new file mode 100644 index 0000000..7e40387 --- /dev/null +++ b/Metrics/ClassificationMetrics.py @@ -0,0 +1,126 @@ +import numpy as np +from sklearn.metrics import roc_auc_score +import torch + + +class ClassificationMetrics(object): + + def __init__(self, classes): + self.classes = classes + self.cm = np.zeros((self.classes, self.classes)) + self.accuracy = 0 + self.balanced_accuracy = 0 + self.recall = 0 + self.precision = 0 + self.f1 = 0 + self.auc = 0 + self.nll = 0 + self.pred = [] + self.y = [] + + def compute_metrics(self, pred, y): + """ + + :param pred: + :param y: + :return: + """ + + self.pred.extend([pred]) + self.y.extend([y]) + + self.cm = confusion_matrix(self.cm, pred, y) + self.accuracy = np.sum(np.diagonal(self.cm) / np.sum(np.sum(self.cm))) + self.nll = -np.mean(np.sum(np.multiply(self.y[0], np.log(self.pred[0])), axis=1)) + epsilon = np.finfo(float).eps + self.precision = [self.cm[i, i] / (np.sum(self.cm[:, i] + epsilon)) for i in range(self.classes)] + self.recall = [self.cm[i, i] / (np.sum(self.cm[i, :] + epsilon)) for i in range(self.classes)] + self.f1 = [(2 * self.precision[i] * self.recall[i]) / (self.precision[i] + self.recall[i] + epsilon) + for i in range(self.classes)] + self.balanced_accuracy = np.mean(self.recall) + + try: + if len(self.y[0].squeeze().shape) > 1: + a = np.argmax(self.y[0], axis=1) + else: + a = np.array(self.y[0]).squeeze() + + # self.auc = roc_auc_score(a, b, multi_class=mc) + self.auc = self.custom_auc(self.pred[0], a) + + except: + self.auc = 0.5 + + metrics = {'cm': self.cm, + 'accuracy': self.accuracy, + 'precision': self.precision, + 'recall': self.recall, + 'f1': self.f1, + 'balanced_accuracy': self.balanced_accuracy, + 'auc': self.auc, + 'nll': self.nll} + + return metrics + + def clear(self): + self.cm = np.zeros((self.classes, self.classes)) + self.accuracy = 0 + self.balanced_accuracy = 0 + self.recall = 0 + self.precision = 0 + self.f1 = 0 + self.auc = 0 + self.pred = [] + self.y = [] + + def multiclass_to_binary(self, pred): + + binary_pred = np.zeros((pred.shape[0], 2)) + + prob_classs0 = np.divide(np.sum(pred[:, 0:2], axis=1), 3) + prob_classs1 = np.divide(np.sum(pred[:, 2:6], axis=1), 4) + + binary_pred[:, 0] = prob_classs0 + binary_pred[:, 1] = prob_classs1 + + return binary_pred + + def custom_auc(self, prob, labels): + + unique_labels = np.unique(labels) + all_auc = 0 + + for l, label in enumerate(unique_labels): + + prob_l = prob[:, l] + class_indices = [i for i in range(len(labels)) if labels[i] == label] + rest_indices = [i for i in range(len(labels)) if labels[i] != label] + + suma = 0 + auc = 0 + for i in class_indices: + for j in rest_indices: + auc += (int(prob_l[i] < prob_l[j])) + (0.5 * int(prob_l[i] == prob_l[j])) + suma += 1 + + all_auc += 1 - (auc / suma) + + return all_auc / unique_labels.shape[0] + + +def confusion_matrix(cm, pred, y): + # assert (pred.shape == y.shape), "Input size does not match target size" + for i in range(pred.shape[0]): + if len(pred.squeeze().shape) > 1: + a = np.argmax(pred[i, :]) + else: + a = int(np.round(pred[i])) + + if len(y.squeeze().shape) > 1: + b = np.argmax(y[i, :]) + else: + b = int(y[i]) + + cm[a][b] += 1 + + return cm diff --git a/Metrics/RegressionMetrics.py b/Metrics/RegressionMetrics.py new file mode 100644 index 0000000..4b11007 --- /dev/null +++ b/Metrics/RegressionMetrics.py @@ -0,0 +1,55 @@ +import numpy as np +from Metrics._utils import int_to_binary +from Metrics.ClassificationMetrics import ClassificationMetrics + + +class RegressionMetrics(object): + + def __init__(self): + self.dif = [] + self.mse = 0 + self.mae = 0 + self.median_ae = 0 + + self.pred = [] + self.y = [] + + self.cm = ClassificationMetrics(classes=2) + + def compute_metrics(self, pred, y): + """ + + :param pred: + :param y: + :return: + """ + + self.pred.extend([pred[:, 0]]) + self.y.extend([y]) + + self.dif = np.abs(self.pred[0].squeeze() - self.y[0]) + self.mae = np.mean(self.dif) + self.median_ae = np.median(self.dif) + self.mse = np.mean(np.power(self.dif, 2)) + + metrics = {'mae': self.mae, + 'mse': self.mse, + 'median_ae': self.median_ae + } + + binary_y = int_to_binary(y, threshold=2) + binary_pred = int_to_binary(pred[:, 0], threshold=2) + clas_metrics = self.cm.compute_metrics(binary_pred, binary_y) + metrics.update(clas_metrics) + # Convert to binary prediction + + return metrics + + def clear(self): + self.mae = 0 + self.mse = 0 + self.median_ae = 0 + + self.pred = [] + self.y = [] + self.cm.clear() diff --git a/Metrics/__pycache__/ClassificationMetrics.cpython-37.pyc b/Metrics/__pycache__/ClassificationMetrics.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c52322bd8b9f23331bd9504604d94943e5fea4f4 GIT binary patch literal 4067 zcmZ?b<>g{vU|=wL+MTGw&%p2)#DQTJ1_lNP1_p*=3kC*;6owSW9EK>y6owS09Hw06 zC}u{87)un3J3|U{3QG$^3QH<$GjkMM3Tp~m3qurp3S%&XCi_c}d48IVw|I;4lj9Rh zljDn%^NUiGLBcT11hdM7fq@|vY)uqX3S$aW3quq$#PBE<sIjam>?s^A3{h+;oGDyj zYq*0MG<j}`I_D%77iXqrCMT9;=I8mQmK0?s7bk;^23Z55K|XQ@`6z^efuV-6h9RDz zgmD2=4MP?K2&XYE1c@=HfJhcF$qFXfz$81E<N%YLV3I4Hp@t!z8%*+~Fa$GbGWiuT zGB7Y`^4?-k2016S_!d)M!7bLR)S~?2TTIEhw>T1$lS_*dlPhltCMD)1<|U`5#KVQy zic*skb8>ER78Io>XBKDX=iOpTGrYx|Sekr`IWH&Y7E3`<YRWCfN=>F)EXAogX+<C} ztz;+?VqjqS73ORe6Iz^FR2-9=SX7dkmz-!A6OiE&6Ht_&m6}{q9OLL2A6!zDpPd@x z=ouUzl$u_YT3j3h4XGHc;ip$nd5a@HJ~J<~BtD)46n0{u&}3v|6k%jTgH>|ajeu!S zW(H|xU|?WnU|;~HI}pFPhmnDygE5O?0b>ngI^#k{Mi7r_0b>mlgjd4c!I;HT!&t-A z%-q2k&r-s=fUSge0Xs-^AtRK>2H`DalxA4S*ufajQNp@_vxGGZq>`(HF^egMQIesA zbpbcXCJ=ie<3gqsrWEF0riF~$3>}Q|JSD6PcuV*eGG_62GPE<aGo~@6u%xiIaFhsi zFlGrhGrBM|gG?5J%CTXQ6NbvMW04a9xu}CNOBn1v(aM+-F_^m+h%aQwl4xd3VajHi zz*wYG!kQ%slY_XimN^fkUkWC=kP)P(mbrs*fi%oVHOvc{IvC?+!Wr^}7#Tn?g#*No zWXNNR0EH<_2V)jT2V<5H$Q2!oSt1>bS)v_`Su!1rSv)D6b2w^QYFL69G`XsrxVRLc zz^WjzC^1(7l<cg~1uKz6ic(8Ti}DZx;B4>`oEI2xu_RVx7AJ!^pacNQiJ+7u3`$LA z3=9mQ{1dBJ%LwvG4Py;s8j~bAUxN543@Hq)Oc1dS25|;Z3{PY#1i6#ZPm{Tboq>U& z2$Y^~F&CHSYBJqo(lfZln5oHhi#@fVI5Q_duSgK21e8IFKsNs3vdPITE=kVMEwEDo zxd7xGHijxo>}eOIuCydIJ{MA)=-K4tCnx3<+vy=pLGc(T0|Nudnqop8s{s|I46RHc z&*Jr1kvOP8VJeaUc|#JymIAR+{Z%B*z`&r2@DPft*kG=*fd#ioEn^BOQ83gnN-~Im zSd4HMSgeCF3oIhe0CJr)11!*Ag2GFasfZio1W+c{B+hASaHlb*gHizl!vlLw4oLig zQUk)kA|8;@pnxpm1+n-*ewHobXJBBs#g<x8lA4!NBmm+FB<JU)m4Zsi_}s*jqRa}2 zX`s{q4zybwDVd4s`FV*sMIak)vE-&E=H24REzK#(EXb+6#hjC$UIa4z7He8&URwSw z*0h}b#FAUgsRhNiSc?+#(o@0ZD>kRyV$Mm;yTx8ySelwzm3oV{I3uwj^%h%VQF?A- z#VyvvqN2pgTU^Pd#U=T<ph^$y6-`!1Vgz{w;swUUTkO!J1unxinTil`3ra_~_|a0J zC@363iHm`QkCBf_fKiRnh>?d;hEax*hf#o$kBNy<icyS7gjs@#jgg0mg%KHYF;%G& zmCoU*2b5_+bqu)L^k86MC;=7J3|XK`lZliH6O_s0p%o@dRmlU+zD1xU3JGm+mIg&w z5jYKi(ikKL!07^<B;Wz4$p{HiaP|fh;6P(d&Ph!yas&kxNRR=evZ|8C5ey)G@HPV| zC4pQ6_IwM>^U&HP1?>G=rV@q)%q0w<8VXWYftoFlYz=C-^fINeE@Y}@#!$tS!iJ)X zeIZjVOAT`k6U>w}CL4wt7C4_7%x4N_(By!$F^XhBnFEv@G#QIPmGVnab@~z%jxRwm zrpbyFu$rL!!j_U*mYI?Y&Kt-{@fLSdW?o`ZWjv^qkK!pP%1??1w_b`3P<V#mBnlzG z!7KvKN?@h&CHe6XQ+z<74vH!UCJ`nzMl~h@MggWOE$rcq%>Z~a0Tk7s*a!Km_!%_# zI~W%-)-tCs*0NNbC}FB$235?ldt#VsSvwhEWoiv8s8p?GgI32(Da<V#CCnu(9gNM4 zAob0Rwd_zi7A$fcDU7w8DU7vT6+t!ZE)20=wcHghH5@Jsu{O0lHC!dEHEcE9k_<I$ zH9V3G*$mB$DXeH>OwEiU4B`yt47FS}oGEPP47Hqv4dD!hW#J4pEGcX?Ts5513=$xD zmO=@TSPgR)8>m?#4QfQPLn0WS2SFt+C`LhHUJQzLP+p1^0aciwGP{Ezo2iJUh7nRi zL!t+i>QM{kTdX;WNvS!v*uYdVL>b6LaK(`TiZqZtYz$Q<*rN(jmJzH3@jI58uw#qN zL1`P*??7{BBEp^UTBHb^2iU=>{T5qkUS?rw>MhRHywcp%qQsKaB2dj!qz=je%$a#5 zMWC#p$qOzvK+Wm=q#}?#w|F5M;~~Kl#h#dx11h&~G3G?EfpbL8EnY}U&&*57OinG1 z;wefkE{TWoZm|@X<|cwm`65syeTy**>~u{gB)@=zTZe&xAp+zLP)sr~aWS$m2{8$P z3UF{?&Be&YXuv4IB*IjsKqw;oG&w+x^Ss2|)cE*YT=DU_`6;D2AU02Yd|_!~4n(F% z1QeS3Ai@AdfNGB-kTbw8L?XcM1LZbQAyN#=dK^p~yc{|l+#EbbAaQus1Sx=*jEfi; z7%JwJFw`(+F@h2?D2v5*)iReb)i5q#UdWKe2x;3E*_E&?V69=SVTP1(piH`uv6dxI zsD!PAWdS=#Mv`G6V>4qdO9@jA6N=s{uwEuKy{vgWC2TcJHO!I>&5X6IH4HT@k_<Jh ztPGM2DNN=J%nUXRg#zIWHK1(849QlI<iY_-7NEocP8&rUppXF-_(fVEmNh87uomT) z=A~$|LE07YN(UUlMY<prj7jLhC&$3RkPnJpP!?oh5@8f#lwcBK0+BpS5==!Pc}<R6 zta+um1(o158^vFo4XSqX^k6LkP)5APT9RLsoN<dSKMB-%0|!4?Gm?cME5MD~TO2l! VsJ8>v*Tta9lY^0mk%L(v696z1Z^Qrq literal 0 HcmV?d00001 diff --git a/Metrics/__pycache__/RegressionMetrics.cpython-37.pyc b/Metrics/__pycache__/RegressionMetrics.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2df34842674b3cb390c2ab9f89223d4a7582fef2 GIT binary patch literal 1563 zcmZ?b<>g{vU|<OH4NlBwWng#=;=nKq0|SEt0|P^`2?GN|3PTEG4nq`U3PTE04pT03 z6f+}8j5&uTmo<u&k-?oIg(Zcxg&~DCm93dMiamufm_d{6CCDs4O~zZinRzAgCHe75 znR$stm70vVM4fXIi;FYUGLsWaGV}9%Q%j06lZ%r<>S34(X1@vp14Am<?kJ`d#uTO& zhA8F~<`k9|hA5U4))clDhA7q)_Fx80j$493sp&<j#l=WQGD6J*(aa1C49+kwr8Cqp z#51HY)-c2~f=MPY$qW`}0h6pHY$;4B%)KBr>?sVv44N!{MIeJ-f*hyGc#Az5<c-u~ zO^#d4DVb@vm~)F$Z!za4rrzSrO-;#6%!^M<y~R>cl$vsjv9btc{Vk^CTusJXEXAog zX)76u_!$@&eg!#O#e^2878S?jCKi=s<|QW@#sp-z!~_)OXQd{W6vsGv#s`-a<!7hH zIC=)hL;V;74YU|2AK_=cg34PQ@$s2?nI-Y@93cPlfdYY%4FZ|i7}=Prq%gI?v?PNZ z3ev&Oz`(!;3$rr}3=B04SquvpYZy})Qy5#BQkc@1BpK2f7cw$}WSAB()-XYMCCm$0 zKx$K%BpDX4E(FVh*b)p2L1D>O!n}aJh9Qd$%;P9wUcd?Gae?GYm=|z^L{gX+GJ)0b z)G%am)G%a$Xs#5NIn1@pCA>9EDa<LXy-c+%CA_F^f{L@&Fl6y9;ICn=VOhvj%T~j@ zK%j;V>{@0}@~m>=;!=PDtAfO$#9ReXR9T@5Rw9WMrIwTy<sk$h>7j^~fq_Ajxrmd2 zfuRT#;zgi*rO9}Uvm~P^wKyX`Cq+}Rh?{|d;TBtJMM-L2N)Zo;!<1KWi#ai=_!fI{ zVQFe=Rca9^$ZxUarY7cr4ZFn#Nk?GwZm|~Rm!}p%EC8jzB3_Uo{K@&b1*Ija@wt#( zbc?ODASJORRg=946z^bbA=a@&RYY+>GGcsX6gQLw@^};vC^^H70jEs}Apr806axc; zFeov}fRYA>6eAy#0wW(IAEOu(3nL377c&o|1S20K7ju;&X3|1)VlpTbfs6ssAfJG9 zr4Fc|0i_R=TnWyJ&|F!=kj1)yZ6O1w@CjznWP*ggCObH+AmIWEc!ZZBA;g-TlbTqh z$p{G#P@Y5bmn;JVg8|52pcr95WD&+HaV-7-8RVzQ2FhP~iMgrq@wd3*<8$*<N^?ML zp7{8}(!?ByOpzcg*or`QK&(Ww98}1Ha#%5l;9%n5=8!D{iD`1&V$CbfEvPIKVPIg0 z;)CXOz4+3S%$(vPP>@B*!UVCFrI6BwEk6lVgn-S&=73uqHW0_#fnoyU3Pv7A9%cX~ CBWvpb literal 0 HcmV?d00001 diff --git a/Metrics/__pycache__/_utils.cpython-37.pyc b/Metrics/__pycache__/_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1210ca5be0dcb0a67b7f8deba0ec3accdefa8cc2 GIT binary patch literal 518 zcmZ?b<>g{vU|{Hq3rU>Cz`*br#DQTZ1_lNP1_p*=5e5c^6owSW9EK>y6s8pB7KSLM z6vki%O_rA+wSLJUl^_-~0|NsL7t1g(FtjtIF{Uu4Ftu>hFt{)@GuASdFf3rKVOj{% z&Fq)V2-3yCzyLCUje&td7-XFq0|P@kLk&Z$TrDG5Ib#Zg2*X0g4u))&BDoaCJf;+; ziA;qo!3>(rFG0Fqf{ZBw5t>Z5nDPp4v8LtZ=NExEdIq-`Gc_4+ah7Bhr50!8=cKG; z_~q_w6%$&VT2vg9n^;tmnU|bs7!#1;5))9ApOu<iQXJ#x86R9yl%Jg%<LDV2AC#J2 zlv-RI<C|Jil$l%{6JJ`AnNzG+Q2C3?CMUDFBso8~z>XE<a)>vo6f^Tm;!E=5lQQ!X ziz@YOa`KZCbBgWs5UMqqidYyJ7;dp978NB{YBJwqD@#o-$uBD60Esdt7I88#FlaJ@ zxkaEDy2XoPCP;HJh+tq8Vk+VXDQ3RKnpc`zPzg2|ECe<Li2$3!VUwGmQks)$2MVrY Ikb^mx0qtdWjsO4v literal 0 HcmV?d00001 diff --git a/Metrics/_utils.py b/Metrics/_utils.py new file mode 100644 index 0000000..c8b3432 --- /dev/null +++ b/Metrics/_utils.py @@ -0,0 +1,7 @@ +import numpy as np + + +def int_to_binary(vector, threshold): + a = [1 if np.floor(i*6) > threshold else 0 for i in vector] + + return np.array(a) diff --git a/README.md b/README.md deleted file mode 100644 index 0074788..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# AIS_Regress - diff --git a/__pycache__/evaluate_model.cpython-37.pyc b/__pycache__/evaluate_model.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..322b92ce507f58b1850b31118dd901c41cf6da33 GIT binary patch literal 10401 zcmZ?b<>g{vU|?W8e?0k_4Fkht5C?`i85kHG7#J9epD;5pq%fo~<}l<kMlmvi*i1Q0 zxy(__U^a6OOD<~^E11oa!<Ne)#SUh(=5XY4Msb4KY&l%H+)><MHhT_FE^ibsn9Y&H zm&+f;4`y@b2;>Sz2|?MyQ6eb}DV#Z?QDR_PJW2vgOGZh8X{jiw6s{ER7KSM46rL2` z7KSL96z3HF6oD4TDA^Rj6rmP|D7h5j6p<E&DESo86tNbDD1{X96p0pwD8*Ez6v=F+ z3Cu+gQlz4kQ#n(WQ>0U5dYPhBQu$L=Qe@K^Q{+<QQxwt}Qxtnyqf}E>v(%#0Qn^#r zQj}7ZQj~j{qtr7QqBJrYqBJYyQdC?RV#T7gG8s}eQnhRtG8s}eLG%R1LWOXK2}~7> zQdC_SCNRa$h|*3~&C*FxOVv(QPf_n>j?zt4&C*NNm1Kz0Pi0TlPgTt_Xl9BsOf_6! zl&ZgwF-kv0BSkYsD@A(_a|%O>bc)U#<|yNIhA5MChA7i?hA6XihA8ujDXA7IO3jQe z46z+imZ`j{`l)KE>Zz8J3?TDTbW`+tS);5{trl1>WQej!(N8gGVSwpqW{k2;RZKNb zwMmhdWJtAbW{zQrvWv2ha)@$FHCf=4YPZ07A;UsOMut?=1x~5<aGu!$r&I?x&wPPX zsw0dyfw6E|I76z~0+&>ig^W>JV3zqp#wbTH%XA@Qlq;BJwvaK(twIGHelk(+snV&g zsamOysqW1zsT!&7k_@TZHVhM(3KPN^(ix&WDrTfwq-te(rZ7k{fPLE*<&`S7z&lkd z)hmTFg|n3@jY*PWA!C#sSkw_s)F;(0)hC;20#nhKRG$<EFzZ>W#{#Dm!*oWF5A%4y zJR>Mi1*(d<2qJC_6^HOlAiN1og;T;AQdv`^Q%rjqqkL0+7x;nvu#hRrKQ$o5tQiz? z0jd6AS{CBJ6!R2|UKU1%RM}KFklGYWC|_tn;6esQh6=kBD;I`Xv#6jHYe-zB2B&&7 zGlKlw%oydBDwGwH>YC~$$&l&>CWBHWQfxpbPGBx93TH?aS`eCIn_`z@pW=|t0`gg^ z(1NfOM<^>i#R<xa0NFNyvCt`;A%!!TLDS_WDDP`B-r{shEY3`hcXRgBWW2@f4yHt% za}tY-Gt)AY6H7Al^L$fFiZYXnH5qS-xTPkRloq816r~mv<tL{W7iZ?BYck&APR>b9 z%!@BcOv*{sWW2>&T#%Dl5}%S-k{F*{rpb7V$G6fYu_VzaKQSe>2&6s)q%tTK#Pv&N z0Xc<%fq|WYfq|8Qfx#J65+yJ&FqAN)Fs3l|GS;%xFw`(-Gt{!yuq<G#VXR@5WT;^R zkqen>*-DsFm_cD)!j!_YfVqZwAtQuW!m@z1h84<VTfkPs1_}$9>Ke9S22EDKmrM){ z44RC$*poqCPA#rtQC3z~t`da$S)ml<Lj}9)T21yMkUMYjC+FuDl$NB%=R!i|7HdIK zW?soHwzT}B+{BVwOnC*jc$4!>^Gf3L^7E=vi}G)=6@Zm!vfmOc%}Y)#DoM=DE2)f6 zEGkN@yd{te3b}%!)RbbV2zO3mQfdyA!&q>O9cB}IW?o8Wa%%A{#@v+*MUo5*48Po+ ztztrpQ;UjYaubV6GV_uX4Pyc_Tw(%>^0QKtONwJ0J>!E*it@8lV;nt$<AYMui&Be= zV^Yf!b4n9Ip2$y0&Cx5Uyv3iMS_1P@d}&^?5GW2n1Op2f69=ORBL^cNBOfCdBL`EF zGy?-eGBY&N)EO8UKotu((xjL{k+y)bhG`*VEps|UElWB>Eo(YMEn5v!7E=nNBttEG z33Ccl3Ue=GEk{L92}=z#M7)GGg{6jFgrS*H0+QkLYB^zYHJlO*wOlnE3)n!3xQ01} zp@u7kRf3_FNs^(KyM|){dksSkw<JRiBZyqcRLfJtvVfz83zBVXSQl{A@T9OwGC=uk z3pi?+YPeI_B^h91g>Il?r<OOJp_VV5p_V_Lp;n+`SqV!G1H^Tp>=-+xma~Lm0cQ<E z4QC2N3InVNsucu_GNOqJ)d<!IWiw4+EOMw3N?`=E^lEs)c5}k)%@e5MLlr5!0aqPU zBY+{tk1i6<P{UipSHoW;kj_#o3}TBjfOz5z=}fgEDI6)BpsF*SrB)PFb*Av8@TarX zilsBuikEPeFf8D%5dp=@LPjW?6;!A;GchuNWqFWfL2S5eg;NP{jW9S=%wm{oIcp_q z#24_DFl6yB5GY|-AXvkhA|TF?!X(YGkTFHDM5so%h7(kfFJuN~!3m6o@4^{MgtJ6a zSW}o%gi?finPZr0C1aRsrD~;Xq_RY7L{dajMAMm4#8Sk2nQLV#OiILR#A}3WL~F#F zSzH)mm10<GWozYX<!cpcq_V_oWNYM7Bx>Yq6jCI6S!)$1FcqeRGt@{ekN~F`=>?K0 zQXrdaq!&n~NJCgz(lu-~tSK@nvMJIja=omzN;T38WT5J07c$f;*C;QLN|6VvRtBqP zNl{3VO;H4^R;f{js#dB|5of4X1vM607;02q7@8St)oP@(Bx+P^)KVBzRC>XUr(gz6 zRlg!e1_p*AP=3*5EaGNhU`U@e69i7$zXZwLVooege)<3Z|NpnRax&91O45r`Q}b?d zrX&_+LzryEi8;CXdAHa>LPe=5RlFfZrKt)5`Nf$fnPsU^u3u_;BACnPmY7qFBn{=m z<e|2Iv`?=VS?B(+YKgrj(@S0k28Nexpkk5v7HeKnetGdN*1Y8WoZ?&TX_@K8nN_J( z!p`|cMXAXp3ZQZ;GZ|D#6h{gBW)>HNi&1#ZQS7G4bc+pKo)t%N6=&p^r{!d(78OJ7 zy=kw>e2X<PrywKo7He{TPJYoXR<PnK(NIJ|sZf$plv<pTpObQn*)i0)iYZ(7C8+e# zWV*$bn^=^cS_E=akv0PZLluv*LVlV;aAIyjPHJ%#bGEK-6{j+cp9~6OP_YFn^Ff7q zFsP6d0mTAC4MQwXEn^Mi0)~YQ6PXHGf*}R0CSw#QsAL4YN|Wgplb*pX#vE|j3n8Q! z7#MzW+2mvvmw+k;yL6C|AoJN6suT-y@=M~A@+&}ev7Sv%esW??v7H`5HK@wX$xqgl zE8=HhV7SGeSd?CzUsQ67wKyZOAQe*KuoflerKjFv230bwpnA5bhzF#YEw!Q~H815B zTVhdqZem3dDB5nZfNGOl%*7=sw>VStN^?_-5=&BVu_YE1fRz>{=B4E4-eSuG)m1sS zSc~#Y^HOdx=cMM{VlK!jxy4ajngn+2Eq180Z!uS87TjVlPEIT-NiDj?mXn$eGKs4= zwIsd*94fcK>`Dlm1*H2HPi9_OYEemiMPfx}aTIGwd1hXPrh1Vm$k~jQx426Yk$a0Z zzbGZO=oWi1B+iOt85kI%m`gH>Z*k_u!vw{^Ix}za<(1|pr53@&Z!u-(6-j{PcuR^( zQ{xMux!@Lrn+MGWxA@XPxd0&z<-_G!K*<=KfNrsX8ZEb2L8MU>dqHAOD#+g&AUl{U zif=I|=jYsFPRmTc#hh4S2qKK4*uX(%c#E;(7IRr*&Mn5|A_I^oSc;QN45L_b@`??k zm=i0E!Azql=A69ZTP!(=NyU&91ggi2K*{J9FDUTf=|u^YlCr?bm5Y%D47r%Om=qZK z7&#aP7)6+P7zG$5m>@DNOd^b8jAD#jj9iRDOhQZ`5~6~Kk%y6mQI1iLQHarqNrF*? zQHD{9iGvBGQiPF%iGz^~RJ${S>i41)kUtsV4Ke`+1_n^F0@v*lOwhWWVId=;UQcJJ zWlv|Q<w$3!<%HFiwOsHzp1WdA2}=zNq!tC&@mxrCd>^Q$hRD_MNHEm$)^J1Xc$O4~ z8eVuE&sW2}fW3yXh7a8A<CA1q$W+S@uGe{y>U4gjI-Y$2M-4xyhKI^?fcYG-dbfsi z0Y?o(4POc;QX9Z8oS}qs0T-wSDB;ZFuHmd<2i2XRdb5|6kpU{kk;0e4ox-2OizLRD zB9OwJA_x;(z>^}hfEUyd1=SrWX0l>2leGrZOx7B7Gx<`4OE?$sFJxFCu#f>%XQzmW zGo*+v;04vUB@7FMYS<StmI&9dH#15zq%bXHf{Sy2#W^s<*}&p#F!6=V3z<Q_T)>ke z26eYcia3-dnj(?Hx)9VZ;9MXE384kzDUu-B6oz01O({GzTagT?&I9G^TO5hW$)!b! z$(6SRlM-_h^O93j;$cEXa-ag7IVoSaNEeiSI2@sBHJOU^Kqi0-c;@s9-6}yRsAdJ2 z<|^i*3f&?dkO_R%wF*cD%u7(#g4Q=H?W@!rL!Cjo6sl_#xD-H~q{QT81TV1^%&8Le z1D7<Q+5}n%MR5nD7J(c2dC95iX<+cozKR3ZQ_z$E7ve?6Ah();2v8MMWC~)Lfe3RD z0V=wSEI=$LP+nt8%S<mVN-gpNb07t35g&-dQj%Ixa*H`VIq?<;s0OOc$;>SRjnUj< zEh$PYs48*<sbYhbyI^T|kPvH0W=T$}ra%#>^<E?lvI<ni75Rc#ejvghM1bRiIWajo ziZuxo2T|-L8S$xkB}GL6AZbvETNDUl1%U`qO$jb((aS6uaG6z{nv<Gbl9~byDp*IN z5LASDfQm3iE?7au$ijqDjB$X9F#&K<#=#`Q2!aCO4hM)O!6?Hh!6*kV)_52>7>kNQ zT@O&Wf(!-q7daRh7(k7IVm?L&hIEEb#)=6g3^fc{jG!z6ZlKoHvXn3_V6I`PVL@rO zvw}q#(L~v5SZmm_nTiT(*g%c;qNEPS1uQ8{kYc285}3yf<%vMmw1GugV4`3iD~uP; zP{Il+)GAz3*jyN5Z8{m+8QK}sm{QnNI9fPL7)sbX7-2pBT9y*lEKv7^MUtV0xrRlO zp@RX^ga;Kjg-gJuae|5@NK+n^0ZUjHaHR-9S==dtAQm`N3PCfaJSYYgK!hTQ097oS zEJavEF&apq)Kd(~zUiO_l1MFM2V)H*$W7TyMJzRpNDU=$y3k}Q(g)c9N;jHJw;0oJ zaYE|lqSO?y0&qQ7WXQn4unUwlpzS18a2f*TS#ZxdGZkxF=@v^#YH`U+aNC)=C<$Z_ zs6@EMSbU2qB;XcPh~F(HHxLe}5(_9wO#wAa%Tkexf+|thg3RKQ)ZENuWMNGnNG1j6 zSx`Hy$P#2YJd+k>g1DgUR^$d^fpS?9sA4Y42Q`w}!R?wNACMHtRks*ZAsHtEBm#0G z*z1~1MI|7CQV;<ulWqxOczXpXP*gxkn~{YPQlWEzI^IltOdO1SOrU(h#VEt5z$C=T z#aL7~GV%gA7t}Bf?Yw{#_we=}I3HXFML9GdXn+$$VtOiQ00uTpgst09R0c{!<)GAu zloN_7K%$i(q6$P*gNPbXmB8$no*KoC)TN?wl8<6eOeZqYi(vEPK~NwNo$ecu(>*9O zL9-X&Ch7xl9Z<_s;ZVcu!Vqgw%UZ%v!w9JhvY1ksY8X<OB^gp!dYNn4YS<Pq!$t@} zC1wo^Y=p3oE1V&PwU#}Ft(Ic~V?|d9dkuIzb^>E;4S2AFtAwM5shP2s8`QL`;Z9+y zVXxr;jlS0M)Np|sowd9*Tqzuq47GeUT%aaqEq@JH3KyvHDhV1`>MP*}HPc)eVw-AN zD{4x(YdF9)RY9#_4GU;g6YM%3P%{g{U%*=f9^_fbTq{(<lEqgel)|0D)5}yVTq6u> z64r{O@YaZc25YjJY#3@pYM3O!&BO(Q3mFy&Eo7+Sfz*5uvWB%rFpb5Ap>S6?L*W`w zqc=;qgfmOHh9`w7g|C+hLe`3waAt{=a4rxn;anhABf5~WMsy()H$w_HNKLI6NCy-n zRPiiitl>eZf~bX%wc<5mHR2KswGt)bDf}gz3xsRL7BYf|n#5BCz<db^A2i5QBT*v& zYUsjzkj4_s5Y7Ou)dZon8mJ<;#hy}>Uyxr~qRAY^nwOTESQ5pS2OfebY6PWzP!@R! z>eLj0hLdk`rKY5(#%JcGq*mNwFHZ#xuoOcIZcrTu>Wrsk=B6(=^!lH-#}-RX##=0) z4s4Z-Z)##5e5eDpPEn|m0?Vdityv_BKy4GyU`<J4acapep1jiBc#tzoi&Bek@#o|x zro<PQfCgeg)zvN5lKi6Nj9V-q@giH02W&tDxat8_Ot)A;<H^O~>I&Qp%*?yRo|a!! zo>-J}i@i8Mtt2<G;udRiPJUkMEw+@@lEmbUTg=G?rMFn~N^=V;i>$z<L2+sBEsosM zoRZ9foXT6wIr-_}!2?anTWrbsd7#cM8^{HzMYniBRW2yNK*P>a+z6)@-(t%J4ILMQ z8-=%6!JJzhdGTOIF^C0<nbHz)Rd|avDX}CuqX^XSyu}OZY=bK2{Ji4$N^ry!HG|y6 z4UQd1w18vf7Gw4;4zLUIOG|DsrbTh2#e@Bpa*GQz&JhnXsrZ%{hyoc{hEj0e5>Krt zNKJ-LLx5Tr(1G_`qVUK<E@;8EG=u<G&s<=e(~A-dGH!#)Cp$>J%*4eg#U#MU2WrnU zi!pI93NeA|XbwgmCeQ#mh%E%}qVq5rFi9}VF|jb}F>-JUFbXggf$Deoz&WT<4llX3 zK}s&>3Xd8l7lv4eS{6vjmBp0B3~I^NFr+X^GNdrqFr=_ZGNiEfvedHHur6Q$wJmE{ z7J}!rYM2+WgXTLJ845uq7+Wn{3S%w%1jdSp8b%j}35>CRwH!4Zpz<k;8C*JXmT)ZK ztl>;yu3@WT2bW)55P8s`I=I~8sVFMps$m0{PHD9)6$vF=(2^^pmKV}K1-pYATvmZ= z>IFPC+zT0N`AS%`cx(7l7*n`<L7jE}1$+w`KxGvuSJ!YsN+JkZ!&1WwE|1oPGZfAX zXDH#w5(MEIt`z1J?p`JcSu0S&ktI~Zu|T+lV}VGGz(U3v0dPsf2vSoE)&bImqKa!F zV+|KVRSH-wgscS*&`U7X3YUnc@RV>Y5Ude|ls_e+DZF65Foa(tP$O6)T*HIxgET~W z#ODX@;DUxZD^rV#!GpWt(npi!7I$VIJi8aI1SN8w{L&J*P!ta+lYm4(!*nk}b6=4B z02-(SX8}#dqE=8f&sfw3>f4fE5{VUo8W~0HAUANpN}wWeleZI8#&m&*ZV&-3cR;00 zQ4fd<?!WeeSbZR(A4E(95tBf~WDo%=Rf?v7SW`j7G!OwAHqsO+nhvs$8=RA%MG6<9 z5Gk4kQVANrE1C^r%>fZ}LBu=|0V*wu=7U(E!W$eMMGHaPMId4^h*$z5mV$_7AYwU) z0JR7Z*%;hZ5@cjxC<eKb0o40s6k%jy<bma3CO#%UNFIjgUp_`YCLQ#=3u<!R;)svW zP0Y-TkFQeH)6)kH66%L!r21qQm&7OL<>yuA=9d=7=a%VJ<P=wc6M`nwEzZ=6<kXz_ zl+2=A>;*-s@wq|4xA+osa^g!eb5jfQGxJJ{Z*d{9*>Y1;G86M|v83b%1#7b2;>gS^ zC@qOkNxQ`co?k7#B?z)O9#msw=I13AWu_M2;?6CLFD?P4qV&pJeEFp%AT7{nE?NUh zK}@$;ic?clZZRd7X>x&6Z2H+VAn?W>Iy&;w9zLp(ZVU#G?LouY3=9kpU}|pKL!~am zL{7pOJMA?YUxL!jEtcG(+#*fpTWq<hB^miC;7Gg0la`;80;@{EBRnrbqpB}K#UMy6 zWMmhbiwvQ(ktQ=Z+ko@aEpAXw0oU%S#UY?M+x)!L_>BA#lx{0%PypN*1Qp`xpl+*B zEn^D90)`UC8paZ)h0Ksnt0rSrh_jVGc<xj`7dCgQACTdq51TvHhtHkr=a-h~LuxjC zXrLCySJy&>;;U=p!G5c*)hj5`WCjo46oHc`D77N@i6O}noIpXY19t$*Kxq?{H`y4$ zd~E%4=v+~ifOCFsQf6LiiUM>>IEo9=)74}I5Aj5C!aKP{cJjfY1!@3+LrbKV5mfa^ zGL$f4bn-Qsioo4hP)C-KzgENj#h}TA=&l!G>7S6=UuP^z1*Pc7D8a;{<c!Rc)MU__ zgkn86XFqT$hc52!><4a#MM*-(>hz%V;E1&ikf|0~PyhJRlFXc9J#6b7ia_P_EfKhy zWW<_>BG7ahk`!e91K6k{P=dTA4A%u(2LUMxi$G&nNSeUK2xLt}5vXBsOE5RFq#!51 zBquXTub>jdy2S>ei$H^iQS8O3iAniIc~Q*8dBsKGq8Ob2*a{N!QWA@AF%_f~fy;G4 zP`L=2wFeLIg9p*TqX$Kxt`xXc0xl*&#amutZfbmd(MC|Y%>nUYO4=<}aLIFv7h15C zr6!l;7ZriJZMS%eQj1G-N{YdaF$53P#DWZrfO5hue#i(Ryyh!f3$`FFGcPd*RJ8Fy zvsFCAXwaMzWJEDOr3l=G<xeh31+|l*>WaZNOA)C2zr~RZ%@wyeLAHX&cWyB!=Rn%Y zkkW@ARAfSe9h_Njaf8NGAp*s>m{M|X2_WjF_=3cejN&5D_{}Zm(!Atbyl`_t4Ms>! z1R5T?#a5CJn)$oM3SP?rN#(U5pD`Dt7D4g~xQYc0h(Sh`z~e#S!By}`Fu3vq)s>Jz zCQ!x#5A0$aX}ZM%nm|P=L*NC;Ee@O9{FKt1R69_^p?DJm0|RK<2MIHQMo&R)Qw~NB zW)1-ke=c(lNe(FvGY(@eRu)DcW-dl1MlL2cCUCnJw9JE%kC~5AfQgHdhmnhkiIIts zjfIJkgBdp6#K$PW2r`e2k%v)&1=RB8V-x{TDseG#F>^7>aKT2fKod=TOngjSjC|mU UDX>eJK<#1B)RP!=G>eN70G;25VE_OC literal 0 HcmV?d00001 diff --git a/__pycache__/train.cpython-37.pyc b/__pycache__/train.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e05fa6688d18b45eec70987ff4883af0f7ac0409 GIT binary patch literal 5576 zcmZ?b<>g{vU|`_No|kw{fq~&Mhy%mS3=9ko3=9m#bqov)DGVu$ISjdsQH+crHd78$ zE^`z!n9ZESlFJ&!3TCt9utl+@Fr={Nu;+3_ae&3xayWCjqPTLoqqxC*mK>fao)m@@ zmK@$FUNFrU#RsPOqxiwJK$JiVR|-c8XA5JLU<ykxgC_S&kgGHqZ*e*$7H1~MyE*%5 zGTst(&PgmT&P>ZpPAtjH&+|<!DauSP)?~aTkdt3r9ABE3oLW?pn3-2n2~y1&9O4)f z8XWKMt;uwY%{4DQGcWZPn`>EWUP-ZAGRU_eI|LaR7<d^N7(o1DHzo##8pasrTINpP zPR<gB1&k#OSxgI<7cvydq%h_&r7$gI>|`imsbS7y&0@=9Phpm1s9{K9kz`0=tzk%E zlVnI??`7#^Dd7P7hM|LH0p~&n7lvlWX2x2U60Q`k8kS}zMvypn2g5=}Murle6xI~( zUdB#tIExR?;woXt;?3fNm{P-#!UMLASCS!xua~)#6)elYK%j$lA!8?d3V#Yij$kco z4QrMV*nPq&0x5#MOfgKgY_;qq3=2e3m}}T;SQl{AurFkkW{_Z5$hd$zg>fNcEk}uH z4M&PlFJmVYoW%@hF}5?bGqtm{v$k`mv8D*8h_rCja<((HGq<y}v$yl5v8IToh_!Ik za<wzGGq$s|bEUDQh^I)jaMW^_h}Cd5Gj{Tnh}Cd4Gj{Tqh}Cd6Gj?)x@GKB7k;sx< z$k@)G#+V|RBGtmt%vj3<6_;Ab*v{C_1h#>rowJ>*ox7c<owuE@oj;8yMLI<WteO`| zHCsDJJ5M`*8cT|7iX2#$4=O9YkTFF*MWKbEnX#6?gC|R-hB-wsMX8sGk)ebuMR|d2 z2Tzt<mVAneBtr*3oW%iSEo5$HWMlxVQ2>*QU{VQ8DuYRt64eydIc&88H3DG1S{hR@ zgQl9_Etcfcltj0ej0_A6x7czLD^rV#UowF>>?uY01^J~Vnk=`tGxOrpQWHx`i&Be= zKzZ{PPkw0$TqueMsx-bRu_QGaWE3dTf=mG?SWtGAVPIgWVTcu}WvpS$Vkly&VO+qt zkRgnLk)e<&g&~+hlhIF;=@xHMYGG++QEG8~dQoD^E!Mo!+|(RRrdv#U2DcasRx%U` zGcYjxvU9eI2`x@7DvrrbEGo&&OHMS53CM7X2`I|XN=+^)j&byi4=yRn&rXeT^bC#< zN=+|HEiR5JDN4-D(<`X_#c7kCnwMHpP-G_oavI2;EDTk0VEOpm{FKxjJ)4~T<iwm} zJ3YA4Dgj@Rj6y+TQDSatNoo-&OS30~@_K6VEvB5JTRi2dnduoN@hPdviItj6MdAz$ z3`HOtia;R}qR9yIUP)<DUVL(XX<i9fKSWD$GRO^}Fk@$6U|?flU;z2E_y_|7LkH6W zhJ_4_3?0m%@M2`BVN79^WLUtokYNEcD09{_)i9<op@=YdFfCxIVOq#o%hJKPfVGCD zhIs*73iCq7g-o@q9V`pjYgiU=EMx#>`UPAIK_-IqBZ(|zs%5KTUBC^smj_HUF5q3r zP{WYL7ti0p0;UCO*ub=44TLXL!;E4sNUbn9Zdv?pF;?7StSsVWU|`S`xy1?&>RX&u zsYUtFm`={g&r7|<RFZ#-JuSbeJh3R{7Dsw&Nj#{`yv13XS6o<{np%~5i$6I(x1h8n zH9i+ouHIrxNi9iC&bY;#Tu=&5Fa?#jIFb^Rv%zMt6qlqH+~Nb7RFYqkm;*77wYVg) zB=r^x*l>23NsPI-nDa|Zia<$Klf4KO4Yyd65=)XZZn2~!mL%R{uZ#yfyhxvcfgy^; zC%?Ejiai%1rpXF&Nm*uc>Mbsiu@KQ)tl;Eyi!;BVBr`X&Dz)eqFW3l}^T7$aNCcEp zWEmJ3j6mrTWIqEF2O|fg5Tg(i7n2a928hSR!w3@NV62jY8Uad=h?EKzL`g`X&;}=@ zbw~-Rgb|c@`lrB?67xca1uQ5@2_XVXO04jt#0E}E?C_+-ynv&I6;wZf%KjP_P+|fl zngu-YM8&m`0VV=UR2_H{Q8+^iLm)#CLj(gf$rMS0LIo5UkYpotizT%zG3OR*Nq$js z#x3?dNYW{i2Psnk5sDx}2}CG^2u%>70wPpFgc^uY2N7BzLK{TrfCya>p$AHtkVr2w z00|g^%0Sl2__D+taD3ll$|;Lt%?0r_S&NK8vK-)u1}QKFaXE{VGg4DZb5e_LahE0L zU`FvRj<Up@%#_5ERCiE3i-9scBL^cFJf69jG{CW3B@2#B^s)%18y?9V3=9nLNZy1Q z$umfe<Pyd#wi=cc<`kA*P+7safW3wll&hgp&ep-afU|~`U_`T{L^LzFln|(42h)OR z5zXp{9?^`qm{T%yHHC^SK=EJ+BCJ3JYDC{+FV0UZ$xW;%Lc};pk<9_l;Yg9qRvBNC zT3iB&ZN{`)jPX%yxnLnp7D$A1f+HNP2pr=)ASRZ0W&w#uV2NW!K1NV(2Ib@`Nvv@U z(h09YLG=a;tTqT?U|;|>_CSpxMurlmET$ADNKDl*#51R`b}(eI#Iu5$cv&p*Y$>3! znI)b*6I5oi#B)GtPAJU<QWwt+&T8y_>3>#!yyTk{V4tbO$iPq~7y_y(GxO3F3X1Z< zb*Cm1DBG1J=G<aQNi9wWS7@0pm>C!}d2X?kWag&cVlB=|O)a>^Qc{?bdyAzYDY2*s zp2Kc&C8ZXZ#HSYICuiK^$j!`yl)^$_QD`VYOLGCR7*g)JB?uM)1q&>oZ?S^wPff;K zY^mV(GN^40%^Uee@enaY!Fr1yY(-{XdVErTeo1LTHYj3*K$(x5i%|etsDg`DrYaS% zTJ($u7DO0;5<jf4_+epWVCdk^V(8#ZVN78HHQj0$vKSXILE;O<&SLK1&SJ@8?cjiu zY@o(@3d<bk6xI|rke&{fEQStNm<bT|H4IrSS?t*iMI|ZhH4O0_2+?@<JcSwt7KSX2 z6h?CfCWd&98W5W!o26(<2M4HzX9UN878l5jcy6%Gkd(m#HU+}w?O@4b$%3%?I+z#m zgKK)0c!3V~1%lw(p9@N}L1{)P%><<xAT&5Var!|DY{pw$xrr5!XfM9Sk(iuZT9lYv zsmWXfE_O{ol_(!f6jEP;8cFO$scA*2#Tnp4_EG~>E(_gaE-K9{@&lE!Y^4RD67&{# zacW6?L4I*bT4n_zsTG0B@uC1waRJINw>V2ui*hsbK(#$9IQ<p*gX981L=cDw2ARnP zZV$w#WG0tD(<-FeW=+mXO)S!6ED8m=f~Uw0qyyycB3lp(Tn2E1lO`n5-GXp(@)J{1 zi;6&ne33cGd_HgjN`<A(TU?+74pn6jGARs1fU0V6Q{Wa~QE48iq5;(vnfZB*pacu@ z7y}a@BM&1FBO9X_6CaZRBO4<hGanNdBL@>m6pTSF1x5}qS)~lN7CkYenw<>F2QUmz z%U3{k2}98ZNO{x2l*N?AyntmP!vfZY3=7y6GAsmjOc)k0rZ9Cd<CJ6WV8kiM0ydL9 zg;f$^78_KI9Z8G>Y(A<wPN*2NIxetXsOq?(Vo-J9w8`UFgor;-bVhMP0?9Gd`4)3x zX>t@KuHs?ovM3UiOxZyZ<{0W6#RFo%<f1shaSsyb1+!ohnjE*-^HK{+O7l{qxN`E- zp(PhmGDT~VfFc)MVrjD6;werngEw!BQa~vq4Md=&5#I8m%#zf2s5Og0Nd(l4WMF1t zlmSB#Mi{J8g7^`fs1XfD6a!GoG*E8@lrf6=z`eFC21wrn9D<;J8YCyOCg<cAr$TcF z*!2(s<cwR~pkxLLl+3*JjUdN?%w%AzQo!mIB*ow=NRzP$RDD*tJBB(d<flP2fhwfp z)DkO&YAZdXv|1enXqlgxpQliqS^^hN%u7)K*^DGulnKhfY><K{0Th~`d|3p_dr^=) z3+V*hVgr=~rNx@+Mc{S^sF_y;YGvKx$t;dfEKAJHNleN~EdqJw7CWd@P@GzFiv!j- zzQtOckywy=i@CTo_ZAnTr+ACCpeQr1qzK$f1$BvUv4T4Uw^$rg5_50y<P^mh!`qI6 zL8&RF$*Dd;{&@j8i6yCtrA44t>n*m@yv)MVR8S{9DK)3~7PnhYequ>TYF=@EQ4y$> zPz3I=Fy-eJfqIcq?7<<9K_RX#QT(m}{>~op&i=juKCU1EPB^nD22?z;rR5jpCYC^w zCpamB3s5X2Wf7=^C`tkarr<5M<ovv}%=BCAkdmyp2vo7&;(<gkM63u@&E8_lE4am0 zmY7qTS`5+z>iZT+fg%Ca<tYM<<`sE>xZxlo14QJ2h&-^{ic*V9b4tJ|OH%;c48v9o z7ny-{fE-X{2V#M<O)7{5FNKTJK-_c?0gCq`aDIZMRfv<p*@GXHRzYcxftia@f>DH# ziIIm%jFF3xi<t{d@-VV7@i6f*b1;fA$}xe(m_$TH1cmq^O$Z4wJta9l5l%ivCPqF+ zHbxdk0VXy^8026pY6pcLqnoBslwe{}az<uJYBG3WMbFLI51g=~B%x!5deD&z#3*7B zDDgyz`s5cE>wyU5(L`_>xW$s3Ur-svkx^NYT9jW<0*<gK-pus8%#u_+NG%0UC*U+v zlm!mr0!Zrg1_g~ThybU3kUPP_jzoZii^C>2AJomU12s>JLA@AQ&BKG-oMHw5`>l}g literal 0 HcmV?d00001 diff --git a/__pycache__/train_graph.cpython-37.pyc b/__pycache__/train_graph.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..966d938bfed7c0d6f1b8803f3d022ec82c7a5032 GIT binary patch literal 5978 zcmZ?b<>g{vU|^7Do}PR_g@NHQhy%mS3=9ko3=9m#bqov)DGVu$ISjdsQH+crHd78$ zE^`z!n9ZESlFJ&!3TCt9utl+@Fr={Nu;+3_ae&3xayWCjqPTLoqqxC*mK>fao)m@@ zmK@$FUNFrU#RsPOqxiwJK$JiVR|-c8XA5JLU<ykxgC_S&kgGHqZ!x<&`)M-X5_Qf= zEH2JW%S=uz$;{95O)V+POfJ@Byd{v6UtAntnwOkfRFasPS5gU5%^4ix7!n#B@9(Y2 zbc@Y3FFi9a^%k3JS!!NMv0E|=$Y=%z20;b}1|9|m29UpteV7;+YM5e}J9#?!JGn|2 z7BH4DWHBvZUdT`+lfszCl)|)-v6G>MrGqhxHHBG{A&V`ArG_DeRgxivt(UozwS+x| zJ%yu%p@Ve+$3g}dhGxcQ##)vV&J@lXmS!eKkT_Qd!$L+zh7#@+mK3gD#!g;1O90N| zE@8;x$>N2WQp1qKEy+;BkisL$kiy%`+{p%(<;&vlU`ydk;qPVY<VayoVaO4vWvyY& z5`?==C`&j+AVsj3DTb+*t(LunVSz}BPz`$x>jL%~_Jxen3=#|r85eM+FfL@Q<tP!Y z;YbngW$a{zvsmCPrgnyQ=62S0wsxL&-ZZupkrdGuj#|!khIW>A)^?6|{&s;hwiK}x z@fMC+u6BlYrgqkLu6FJ;))a{p$rg@U?h>&Y&Su6=z7nw-u4cwg!4k0=?q<eL&JMl> z;w2JUk_#Ez1=AQ)q*A0?IGP!2d7$D_3mMxP+nL&#!S-;rbGP%h^R)}K3#M_W$fU@E z)$$^#WpC$f=W7>CV@Z)qkq682L1m>EGNve`D7G*(GuHBV@MXz#Fs3M_DEBfkGL&$p zs4S4};LDQBl21{UWatotvp8X_h0M*2j0|8k3Sd$ZOe%p%WiY8yqMD*MhpkqiMgYuL zOJfRV&{X%k#gbf_lIT{%$iTqxl8J$V;TC&JQGP*wX^AFt6l-2uYGO$gTV66KVHbgN zWim)1C~bn&f)gbuGs-Y9Fw`)_iqtaJFlI3nG1V|GU|h%$#=yu>$dtkm%%I8Wr^$4S zw<xu+G_xqRI6l27G36F(UTJP>jwaJBCOv~&j0Gzhiuf5A7=F1pTg8MHrxq2*<R%uC zWacF&8pZ@<xWoh$<!7ZPmlVf1dd3Hr6y;~9#yENg#|NdR7o`>#$CMN$X6AtmF38X; zsQkrglb)KFT2WABCjoLG$h9mCRVolwx%nxnIZ(AWIr+(nImLE*a1B)gz94ypg2bZ4 z+|-iPB2X4)2M0lF@hzsDqFX%WshQ~+CGjb#$%&PkOhv*B3=Bmg3=9nEX<+coUX$?_ zTUugrNq$j?CL_q>C8b4q@yYq6c_qa!K|!d=c#9okN^vsCaiAatg)bWe0|UsoViraQ zh7P6$3=0_;89JCj!O6%_!<fP-$*_QFA;SV@P(H0?s$ooFLJ?uEVFcxTBsDBGj43QA zYFIm%7O>PX)iBqvEM%@_t6>AFu4V6FT)<kx-oYZtP{Rr$7qF$UE@WKDRLju;&XZs< zj)e>hI2ST3;93ZB4T~fLy68ftTFx4d1>8_~^MGl_1-uIxY8bNk;`uvR!L&dPCzuwj zVFS}bHQ+J{ZZ=4*FgWS3`Q2iyxW$#4lAao$nU|7UQ3Q(7Ta1-O;tUK7nj*JY!I654 zvnsVH9~7x6w^);N^7B$}F_q-sVo%F2Do-p*xy6y5S`rT`hi`F~<`ox~rlwY<-r`Tr z&n+k|NsZ5il-0M`Qc_D2lQV8HCl{206KFx@Esmtb<ZQ4REX5_M1-JM>CY9uuB<4WO zV=XR8EJ?k^0ydl-W)frWE#~~vk|Jr4Be{w|@p_9jDX}Cu;}%OwVo73=6ezQCgS`bw z2)EcP<H24j(gR6xq{V~1krKt?lV4mM#hwe1)no&?vMe(>^%fV%N{Hw!R&b)e#hG7F zl9`)Xm0EO*7i<_P%rcWxi(y=-`@ngvNC1>@L7CSGlu1-TiI9ndk%LhP0-5-jI9Q~Z z#2BSOqD(xDAUO`kDg}tiU>iXpUXY;&&qiP=lqAgwOVYm>5J|d(5tQiqAHWke^FoFN zEFBCB8R3Z<Ap%R(2oYGKMu>nCH7iP@W?slp%bvno!(PLl!j{cs!_dLFfDI*qv!}2_ z6FBn%jv5Y7#n-{QfD@tyl*B>FcmWSeQs-L8022iz^$t8qJDeefA&?=6A%X#%kU9K{ zBthW~ig<7u2B(em1&3b$^Y++c2`+*mX<FzOOKMqS&MnrG{G#NHTkLs|bX_C|(k>4o z6hMR`h)@C%8X!U$M1bP3NEO6V0}+}aLJLG_g9sfEp^He7MfxBC0}ugrl_4nMu~x>H zCFXz=VUaONfGMXeiZvG`sL57j0utc>Cp?fUGZ2@vI5{IVr8FnC=oWWbVh%jXfjJln z?iNQ`VoqjCVo9n$DAj?y!obME$OTDrj6$GP$H>DZ#mEItd7xBRr3_A1@mNzF%rxwY z?gM6`yEXucju~94GD|WnKqNYr1spZ(9Uw7ii3&<skkT4m6q?jPNs%FoH=eJ9r3OUv zgG*7~cmcALnqZL?DE6&Egbj$mnZ~G<yx1z^OHzwVkkS`pd=y(QSWuG{lD;^>=?kn3 zoWOWMOn9<_@G#RA3rJNOC{2Mpfjv!u$}(_@lE;&xK<2<3I-uGb)EWoZ${`F444`&5 zs8!0yP{NeOl){w4oWjz}RKpO@oWk0{ki`<ungXihSmN1IKy@2SJbNan?qiAPfYO{$ znhT^Zo*P^^u=}O|S^4pjZ&HALrVb+mLzQ3%s6msNm#$Dyln-v=XflBcgp$OZTP!K5 z#mV6EBJ%|^1A`{dEtZnZ+|*mF#W|^|1-DpA3R7}#u@odF78Sws{w=Pg)Z&u()Pnrv zj9VPJnR$@PO9(6q4GU-`C;%1%Wn^d=-x36ifPw{HQ?h~^GoW@(D!A(c>gGUmVt!FP zL<~{M+~NmYk(rkspOl|pQd&?CiY6gYg~QFoC;+Wkz|{*=l@?en_RJ2JLKuS*ORTVp z2HbYy%3|nXNMTH20=1%R7_t}_FhODs#Li;w;L2jjV(s97lWd@NQwqx*<`mWxHjthU zR%8<(>T4LXShCo&8H!3$*lQT#IS``p?0E_`3@i*;94U<E3``9195o;|M>b2*lnxG1 zyM_^*3bMFBX2f%YZHA-|9<V78Hg5-O7E2a{&DX)QfFIm$Vu=^%U|%2zZcA}OX*MX$ z1f`jwG$Vutrz%cANQJ_9iz_#=0uuSfw>T1$lS_*dlPfiui@?=_DFXvT6dz0!(x3pf zTiJ_J(~44yGr&phC8&X`DRhgus5Gw#RC3>9D=h$(;<vbqQ%m9t@{3E-GAj@Xt_W1m z6a|9Hlu&RbQIcAeo0$h{DzSo7VNn1`E(k;fg9uRGy2S-<OvR^UCYL}{EV#@EHR)0l zi!>RF!a%hb_bv9E{KS;hqT(VqkXDcfitIrwaB;y6PN~o|=m8S81i63@oOn{<snZE0 z6AmIk9S?A0=oVj5X&$Ji1JzHN`FT@8$rRKaV_@QA<YDAtWMdR#;$sqEWMkxG=40ey z<X{4cf-$I##K-|At2Dt@VNb%SHY9`c01U%Z@D)(4#ZYtsQr2`ZWie$jFJM{7uz+<T z!veO23=2U$Lxu&6DNG&AIOUi-7;(z6fX!r2VU<Lf#Re5)M-t-zn~$oF6Do$RjtguT zsyc3{7*riNMe_I+AtDPDcTt>>pmGd#zQvqanj8g*rFd9sEQ$gpPIgdf<QVE4#RFo% z<f1sh5e*XO1+!ohnjE*-^HK{+O7l{qxN`E-p`{j5;zVmFfzv7|6=<^D;werngZBuE zK=nycIw+B#r4-)sqRf)ic&IhoKnVquLl~G@7-hgvgb@a-G$4KiCuQ^oGm0@N<r%17 z2+A47eBkbN76YUg2o6V3hZ>R#L7koARA|nCl(-;|K$0^zD6xTpCNnSn2*`;bGa1;b z)bKb6Nej3t(qt@(WME*Za(4`MR>)6-H2y&qQ*mmEl|r?Zo>5w@jsmpo&&<zLC{8Vb z3n%8KD1huo5-iFDWnnf*(US-YSy1jQ0_DaiNX~^+JGa<CWkG4Nrg{;$c>-#A7IA<w zGEZi4d}3K*W=>*KPHGXzOSjlTeX8Qrl3P4^rMdB-VT;nD)Z$yr#ihBoxDY-0TdV~| znRz8e;6^lP_~I5TxWjad#W5u@_ZCl1QG7AHEh`w5no^pa>J#Lj7m$-!lA2gr1Zq^@ zVk^zdEG$jE#g>zpl$ujq1ZqFu;&#i)Pb>*Z%`46?Dgw0^iojhyru@7jc5n&~4si?$ zadnB}cMb4&_K0`(_YLrI1qpD%nMJW6-?63T7v(0FKoToBVS)=$ETv@;sJtiwl^~je zx7d>N^U^ZYi&8;muxI9_fO>sJ*`V+RrJq|&c?GxF$`W%*Q;R`rK_d~6UI?iFSL6&* z=LsSrKtu+J$N>>~AmiAIQj1G-O2Da0QwZEf1KWeG^er+6X#?8}8mNV2Aa{@$xFog! zvA_`y&Qy*dE-31Yz}XAbA-yGtkvKuZ#i06uftia*f)Ugu<znPw=3)eqJdA8iJWN8& z9E>uIa!g<`CNWttaS>5SdqP4?M^TPXgp-eviII<yjgf^>fQgL}207S@T0!ZN(M?k* zN-(h~IU}<qHMs<odi30#{lKX!N)kGjs|Ouz%7l;Z7J(8`l&DXBaj_nVKpy1<Cxcro z$@vA9Q5+eS1*t{(1ts7(i{i~p&&w=H)q~bipris$CPi7`;4XlqQXf#@_<;y;1^~Ge l90f=OC}eJN*yQGex?gsnwra5$Xvi2;EirL0@*uaZm;o%b@n--4 literal 0 HcmV?d00001 diff --git a/_utils/Result_container.py b/_utils/Result_container.py new file mode 100644 index 0000000..0e95dee --- /dev/null +++ b/_utils/Result_container.py @@ -0,0 +1,54 @@ +import numpy as np +import os +import pandas as pd +import pprint + +def mean_dicts(list_dicts): + number_dicts = len(list_dicts) + keys_dicts = list_dicts['Fold 0'].keys() + + mean_dict = {} + for k in keys_dicts: + + values = [list_dicts['Fold {}'.format(l)][k] for l in range(number_dicts)] + + mean_value = values[number_dicts - 1] + for l in range(number_dicts - 1): + mean_value = np.add(mean_value, values[l]) + if k != 'cm': + mean_value /= number_dicts + mean_dict[k] = mean_value + return mean_dict + +class Result_container(object): + + def __init__(self, target_metrics, output): + + self.target_metrics = {} + self.results = {} + for o in output: + self.results[o]= {} + for m in target_metrics: + self.results[o][m] = {} + + def update(self,output, method, metrics): + + + mean_m = mean_dicts(metrics) + #pprint.pprint(mean_m) + for k in self.results[output].keys(): + self.results[output][k][method] = mean_m[k] + + def save(self, output_dir, name=None): + dir_name = os.path.join(output_dir, (name+'.xlsx')) + writer = pd.ExcelWriter(path=dir_name) + + for o in self.results.keys(): + + # Write each dataframe containing a metric for all methods to a different worksheet + #items = self.results.items() + sheet_metric = pd.DataFrame.from_dict(self.results[o]) + sheet_metric.to_excel(writer, sheet_name=o) + + # Close the Pandas Excel writer and output the Excel file. + writer.save() diff --git a/_utils/__pycache__/Result_container.cpython-37.pyc b/_utils/__pycache__/Result_container.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb327b26fa5271aa851b3f12d0161b29996ea1dd GIT binary patch literal 1794 zcmZ?b<>g{vU|`t9+nHFx#=!6x#DQTJ1_lNP1_p*=8wLi36owSW9EK=HFwGRj1g4pz zSim%E6l)4o3UdoX6q`Fk3QG!W3quNPDtj|?6h{hUFoP!BOOTm<$&4V1fq{V?ikU&C z_b@OplrVHKHZ#^Tb}*(eN-`{9T*y$%l*v%bT#;A9<iZe}*2&P$(9W2~1h$2vgsFzH znbC!znX#6ohNXrvg++ovlA)HhLIorP;(=siMQYhfm>00ru-35Dut_p3WU6H?3<+oG zU`S!jW-h9!VXa}5WvFGXVXa~AV6b5*lnZC5VFtO@uZqnrKPN@O0PbKW1_lN;1_lOU zkb~tI7#PwSY8Yb0zz$|uz)-`ukWrGMgF%vEB2yttFoPyz6+763>RL_4TWo3hMY)M3 znoPHt^bBq>=4dkAV$8n9m6KUq5}%TpTvEJ};a7mORZM7cYEf}aZemeMW?pilVN5`V zOH4pfepYI7NpXy$XMAu;QGRx6jH738d{An7QEG8<OnhlcW=?TTP-<~$PDy-neqKpp zW?pKMUP0wAE*p>q$@#ejcDx{0L%dKWmz$cH2eDSqCMQ2RF{jv0525rWD2i?|CFg3g z-eS&4&AY{tomyFZi?t{*FFo}ZQ(nO>=ERf~P4*&CXhiYkmF6a;7D0@Q;sPm!vN_?_ z++r(B%qdMRj^Y9ffjLE>q*Vk;1)5AntPBhcVB?B77#J9CL6m{R1!PCD1Srigaxn5R zu`qHmNigv+3NeZ>a)3k_i-Z^$7?ME>U>KC5z-d;Efq@|voL-}tQW#U1S{S03Q<zg& zS{R~OQaDoBQrKG<qgYc|gBdhAZwa6WU^2)YkeMLIfv__u2wfN$7%~}Z7~&x$V}%ho zb!yczgG93!YnW{qDx_){T^M48YFS`nk_<I0HVlPo;S7aB;S4DZpyc7F$#jdaB(W$x zwIn__wWKIBx%d`)5jYl$HCb=56sP8-6^Vd?gDt<bq@c9q7GwS`#$0faK?rbIam2@G z=4F<|$4i2o#{+UO6B{ENBNrG}Nuqm852ijD6agURpdbdZ!Tz%X=foOjP`;|rsbR=s ztO4alriBbH46#zRpnSwC$p90t0kN5E7z!m2zG4Q)NRcQgN<pbkldVV$#1;p8B{#Js zBR}O9JJiQfY~U1;3rPW>)PUqQw$g%>#FA7ckjFrdWMJlC5@D<o$M6zFogX~LK<2<> zEDr3WEXD;)HH<Y(DU9NvQh>RHIg6!+Ifbd0v6iL6sD>enbpadLk1DmSCCpju3phYx zApb37tYxcV%i^qIsbNiF?qy<RDAWpPs9{;awUB`k5|=D~RjhgyImH#4jJH?{5=$~P z8E<hFXQZZ<#OEdErfPEEV#+Tr0+kuJShDgn^KLN}q}<|mtw>JI2`|blNi9Nz)-6t# z#F9j}qQu<PTbyY{`MKcCbc>@TKRy+t{uWDdVp*ytJ2*^laY0f>d`f1~Ef$bfQ5-3m zMPN&Bv6VxN24$Wo9*Aoop$-XRP#}S`Tah{g1A`VQkin^kQHYU;Nq|L&QHZ%p6eFm? zwln%^vfbi{2ip@Le~T+VJ~uz5G$%Da{uWPsd|_!~4#+L>@kLUgh?fNsAUlxkMOY3h zPC-ed7!<=COdOmX97P~8P0m}ad8N4pl|}p@{R-gJQ;?XKl2}{>N)fl%3JQuc^Gd*} l?G{^p5~z#>>%?ZwEe;!q6YW5STQR7@0~H?}j2uiHf&dOSt7!lL literal 0 HcmV?d00001 diff --git a/_utils/__pycache__/plot_utils.cpython-37.pyc b/_utils/__pycache__/plot_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08e282b35f76714334f984cdfdfdb75929a6f35c GIT binary patch literal 7536 zcmZ?b<>g{vU|?u7ZB6{Dz`*br#DQTJ1_lNP1_p-WA_fMA6owSW9EM!RC?-Y-n>mUF zEXEqe3Z~hj*ugYM6lV%k3UdoX6jusM3Tq2P6n6?+3VRDf6i*6A3TF#L6mJSw3U>-u zIzt*u3Qr1e3u6>t3Ky8qoWhsF-@+KhpTZc-pegVY<PyJRkUKyuVFm^UP6h@BXONp( z7#SF97_t~@7}FV37<(COK|ID<mKuf{MoES&rdrk#<^?P@tTjy83=0`+*-BWm*lJj7 z*d!ThSU_Y7Q!i63LkW8p#{$k8<`m`yTs4d}ObeM7GS#w|urA=PVPDADFH*}<!;r;O z!vXS1FB2m}I713UAVUyC1Vaj24XZRmEoTXD7GDZ`3P&$v3{x#vEq4iT7XJc)6wZZ= zwLCRECCm#1Yd9A&mM||6g0UAeN;5Pw*7DYHX9+{xQo{*$5myaw3U@C{Enh`ciAW7! zGouSbtX~XMEq|>*jlcrYg$%WVHQZTZDLf0fO1NrRYxpG@Qh1x07BbcdED&GFP$RHF zVj)8cpEQFwLycexn>a%nQwo2IK#E|BP>OIb8zV#Eqi}{2-Ym%!krdGsF_3>sco#^e zh^I(^+y`=}2*W}qMntGcB87@j3GV{w8li=Zj0`2b3uG2DfYL!QgQk?9CgUxpq)JVu zTP!JwC5g9KGxJhXE2_AvYvZeH^(u0TD{ip`XQZZ<7-}-!;wlEQ;`0)7Q(rPNFfeE` zy%b?!V0g&`Vl&=iPs>a%&a6s}VvEdi&vPmb(PX;CoRgn^i#0hvC%>qQsk*j`S<fi- zB@;+JQxV9BB6bD_hAQ6tw6x;X5`}{N%)FA~TWraxc_pbux7d>MOG@%{HCb;lRaD+$ ztEenVtth$01tOF4^NUi7Z!u*g-eM|Ce96ziz;KJTC^J2yM3d<jM^S!BVo7FxUJ)qO zrO%oP0z2)~&z=E+FZNZO5bqV_rEALG;x5iFDv2*k%qdMRE&?UjTWk={EvCGJTdd_7 zsYR)`m<m#Eak?ayB)SzP=BD0a$}hgfQjl1Zaf>A@KQr$ZTUvfmZeqzTZr6(B)SU35 zOpx<AO7i1VLHt|H1vw?RIEqV?3UcyGif=I`=icH-Pc4a0&P^;R;s&{$IXAK57IRW! z(Jju@ywcp%qQsKaTU;PD8L1#&-Qq~h%LDl;^%g5gwBi;gh$_!aDap9SR#B3foLzj2 zr7|Zo_ZE9`Vp(chCMei*@{3b7g>NyXq}^f#`{@=_NyaVq{L+&6l+2=AY&nTZsX4_( zk{}nbXXd43CZ`tP;!MlTNsR}4j;$Q*lPIptyfSDA-C_nA&6HRX#hzOU_P{MRaP$`6 zVl7HdE-}2tn0bo@#Ja@>b9!24I@r9G3`L;q|I5qSDkiizwWv5IH?gQBGcP&OFeV_w zB_^OKKPxr4q&UXWGd{SaC_g(j#?do4J}5Q4D7CmaCcd;JGp9HP6owF{UP0w8aWJ<y zGd(XeEi*YW4;GUgpmN3tRPr$LF|sjoG4e5qF^VzrFiNm+FbOd7Fv>AXF$ytqFp4os zF)}eSG4e4(RB$mCxiT;?Br}8bGB7X*AQxN~j0_AV3^k0+jJ1q43|Wk|%r&6miUnL; zv7|6&vn*r;7hbG2EH#W03@MBYnQGZem=~~t_~7D+J&OZW#Da?)j#|zd&MeLv<{DN> zh8lJd2`a2o*m_xNxsYVopfc=GH5|RHwcHg^C0sRJHQdciE)20gF-)~QwY)Vv3%D0D z)bN5xafVvH8qO@96wU>_HGB&hYj_v%f#vuYGNf>U3mU!@Zg4@vlfnxsyi)|g1<jOj zh8oTV0x66of;FrujFJq^jOk1@j5UnuOeum3nZZSj5U5y65djxN>{-I#A_-Kuh-8T_ z5UXKI5nsqu%U{C2K)i;3AtSV~k?;d&c}=DwP~^O11(gDfMW8Ay<Q8MPCKEU(6oCqp zTZ~1T%m{7~D0_p7k0M?M1_p4Ep^3ZrDB=St0%_4?Eds?}kswGw2t<I&gCbCwRmGgG ztNT(HBqj!O2UC$ahz*iYheZ4Xdrd}&#X`B6#l@L<>ChaY2P!rsZZT)(m4J&2aP}#Z z0%?~9XQbT3ip<>7+*|CqnRy^akqSso6+~!)6fjq178GfNxH=$05k!Dpq6d;?0VU{L z%sHufMfwa33{m{BVk15=C9AZ!q(}~=ULHg!fCxj7^961(<(3thg7{`2!W=|^vUQOK z$PA9er2Mkf_>zn$j-=F_{POscj3R505*rX<2eKBNd5xmDKsh%atO&{mYb|mFDRcr6 zpd1X&#zih5ee59T#^;uSyiyDz7#KO2*cjOu`4|Nld6)zkr5HsRd6+~%JXlU+l4Im! z<YDAvED8eUG*EQ_3Uf9F1_n^d0NDU;w$v~zU|7gd%T&U+fT@Hri@AoWge8TsmuVqm zEprL$0=9(=F-)~AwXC3qKn+t3a|%-l`vT4s<`kBNOew4>Y`v_E3^l9^xKh|bX|RMf ziyPF4N#O#gTGj<TplYy$bpdY;BS=1lA(%mv8&n~(rj%3`q=FOc%V!`zgIpD&$yx*| z6K=8OCKhLd6WJ{mPyv36t+=o>u_!gYT4bI3!>ShhB2SPW^kk*UcZ(%Czo-bDQf{%7 z6lIp);>^!WEsoF0%uc<<l9ZpHgGed2n2Ym@qu4W26H9Uv3!(&5GRsnn(m{!^ATcMk zq$Kqgdu}2~pctG=Zt;|WYKi!q#LE2Ak|J=LU@I~PxdWUYB0#KAP<%0_6@eoi6v#-? zC<%^6ko!_|z*T2_T54iRX;Er1$TP*D%)`LQ0g6gSE=D0HE=E2^4#uJcQ1pSSaZvPu zq7lRfnOqDCu@uHyMsT1qK{7trI!&fqT*aW;xws@Vx%d`iB3KkcfD;ri*d+zf%oYYJ z#z7`CFcu|)+|S^a3~DlfECgjaP~{1VmEtgFP{~}vSi-b`8Pbeg$k@!-%-G3P!dSxu zYF2_uvIQ&)LCrUI6fstiSSNEkQ#*4S6R2!%;V5CNVFxwgni)HxrY&G!$gqH8Aww4H z0?vgDoeU*hS=<YFQdkx;mM~@Uf>K)vUk7tDV+wmOb2DQtM+sLJe+owmTMB0iS1)r6 zQ!QsLR}EJRw<JTBKnHUNLk&j?PcL&VcZE+4w+ln8ODzvnRxpKkflv+40^x-W3mI#8 z7KkilsNn%MAX7NOWj0tQg%4bA^P@BncZD-R%@6}M5_>`Au2_nA3qvPk4Oa@ABtw>X z2SYnUJ7XGCibRTJ3rC4Wic|_uFJp>y4M&PhFKY~QEpIJfEq@8q0?7_g!&@Llwnjh% z)Hsz|$RN#7E0`h&>Kv3vcQDrog6f@S##$k$*)lb}H9{%!DGDiyy)2BN@R6<IPf<!y z2IaOI&Mdha<`fkOuY_xXJUH(uWGOCCs$or0T_6O?gA18zg-f^=DAx#s@?#0t0u_*| z8kPyn#a`g><V@jAQA<&WhNno1Mv7*NR*H6tPKs`dUOIb<eu_alQ;K1VQHpVjNs4KT zS&Df&dx}MhWja%e)f|>up^7IZY$?_$HYv6#cIhmj7>>OV!&EC;D^{Xf!n8oGMidmv zpp;W9UZM)(iGk9KK#6*mMvX{~c#S|BlO#ipSPf$hLyCQh1Gt2Ni!acu5vmcBWB|Dy zp?ZPVLWTv}V4sMluw}DMU@9`HVFQ^h!63;{D^Vkn;s_Q!k|LGIl;Tt?iO`#+lj5A> zlH$5R2%%#kV~SvkAd(v06t^156!%^ZMurJYg(jfdJ4>d9v4c6qgNU&A#25D9?Ca%s zi#sK?IJqdZ0Mw{X24!+k{sZMZP<t6vX%>U(qI8BDhFFOhrdq~YrW&RWhHSPXmJUWx z-kZo&$Px@`{eiQ(Ci5*OJp)k3pfs=K7Gs$v(=EP&qSWG4c>C=Ze_3KtW@1uKYB9LO zQ4H1xu5*5I+2mvvmn7%s7TAGGpkh$@z{XHz2rfBNGK))!GLuR{PK{4aEJ;nzFUm|# z%+a&S$xlwqDYnx?=#F9uPAxHpR`{@%1>-H|ywY4vrXo>L-v(4Eftwwg%;2WVE!MoE z{PN;kta-`#ImO^QLX#2PXaO})phi8gr?h<nu0u2#qj<qhbXb2T8RSz~(1TnS3@RSg z7$8Mo4MPf}Gy}K*WL(H7!T>7%YMD})7J&NIObeNU88n&wpe~Gbs1mQX(lbn}RprvC zwo0j0C`c^HOwB7P*3@Jz@&c8#tVQ{yc`1<Uin+Knw+Pfw(PX~GT##H6#h#dxQ<j)h ze2XJ7sW?BUv?LW0KcK=N97(rW(n|A^3qkG!r4a^3DaI;oA|nW-6Q0UIsST9Uz^M$> zSnOb|Vd!9J26>0k4{WO@V-aYSLz4+yK^K*PDm#QjIBaqfbCXgM?aCM!7(RnSuS%at zhro2+V$62aWWB{$5yhTZT9OY608UUZw4f-zpg4*rIlmOtUro-*PtGo~Wnf@{2Qu6j zARoXZOa`o^h9Oo2l2=j~7BD~tFd+Fvld+1w+MrfJSE1U_%23ZJtyWc&5gPnOX&~R( zgCdAA3+#Ic0jfKTvKc_*Dhv#;9Kj9gNGs%|rj=;2fE&)>6bp6V1ADL#q??``#g~+n zUlCuDAD@_)oRMGj5){~)jJH@oeR^<{y-K9opf(;kgMr(^Ohur9otL00UlY=vuVT#& z3RW<HP=*l72tpY{C=&=}3ZcxZc--^zQxx(`LAg4$O4un8&QZ{?Gs+DL)~w<x&d5wF zQOHkIFhF7(BC(B-*v3d~6C}1N65Fhb-!Uh%EL9;ftt7Qb!O|e5ir*zQF$K<7NYP}u z#g-csY-x~kizOvDDEJm9*cI_$W^R5_Nn%cBNo7$TC{h|gL_LUT1Vs;9L1J-9YK|r= zq`_4I5@XINO$8^?TkOzw9CUPIrG1eHC{;klDo)y`r-8vc`zrnb1#s8hN}<|H&n&I> zB`B~pS;0da;I7Oq&XS_UykgJ@#4V1r{Jauye1W=AnoPG?OEODxQj0+SfhuYA*o!Yo zEJ{x;0Y{?VEjG|NNNUP07Es{|$@Xled6|W!skd0NQ!63E1VvgPZ}5O!6c0}K#kaUX z8sotON4MBY^1<4{T^>-uRHP2lz>$-gS6q;ooLb}$3Lki1s0h?!0H<t7hX~#cisA+h zX~Y*K7A58u-(oJvOufaDl9^m`i#@S8v8X7qvItbP6#0S-XUj=VPt8k-Vku57DF6+# z6hJ&32eJxFZ>b10EC4Oz(sJ_ii;7}Eog_}McR(e>E&k%vl6df-3b^<!jslOwK&-jN zR+O5XUzU1{1ylk+1K+VCwOCUM+~B^&0js!fv6g`9uc8i+Bbh<%a!_0R76+_t8^xbm zQIMKkl9~c-?1IY<aK|i)IkBQRiX|zrsJIB!XN+PiPfbZrExyGHva~pg15^xwn70@! zqu2}bi$ED29Lq)EP8zt056b1Y*z!wD3Q9|gz{!uX09;dnOAJjGa6*KXmYJZS$^sFf za^#jQC=^gKI&w`K1uE1FKm{q27^4WI2$KYp7^4D{2$KjS7o!NH9Ar3!Nq~uig^Q7g zkpnci!pOxa0O>t2@qzmeAhjHfY>XmI0*q{o9E=={JWQYg86HL%Mv$o>okENXj0#BV zixz|W8=!FmkmaBrKB)N*D(Q>Gm_T)47Gnw{r~#kC0&XI**0Mpm0t=W@SQj#a*ftC$ z%vmfoY@p^ahz}b5P2mJJ<4TybSZmloodj^>jy;77+>B+;VuOjMaHsIpFr@H;RM)UB zV6R~UHS|ir-2z6263#5H1>7~vDSQifpp9Af5{3o5ps{F1h7tzQ;0uV(0=W=0whMP7 ztPu}tw5N!rvxCNT)0t8vQ>0R)Q)E(PQ{+<Q)7e1{>vX0Rr8z9M92Kuh_*0ZqR8mw^ z)S%7SYmhERi9m_q0-+kt6eei~a5I*>L;%DCH)DAqeyriH;Q=>eL8ImjDeNigARm>0 zeTpmx^{)nKP^g9tCbK}ehNFfH+Q{VuHE4_e)Ubf;05@lOYj{&MvsorE7EMXv&SOf^ zs^vrI%@Tn$))(*~bSz{{(M!=oQX`t8U&EJT0BX(_{t0KOVa;N%0XOPFjaiJ)HpUm) z;O47|AGnr-G~ytwx}sdrpb4a|y2YNBlb={(Yy_?ui<%f17(iWpa1H|1+Tfg1)B?`K z*(s@-ERf=S0VuP8<v~(K<)F4Ahzl$3!SbL&xTprE5^wccR0|piA4c^iq)r4^hv144 zR3}2jz+A8dNDLNWpaQB09BQCo1P8b#8zig|Qm6q+a@|=39vW-~MLtpmTGR%Lp>_}f zHXquM0mU2sI#Ud3yoN2Y7(8eMu1YvEGK)(<W#}#Tr2GmHBZ{-QJh3Pj#4MT$vI$$| z1ZrS|$8I1s6sW2NS5x51xM&(k+jI~y6GY4c`GOx@<rK{Z34p4ZB2W!eG!Mi@jPHSK z1W*g02vj2!Ed<FTf^-px3-&jtAO%+qkOsUo$g!xE0C;8s)R_Vm#KjJvQkYSMQHW83 zS&UJFkqbOP#Kp+Lgso7QV-jQHVB}#073y+Kpy4A>6;QMi<PCvag1L#{0L#ft(krM0 z*?fx)LW9Hj7HeK<Zb2m^AV5|6Ew+Nhyp+V^B2Zg8ioG~BF)6<&4_uLhhws57i{SAo x@URiM<^bCT@gSO4Ic#$CQ%ZAE?LeK4V$k?7sM-QmN*v4_Djc#Lyc{e-o&ZAVU_Sr= literal 0 HcmV?d00001 diff --git a/_utils/plot_utils.py b/_utils/plot_utils.py new file mode 100644 index 0000000..73fd8e9 --- /dev/null +++ b/_utils/plot_utils.py @@ -0,0 +1,203 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import pandas as pd +import seaborn as sns + + +def plot_significant_values(df, value, th, out_dir): + df = df.sort_values(by=[value]) + labels = df.index + values = df[value].values + indices = np.where(values < th) + + df= pd.DataFrame(data=values[indices], index=labels[indices]) + file_path = os.path.join(out_dir, '{}_{}.xlsx'.format(value, th)) + with pd.ExcelWriter(file_path) as writer: + df.to_excel(writer, sheet_name='Sheet1') + + + + inv_values = 1 / values + + fig, ax = plt.subplots(figsize=(20, 12)) + my_cmap = plt.cm.get_cmap('YlGnBu') + colors = my_cmap(np.log(inv_values) / np.max(np.log(inv_values))) + rect1 = ax.bar(labels[indices], inv_values[indices], log=True, color=colors) + for i, rect in enumerate(rect1): + height = rect.get_height() + ax.annotate('{}'.format(format(values[i], '.2e')), + xy=(rect.get_x() + (rect.get_width() / 2), height + 1), + xytext=(0, 1), + textcoords="offset points", + ha='center', va='bottom') + + # ax.plot([0, len(indices[0])], [th, th], "k--") + plt.xticks(rotation=15, ha='right') + # plt.subplots_adjust(bottom=0.5) + plt.ylim(0.1, 1.3*np.max(inv_values)) + fig_path = os.path.join(out_dir, '{}_{}.png'.format(value,th)) + plt.savefig(fig_path) + plt.close() + + +def plot_mv(mv, th, out_dir): + th = int(th) + + labels = mv.index + values = mv.values + indices = np.where(values > 0) + above_th = np.maximum(values - th, 0) + below_th = np.minimum(values, th) + + fig, ax = plt.subplots(figsize=(20, 5)) + rect1 = ax.bar(labels[indices], below_th[indices], log=True, color='g') + rect2 = ax.bar(labels[indices], above_th[indices], log=True, color='r', + bottom=below_th[indices]) + + for rect_below, rect_above in zip(rect1, rect2): + height = (rect_below.get_height() + rect_above.get_height()) + ax.annotate('{}'.format(height), + xy=(rect_above.get_x() + (rect_above.get_width() / 2), height + 1), + xytext=(0, 1), + textcoords="offset points", + ha='center', va='bottom') + + ax.plot([0, len(indices[0])], [th, th], "k--") + plt.xticks(rotation=45, ha='right') + plt.subplots_adjust(bottom=0.5) + fig_path = os.path.join(out_dir, 'missing_values.png') + plt.savefig(fig_path) + + +def plot_correlation_features(df, out_dir): + corr = df.corr() + mask = np.triu(np.ones_like(corr, dtype=bool)) + f, ax = plt.subplots() + sns.heatmap(data=corr, mask=mask, center=0, + cmap=sns.diverging_palette(230, 20), square=True) + + ax.margins(0.01) + plt.xticks(rotation=45, ha='right') + plt.tight_layout() + plt.savefig(out_dir) + + +def plot_p_values(statistics): + a = 0 + + +def plot_distribution_categorical(df, variable, table, title='', out=False, out_dir='', p_values=[]): + # + present_values = sorted(list(np.unique(df[variable]))) + expected_values = list(table[variable]['description'].keys()) + variables_names = list(table[variable]['description'].values()) + + variables_names = [variables_names[count] for count, v in enumerate(expected_values) if v in present_values] + + data = df[variable].value_counts().sort_index().values.tolist() + colors = plt.cm.get_cmap('Set3')(np.linspace(0., 1, num=len(variables_names))) + + fig, axs = plt.subplots(figsize=(15, 6), nrows=1, ncols=2) + + bars = axs[0].bar(variables_names, data, color=colors) + for bar in bars: + axs[0].annotate('{}'.format(bar.get_height()), + xy=(bar.get_x() + (bar.get_width() / 2), bar.get_height() + 0.5), + xytext=(0, 1), + textcoords="offset points", + ha='center', va='bottom' + ) + axs[0].tick_params(labelrotation=45) + + def func(pct, allvals): + absolute = int(round(pct / 100. * np.sum(allvals))) + return "{:.1f}%\n({:d} patients)".format(pct, absolute) + + wedges, texts, autotext = axs[1].pie(x=data, + autopct=lambda pct: func(pct, data), + textprops=dict(color='k'), + counterclock=False, + colors=colors) + y = np.asarray(data) + porcent = 100. * y / y.sum() + labels = ['{0} - {1:1.2f}%'.format(i, j) for i, j in zip(variables_names, porcent)] + axs[1].legend(wedges, labels, + loc='center left', bbox_to_anchor=(1, 0, 0.5, 1)) + + plt.setp(autotext, size=10) + fig.suptitle(title, size=20) + plt.tight_layout() + fig_path = os.path.join(out_dir, '{0}_distribution.png'.format(variable)) + plt.savefig(fig_path) + plt.close() + if not out: + fig, ax = plt.subplots(2, 2, figsize=(12, 8)) + + labels = {'mRS90d': ['mRS 0', 'mRS 1', 'mRS 2', 'mRS 3', 'mRS 4', 'mRS 5', 'mRS 6'], + 'dmRS': ['Good outcome', 'Bad outcome (>2mRS)'], + 'shift_mRS': ['shift of 0', 'shift of 1', 'shift of 2', 'shift of 3', + 'shift of 4', 'shift of 5', 'shift of 6'], + 'mortality': ['Alive after 90d', 'Dead after 90 d']} + + for count, output in enumerate(['mRS90d', 'shift_mRS', 'dmRS', 'mortality']): + x = int(np.floor(count / 2)) + y = int(count % 2) + sns.countplot(ax=ax[x, y], x=output, hue=variable, data=df, palette="pastel") + ax[x, y].set_xticklabels(labels[output]) + ax[x, y].get_legend().remove() + if count > 1: + p = p_values[count - 2] + color = 'k' if p > 0.05 else 'r' + ax[x, y].text(0.2, 0.95, 'P value: {:.6f}'.format(p_values[count - 2]), + ha='center', va='center', transform=ax[x, y].transAxes, fontsize=9, color=color) + + fig.legend(title=variable, labels=variables_names) + fig.suptitle(title, size=20) + plt.tight_layout() + fig_path = os.path.join(out_dir, '{0}_distribution_target.png'.format(variable)) + plt.savefig(fig_path) + plt.close() + + +def plot_distribution_numerical(df, variable, title='', out_dir='', p_values=[]): + fig, ax = plt.subplots(1, 2, figsize=(15, 6)) + df[variable] = df[variable].astype('float32') + sns.histplot(ax=ax[0], data=df, x=variable, palette='pastel', kde=True) + # sns.kdeplot(ax=ax[0], data=df, x=variable, palette='pastel') + + sns.boxplot(ax=ax[1], y=variable, data=df, palette="pastel") + sns.swarmplot(ax=ax[1], y=variable, color="k", size=3, data=df) + fig.suptitle(title) + plt.tight_layout() + fig_path = os.path.join(out_dir, '{0}_distribution.png'.format(variable)) + plt.savefig(fig_path) + plt.close() + + fig, ax = plt.subplots(2, 2, figsize=(12, 8)) + + labels = {'mRS90d': ['mRS 0', 'mRS 1', 'mRS 2', 'mRS 3', 'mRS 4', 'mRS 5', 'mRS 6'], + 'dmRS': ['Good outcome', 'Bad outcome (>2mRS)'], + 'shift_mRS': ['shift of 0', 'shift of 1', 'shift of 2', 'shift of 3', + 'shift of 4', 'shift of 5', 'shift of 6'], + 'mortality': ['Alive after 90d', 'Dead after 90 d']} + + for count, output in enumerate(['mRS90d', 'shift_mRS', 'dmRS', 'mortality']): + x = int(np.floor(count / 2)) + y = int(count % 2) + sns.boxplot(ax=ax[x, y], x=output, y=variable, data=df, palette="pastel") + sns.swarmplot(ax=ax[x, y], x=output, y=variable, color="k", size=3, data=df) + # ax[x,y].legend(title=variable, labels= variables_names) + ax[x, y].set_xticklabels(labels[output]) + # ax[x, y].get_legend().remove() + if count > 0: + p = p_values[count - 2] + color = 'k' if p > 0.05 else 'r' + ax[x, y].text(0.2, 0.95, 'P value: {:.6f}'.format(p_values[count - 2]), + ha='center', va='center', transform=ax[x, y].transAxes, fontsize=9, color=color) + + fig.suptitle(title, size=20) + plt.tight_layout() + fig_path = os.path.join(out_dir, '{0}_distribution_target.png'.format(variable)) + plt.savefig(fig_path) + plt.close() diff --git a/architectures/3D_CNN.py b/architectures/3D_CNN.py new file mode 100644 index 0000000..21a40e9 --- /dev/null +++ b/architectures/3D_CNN.py @@ -0,0 +1,64 @@ +import torch.nn as nn +import torch.nn.functional as F +import torch + + +class _3D_CNN(nn.Module): + def __init__(self, num_output): + super(_3D_CNN, self).__init__() + self.conv1 = nn.Conv3d(1, 8, kernel_size=3, stride=1, padding=1) + self.conv2 = nn.Conv3d(8, 16, kernel_size=3, stride=1, padding=1) + self.conv3 = nn.Conv3d(16, 32, kernel_size=3, stride=1, padding=1) + self.conv4 = nn.Conv3d(32, 64, kernel_size=3, stride=1, padding=1) + self.conv5 = nn.Conv3d(64, 128, kernel_size=3, stride=1, padding=1) + + self.BN1 = nn.BatchNorm3d(num_features=8) + self.BN2 = nn.BatchNorm3d(num_features=16) + self.BN3 = nn.BatchNorm3d(num_features=32) + self.BN4 = nn.BatchNorm3d(num_features=64) + self.BN5 = nn.BatchNorm3d(num_features=128) + + self.pool1 = nn.AdaptiveAvgPool3d((64, 64, 64)) + self.pool2 = nn.AdaptiveAvgPool3d((32, 32, 32)) + self.pool3 = nn.AdaptiveAvgPool3d((16, 16, 16)) + self.pool4 = nn.AdaptiveAvgPool3d((8,8,8)) + self.pool5 = nn.AdaptiveAvgPool3d((4,4,4)) + + self.fc1 = nn.Linear(10244, 1300) + self.fc2 = nn.Linear(1300, 50) + self.fc3 = nn.Linear(50, num_output) + + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight.data) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Conv3d): + nn.init.kaiming_normal_(m.weight, mode='fan_out',nonlinearity='relu') + nn.init.constant_(m.bias, 0) + + + + def forward(self, x): + + x = F.relu(self.BN1(self.conv1(x))) + x = self.pool1(x) + + x = F.relu(self.BN2(self.conv2(x))) + x = self.pool2(x) + + x = F.relu(self.BN3(self.conv3(x))) + x = self.pool3(x) + + x = F.relu(self.BN4(self.conv4(x))) + x = self.pool4(x) + + x = F.relu(self.BN5(self.conv5(x))) + x = self.pool5(x) + + x = x.view(x.size(0), -1) + + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = torch.log_softmax(self.fc3(x), dim=1) + + return x diff --git a/architectures/Edge_GCN.py b/architectures/Edge_GCN.py new file mode 100644 index 0000000..bfa90f0 --- /dev/null +++ b/architectures/Edge_GCN.py @@ -0,0 +1,25 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch_geometric.nn import ChebConv, EdgeConv +from torch_geometric.utils import dropout_adj + +class Edge_GCN(nn.Module): + def __init__(self, nfeat, nclass, dropout=0): + super(Edge_GCN, self).__init__() + + self.dropout = dropout + + self.gc1 = EdgeConv(nn.Sequential(nn.Linear(nfeat, int(nfeat/2)), nn.ReLU())) + self.gc2 = EdgeConv(nn.Sequential(nn.Linear(int(nfeat/2), nclass), nn.ReLU())) + + + def forward(self, data, edge_index, weigths): + print(data.shape, edge_index.shape) + x = self.gc1(data, edge_index) + edge_index_drop, _ = dropout_adj(edge_index, p= self.dropout) + x = self.gc2(x, edge_index_drop) + x2= torch.log_softmax(x, dim=1) + + return x2 + diff --git a/architectures/FCN.py b/architectures/FCN.py new file mode 100644 index 0000000..f2de6cb --- /dev/null +++ b/architectures/FCN.py @@ -0,0 +1,52 @@ +import torch +import torch.nn as nn + + +class Basic_FCN(nn.Module): + + def __init__(self, in_features, layers, out_features, dropout_rate): + super(Basic_FCN, self).__init__() + + self.layers = [] + self.n_layers = layers['number'] + l = nn.Linear(in_features, layers['layer1']) + self.layers.extend([l]) + + if self.n_layers > 2: + for i in range(self.n_layers - 2): + name0 = 'layer{}'.format(i + 1) + name1 = 'layer{}'.format(i + 2) + l = nn.Linear(layers[name0], layers[name1]) + self.layers.extend([l]) + + if self.n_layers >= 2: + name0 = 'layer{}'.format(self.n_layers - 1) + l = nn.Linear(layers[name0], out_features) + self.layers.extend([l]) + self.layers = nn.ModuleList(self.layers) + + self.dropout = nn.Dropout(p=dropout_rate) + + # self.logsoftmax = nn.LogSoftmax(dim=1) + + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight.data) + nn.init.constant_(m.bias.data, 0.03) + + def forward(self, data): + + if len(data.shape) == 1: + data = torch.reshape(data, (data.shape[0], 1)) + x = torch.relu(self.layers[0](data)) + + if self.n_layers > 2: + for i in range(1, self.n_layers - 1): + x = torch.relu(self.layers[i](x)) + x = self.dropout(x) + # x = torch.sigmoid(self.layers[self.n_layers - 1](x)) + # + #x = self.layers[self.n_layers - 1](x) + x = self.layers[self.n_layers - 1](x) + + return torch.log_softmax(x, dim=1) diff --git a/architectures/GCN.py b/architectures/GCN.py new file mode 100644 index 0000000..239fd36 --- /dev/null +++ b/architectures/GCN.py @@ -0,0 +1,38 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch_geometric.nn import ChebConv, EdgeConv +from torch_geometric.utils import dropout_adj + + +class GCN(nn.Module): + def __init__(self, nfeat, nclass, dropout=0): + super(GCN, self).__init__() + + self.dropout = dropout + + self.gc1 = ChebConv(nfeat, 200, 3) + self.gc2 = ChebConv(200, 50, 3) + self.gc3 = ChebConv(50, nclass, 3) + #self.gc1 = EdgeConv(nn.Sequential(nn.Linear(nfeat, nfeat/2), nn.ReLU())) + #self.gc2 = EdgeConv(nn.Sequential(nn.Linear(nfeat/2, nclass), nn.ReLU())) + + def forward(self, data, edge_index, weigths): + + #x = self.gc1(data, edge_index, weigths) + x = self.gc1(data, edge_index) + x = F.relu(x) + x = F.dropout(x, p=self.dropout) + #edge_index_drop, _ = dropout_adj(edge_index, p= self.dropout) + #x = self.gc2(x, edge_index, weigths) + x = self.gc2(x, edge_index) + x = F.dropout(x, p=self.dropout) + #edge_index_drop, _ = dropout_adj(edge_index, p=self.dropout) + x = F.relu(x) + #x = self.gc3(x, edge_index, weigths) + x = self.gc3(x, edge_index) + #x1 = F.sigmoid(x) + x2= torch.log_softmax(x, dim=1) + + return x2 + diff --git a/architectures/ML_algorithms.py b/architectures/ML_algorithms.py new file mode 100644 index 0000000..075f073 --- /dev/null +++ b/architectures/ML_algorithms.py @@ -0,0 +1,146 @@ +from Metrics.ClassificationMetrics import ClassificationMetrics + +import numpy as np +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.neural_network import MLPClassifier +import xgboost as xgb +from scipy.stats import uniform, randint + +def apply_LR(x_train, x_val, x_test, y_train, y_val, y_test): + # Number of classes + c = np.unique(y_train).shape[0] + + # Parameters for hyperparamenter optimization + solvers = ['newton-cg', 'liblinear'] + penalty = ['l2'] + c_values = [1000, 100, 10] + + # Define grid search + grid = dict(solver=solvers, penalty=penalty, C=c_values) + scoring = 'roc_auc' if c < 3 else 'f1_micro' + multiclass = 'auto' if c < 3 else 'ovr' + try: + grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=grid, n_jobs=-1, cv=5, + scoring=scoring, error_score=0) + grid_result = grid_search.fit(x_val, y_val) + + # Summarize results + print("Best dictionaries: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) + + # Use selected hyperparameters to train the model with the best dictionaries + clf = LogisticRegression(C=grid_result.best_params_['C'], solver=grid_result.best_params_['solver'], + random_state=0, max_iter=500, multi_class='ovr').fit(x_train, y_train) + except: + clf = LogisticRegression(random_state=0, max_iter=500, multi_class='ovr').fit(x_train, y_train) + + metrics_tensor = compute_metrics(clf, splits=[(x_train, y_train), (x_val, y_val), (x_test, y_test)]) + + return clf, metrics_tensor + + +def apply_random_forest(x_train, x_val, x_test, y_train, y_val, y_test, save=False): + # Parameters for hyperparamenter optimization + # Number of trees in random forest + n_estimators = [2, 5, 10, 100] + # Number of features to consider at every split + + max_features = ['log2', 'sqrt'] + [a for a in (2,5,10, 20) if a<=x_train.shape[1]] + + # Maximum number of levels in tree + max_depth = [5, 10, None] + bootstrap = [True, False] + criterion = ['gini', 'entropy'] + + # Define grid search + random_grid = {'n_estimators': n_estimators, + 'max_features': max_features, + 'max_depth': max_depth, + 'criterion': criterion, + 'bootstrap': bootstrap} + + rf = RandomForestClassifier(random_state=True) + grid_search = GridSearchCV(estimator=rf, param_grid=random_grid, n_jobs=-1, cv=2, + scoring='roc_auc', error_score=0) + grid_result = grid_search.fit(x_val, y_val) + + # Summarize results + print("Best dictionaries: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) + + # Use selected hyperparameters to train the model with the best dictionaries + clf = rf.fit(x_train, y_train) + metrics_tensor = compute_metrics(clf, splits=[(x_train, y_train), (x_val, y_val), (x_test, y_test)]) + + return clf, metrics_tensor + + +def apply_mlp(x_train, x_val, x_test, y_train, y_val, y_test): + # Number of classes + + mlp_gs = MLPClassifier(max_iter=300, verbose=False) + parameter_space = { + #'hidden_layer_sizes': [(10, 30, 10), (20,), (50, 20), (20, 50, 20)], + 'hidden_layer_sizes': [ (20,), (64, 32, 16), (128, 32, 16, 8), (40, 20)], + 'activation': ['relu'], + 'solver': ['sgd', 'adam'], + 'alpha': [0.0001, 0.05], + 'learning_rate': ['constant', 'adaptive'] + } + + clf = GridSearchCV(mlp_gs, param_grid=parameter_space, n_jobs=-1, cv=2, + scoring='roc_auc', error_score=0) + clf.fit(x_val, y_val) # X is train samples and y is the corresponding labels + print('Best %f using %s '% (clf.best_score_, clf.best_params_)) + + # Use selected hyperparameters to train the model with the best dictionaries + clf = mlp_gs.fit(x_train, y_train) + metrics_tensor = compute_metrics(clf, splits=[(x_train, y_train), (x_val, y_val), (x_test, y_test)]) + + return clf, metrics_tensor + +def apply_xgbBoost(x_train, x_val, x_test, y_train, y_val, y_test): + + params = { + 'gamma': [0.5, 1, 1.5, 2,5], + 'subsample': [0.6, 0.8, 1.0], + 'colsample_bytree': [0.4, 0.6, 0.8, 1.0], + 'max_depth': [2, 4, 6], + + + } + xgb_model = xgb.XGBClassifier( random_state=4, use_label_encoder=False, eval_metric='mlogloss') + grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, n_jobs=-1, cv=2, + scoring='roc_auc', error_score=0) + clf = grid_search.fit(x_val, y_val) + print('Best %f using %s ' % (clf.best_score_, clf.best_params_)) + best_model = xgb_model.fit(x_train, y_train) + + metrics_tensor = [] + metrics_tensor = compute_metrics(best_model, splits=[(x_train, y_train), (x_val, y_val), (x_test, y_test)] ) + splits = [(x_train, y_train), (x_val, y_val), (x_test, y_test)] + """for x, y in splits: + prob = xgb_model.predict_proba(x) + output_one_hot = np.zeros((y.shape[0], len(np.unique(y)))) + for i in range(output_one_hot.shape[0]): + output_one_hot[i, y[i]] = 1 + CM = ClassificationMetrics(classes=2) + metrics = CM.compute_metrics(prob.squeeze(), output_one_hot) + metrics_tensor.extend([metrics])""" + + return best_model, metrics_tensor + + + +def compute_metrics(clf, splits): + metrics_tensor = [] + for x, y in splits: + prob = clf.predict_proba(x) + output_one_hot = np.zeros((y.shape[0], len(np.unique(y)))) + for i in range(output_one_hot.shape[0]): + output_one_hot[i, y[i]] = 1 + CM = ClassificationMetrics(classes=2) + metrics = CM.compute_metrics(prob.squeeze(), output_one_hot) + metrics_tensor.extend([metrics]) + + return metrics_tensor diff --git a/architectures/__pycache__/Edge_GCN.cpython-37.pyc b/architectures/__pycache__/Edge_GCN.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e84e8b4313f41fe71b4b9ef5bca63c59c83a4320 GIT binary patch literal 1235 zcmZ?b<>g{vU|{gPH#yOinStRkhy%kc3=9ko3=9m#5ey6rDGVu$ISf${nlYCtiir`z zX3k||Vqj#*Wr<=1i!kM|<+4Yy=W;}GfceZhoVi?4T#O9v3@I!rtSt;Ftf@>{+|A5U zJSmL944P~&LH7A+GTq{E&PYvi&d)2m#o?Ngo(f`VGT!1&DatR%FD;2rOvy?HDTQH> z^FZukH3kNTRE8+V6ox3K6t;GTG{zLB6y_GrC}xO-Q7r8YEDTYs!3>)05ToPWo&Aa! z85kInVTOQcP6h@BW{`g~7#J8z7)lsx7@C<DFfC+YWT;`TVTflgVJTr<z*fS#fW3w> zg;9i|gabrNGc+?UWGZ1@z`2lNAyYGB4MRK^LM1{?4HKFw?i7Y#22CcvmrM){44OQ* zSc^*wQj3aM85kIDam2@G=4F<|$KPUy`o9R|v|CJhdAGQNQwvK|^GY%kb8fNuWagzN z7Tsdb%qzLY5|ru_dW$(d*$_k+X|mj6DNfBvyTzK9mYP^{i!CoXC$YG=h!bQT<1Nnk zcrZUcekH>%e`l+h(Bjmh;+WjTqLR$K<V3@mfDD(IfTH}Y)Z~)l7)Q_e;F6;J?9>=X z&*1o=)byg%;^LUZqU4OslGNmq(xTMj7*MQ%f>y7fvWSC$fdPbzK?DOM9}^ps7Gsql zTnWs7@E8E81I2<fC<eU1F;K&p#aP3X#n{Zm$WX(ufT@NN6o$1dCCoKUH4IrSDU7{L zF-*0rwQMyE3s`GdYFHOC)v}bZWwF<=fRaEjQ!RT9doY70i=QUrEyjYEpeWa5yv3Z7 znXAchi?yH#6l$!+8Hoj{Mf?m53`HOZfWxOq02B+XCHY0k8MnA|^3&ss^V3Rl6Du@1 ziUdJYEGda4iMO~?Q_@r8GxJhXD{irur)H*?WE9_GthmLG5RV7N-z~=YTTB&3MWP@} zz=Rmc5cahEqVmL|6p)F<JfOtP$j8XTD8$Ia$j4MA01qsvGEGKsqSa&rC6Bzs+|>B^ zTU_z+x%nxjIUqJqe0*VPVh%(G90NtFAZvJ`F`b;6lM^40aD*mfkqF2^Ah&>%9XlxO zL1Dnb$j1p5(c~?X0U5yo4hg-yydqF}5hV=c>ZO(DC6{F8=OyOc;zD9Gy4?~2tBFrf z&Cg9ODauR+X$GZqNJ17xmn|*H%qa#(z%4dku#dnFK)4pd0=e}ThYiHfcAy9?2Dyj> K9BcxNJWK$4SU1}M literal 0 HcmV?d00001 diff --git a/architectures/__pycache__/FCN.cpython-37.pyc b/architectures/__pycache__/FCN.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48d5168787abc3f17c28ddf1a6e4d22b47176aad GIT binary patch literal 1531 zcmZ?b<>g{vU|>+>nwPkRje+4Yhy%kc3=9ko3=9m#S_}*fDGVu$ISf${nlYCtiiwfI zogsxOg}H?xg*lZei@BLOiY0|Hm_d`}CCCK7WDp6%%uq(L3IhW}Dnk@w3PTiAJ3|^{ z3S$aW3r7@l3Udle3qur3I|B<t6l*YpChILur^MpS<ajq{zhqXZc93J)7#J8p#ub|| zGBA`dlrYvXG&3z=TFAi2kj_xU5YJr0l)@+pVzHF4E?}!+tYJ!Fl4MxORLfe!kj1=! zy@oZNaUmlkLk&X~OA2!~OVON)kP;3EPlBPD(S;$_sg^B;WdUamTMDZ<!$QVdb|{Y- z%;Nx?%T&WI$xy>o!-380!mx0d`K(1(z`8-Uu}XkVL(<QT>8=vi1zaGPE@TAxri3+% zyM`r&trx`Osj#bISirlG!G$5#sFt&YuZFXPHH)p8spwY;YZm_kff~*%!7L$=@4$TF z8qO?{ETI(kg-nbLg}=fXQW%06G&%fkvE`NKCZ!hLV#`UaOf52e$;80GP{j`BRM);_ zWME*>WW2>#kiJ8%^>yHmdG?y3w^)ly3sQ@UKq_x>#K&jmWtPOpLk%dt#gP{e)?Ivy zDKGC9n@?t5YGTnXw$zG})V!2itVM}=>8ZEa((;RP6H9J!`R1pT=A`;$7MI*&cPYv* z$S*Cq#hwCV<bq|3Z*gT7XXX``B<3Zj-eLjScZ<Iwu`DySD84i=6J%8UEw=L1%=C<s zTP!JwC5g8<lk>sqOX6>_BxNQRYjWLUDNfBvyTzTE7oV1zSW;S)T3iH5M7MbIOH1HF zw|Jm-#}_4*q~2o8xy6_n#hRCxn`&?iOc~x{%++MP#Tg%;oRe5w93Q`u;g^@QRZM7c zYEf}aZemeMW?pilVN5`VOH4pfepYI7NpXy$XMAu;QGRx6jH738d{An7QEG8<Okz=T zMrKKBGT1FKZq9ys1(ijd3=9nHu&l<w$j8LS$i*nY$j8LRD8k6c$j2zcD8vL35diU+ z_*nQDc^Cy4tAyZLL=P4t$)E@VWfxHP1?3xOkhyahKzXHxF^jR8F@-UksYs=SX#sN$ zBS-|4J5m_am=-eCGL|qcV5woqVg-rSFg7zTWUOVXVaQ?w=aZ@mvl8|cMlhdIf}xqo zg&|g{mKmm}h8dI}V0spC)G&c`7lwdy0x0jXr7%h`K-APSl`v&-f^upKOD_{ByZaS^ zGIkLt=@o&Tt;u+cIVCe!lj|09PHNsQ*5ZuBg4A2ACHY0k8MoMrQXzbnqSTyHaEuml zGcYg|@q-9q1_p*(+&TH_@x}RRCAo<enk+>!APGs31Y<>!B8a03C)7Z~>}mN$<%vZp zAmL&mPyjOVF^VyYFmZuMMm`o1rYb>1a6?sVvVrmfD5b>5-{Ojoha}zj_**>j@r9*{ zIS?6e{1)khtl@<wq2$z@ocQ=6kZoW>ld%X?G!}u}0?uQgFenDaAqNwW99Tq?wFp!a q-{Js=fnHu-5f8{jHb`ayD*+pYM1V}Y#bE<+l^rN-Ax1LtFarS46^2{@ literal 0 HcmV?d00001 diff --git a/architectures/__pycache__/GCN.cpython-37.pyc b/architectures/__pycache__/GCN.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcd4d2fcac2fd81b246c8c8cdd10b29546684431 GIT binary patch literal 1173 zcmZ?b<>g{vU|>j4nU~1R%E0g##DQTJ1_lNP1_p-W2nGg*6owSW9EK<e&6vv+#l#3< zGv_ifF)%XZvP7|hMVNBfa@nKUb2*|oz<lN$&Rni2E=C4-h7^_*))s~o)>Nh}?q=pF zo)pGl22Hk?Ap86@nQn17XQU=M=jWB(;&4q#PX)0w8E<i?6y+D>mzKmQrer09l)^B` zc_4PN8Uq7EDnk@w3PTiA3R^ov8e<Am3Udo*6f?xaD3*2x7KSL+U<OV0Tg>jxenpH7 z3=GLI{U92InL%E4Vqjn>VJKm&VQ6Mrz_gHok)ejUh9RE0gr$Ztg)xPx8N_2PVM$?3 zVTSV9N?1~uYnY%s_7sL-22B>fmnRq)7+!+Rd}+kMz@W)-i?z74AhoE7m4Sib7Ds%1 zW?p7VeEcnTsP~GPK;q2l$%eO>)02%rgs~>eEtcZcoU~i4d1<MMCAZk}l5-M^i;K8H z1~T5_jE@KN<KtH{{PJ?PiU}=FEh>)5O)M(O%u7x*j0wnai3upm&q_@$DUNaUj1Mj; z%Fj-Xar6w14@ylhN-ZvqNi0gv$Sg@sE-5WaEsk+__R}k<ECMA7PEcro7z~VjOl(Yi zjC_n$JRo_P%i-b2#=yV;3O{F%jw}WSh8l(i3^j~3ObeN6SxOieFx9XuWCXFYm}^*S z7_yjC7<-v&S!x&-u+*@?RH4ctbk#5{V1=t<%VMu#0mW@EQ!Q%^C_Y*IG#PI(7Q6&S zs3zkr=9J7_O^zZSP*5_u-C`+9%_#+kRuL})149uXNSw7KzbHB57I#j5dVFzyT1jqV zg(h1OKS+utC9x#&7FTLYdTM-TUP@|36nlAUW_n3R@h!%RTTB&3MZzFsz=Q}$1AAJ2 zQF&rf3W!rI1WJs|Jd9k70?a&&0!&;?JWPB{RotLJgR0YH1SdUBHc$e{OUzA;kH5tg zAD^3_Qknx|^Tfv&mL}#vWWdo-qyVyq7aG&asX00E@d#IFG8TbsEdtpOPFNhEH~>Wk z2O}RhSVWVzND5>G2iV(sd3m>(^75jDVLZLG(!Au7%>2B>oLgK7c98@~j}TZ@e0pkr zZfZ$UX0l#h9ypm6fs*wtQFPhTlFXdqB9QI3*nGiW0=od=TnG!~*jpSn5Kr5IVy+mJ P&Nvu(7=;)G7<rfh-B22W literal 0 HcmV?d00001 diff --git a/architectures/__pycache__/ML_algorithms.cpython-37.pyc b/architectures/__pycache__/ML_algorithms.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbb131a9cdf60971f8ccc94d907cf2f5a60ecd32 GIT binary patch literal 4090 zcmZ?b<>g{vU|=|wIWMtLoPpsnhy%mS3=9ko3=9m#T?`BiDGVu$ISjdsQH+crHd78$ z6mtqg3UdxiE^8DkSd1lyEtfrt9n5CU;mGBT;smqVa=3E2qqxCrrW~Fq-V}xu_8h)k z{wV%jfhYk+h7^tz&K8C!!4%;X?i8LD#wei_-W0wThA80_{uF^0hA5E~!4#nuhA7b# zreFq5k(VGBYBJsub<RmFF3wELOinDx%+K>pEh)-OF80%8yd@Ttn3s~D>y}@XT3iBC zm0G09cuUA9KRvU!Br`cEHN7acxHvOEPm}Q$k9$#ON^ojoQF4ZJm?q;bUSFR8ByCK$ z*h}*=)AEaQZ?P98=A~rjl_ax)9KpcAAjrVLz`?-4;0y{W21W*k5{3ngHOvbcvzSsC zB^hejQkYVh)0t}7Q&`d&YdKO_Q`l12)0t~IOPFieYdC5+Q#g8=Yq@IJQaH02ikwoo z@|aS%Yq_EPpcI}wrWD><p2~z0mJ-%xh8nIEz7+l%?i9upfnK&+-WuKoY&DEEEDM=x z`AXPR1Z()RIBNK^IMbNK7@8Rw8A@0|yc8ix1~4rw$&kXBB9g+JBHGKkfUSn1hIt`V zEq^#eo)RMi2$rxy<;8lL5%Q4?c}x)uC0sT9ARTE;ARE(|YM5$R)0onkQ^b3jY6WWe zYXs6j@gw0^#K^$FP{o;-T3(W$r<<I9i!&!PDJL^8HL>UxQ;t#Q3uXp}mnjSk3@<^c zPm}o;TXB9)S!&TO_JY*B#GI1KTa3;xLCNnHdr^LJd}3*G6i1q2d~RlPQT{EK#L|-d zTg>@oMKAyV|Ns9bD+2?ACfhB})Z&uN+{BXnqFY=AiA9OI@##gGDYw}2;<NITif=I` zm)&A7PR=jN%uBz;omy0sUlb4GrdG*0r52Ycq+}+8(okYiW@@pOf@+#VX>n#=x`JwP z5kCV1LlG|n14HH)Mg|5=R)oMU9#CS)&y6oGNi0dd#gUs>5uaI-T6Bv$w=}0DGd>xV zJT;k%WI!g!f(SW~TN!V$6&K`WmK1Ap-(t!uxW!hQmswbvdW*F<Be5X$77NHJMIc`l zu`n<&++t44EV;#6P?VWha*I1DwYVf6?C$tmJYWvkuf_4V_>=Q<3rb5;<8vX|KvVD* zdqsRnQDSD^E!K+ovc#NQY!&e(sl_F?*ehY;l@Reth&W^NEp|vm6c_O^Ffc@MB!lEj zQ;To0fC95f3gjLxu;;*jyTuI>jxPph94MzKwYW5=<Q8*sPTDO#s9WMoQuB)Qi&ioe znKLjj{0ecliU}=FEh>)5O)M(O%u7x*j0wnai3upm&q_@$DUNaUj1Mj;%Fj-Xar6w1 zhvw86Q0~kuNlh*(ElMqp@%4#M%t_BL$}GvqE!Hcjyv31NP>@p@?-Rrg%84qV?8zj+ z%)!XU$i>XZD8MMd$i&FQ$i>LREW#|rD8b0aD8nSjRAkA(z~Gk54pPp*z#s(8&7j0r zd;nDLF@o}M3QIamEjuXhrZcrOq%o$jr*O1zq;R@0G&71b)N+8;Fo1G(3RenuI#VrI z3Qr1eI#Vq-B#+l{gYxYh)>@tth7_(8{$9pf-V(+d-Wr}1ffPni>6Rkg%T~)*!?ysO zw?KKaggHf|hChp?hChoHmYZvM7cg}|@~l7!Ta5sSjgcEgdzoqlYXoWp!MRb)?<Er` z>4|~}9uNTvpIa<B`RPWtSc(gaN|Hgj736=AI#9Y31_dA}sirg3FvN=0GS)D5Fk~^L zFi0|FGZwMbFivDDWC>=_WPAzIsmWNR1WMLSx0v({ZZRfmG8SoqtN}%Jkp(EexomPW zi%UT1)6NiNG{_z{hAMSPghBIA8n~?1v&qR%PRuE`(?jUdWGS)&83^~mOA!VJh7h+~ zEa{nfnYY+e^Gb^H3o12PZ}H^C!%KnUTRforn3kFdPA|7OLA;dIf|86|oXJI?Y?_&$ zcZ)MAKfk27q$sgKlM#|nL1q_$3UzQHrpZ<$2C`cmL`Z-LNe}_b2u0GMLXJS`p~+Yz z4+<RiB2bEg<Z4jTEK&s-qXr_>L4*cKkq{){v_JydAVLR3=z<76P|jm1PAp3;aslyO zL4+HK@Bk6+pxnw04M|X*yv0<MR%8s4G64~$Ai@kpfMNv}L!ijHC5$B&K$aJSoXNn< z!z{(j#mK?P!OX-6fgDU+j35ZgM?8#jOhw)d3=Ht<5LBSUtHT)#pz1IMoP~OsYS~g) zKv^h-J)I?mBb_mYGleUi2~_2$Gl44nbfy&kIjptpC5$y}HS8$@kZh46(#uxMQNsbt z7>o=h%qgNZ99b+i97q|1Z2=RgYFfxt%TdBs!vSJrWDK!hrdm!=l?%=o;(nPrj0_BJ znoLEYviTN!S!z*IesQWMW04~$Hke<68b2=;K-H80NPy*K11JRjK!iU?6y#4$rk9|& z18cd(Qk0rg8pT|ko^p#NF(olKUDsLW^&>MW`}A2eL13l*Eso^;yyB9?ypmfSi7AN% zC7ETZnygVm8JQ_5sd@1^iIu5E@x_@{sl~Ur5|c|Z%fKxgaP@PGH8H0kBk>k*PHJLN zUS?i;d{JUaY7scSg7XI=(XtldOs|6A+779HQ$Qq9P?Ls066M05M5EYpa|+_qi=+6# z)lzB+$TtOv$*D!CDG*!_fD4CPoRIXFn^OP^*kVx50;M`GMixdkMm9zsW^h_#6k!IZ zwJ7AY2C9@naR6?5?LjK6Q&`i%&8!rTbmkOJP=hOlBb_CM1zb2Yr|`^Ssbwo+$YM<4 z0;RhY{$A!<_7bKV_8PVnffN=<x`U=VW>8aSAyX}92}_Dd4QCc>4QCb`yf|iGzzj-h z3z=%U(iv*GOW13;Kzxjp2P%@evB}i()Ns{sgVUp!UplA|W?*<=Uu4U`zyRg_ut$`B z>BeC27?z%1+Cvq6fQj6+e+g<dyaYw1Cd)0>^u*lU#9N%jrAft!xdl0?Q3A>NIS^)i zQe{a|YHAUv6mg5<$j!-5&&e+?hPGFt1WSuk<8u;|Qgh-{^OEyZQj4OvQ_B)_pzRAt zx<e#6LgggxbjNXvxgtF&iZ{aD3DM*xkPLb8C&OE8;N({v#aWS_6rY=)l9~f43_+>z z78khP1Q7+7iJ%f1k|IH+BP3b!L6T)fdXiIqesM_}sNMp#;~1EE7&#cZ82MNPAf+Iq z2qO;@3!@yP7*|meD0PBrH&8^Pl!Oz&5n9Vs(N@Fg!Vp^*!&J*$%TmLz092VX*0Po` zE?}x*$zld|8A@167#FbCuq<S3W=vyR$XLr(p;p3H!v>LU2I&{6Wlv!Om6A0qHSCfM zX-qZ@g<jzdCG088DJ;E=;PR5AhIIkwLWUYPQ0lE=TEJDqna;S7k&&U0E1aPQ)KF)I zG}OVV4V=g{8E>(J8+NInwkdBxQ7Wh{8edS9pOjbxDhO|}R;3o@7lSMOTg*ABc}0q# ziix!-F)ux}2;}o3P-%FJy|}P6HMJ`B7F%jXNorn-CKsdt0EaXv%x^JP++wV}#R4)q ziZ8#kq@c7UK0hxtJ|n;67Gq`<le6zFc4+GnEo#6vw1eUURP-@0vM};7axuy=iZF^W za)H`;Okfq7jBc7@w`6@GtzkV_(n9Gff?a%zHLo<cpc3Lgf#Pg%@v4`aSDc!gl#>ds zx1&U0!jMi3G)aMST9i0U3Y?(gi&Jw_!959Z3XT$k$>yb&7A5Ay=cSgE=NDxcf%0b* zdqsK@C~+0Vg94GeI61SRQV-OZC@uoEM~gsZ8@PQ6u40QoWgj?+f&GURDu`6eVUwGm iQks)$2WprVgW4-ROdO0nP{_e3z{H`zrOYA0p$GuN6Z-=I literal 0 HcmV?d00001 diff --git a/dictionaries/dictionary_modalities.yml b/dictionaries/dictionary_modalities.yml new file mode 100644 index 0000000..b22e741 --- /dev/null +++ b/dictionaries/dictionary_modalities.yml @@ -0,0 +1,764 @@ +#### Data divided into +# 0. Output +# 1.Meta data +# 2.Clinical data +# 3.NCCT data +# 4.CTP data +# 5.CTA data +# 6.Treatment +# 7.Treatment output +# 7.Control CT data +# 8.Temporal + +# Before-Imaging: Metadata, NIHSS, Unkown_Onset, Time_Onset_to_Admission +# Pre-treatment: Metadata, NCCT, CTP, CTA (+NIHSS, Time_Onset_to_Admission, Time_CT_to_Angio) +# Post-treatment: Metadata, NCCT, CTP, CTA, Treatment, Treatment out (+NIHSS, Time_Onset_to_Admission, Time_CT_to_Angio, Time_Puncture_to_Recan ) +# Post-treatment 24 h: Metadata, NCCT, CTP, CTA, Treatment, Treatment out, Control CT (+NIHSS) + +Ouput: + + mRS90d: + type: 'ord' + info: 'Functional outcome at 90 days' + categories: 7 + description: + 0: 'mRS 0' + 1: 'mRS 1' + 2: 'mRS 2' + 3: 'mRS 3' + 4: 'mRS 4' + 5: 'mRS 5' + 6: 'mRS 6' + + dmRS: + type: 'cat' + info: 'Binary functional outcome at 90 days' + categories: 2 + description: + 0: 'Good outcome' + 1: 'Bad outcome' + + mortality: + type: 'cat' + info: 'Mortality at 90 days' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + shift_mRS: + type: 'ord' + info: 'Shift in mRS' + categories: 7 + description: + + 0: 'Shift of 0' + 1: 'Shift of 1' + 2: 'Shift of 2' + 3: 'Shift of 3' + 4: 'Shift of 4' + 5: 'Shift of 5' + 6: 'Shift of 6' + +Metadata: + + Sex: + type: 'cat' + info: 'Patient sex' + categories: 2 + description: + 0: 'man' + 1: 'woman' + + Age: + type: 'int' + info: 'Patient age' + +Clinical: + + NIHSS: + type: 'int' + info: 'National Institute of Health Stroke Scale' + categories: 43 + description: None + + pre-mRS: + type: 'ord' + info: 'mRS previous to stroke' + categories: 6 + description: + 0: 'mRS 0' + 1: 'mRS 1' + 2: 'mRS 2' + 3: 'mRS 3' + 4: 'mRS 4' + 5: 'mRS 5' + + aHT: + type: 'cat' + info: 'arterial hypertension' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + HLP: + type: 'cat' + info: 'hyperlipidemia' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + DM: + type: 'cat' + info: 'Diabetes Mellitus' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + aFib: + type: 'cat' + info: 'atrial fibrillation' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + s.p. stroke: + type: 'cat' + info: 'previous stroke' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH: + type: 'cat' + info: 'anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH mono: + type: 'cat' + info: ' mono anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH duo: + type: 'cat' + info: 'dual anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + OAK: + type: 'cat' + info: 'oral anticoagulant' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + VKA: + type: 'cat' + info: 'vitamin K antagonist' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + DOAC: + type: 'cat' + info: 'direct oral anticoagulant' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + +NCCT: + C_br: + type: 'cat' + info: 'Caudate region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + IC_br: + type: 'cat' + info: 'Intenal capsule region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + INS_br: + type: 'cat' + info: 'Insular ribbon affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + L_br: + type: 'cat' + info: 'Lentiform nucleus region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M1_br: + type: 'cat' + info: 'Anterior MCA cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M2_br: + type: 'cat' + info: 'MCA cortex lateral to the insular ribbon region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M3_br: + type: 'cat' + info: 'Posterior MCA Cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M4_br: + type: 'cat' + info: 'Anterior cortex immediately rostal to M1 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M5_br: + type: 'cat' + info: 'Lateral cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M6_br: + type: 'cat' + info: 'Posterior cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + e-ASPECTS: + type: 'ord' + info: 'electronic ASPECTS' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + + ASPECTS oberfl_tief: + type: 'cat' + info: 'depth of regions affected' + categories: 4 + description: + 0: 'superficial' + 1: 'deep' + 2: 'both' + 3: 'ASPECTS 10' + + pc-ASPECTS: + type: 'int' + info: 'posterior ASPECTS' + + + Volume e-ASPECTS: + type: 'float' + info: 'automatic volume on e-ASPECTS' + + ICV krank: + type: 'int' + info: 'Internal cerebral vein intensity, ipsilateral (HU)' + + IVC gesund: + type: 'int' + info: 'Internal cerebral vein intensity, contralateral (HU)' + + ICV Index: + type: 'float' + info: 'ICV krank/ICV gesund' + + Vessel Occlusion Location Admission: + type: 'cat' + info: 'Location of vessel occlusion at admission' + categories: 11 + description: + 0: 'No occlussion' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + Vessel Occlusion Side Admission: + type: 'cat' + info: 'side of occlussion at admission' + categories: 3 + description: + 1: 'left' + 2: 'right' + 3: 'both' + + Tan Score: + type: 'ord' + info: 'collaterals score' + categories: 4 + description: + 0: '0%' + 1: '0-50%' + 2: '50-100%' + 3: '100%' + + Coves Score: + type: 'ord' + info: 'cortical vein opacification score (score of 0(absence), 1 (moderate) or 2(full) opacification + assigned to three veins in the affected hemisphere' + categories: 7 + description: + 0: 'Coves Score 0' + 1: 'Coves Score 1' + 2: 'Coves Score 2' + 3: 'Coves Score 3' + 4: 'Coves Score 4' + 5: 'Coves Score 5' + 6: 'Coves Score 6' + + + BATMAN: + type: 'int' + info: 'Basilar artery on CTA score' + + + Clot Burden Score: + type: 'ord' + info: 'Evaluates the extent of ipsilateral thrombus' + categories: 11 + description: + 0: 'Clot Burden Score 0' + 1: 'Clot Burden Score 1' + 2: 'Clot Burden Score 2' + 3: 'Clot Burden Score 3' + 4: 'Clot Burden Score 4' + 5: 'Clot Burden Score 5' + 6: 'Clot Burden Score 6' + 7: 'Clot Burden Score 7' + 8: 'Clot Burden Score 8' + 9: 'Clot Burden Score 9' + 10: 'Clot Burden Score 10' + + Vessel Stenosis: + type: 'int' + info: 'Presence of arterial stenosis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Stenosis Location: + type: 'cat' + info: 'Location of arterial stenosis' + categories: 8 + description: + 0: 'No stenosis' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'MCA' + 4: 'PCA' + 5: 'VA' + 6: 'BA' + 7: 'ACA' + + + Arterial Dissection: + type: 'cat' + info: 'Presence of arterial dissection' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Arterial Dissection Location: + type: 'cat' + info: 'Location of arterial dissection' + categories: 4 + description: + + 0: 'No dissection' + 1: 'ACI' + 2: 'VA' + 3: 'other' + +CTP: + 'CBF_lower30_volume': + type: 'int' + info: 'Volume in which the CBF is lower that 30% of the CBF of the contraleteral side -- core --' + 'Tmax_greater6s_volume': + type: 'int' + info: 'Volume in which the Tmax is greater than 6s -- penumbra + core --' + Mismatch Volume: + type: 'int' + info: 'penumbra volume' + Inverse Mismatch Ratio: + type: 'float' + info: 'volume core/total volume' + Hypoperfusion Index: + type: 'float' + info: 'Volume Tmax>10s/volume Tmax>6s' + CBV Index: + type: 'float' + info: 'Unknown' + +CTA: + + Vessel Occlusion CTA: + + type: 'cat' + info: 'Location of vessel occlusion on CTA' + categories: 11 + description: + 0: 'No occlussion' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + +Treatment: + + Stenting: + type: 'cat' + info: 'Use of stenting' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Thrombectomy: + type: 'cat' + info: 'Use of thrombectomy' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Device: + type: 'cat' + info: 'Device employed for thrombectomy' + categories: 3 + description: + + 1: 'Stent retriever' + 2: 'Aspiration' + 3: 'Both' + + PTA: + type: 'cat' + info: 'Use of percutaneous transluminal angioplasty' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Number Maneuver: + type: 'int' + info: 'number of maneuver that were necessary' + + Lysis i.a.: + type: 'cat' + info: 'Use of intraarterial lysis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Lysis quantity: + type: 'int' + info: 'quantity of lysis used' + +Treatment_out: + + Frustrated Recanalization: + + type: 'cat' + info: 'Whether the recanalization was frustrated' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion after Recan.: + type: 'cat' + info: 'Location of vessel occlusion after recanalization' + categories: 11 + description: + 0: 'No occlusion' + 1: 'ACI' + 2: 'Carotid-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + TICI: + type: 'cat' + info: 'Reperfusion score' + categories: 5 + description: + 0: '0, No reperfusion' + 1: '1, Minimal reperfusion' + 2: '2a, Partial reperfusion (<50%)' + 3: '2b Partial reperfusion(>50%)' + 4: '3, Total reperfusion' + + + SAE: + type: 'cat' + info: 'Subarachnoid hemorrhage encephalitis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion new SupplyArea: + type: 'cat' + info: 'Vessel occlusion in a new Supply Area' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion new SupplyArea Location: + type: 'cat' + info: 'Location of vessel occlusion after recanalization in a new supply area' + categories: 11 + description: + 0: 'No occlusion' + 1: 'ACI' + 2: 'Carotid-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + Vessel Occlusion new SupplyArea Treatment: + type: 'cat' + info: 'Whether new vessel occlusion was treated' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Infarct new SupplyArea: + type: 'cat' + info: 'Whether there is an infarct in the new supply area' + categories: 2 + description: + 0: 'no' + 1: 'yes' + +Control CT: + Lacunar Infarct: + type: 'cat' + info: 'Presence of lacunar infarct' + categories: 3 + description: + 0: 'no infarct' + 1: 'Lacunar infarct' + 2: 'not a lacunar infarct' + + Infarct Volume ControlCT: + type: 'int' + info: 'Volume in ml/cm³ of the infarct in the Control CT' + + Hyperdense Media Sign: + type: 'cat' + info: 'Presence of hyperdense Media Sign' + categories: 2 + description: + 0: 'no' + 1: 'yes' + pc-Aspect ControlCT: + type: 'int' + info: 'posterior ASPECTS in Control CT' + + + Aspect ControlCT: + type: 'ord' + info: ' ASPECTS in Control CT' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + C_br2: + type: 'cat' + info: 'Caudate region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + IC_br2: + type: 'cat' + info: 'Intenal capsule region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + INS_br2: + type: 'cat' + info: 'Insular ribbon affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + L_br2: + type: 'cat' + info: 'Lentiform nucleus region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M1_br2: + type: 'cat' + info: 'Anterior MCA cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M2_br2: + type: 'cat' + info: 'MCA cortex lateral to the insular ribbon region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M3_br2: + type: 'cat' + info: 'Posterior MCA Cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M4_br2: + type: 'cat' + info: 'Anterior cortex immediately rostal to M1 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M5_br2: + type: 'cat' + info: 'Lateral cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M6_br2: + type: 'cat' + info: 'Posterior cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + +Times: + Unknown Onset: + type: 'cat' + info: 'Whether onset time of symptoms is known' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Time_Onset_to_Admission: + type: 'int' + info: 'Time from symptom onset to admission in minutes' + + Time_CT_to_Angio.: + type: 'int' + info: 'Time from CT imaging to CTA imaging in minutes' + + Time_Puncture_to_Recan.: + type: 'int' + info: 'Time from CTA to Recanalization in minutes' + + Time_Recan_to_Control: + type: 'int' + info: 'Time from Recanalization to Control CT imaging in minutes' + + + + + + + + + + diff --git a/dictionaries/dictionary_timepoints.yml b/dictionaries/dictionary_timepoints.yml new file mode 100644 index 0000000..f8b8e9f --- /dev/null +++ b/dictionaries/dictionary_timepoints.yml @@ -0,0 +1,808 @@ +#### Data divided into +# 1.Admission: Metadata, Clinical, Unkonwn onset, Time_Onset_Admission +# 2.PostImaging: NECT, CTP, CTA +# 3.PostEVT: Treatment and Treatment output +# 4.After24h: CCT +# 5.Output + +Admission: + + Sex: + type: 'cat' + info: 'Patient sex' + categories: 2 + description: + 0: 'man' + 1: 'woman' + + Age: + type: 'int' + info: 'Patient age' + + NIHSS: + type: 'int' + info: 'National Institute of Health Stroke Scale' + categories: 43 + description: None + + s.p. stroke: + type: 'cat' + info: 'previous stroke' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + Unknown Onset: + type: 'cat' + info: 'Whether onset time of symptoms is known' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Time_Onset_to_Admission: + type: 'int' + info: 'Time from symptom onset to admission in minutes' + + #pre-mRS: + #type: 'int' + #info: 'mRS previous to stroke' + + pre-mRS: + type: 'ord' + info: 'mRS previous to stroke' + categories: 6 + description: + 0: 'mRS 0' + 1: 'mRS 1' + 2: 'mRS 2' + 3: 'mRS 3' + 4: 'mRS 4' + 5: 'mRS 5' + + + aHT: + type: 'cat' + info: 'arterial hypertension' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + HLP: + type: 'cat' + info: 'hyperlipidemia' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + DM: + type: 'cat' + info: 'Diabetes Mellitus' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + aFib: + type: 'cat' + info: 'atrial fibrillation' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH: + type: 'cat' + info: 'anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH mono: + type: 'cat' + info: ' mono anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + TAH duo: + type: 'cat' + info: 'dual anti-platelet drug' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + OAK: + type: 'cat' + info: 'oral anticoagulant' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + VKA: + type: 'cat' + info: 'vitamin K antagonist' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + DOAC: + type: 'cat' + info: 'direct oral anticoagulant' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + +Pre-EVT: + C_br: + type: 'cat' + info: 'Caudate region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + IC_br: + type: 'cat' + info: 'Intenal capsule region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + INS_br: + type: 'cat' + info: 'Insular ribbon affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + L_br: + type: 'cat' + info: 'Lentiform nucleus region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M1_br: + type: 'cat' + info: 'Anterior MCA cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M2_br: + type: 'cat' + info: 'MCA cortex lateral to the insular ribbon region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M3_br: + type: 'cat' + info: 'Posterior MCA Cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M4_br: + type: 'cat' + info: 'Anterior cortex immediately rostal to M1 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M5_br: + type: 'cat' + info: 'Lateral cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M6_br: + type: 'cat' + info: 'Posterior cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + #e-ASPECTS: + # type: 'int' + # info: 'electronic ASPECTS' + e-ASPECTS: + type: 'ord' + info: 'electronic ASPECTS' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + ASPECTS oberfl_tief: + type: 'cat' + info: 'depth of regions affected' + categories: 4 + description: + 0: 'superficial' + 1: 'deep' + 2: 'both' + 3: 'ASPECTS 10' + + pc-ASPECTS: + type: 'cat' + info: 'posterior ASPECTS' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + Volume e-ASPECTS: + type: 'float' + info: 'automatic volume on e-ASPECTS' + + ICV krank: + type: 'int' + info: 'Internal cerebral vein intensity, ipsilateral (HU)' + + IVC gesund: + type: 'int' + info: 'Internal cerebral vein intensity, contralateral (HU)' + + ICV Index: + type: 'float' + info: 'ICV krank/ICV gesund' + + Vessel Occlusion Location Admission: + type: 'cat' + info: 'Location of vessel occlusion at admission' + categories: 11 + description: + 0: 'No occlussion' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + Vessel Occlusion Side Admission: + type: 'cat' + info: 'side of occlussion at admission' + categories: 3 + description: + 1: 'left' + 2: 'right' + 3: 'both' + + #Tan Score: + # type: 'int' + # info: 'collaterals score' + + Tan Score: + type: 'ord' + info: 'collaterals score' + categories: 4 + description: + 0: '0%' + 1: '0-50%' + 2: '50-100%' + 3: '100%' + + #Coves Score: + # type: 'int' + # info: 'cortical vein opacification score (score of 0(absence), 1 (moderate) or 2(full) opacification + # assigned to three veins in the affected hemisphere' + + Coves Score: + type: 'ord' + info: 'cortical vein opacification score (score of 0(absence), 1 (moderate) or 2(full) opacification + assigned to three veins in the affected hemisphere' + categories: 7 + description: + 0: 'Coves Score 0' + 1: 'Coves Score 1' + 2: 'Coves Score 2' + 3: 'Coves Score 3' + 4: 'Coves Score 4' + 5: 'Coves Score 5' + 6: 'Coves Score 6' + + BATMAN: + type: 'cat' + info: 'Basilar artery on CTA score' + categories: 11 + description: + 0: 'BATMAN Score 0' + 1: 'BATMAN Score 1' + 2: 'BATMAN Score 2' + 3: 'BATMAN Score 3' + 4: 'BATMAN Score 4' + 5: 'BATMAN Score 5' + 6: 'BATMAN Score 6' + 7: 'BATMAN Score 7' + 8: 'BATMAN Score 8' + 9: 'BATMAN Score 9' + 10: 'BATMAN Score 10' + + #Clot Burden Score: + # type: 'int' + # info: 'Evaluates the extent of ipsilateral thrombus' + + Clot Burden Score: + type: 'ord' + info: 'Evaluates the extent of ipsilateral thrombus' + categories: 11 + description: + 0: 'Clot Burden Score 0' + 1: 'Clot Burden Score 1' + 2: 'Clot Burden Score 2' + 3: 'Clot Burden Score 3' + 4: 'Clot Burden Score 4' + 5: 'Clot Burden Score 5' + 6: 'Clot Burden Score 6' + 7: 'Clot Burden Score 7' + 8: 'Clot Burden Score 8' + 9: 'Clot Burden Score 9' + 10: 'Clot Burden Score 10' + + Vessel Stenosis: + type: 'cat' + info: 'Presence of arterial stenosis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Stenosis Location: + type: 'cat' + info: 'Location of arterial stenosis' + categories: 8 + description: + 0: 'No stenosis' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'MCA' + 4: 'PCA' + 5: 'VA' + 6: 'BA' + 7: 'ACA' + + Arterial Dissection: + type: 'cat' + info: 'Presence of arterial dissection' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Arterial Dissection Location: + type: 'cat' + info: 'Location of arterial dissection' + categories: 4 + description: + + 0: 'No dissection' + 1: 'ACI' + 2: 'VA' + 3: 'other' + + 'CBF_lower30_volume': + type: 'int' + info: 'Volume in which the CBF is lower that 30% of the CBF of the contraleteral side -- core --' + + 'Tmax_greater6s_volume': + type: 'int' + info: 'Volume in which the Tmax is greater than 6s -- penumbra + core --' + + Mismatch Volume: + type: 'int' + info: 'penumbra volume' + + Inverse Mismatch Ratio: + type: 'float' + info: 'volume core/total volume' + + Hypoperfusion Index: + type: 'float' + info: 'Volume Tmax>10s/volume Tmax>6s' + + CBV Index: + type: 'float' + info: 'Unknown' + + Vessel Occlusion CTA: + + type: 'cat' + info: 'Location of vessel occlusion on CTA' + categories: 11 + description: + 0: 'No occlussion' + 1: 'ACI' + 2: 'Carotis-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + Time_CT_to_Angio.: + type: 'int' + info: 'Time from CT imaging to CTA imaging in minutes' + +Post-EVT: + + Stenting: + type: 'cat' + info: 'Use of stenting' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Thrombectomy: + type: 'cat' + info: 'Use of thrombectomy' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Device: + type: 'cat' + info: 'Device employed for thrombectomy' + categories: 4 + description: + 0: 'No device - no thrombectomy' + 1: 'Stent retriever' + 2: 'Aspiration' + 3: 'Both' + + PTA: + type: 'cat' + info: 'Use of percutaneous transluminal angioplasty' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Number Maneuver: + type: 'int' + info: 'number of maneuver that were necessary' + + Lysis i.a.: + type: 'cat' + info: 'Use of intraarterial lysis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Lysis quantity: + type: 'int' + info: 'quantity of lysis used' + + Frustrated Recanalization: + + type: 'cat' + info: 'Whether the recanalization was frustrated' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion after Recan.: + type: 'cat' + info: 'Location of vessel occlusion after recanalization' + categories: 11 + description: + 0: 'No occlusion' + 1: 'ACI' + 2: 'Carotid-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' +# + #TICI: + # type: 'int' + # info: 'Reperfusion score' + TICI: + type: 'cat' + info: 'Reperfusion score' + categories: 5 + description: + 0: '0, No reperfusion' + 1: '1, Minimal reperfusion' + 2: '2a, Partial reperfusion (<50%)' + 3: '2b Partial reperfusion(>50%)' + 4: '3, Total reperfusion' + + SAE: + type: 'cat' + info: 'Subarachnoid hemorrhage encephalitis' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion new SupplyArea: + type: 'cat' + info: 'Vessel occlusion in a new Supply Area' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Vessel Occlusion new SupplyArea Location: + type: 'cat' + info: 'Location of vessel occlusion after recanalization in a new supply area' + categories: 11 + description: + 0: 'No occlusion' + 1: 'ACI' + 2: 'Carotid-T' + 3: 'M1' + 4: 'M2' + 5: 'M3' + 6: 'M4' + 7: 'PCA' + 8: 'ACA' + 9: 'VA' + 10: 'BA' + + Vessel Occlusion new SupplyArea Treatment: + type: 'cat' + info: 'Whether new vessel occlusion was treated' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Infarct new SupplyArea: + type: 'cat' + info: 'Whether there is an infarct in the new supply area' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Time_Puncture_to_Recan.: + type: 'int' + info: 'Time from CTA to Recanalization in minutes' + +After24h: + Lacunar Infarct: + type: 'cat' + info: 'Presence of lacunar infarct' + categories: 3 + description: + 0: 'no infarct' + 1: 'Lacunar infarct' + 2: 'not a lacunar infarct' + + Infarct Volume ControlCT: + type: 'int' + info: 'Volume in ml/cm³ of the infarct in the Control CT' + + Hyperdense Media Sign: + type: 'cat' + info: 'Presence of hyperdense Media Sign' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + pc-Aspect ControlCT: + type: 'cat' + info: 'posterior ASPECTS in Control CT' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + #Aspect ControlCT: + # type: 'int' + # info: ' ASPECTS in Control CT' + + Aspect ControlCT: + type: 'ord' + info: ' ASPECTS in Control CT' + categories: 11 + description: + 0: 'ASPECTS score of 0' + 1: 'ASPECTS score of 1' + 2: 'ASPECTS score of 2' + 3: 'ASPECTS score of 3' + 4: 'ASPECTS score of 4' + 5: 'ASPECTS score of 5' + 6: 'ASPECTS score of 6' + 7: 'ASPECTS score of 7' + 8: 'ASPECTS score of 8' + 9: 'ASPECTS score of 9' + 10: 'ASPECTS score of 10' + + C_br2: + type: 'cat' + info: 'Caudate region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + IC_br2: + type: 'cat' + info: 'Intenal capsule region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + INS_br2: + type: 'cat' + info: 'Insular ribbon affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + L_br2: + type: 'cat' + info: 'Lentiform nucleus region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M1_br2: + type: 'cat' + info: 'Anterior MCA cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M2_br2: + type: 'cat' + info: 'MCA cortex lateral to the insular ribbon region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M3_br2: + type: 'cat' + info: 'Posterior MCA Cortex region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M4_br2: + type: 'cat' + info: 'Anterior cortex immediately rostal to M1 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M5_br2: + type: 'cat' + info: 'Lateral cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + M6_br2: + type: 'cat' + info: 'Posterior cortex immediately rostal to M3 region affected' + categories: 2 + description: + 0: 'no' + 1: 'yes' + + Time_Recan_to_Control: + type: 'int' + info: 'Time from Recanalization to Control CT imaging in minutes' + +Output: + + mRS90d: + type: 'ord' + info: 'Functional outcome at 90 days' + categories: 7 + description: + 0: 'mRS 0' + 1: 'mRS 1' + 2: 'mRS 2' + 3: 'mRS 3' + 4: 'mRS 4' + 5: 'mRS 5' + 6: 'mRS 6' + + dmRS: + type: 'cat' + info: 'Binary functional outcome at 90 days' + categories: 2 + description: + 0: 'Good outcome' + 1: 'Bad outcome' + + mortality: + type: 'cat' + info: 'Mortality at 90 days' + categories: 2 + description: + 0: 'No' + 1: 'Yes' + + shift_mRS: + type: 'ord' + info: 'Shift in mRS' + categories: 7 + description: + + 0: 'Shift of 0' + 1: 'Shift of 1' + 2: 'Shift of 2' + 3: 'Shift of 3' + 4: 'Shift of 4' + 5: 'Shift of 5' + 6: 'Shift of 6' + + + diff --git a/evaluate_model.py b/evaluate_model.py new file mode 100644 index 0000000..6ae1de8 --- /dev/null +++ b/evaluate_model.py @@ -0,0 +1,433 @@ +from architectures.FCN import Basic_FCN +from architectures.GCN import GCN +from Metrics.RegressionMetrics import ClassificationMetrics +from IO_utils.FeaturePreprocessing import FeaturePreprocessing +from IO_utils.clean_table import clean_table +from IO_utils.split_utils import split_data_cv +from IO_utils.Dataloader import MyDataLoader +from IO_utils.List_Reader import TableReader + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd +import torch +import pprint + + +def get_metrics_unc(uncertainty_array, mean_preds_array, label_array, p): + + #order = np.argsort(uncertainty_array) + + metrics = ClassificationMetrics(classes=2) + + indices= uncertainty_array<p + m = metrics.compute_metrics(mean_preds_array[indices], label_array[indices]) + print('####') + print('Metrics using >{}'.format(p)) + print(np.count_nonzero(indices)) + pprint.pprint(m) + print('####') + + return m + +def plot_boxplots(p, y, uncertainty ): + + order = np.argsort(uncertainty) + samples = [] + auc = [] + ths = [] + n_samples = uncertainty.shape[0] + + # Minimum 10% of samples + metrics = ClassificationMetrics(classes=2) + + for i in range(order.shape[0] - int(0.2 * n_samples)): + number_samples = order.shape[0] - i + metrics.clear() + kn = order[:(number_samples - 1)] + m = metrics.compute_metrics(p[kn], y[kn]) + + samples.extend([number_samples]) + auc.extend([m['auc']]) + ths.extend([uncertainty[kn[-1]]]) + + true_positive= [] + true_negative = [] + false_positive = [] + false_negative = [] + + for i in range(p.shape[0]): + pred = np.argmax(p[i, :]) + label = np.argmax(y[i, :]) + if pred == label: + if label == 0: + true_positive.extend([uncertainty[i]]) + else: + true_negative.extend([uncertainty[i]]) + else: + if label == 0: + false_negative.extend([uncertainty[i]]) + else: + false_positive.extend([uncertainty[i]]) + + data= [true_positive, true_negative, false_positive, false_negative] + data2 = [true_positive+true_negative, false_positive+false_negative] + palette = ['lightgreen', 'darkgreen', 'salmon', 'darkred'] + labels = ['True Positive', 'True Negative', 'False Positive', 'False Negative'] + xs = [] + print(np.mean(data2[0]), np.mean(data2[1])) + print(np.std(data2[0]), np.std(data2[1])) + for i, col in enumerate(data): + xs.append(np.random.normal(np.round((i+1.1)/2) , 0.04, len(data[i]))) + + fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(13,6)) + bplot1 = ax1.boxplot(data2, labels = ['Correct predictions', 'Missclassifications'], showfliers=False) + for x, val, c, l in zip(xs, data, palette,labels): + sct1 = ax1.scatter(x, val, alpha=0.4, color=c, label=l) + + ax1.legend() + + ax2.set_xlabel('Uncertainty threshold') + ax2.set_ylabel('AUC') + lns1 = ax2.plot(ths, auc, 'k-', markersize=3, label="AUC") + ax2.invert_xaxis() + + ax3 = ax2.twinx() + ax3.set_ylabel('# of Samples') + lns2 = ax3.plot(ths, samples, 'k--', markersize=3, label="# Samples") + ax3.invert_xaxis() + + # added these three lines + lns = lns1 + lns2 + labs = [l.get_label() for l in lns] + ax2.legend(lns, labs, loc=0) + + #plt.xlabel('{} uncertainty'.format(uncertainty)) + + + + + + +def plot_selectedsamples_metrics(uncertainty_array, mean_preds_array, label_array, uncertainty=""): + order = np.argsort(uncertainty_array) + + samples = [] + auc = [] + acc = [] + b_acc = [] + th_entr = [] + + n_samples = uncertainty_array.shape[0] + + # Minimum 10% of samples + metrics = ClassificationMetrics(classes=2) + + for i in range(order.shape[0] - int(0.2 * n_samples)): + number_samples = order.shape[0] - i + metrics.clear() + kn = order[:(number_samples - 1)] + m = metrics.compute_metrics(mean_preds_array[kn], label_array[kn]) + + # print('###### ') + # print('Number of samples {}'.format(number_samples)) + # print(m['cm']) + # print(m['auc']) + # print('###### ') + + samples.extend([number_samples]) + auc.extend([m['auc']]) + acc.extend([m['accuracy']]) + b_acc.extend([m['balanced_accuracy']]) + th_entr.extend([uncertainty_array[kn[-1]]]) + + plt.figure() + + plt.plot(th_entr, acc, 'bo-', markersize=3, label="Accuracy") + plt.plot(th_entr, b_acc, 'gx-', markersize=3, label='Balanced accuracy') + plt.plot(th_entr, auc, 'rx-', markersize=3, label='AUC') + plt.xlabel('{} uncertainty'.format(uncertainty)) + + plt.figure() + plt.plot(samples, acc, 'bo-', markersize=3, label='Accuracy') + plt.plot(samples, b_acc, 'gx-', markersize=3, label='Balanced accuracy') + plt.plot(samples, auc, 'rx-', markersize=3, label='AUC') + plt.text(26, plt.gca().get_ylim()[0] + 0.05, + 'AUC acc {} \n AUC bacc {} \n AUC auc {} '.format(np.round(np.trapz(acc) / len(acc), 2), + np.round(np.trapz(b_acc) / len(b_acc), 2), + np.round(np.trapz(auc) / len(auc), 2))) + plt.xlabel("Number of samples") + plt.ylabel("Performance") + plt.ylim(0.7, 1) + plt.legend() + plt.title("Metrics ") + # plt.show() + + +def plot_uncetainties(p, y, c, e): + true_pred = [] + for i in range(p.shape[0]): + pred = np.argmax(p[i, :]) + label = np.argmax(y[i, :]) + if pred == label: + if label == 0: + true_pred.extend(['lightgreen']) + else: + true_pred.extend(['darkgreen']) + else: + if label == 0: + true_pred.extend(['salmon']) + else: + true_pred.extend(['darkred']) + + plt.figure() + for g in ['lightgreen', 'darkgreen', 'salmon', 'darkred']: + i = [i for i in range(len(true_pred)) if true_pred[i] == g] + plt.scatter(c[i], e[i], c=g, label='test', s=10) + # sc =plt.scatter(c, e, c=true_pred,) + plt.legend(['TP', 'TN', 'FN', 'FP']) + plt.xlabel('Predictive uncertainty') + plt.ylabel('Epistemic uncertainty') + +def plot_age_uncertainty(p, y, c, age): + true_pred = [] + for i in range(p.shape[0]): + pred = np.argmax(p[i, :]) + label = np.argmax(y[i, :]) + if pred == label: + if label == 0: + true_pred.extend(['lightgreen']) + else: + true_pred.extend(['darkgreen']) + else: + if label == 0: + true_pred.extend(['salmon']) + else: + true_pred.extend(['darkred']) + + plt.figure() + for g in ['lightgreen', 'darkgreen', 'salmon', 'darkred']: + i = [i for i in range(len(true_pred)) if true_pred[i] == g] + plt.scatter(age[i], c[i], c=g, label='test', s=10) + # sc =plt.scatter(c, e, c=true_pred,) + plt.legend(['TP', 'TN', 'FN', 'FP']) + plt.xlabel('Age') + plt.ylabel('Uncertainty') + + +def test_graph(config, loader, test_indices, state_dicts): + models = [] + + for m in state_dicts: + model = GCN(nfeat=loader.dataset[0].num_features, nclass=2, dropout=config['dropout']) + + model.load_state_dict(torch.load(m)) + models.append(model) + + n_models = 10 + n_droput = 100 + + + for i, batch in enumerate(loader): + + samples = len(test_indices) + predictions_y = np.zeros([samples, 2, n_models, n_droput]) + + x = batch['x'] + edge_index = batch['edge_index'] + edge_weight = batch['weights'] + y = batch['y'][test_indices] + + for m in range(n_models): + for k in range(n_droput): + model = models[m] + model.train() + test_out = models[m].forward(x, edge_index, edge_weight) + f = torch.softmax(test_out, dim=1) + f_cloned = f.clone() + f_cloned[f_cloned == 0] = 1e-30 + + predictions_y[:, :, m, k] = f_cloned[test_indices].detach().cpu().numpy() + + mean_preds = np.mean(np.mean(predictions_y, axis=2), axis=2) + predictive_uncertainty = np.sum(-np.multiply(np.log(mean_preds), mean_preds), axis=1) + + expect_data_uncertainty = np.mean( + np.mean(np.sum(-np.multiply(np.log(predictions_y), predictions_y), axis=1), axis=1), axis=1) + epistemic_uncertainty = predictive_uncertainty - expect_data_uncertainty + + """for sample in range(samples): + print('#####') + print('Sample {}'.format(sample)) + print(' True value: {}, Prediction mean: {}'.format(y[sample], mean_preds[sample])) + print(' Predictive {}, Epistemic {}'.format(predictive_uncertainty[sample], epistemic_uncertainty[sample])) + print('#####')""" + + print('Mean predictive uncertainty ', np.mean(predictive_uncertainty)) + print('Mean epistemic uncertainty', np.mean(epistemic_uncertainty)) + + return mean_preds, predictive_uncertainty, epistemic_uncertainty, y.detach().cpu().numpy() + + +def test(config, test_loader, state_dict): + models = [] + + for m in state_dict: + model = Basic_FCN(in_features=test_loader.dataset.features.shape[1], layers=config['layers'] + , out_features=2, + dropout_rate=config['dropout']) + + model.load_state_dict(torch.load(m)) + models.append(model) + + n_models = 5 + n_droput = 1 + + for batch in test_loader: + + samples = batch['x'].shape[0] + predictions_y = np.zeros([samples, 2, n_models, n_droput]) + + x = batch['x'] + y = batch['y'] + + for m in range(n_models): + for k in range(n_droput): + model = models[m] + model.train() + test_out = models[m].forward(x) + f = torch.softmax(test_out, dim=1) + + predictions_y[:, :, m, k] = f.detach().cpu().numpy() + + mean_preds = np.mean(np.mean(predictions_y, axis=2), axis=2) + predictive_uncertainty = np.sum(-np.multiply(np.log(mean_preds), mean_preds), axis=1) + + expect_data_uncertainty = np.mean( + np.mean(np.sum(-np.multiply(np.log(predictions_y), predictions_y), axis=1), axis=1), axis=1) + epistemic_uncertainty = predictive_uncertainty - expect_data_uncertainty + + """for sample in range(samples): + print('#####') + print('Sample {}'.format(sample)) + print(' True value: {}, Prediction mean: {}'.format(y[sample], mean_preds[sample])) + print(' Predictive {}, Epistemic {}'.format(predictive_uncertainty[sample], epistemic_uncertainty[sample])) + print('#####')""" + + print('Mean predictive uncertainty ', np.mean(predictive_uncertainty)) + print('Mean epistemic uncertainty', np.mean(epistemic_uncertainty)) + + return mean_preds, predictive_uncertainty, epistemic_uncertainty, y.detach().cpu().numpy() + + +if __name__ == "__main__": + + # %% DATALOADIND + + ## Clean original table + excel_dir = "../data/TheList_anonymous_mv.xlsx" + clean_df = clean_table(excel_dir=excel_dir, pre_mRS=2) + + # Given a clean table get features and labels + table = TableReader(input_df=clean_df, tables=['all_timepoints'], data_dictionaries='timepoints', + mv_strategy='median', + output_feature=['dmRS']) + + output_vector = table.output_vector + + fold_indices = split_data_cv(output_vector, seed=5, cv=5) + + results_pred = {} + results_epis = {} + for p in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]: + results_pred[p] = {} + results_epis[p] = {} + + # for f in [5, 10, 15, 20, 25, 30, 40, 50]: + for f in [15]: + features = table.select_features(method='mrmr', k=f, fold_indices=fold_indices) + + feature_vector = table.final_df[features] + FP = FeaturePreprocessing(feature_vector, table.selected_d) + feature_vector = FP.create_features(feature_vector) + config = { + 'layers': {'number': 3, + 'layer1': 40, + 'layer2': 20, + + }, + + 'dropout': 0, + 'out_classes': 2} + + ######## + mean_preds = [] + combined = [] + epistemic = [] + cls = [] + + for k in range(5): + dataloader_fold = MyDataLoader(feature_vector, output_vector, fold_indices[k], table.selected_d, + one_hot=True) + dl = dataloader_fold.get_loaders() + state_dict_paths = ["C:/Users/martinca1/PhD/Projects/AI_Stroke/out/models/features_{}/" + "model_{}_fold_{}.pt".format(f, i, k) for i in range(5)] + pred, unc, epistemic_unc, y = test(config, dl[2], state_dict_paths) + + # _, _, _, _ = test(config, dl[0], models) + + mean_preds.extend(pred.tolist()) + combined.extend(unc.tolist()) + epistemic.extend(epistemic_unc.tolist()) + cls.extend(y.tolist()) + + p = np.array(mean_preds) + y = np.array(cls) + c = np.array(combined) + e = np.array(epistemic) + + # with pd.ExcelWriter("C:/Users/martinca1/PhD/Projects/AI_Stroke/out/uncertainty/predictive_uncertainty.xlsx") as writer: + for per in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]: + results_pred[per][f] = get_metrics_unc(c, p, y, per) + # pd_per = pd.DataFrame(results_pred[per]).T + # pd_per.to_excel(writer, sheet_name=str(per)) + + # with pd.ExcelWriter("C:/Users/martinca1/PhD/Projects/AI_Stroke/out/uncertainty/epistimic_uncertainty.xlsx") as writer: + # for per in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,0.1]: + # results_epis[per][f] = get_metrics_unc(e, p, y, per) + # pd_per = pd.DataFrame(results_epis[per]).T + # pd_per.to_excel(writer, sheet_name=str(per)) + + true_pred = [] + for i in range(p.shape[0]): + pred = np.argmax(p[i, :]) + label = np.argmax(y[i, :]) + if pred == label: + if label == 0: + true_pred.extend(['lightgreen']) + else: + true_pred.extend(['darkgreen']) + else: + if label == 0: + true_pred.extend(['salmon']) + else: + true_pred.extend(['darkred']) + + metrics = ClassificationMetrics(classes=2) + m = metrics.compute_metrics(p, y) + print('Combined metrics') + print(m) + + plot_selectedsamples_metrics(c, p, y, uncertainty='Predictive') + plot_selectedsamples_metrics(e, p, y, uncertainty='Epistemic') + + plt.figure() + for g in ['lightgreen', 'darkgreen', 'salmon', 'darkred']: + i = [i for i in range(len(true_pred)) if true_pred[i] == g] + plt.scatter(c[i], e[i], c=g, label='test') + # sc =plt.scatter(c, e, c=true_pred,) + plt.legend(['TP', 'TN', 'FN', 'FP']) + plt.xlabel('Predictive uncertainty') + plt.ylabel('Epistemic uncertainty') + plt.show() diff --git a/test.py b/test.py deleted file mode 100644 index e69de29..0000000 diff --git a/train.py b/train.py new file mode 100644 index 0000000..056072a --- /dev/null +++ b/train.py @@ -0,0 +1,204 @@ +from architectures.FCN import Basic_FCN +from Metrics.RegressionMetrics import ClassificationMetrics +from Loss.Loss_uncertainty import loss_uncertainty + +import copy +from hyperopt import STATUS_OK +from ignite.engine import Engine, Events +import neptune +import pprint +import time +import tqdm +import torch + + +def train_model(config, loaders, save_metrics=False): + # torch.manual_seed(0) + + train_loader, val_loader, test_loader = loaders + device = torch.device('cuda' if torch.cuda.is_available() else False) + model = Basic_FCN(in_features=train_loader.dataset.features.shape[1], layers=config['layers'] + , out_features=2, + dropout_rate=config['dropout']) + + p = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('Model parameters', p) + model.to(device) + + # METRICS + train_metrics = ClassificationMetrics(classes=2) + val_metrics = ClassificationMetrics(classes=2) + test_metrics = ClassificationMetrics(classes=2) + + # OPTIMIZER + optimizer = torch.optim.Adam(model.parameters(), + lr=config['lr'], + #momentum=config['momentum'], + weight_decay=config['weight_decay']) + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) + + # LOSS + import numpy as np + values, counts = np.unique(train_loader.dataset.labels, return_counts=True) + weights = torch.FloatTensor(1- counts / np.sum(counts)).to('cuda') + #weights = torch.FloatTensor((0.5, 0.5)).to('cuda') + loss_train = loss_uncertainty(weights=weights) + loss_val = loss_uncertainty(weights=weights) + loss_test = loss_uncertainty(weights=weights) + + def train(Engine, batch): + + model.train() + optimizer.zero_grad() + data = batch['x'].clone().to(device) + y_train = batch['y'].clone().to(device) + + out = model.forward(data) + + Loss = loss_train.get_loss(out, y_train.unsqueeze(1)) + m_train = train_metrics.compute_metrics(out.detach().cpu().numpy(), batch['y'].detach().cpu().numpy()) + + Loss.backward() + optimizer.step() + + Engine.state.loss = loss_train.get_total_loss() + Engine.state.metrics = train_metrics + Engine.state.m = m_train + Engine.state.out = out + Engine.state.y = y_train.detach().cpu().numpy() + + def validate(Engine, batch): + model.eval() + with torch.no_grad(): + data = batch['x'].clone().to(device) + y_val = batch['y'].clone().to(device) + + out = model.forward(data) + + lv = loss_val.get_loss(out, y_val.unsqueeze(1)) + scheduler.step(lv) + #print("Epoch {}, lr {}".format(trainer_engine.state.epoch, optimizer.param_groups[0]['lr'])) + m_val = val_metrics.compute_metrics(out.detach().cpu().numpy(), batch['y'].detach().cpu().numpy()) + + Engine.state.loss = loss_val.get_total_loss() + Engine.state.metrics = val_metrics + Engine.state.m = m_val + + def test(Engine, batch): + model.eval() + with torch.no_grad(): + data = batch['x'].clone().to(device) + y_test = batch['y'].clone().to(device) + + out = model.forward(data) + f = torch.softmax(out, dim=1) + _ = loss_test.get_loss(f, y_test.unsqueeze(1)) + m_test = test_metrics.compute_metrics(f.detach().cpu().numpy(), batch['y'].detach().cpu().numpy()) + + Engine.state.loss = loss_test.get_total_loss() + Engine.state.metrics = test_metrics + Engine.state.m = m_test + + trainer_engine = Engine(train) + validator_engine = Engine(validate) + test_engine = Engine(test) + + @trainer_engine.on(Events.STARTED) + def training_bootup(engine): + + # print("Training started") + # print("Train_loader,: iterations/epoch: ", len(train_loader), ", total number of samples", + # len(train_loader.sampler)) + # print("Validation_loader,: iterations/epoch: ", len(val_loader), ", total number of samples", + # len(val_loader.sampler)) + # print("Test_loader,: iterations/epoch: ", len(test_loader), ", total number of samples", + # len(test_loader.sampler)) + + time.sleep(0.001) + engine.pbar = tqdm.tqdm(total=300, desc='Training progress') + validator_engine.state.loss = 0 + engine.state.best_epoch = 0 + engine.state.min_loss = 1000 + engine.state.best_train_metrics = {} + engine.state.best_val_metrics = {} + engine.state.best_test_metrics = {} + + engine.count = 0 + + @trainer_engine.on(Events.EPOCH_COMPLETED) + def run_validation(engine): + validator_engine.run(val_loader, max_epochs=1) + engine.pbar.update(1) + engine.pbar.set_postfix({'loss': validator_engine.state.loss, + 'loss_train': trainer_engine.state.loss, + 'accuracy_train': trainer_engine.state.m['accuracy']}, refresh=True) + + test_engine.run(test_loader, max_epochs=1) + + if validator_engine.state.loss < engine.state.min_loss: + engine.count = 0 + engine.state.min_loss = validator_engine.state.loss + # + else: + engine.count += 1 + # print('Strike ', engine.count) + if engine.count > 40: + trainer_engine.terminate() + if True: + engine.state.best_epoch = engine.state.epoch + engine.state.best_train_metrics = trainer_engine.state.m + engine.state.best_val_metrics = validator_engine.state.m + engine.state.best_test_metrics = test_engine.state.m + engine.state.best_model = model.state_dict() + + train_metrics.clear() + val_metrics.clear() + test_metrics.clear() + + loss_train.clear() + loss_val.clear() + loss_test.clear() + + @trainer_engine.on(Events.EPOCH_COMPLETED) + def write_neptune(): + + if save_metrics: + neptune.log_metric('loss_train', trainer_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('loss_val', validator_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('loss_test', test_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('train_AUC', trainer_engine.state.m['auc']) + neptune.log_metric('train_accuracy', trainer_engine.state.m['accuracy']) + neptune.log_metric('val_AUC', validator_engine.state.m['auc']) + neptune.log_metric('val_accuracy', validator_engine.state.m['accuracy']) + neptune.log_metric('test_AUC', test_engine.state.m['auc']) + neptune.log_metric('test_accuracy', test_engine.state.m['accuracy']) + + @trainer_engine.on(Events.COMPLETED) + def run_testing(engine): + + # print(time.ctime(), "Running testing") + # test_engine.run(test_loader, max_epochs=1) + + engine.pbar.close() + # time.sleep(0.001) + + trainer_engine.run(train_loader, max_epochs=300) + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f}".format(trainer_engine.state.m['auc'], + validator_engine.state.m[ + 'auc'], + test_engine.state.m['auc'])) + # visualize(h=trainer_engine.state.best_out, color=trainer_engine.state.best_y) + result = {'loss': trainer_engine.state.min_loss, + 'epoch': trainer_engine.state.best_epoch, + 'train_metrics': trainer_engine.state.best_train_metrics, + 'val_metrics': trainer_engine.state.best_val_metrics, + 'test_metric': trainer_engine.state.best_test_metrics, + 'status': STATUS_OK} + + #pprint.pprint(trainer_engine.state.best_train_metrics) + #pprint.pprint(trainer_engine.state.best_val_metrics) + #pprint.pprint(trainer_engine.state.best_test_metrics) + + return result, trainer_engine.state.best_model diff --git a/train_graph.py b/train_graph.py new file mode 100644 index 0000000..7346ef0 --- /dev/null +++ b/train_graph.py @@ -0,0 +1,223 @@ +from architectures.GCN import GCN + +from Metrics.RegressionMetrics import ClassificationMetrics +from Loss.Loss_uncertainty import loss_uncertainty + +import copy +from hyperopt import STATUS_OK +from ignite.engine import Engine, Events +import neptune +import pprint +import time +import tqdm +import torch + + +def train_model_graph(config, loaders, indices, save_metrics=False): + # torch.manual_seed(0) + + train_indices, val_indices, test_indices = indices + + device = torch.device('cuda' if torch.cuda.is_available() else False) + model = GCN(nfeat=loaders.dataset[0].num_features, nclass=2, dropout=config['dropout']) + + p = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('Model parameters', p) + model.to(device) + + # METRICS + train_metrics = ClassificationMetrics(classes=2) + val_metrics = ClassificationMetrics(classes=2) + test_metrics = ClassificationMetrics(classes=2) + + # OPTIMIZER + optimizer = torch.optim.Adam(model.parameters(), + lr=config['lr'], + #momentum=config['momentum'], + weight_decay=config['weight_decay']) + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7) + + # LOSS + import numpy as np + values, counts = np.unique(loaders.dataset[0].labels.data, return_counts=True) + weights = torch.FloatTensor(1- counts / np.sum(counts)).to('cuda') + #weights = torch.FloatTensor((0.5, 0.5)).to('cuda') + loss_train = loss_uncertainty(weights=weights) + loss_val = loss_uncertainty(weights=weights) + loss_test = loss_uncertainty(weights=weights) + + def train(Engine, batch): + + model.train() + optimizer.zero_grad() + data = batch['x'].clone().to(device) + edge_index = batch['edge_index'].clone().to(device) + edge_weight = batch['weights'].clone().to(device) + y_train = batch['y'].clone().to(device) + + out = model.forward(data, edge_index, edge_weight) + # f = torch.softmax(out, dim=1) + f_cloned = out.clone() + #f_cloned[f_cloned == 0] = 1e-30 + + + Loss = loss_train.get_loss(f_cloned[train_indices], y_train[train_indices].unsqueeze(1)) + + m_train = train_metrics.compute_metrics(f_cloned[train_indices].detach().cpu().numpy(), + batch['y'][train_indices].detach().cpu().numpy()) + + Loss.backward() + optimizer.step() + + Engine.state.loss = loss_train.get_total_loss() + Engine.state.metrics = train_metrics + Engine.state.m = m_train + Engine.state.out = out + Engine.state.y = y_train.detach().cpu().numpy() + + def validate(Engine, batch): + model.eval() + with torch.no_grad(): + data = batch['x'].clone().to(device) + edge_index = batch['edge_index'].clone().to(device) + edge_weight = batch['weights'].clone().to(device) + y_val = batch['y'].clone().to(device) + + out = model.forward(data, edge_index, edge_weight) + #f = torch.softmax(out, dim=1) + f_cloned = out.clone() + f_cloned[f_cloned == 0] = 1e-30 + + lv = loss_val.get_loss(f_cloned[val_indices], y_val[val_indices].unsqueeze(1)) + scheduler.step(lv) + #print("Epoch {}, lr {}".format(trainer_engine.state.epoch, optimizer.param_groups[0]['lr'])) + m_val = val_metrics.compute_metrics(f_cloned[val_indices].detach().cpu().numpy(), + batch['y'][val_indices].detach().cpu().numpy()) + + Engine.state.loss = loss_val.get_total_loss() + Engine.state.metrics = val_metrics + Engine.state.m = m_val + + def test(Engine, batch): + model.eval() + with torch.no_grad(): + data = batch['x'].clone().to(device) + edge_index = batch['edge_index'].clone().to(device) + edge_weight = batch['weights'].clone().to(device) + y_test = batch['y'].clone().to(device) + + out = model.forward(data, edge_index, edge_weight) + # f = torch.softmax(out, dim=1) + f_cloned = out.clone() + f_cloned[f_cloned == 0] = 1e-30 + + _ = loss_test.get_loss(f_cloned[test_indices], y_test[test_indices].unsqueeze(1)) + m_test = test_metrics.compute_metrics(f_cloned[test_indices].detach().cpu().numpy(), + batch['y'][test_indices].detach().cpu().numpy()) + + Engine.state.loss = loss_test.get_total_loss() + Engine.state.metrics = test_metrics + Engine.state.m = m_test + + trainer_engine = Engine(train) + validator_engine = Engine(validate) + test_engine = Engine(test) + + @trainer_engine.on(Events.STARTED) + def training_bootup(engine): + + # print("Training started") + # print("Train_loader,: iterations/epoch: ", len(train_loader), ", total number of samples", + # len(train_loader.sampler)) + # print("Validation_loader,: iterations/epoch: ", len(val_loader), ", total number of samples", + # len(val_loader.sampler)) + # print("Test_loader,: iterations/epoch: ", len(test_loader), ", total number of samples", + # len(test_loader.sampler)) + + time.sleep(0.001) + engine.pbar = tqdm.tqdm(total=300, desc='Training progress') + validator_engine.state.loss = 0 + engine.state.best_epoch = 0 + engine.state.min_loss = 1000 + engine.state.best_train_metrics = {} + engine.state.best_val_metrics = {} + engine.state.best_test_metrics = {} + + engine.count = 0 + + @trainer_engine.on(Events.EPOCH_COMPLETED) + def run_validation(engine): + validator_engine.run(loaders, max_epochs=1) + engine.pbar.update(1) + engine.pbar.set_postfix({'loss': validator_engine.state.loss, + 'loss_train': trainer_engine.state.loss, + 'accuracy_train': trainer_engine.state.m['accuracy']}, refresh=True) + + test_engine.run(loaders, max_epochs=1) + + if validator_engine.state.loss < engine.state.min_loss: + engine.count = 0 + engine.state.min_loss = validator_engine.state.loss + # + else: + engine.count += 1 + if engine.count > 20: + trainer_engine.terminate() + if True: + engine.state.best_epoch = engine.state.epoch + engine.state.best_train_metrics = trainer_engine.state.m + engine.state.best_val_metrics = validator_engine.state.m + engine.state.best_test_metrics = test_engine.state.m + engine.state.best_model = model.state_dict() + + train_metrics.clear() + val_metrics.clear() + test_metrics.clear() + + loss_train.clear() + loss_val.clear() + loss_test.clear() + + @trainer_engine.on(Events.EPOCH_COMPLETED) + def write_neptune(): + + if save_metrics: + neptune.log_metric('loss_train', trainer_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('loss_val', validator_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('loss_test', test_engine.state.loss.detach().cpu().numpy()) + neptune.log_metric('train_AUC', trainer_engine.state.m['auc']) + neptune.log_metric('train_accuracy', trainer_engine.state.m['accuracy']) + neptune.log_metric('val_AUC', validator_engine.state.m['auc']) + neptune.log_metric('val_accuracy', validator_engine.state.m['accuracy']) + neptune.log_metric('test_AUC', test_engine.state.m['auc']) + neptune.log_metric('test_accuracy', test_engine.state.m['accuracy']) + + @trainer_engine.on(Events.COMPLETED) + def run_testing(engine): + + # print(time.ctime(), "Running testing") + # test_engine.run(test_loader, max_epochs=1) + + engine.pbar.close() + # time.sleep(0.001) + + trainer_engine.run(loaders, max_epochs=300) + print( + "AUC of training set: {:.2f}, validation set {:.2f}, and test set {:.2f}".format(trainer_engine.state.m['auc'], + validator_engine.state.m[ + 'auc'], + test_engine.state.m['auc'])) + # visualize(h=trainer_engine.state.best_out, color=trainer_engine.state.best_y) + result = {'loss': trainer_engine.state.min_loss, + 'epoch': trainer_engine.state.best_epoch, + 'train_metrics': trainer_engine.state.best_train_metrics, + 'val_metrics': trainer_engine.state.best_val_metrics, + 'test_metric': trainer_engine.state.best_test_metrics, + 'status': STATUS_OK} + + #pprint.pprint(trainer_engine.state.best_train_metrics) + #pprint.pprint(trainer_engine.state.best_val_metrics) + #pprint.pprint(trainer_engine.state.best_test_metrics) + + return result, trainer_engine.state.best_model -- GitLab