From 910af1def1e3789d2aaf354cbb409f54901ad660 Mon Sep 17 00:00:00 2001 From: Israel Figueroa Date: Tue, 14 Oct 2025 00:08:57 -0300 Subject: [PATCH] fit --- load_dataset.py | 10 ++++----- train.py | 55 +++++++++++++++++++++++++++++++++++-------------- trainer.py | 10 ++++----- 3 files changed, 50 insertions(+), 25 deletions(-) diff --git a/load_dataset.py b/load_dataset.py index 28b6dfb..611e7a1 100644 --- a/load_dataset.py +++ b/load_dataset.py @@ -10,7 +10,7 @@ def safe_float(x): -def analisis_univariado(dfi, target=None, continuas=[], discretas=[]): +def univariate_analysis(dfi, target=None, continous=[], discrete=[]): if target is None: raise ValueError("No target variable provided") @@ -33,8 +33,8 @@ def analisis_univariado(dfi, target=None, continuas=[], discretas=[]): results = [] resultsmody = [] - # Análisis de variables continuas - for var in continuas: + # Análisis de variables continous + for var in continous: Xvar = dfi[var].T group1_values = data_group1[var].T group2_values = data_group2[var].T @@ -82,8 +82,8 @@ def analisis_univariado(dfi, target=None, continuas=[], discretas=[]): f"{mw_pval:.3f}", ("*" if mw_pval < 0.05 else "NS"), f"{mediang:.1f} ({qlg:.1f} - {qrg:.1f})" ]) - # Análisis de variables discretas - for var in discretas: + # Análisis de variables discrete + for var in discrete: freq_table = dfi.groupby([target, var]).size().unstack(fill_value=0) percentages = freq_table.div(freq_table.sum(axis=1), axis=0) * 100 diff --git a/train.py b/train.py index 899b75d..aaebd24 100644 --- a/train.py +++ b/train.py @@ -1,28 +1,53 @@ -from load_dataset import load_data, analisis_univariado +from load_dataset import load_data, univariate_analysis #from trainer import BinaryTuner import pandas as pd import numpy as np import warnings warnings.filterwarnings("ignore") +# Load data from HC.xslc raw original data file and extract mody data into filtered dataframes +# filtered data is stord as `MODY_data.xlsx` for screening. +# this returns a tuple (mody1, mody2, mody3, mody5) dataFrames from each sheet in the filtered record _, dms2, dms3, _ = load_data() - -with pd.ExcelWriter("Univariado2.xlsx", engine='xlsxwriter') as xls: - mody2 = analisis_univariado(dms2.dropna(), target="MODY2_label", continuas=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discretas=['sexo', 'diabetes_familia']) +# Univariate analysis of the origin data +with pd.ExcelWriter("UnivariateODY.xlsx", engine='xlsxwriter') as xls: + # continous are tested shapiro for normality. metrics for groups are computed, then ttested or mannwhitneyu for difference in groups + # binary use fisher and frecuency tables + mody2 = univariate_analysis(dms2.dropna(), target="MODY2_label", continous=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discrete=['sexo', 'diabetes_familia']) mody2.to_excel(xls, sheet_name='MODY2', index=False) - mody3 = analisis_univariado(dms3.dropna(), target="MODY3_label", continuas=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discretas=['sexo', 'diabetes_familia']) + mody3 = univariate_analysis(dms3.dropna(), target="MODY3_label", continous=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discrete=['sexo', 'diabetes_familia']) mody3.to_excel(xls, sheet_name='MODY3', index=False) +# For reproducibility, seeds from the original research, set to None to get random +seeds = [231964, 48928, 132268, 113986, 574626, 130068, 226585, 446306, 535997, 685636, 779992, 600946, 231614, 1027776, 747054, 546372, 885843, 536202, 852539, 848580, 997648, 440679, 118304, 49131, 861767] +# seeds = None +# or +# import np +# seeds = [np.random.randint(1, 2**20) for _ in range(n_seeds)] -# mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=[231964], test_size=0.2) -# mody2.fit() -# mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964) -# mody2.wrap_and_save() -# -# -# mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=[536202], test_size=0.2) -# mody3.fit() -# mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202) -# mody3.wrap_and_save() +# The BinaryTuner do a Grid Search for multiple hyperparameter space and +# save metrics the test split for the best combinations of ML hyperparameters +# * Hyperparameters GridSearch for each ML Model for up to 60 different combinations +# * 10 different Machine Learning Models capable of Binary Clasification +# * Model trained on data with no missing values, and impute MICE and KNN +# * Different random train and test splits, for given test_size ratio +# A folder is created with the label name with all the state and run data +mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=seed, test_size=0.2) + +# CheckPoint during search all trained models are saved inside the label folder +# including the scalers required for inference and testing +mody2.fit() + +# Generate Chap Explainer for (Model, dataset, seed). See folder structure. +mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964) + +# Create summary of metrics and zip the folder for easy portabilitys +mody2.wrap_and_save() + +# Repeat process for next dataset +mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=seeds, test_size=0.2) +mody3.fit() +mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202) +mody3.wrap_and_save() diff --git a/trainer.py b/trainer.py index d94e091..f772d20 100644 --- a/trainer.py +++ b/trainer.py @@ -599,8 +599,8 @@ class BinaryTuner: self.saveCheckPoint() self.bar.close() - def get_best_models(self): - return self.ledger.groupby(["Dataset", "Model"])["ROC_AUC"].agg(['mean', 'std']) + def get_best_models(self, metric="ROC_AUC"): + return self.ledger.groupby(["Dataset", "Model"])[metric].agg(['mean', 'std']) def explain_model(self, modelname=None, dataset=None, seed=None): self.logger.info("{:=^60}".format(' Begin SHAP Explainer: {} {} {} '.format(modelname, dataset, seed))) @@ -743,9 +743,9 @@ class BinaryTuner: def wrap_and_save(self): self.logger.info("{:=^60}".format(' Saving Summary and Wrap the output in a ZipFile ')) - - with pd.ExcelWriter('{}/Summary.xlsx'.format(self.name) , engine='xlsxwriter') as xls: - self.get_best_models().to_excel(xls, sheet_name='Results') + for metric in ["ROC_AUC", "NPV", "PPV", "Brier", "sensitivity", "specificity"]: + with pd.ExcelWriter('{}/Summary-{}.xlsx'.format(self.name, metric) , engine='xlsxwriter') as xls: + self.get_best_models(metric).to_excel(xls, sheet_name='Results') with zipfile.ZipFile('{}-{}.zip'.format(self.name, self.start), 'w', zipfile.ZIP_DEFLATED) as zipf: for root, dirs, files in os.walk(self.name):