mody_2024/train.py

from load_dataset import load_data, univariate_analysis
#from trainer import BinaryTuner
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load data from HC.xslc raw original data file and extract mody data into filtered dataframes
# filtered data is stord as `MODY_data.xlsx`  for screening.
# this returns a tuple  (mody1, mody2, mody3, mody5) dataFrames from each sheet in the filtered record
_, dms2, dms3, _ = load_data()

# Univariate analysis of the origin data
with pd.ExcelWriter("UnivariateODY.xlsx", engine='xlsxwriter') as xls:
    # continous are tested shapiro for normality. metrics for groups are computed, then ttested or mannwhitneyu for difference in groups
    # binary use fisher and frecuency tables
    mody2 = univariate_analysis(dms2.dropna(), target="MODY2_label", continous=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discrete=['sexo', 'diabetes_familia'])
    mody2.to_excel(xls, sheet_name='MODY2', index=False)

    mody3 = univariate_analysis(dms3.dropna(), target="MODY3_label", continous=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discrete=['sexo', 'diabetes_familia'])
    mody3.to_excel(xls, sheet_name='MODY3', index=False)

# For reproducibility, seeds from the original research, set to None to get random
seeds = [231964, 48928, 132268, 113986, 574626, 130068, 226585, 446306, 535997, 685636, 779992, 600946, 231614, 1027776, 747054, 546372, 885843, 536202, 852539, 848580, 997648, 440679, 118304, 49131, 861767]
# seeds = None
# or
# import np
# seeds = [np.random.randint(1, 2**20) for _ in range(n_seeds)]

# The BinaryTuner do a Grid Search for multiple hyperparameter space and
# save metrics the test split for the best combinations of ML hyperparameters
#  * Hyperparameters GridSearch for each ML Model for up to 60 different combinations
#  * 10 different Machine Learning Models capable of Binary Clasification
#  * Model trained on data with no missing values, and impute MICE and KNN
#  * Different random train and test splits, for given test_size ratio
# A folder is created with the label name with all the state and run data
mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=seed, test_size=0.2)

# CheckPoint during search all trained models are saved inside the label folder
# including the scalers required for inference and testing
mody2.fit()

# Generate Chap Explainer for (Model, dataset, seed). See folder structure.
mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964)

# Create summary of metrics and zip the folder for easy portabilitys
mody2.wrap_and_save()

# Repeat process for next dataset
mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=seeds, test_size=0.2)
mody3.fit()
mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202)
mody3.wrap_and_save()