fit
parent
c736bc0bbe
commit
910af1def1
|
|
@ -10,7 +10,7 @@ def safe_float(x):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def analisis_univariado(dfi, target=None, continuas=[], discretas=[]):
|
def univariate_analysis(dfi, target=None, continous=[], discrete=[]):
|
||||||
if target is None:
|
if target is None:
|
||||||
raise ValueError("No target variable provided")
|
raise ValueError("No target variable provided")
|
||||||
|
|
||||||
|
|
@ -33,8 +33,8 @@ def analisis_univariado(dfi, target=None, continuas=[], discretas=[]):
|
||||||
results = []
|
results = []
|
||||||
resultsmody = []
|
resultsmody = []
|
||||||
|
|
||||||
# Análisis de variables continuas
|
# Análisis de variables continous
|
||||||
for var in continuas:
|
for var in continous:
|
||||||
Xvar = dfi[var].T
|
Xvar = dfi[var].T
|
||||||
group1_values = data_group1[var].T
|
group1_values = data_group1[var].T
|
||||||
group2_values = data_group2[var].T
|
group2_values = data_group2[var].T
|
||||||
|
|
@ -82,8 +82,8 @@ def analisis_univariado(dfi, target=None, continuas=[], discretas=[]):
|
||||||
f"{mw_pval:.3f}", ("*" if mw_pval < 0.05 else "NS"), f"{mediang:.1f} ({qlg:.1f} - {qrg:.1f})"
|
f"{mw_pval:.3f}", ("*" if mw_pval < 0.05 else "NS"), f"{mediang:.1f} ({qlg:.1f} - {qrg:.1f})"
|
||||||
])
|
])
|
||||||
|
|
||||||
# Análisis de variables discretas
|
# Análisis de variables discrete
|
||||||
for var in discretas:
|
for var in discrete:
|
||||||
freq_table = dfi.groupby([target, var]).size().unstack(fill_value=0)
|
freq_table = dfi.groupby([target, var]).size().unstack(fill_value=0)
|
||||||
percentages = freq_table.div(freq_table.sum(axis=1), axis=0) * 100
|
percentages = freq_table.div(freq_table.sum(axis=1), axis=0) * 100
|
||||||
|
|
||||||
|
|
|
||||||
55
train.py
55
train.py
|
|
@ -1,28 +1,53 @@
|
||||||
from load_dataset import load_data, analisis_univariado
|
from load_dataset import load_data, univariate_analysis
|
||||||
#from trainer import BinaryTuner
|
#from trainer import BinaryTuner
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import warnings
|
import warnings
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
# Load data from HC.xslc raw original data file and extract mody data into filtered dataframes
|
||||||
|
# filtered data is stord as `MODY_data.xlsx` for screening.
|
||||||
|
# this returns a tuple (mody1, mody2, mody3, mody5) dataFrames from each sheet in the filtered record
|
||||||
_, dms2, dms3, _ = load_data()
|
_, dms2, dms3, _ = load_data()
|
||||||
|
|
||||||
|
# Univariate analysis of the origin data
|
||||||
with pd.ExcelWriter("Univariado2.xlsx", engine='xlsxwriter') as xls:
|
with pd.ExcelWriter("UnivariateODY.xlsx", engine='xlsxwriter') as xls:
|
||||||
mody2 = analisis_univariado(dms2.dropna(), target="MODY2_label", continuas=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discretas=['sexo', 'diabetes_familia'])
|
# continous are tested shapiro for normality. metrics for groups are computed, then ttested or mannwhitneyu for difference in groups
|
||||||
|
# binary use fisher and frecuency tables
|
||||||
|
mody2 = univariate_analysis(dms2.dropna(), target="MODY2_label", continous=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discrete=['sexo', 'diabetes_familia'])
|
||||||
mody2.to_excel(xls, sheet_name='MODY2', index=False)
|
mody2.to_excel(xls, sheet_name='MODY2', index=False)
|
||||||
|
|
||||||
mody3 = analisis_univariado(dms3.dropna(), target="MODY3_label", continuas=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discretas=['sexo', 'diabetes_familia'])
|
mody3 = univariate_analysis(dms3.dropna(), target="MODY3_label", continous=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discrete=['sexo', 'diabetes_familia'])
|
||||||
mody3.to_excel(xls, sheet_name='MODY3', index=False)
|
mody3.to_excel(xls, sheet_name='MODY3', index=False)
|
||||||
|
|
||||||
|
# For reproducibility, seeds from the original research, set to None to get random
|
||||||
|
seeds = [231964, 48928, 132268, 113986, 574626, 130068, 226585, 446306, 535997, 685636, 779992, 600946, 231614, 1027776, 747054, 546372, 885843, 536202, 852539, 848580, 997648, 440679, 118304, 49131, 861767]
|
||||||
|
# seeds = None
|
||||||
|
# or
|
||||||
|
# import np
|
||||||
|
# seeds = [np.random.randint(1, 2**20) for _ in range(n_seeds)]
|
||||||
|
|
||||||
# mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=[231964], test_size=0.2)
|
# The BinaryTuner do a Grid Search for multiple hyperparameter space and
|
||||||
# mody2.fit()
|
# save metrics the test split for the best combinations of ML hyperparameters
|
||||||
# mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964)
|
# * Hyperparameters GridSearch for each ML Model for up to 60 different combinations
|
||||||
# mody2.wrap_and_save()
|
# * 10 different Machine Learning Models capable of Binary Clasification
|
||||||
#
|
# * Model trained on data with no missing values, and impute MICE and KNN
|
||||||
#
|
# * Different random train and test splits, for given test_size ratio
|
||||||
# mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=[536202], test_size=0.2)
|
# A folder is created with the label name with all the state and run data
|
||||||
# mody3.fit()
|
mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=seed, test_size=0.2)
|
||||||
# mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202)
|
|
||||||
# mody3.wrap_and_save()
|
# CheckPoint during search all trained models are saved inside the label folder
|
||||||
|
# including the scalers required for inference and testing
|
||||||
|
mody2.fit()
|
||||||
|
|
||||||
|
# Generate Chap Explainer for (Model, dataset, seed). See folder structure.
|
||||||
|
mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964)
|
||||||
|
|
||||||
|
# Create summary of metrics and zip the folder for easy portabilitys
|
||||||
|
mody2.wrap_and_save()
|
||||||
|
|
||||||
|
# Repeat process for next dataset
|
||||||
|
mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=seeds, test_size=0.2)
|
||||||
|
mody3.fit()
|
||||||
|
mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202)
|
||||||
|
mody3.wrap_and_save()
|
||||||
|
|
|
||||||
10
trainer.py
10
trainer.py
|
|
@ -599,8 +599,8 @@ class BinaryTuner:
|
||||||
self.saveCheckPoint()
|
self.saveCheckPoint()
|
||||||
self.bar.close()
|
self.bar.close()
|
||||||
|
|
||||||
def get_best_models(self):
|
def get_best_models(self, metric="ROC_AUC"):
|
||||||
return self.ledger.groupby(["Dataset", "Model"])["ROC_AUC"].agg(['mean', 'std'])
|
return self.ledger.groupby(["Dataset", "Model"])[metric].agg(['mean', 'std'])
|
||||||
|
|
||||||
def explain_model(self, modelname=None, dataset=None, seed=None):
|
def explain_model(self, modelname=None, dataset=None, seed=None):
|
||||||
self.logger.info("{:=^60}".format(' Begin SHAP Explainer: {} {} {} '.format(modelname, dataset, seed)))
|
self.logger.info("{:=^60}".format(' Begin SHAP Explainer: {} {} {} '.format(modelname, dataset, seed)))
|
||||||
|
|
@ -743,9 +743,9 @@ class BinaryTuner:
|
||||||
|
|
||||||
def wrap_and_save(self):
|
def wrap_and_save(self):
|
||||||
self.logger.info("{:=^60}".format(' Saving Summary and Wrap the output in a ZipFile '))
|
self.logger.info("{:=^60}".format(' Saving Summary and Wrap the output in a ZipFile '))
|
||||||
|
for metric in ["ROC_AUC", "NPV", "PPV", "Brier", "sensitivity", "specificity"]:
|
||||||
with pd.ExcelWriter('{}/Summary.xlsx'.format(self.name) , engine='xlsxwriter') as xls:
|
with pd.ExcelWriter('{}/Summary-{}.xlsx'.format(self.name, metric) , engine='xlsxwriter') as xls:
|
||||||
self.get_best_models().to_excel(xls, sheet_name='Results')
|
self.get_best_models(metric).to_excel(xls, sheet_name='Results')
|
||||||
|
|
||||||
with zipfile.ZipFile('{}-{}.zip'.format(self.name, self.start), 'w', zipfile.ZIP_DEFLATED) as zipf:
|
with zipfile.ZipFile('{}-{}.zip'.format(self.name, self.start), 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||||
for root, dirs, files in os.walk(self.name):
|
for root, dirs, files in os.walk(self.name):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue