diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..076634a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +old/** +*zip diff --git a/load_dataset.py b/load_dataset.py index a66187d..2e43808 100644 --- a/load_dataset.py +++ b/load_dataset.py @@ -9,6 +9,92 @@ def safe_float(x): return np.nan + +def analisis_univariado(dfi, target=None, continuas=[], discretas=[]): + if target is None: + raise ValueError("No target variable provided") + + import pandas as pd + from scipy.stats import shapiro, ttest_ind, mannwhitneyu, chi2_contingency, fisher_exact + +# label_columns = ['sexo', 'hist fam', 'edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'] + label_columns = dfi.drop(target, axis=1).columns + + # Separar el target en dos grupos: N positivo y N negativo + groups = dfi[target].unique() + if len(groups) != 2: + raise ValueError("Target variable must have exactly two unique values") + + group1, group2 = groups + data_group1 = dfi[dfi[target] == group1][label_columns] + data_group2 = dfi[dfi[target] == group2][label_columns] + + results = [] + + # Análisis de variables continuas + for var in continuas: + Xvar = dfi[var] + group1_values = data_group1[var] + group2_values = data_group2[var] + + # Test de normalidad (Shapiro-Wilk) + stat, p = shapiro(Xvar) + normal = p >= 0.05 + + if normal: + # Distribución normal: media, desviación estándar, y test t + mean1, std1 = group1_values.mean(), group1_values.std() + mean2, std2 = group2_values.mean(), group2_values.std() + t_stat, t_pval = ttest_ind(group1_values, group2_values, equal_var=False) + results.append([ + var, "Continua", "Normal", + f"Media: {mean1:.2f} (Grupo 1), {mean2:.2f} (Grupo 2)", + f"Desviación Est.: {std1:.2f} (Grupo 1), {std2:.2f} (Grupo 2)", + f"Test t: p={t_pval:.3f}" + ]) + else: + # Distribución no normal: mediana, rango intercuartil, y test Mann-Whitney + median1, iqr1 = group1_values.median(), group1_values.quantile(0.75) - group1_values.quantile(0.25) + median2, iqr2 = group2_values.median(), group2_values.quantile(0.75) - group2_values.quantile(0.25) + mw_stat, mw_pval = mannwhitneyu(group1_values, group2_values) + results.append([ + var, "Continua", "No Normal", + f"Mediana: {median1:.2f} (Grupo 1), {median2:.2f} (Grupo 2)", + f"RIC: {iqr1:.2f} (Grupo 1), {iqr2:.2f} (Grupo 2)", + f"Mann-Whitney: p={mw_pval:.3f}" + ]) + + # Análisis de variables discretas + for var in discretas: + freq_table = dfi.groupby([target, var]).size().unstack(fill_value=0) + percentages = freq_table.div(freq_table.sum(axis=1), axis=0) * 100 + + # Pruebas estadísticas + if freq_table.shape[1] == 2: + # Test exacto de Fisher + _, fisher_pval = fisher_exact(freq_table.values) + test_result = f"Fisher Exact: p={fisher_pval:.3f}" + else: + # Test Chi cuadrado + chi2_stat, chi2_pval, _, _ = chi2_contingency(freq_table) + test_result = f"Chi2: p={chi2_pval:.3f}" + + results.append([ + var, "Discreta", "N/A", + f"Frecuencias: {freq_table.to_dict()}", + f"Porcentajes: {percentages.to_dict()}", + test_result + ]) + + # Crear DataFrame con los resultados + results_df = pd.DataFrame(results, columns=[ + "Variable", "Tipo", "Distribución", "Medidas descriptivas", "Estadísticas", "Resultados Prueba" + ]) + return results_df + + + + def load_data(Reload=False): if os.path.isfile('MODY_data.xlsx'): import pandas as pd diff --git a/train.py b/train.py index 57f3db6..8fdeff3 100644 --- a/train.py +++ b/train.py @@ -1,4 +1,4 @@ -from load_dataset import load_data +from load_dataset import load_data, analisis_univariado from trainer import BinaryTuner import pandas as pd import numpy as np @@ -7,13 +7,18 @@ warnings.filterwarnings("ignore") _, dms2, dms3, _ = load_data() -mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=[231964], test_size=0.2) -mody2.fit() -mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964) -mody2.wrap_and_save() + +resultados = analisis_univariado(dms2, target="MODY2_label", continuas=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discretas=['sexo', 'hist fam']) +print(resultados) -mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=[536202], test_size=0.2) -mody3.fit() -mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202) -mody3.wrap_and_save() +# mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=[231964], test_size=0.2) +# mody2.fit() +# mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964) +# mody2.wrap_and_save() +# +# +# mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=[536202], test_size=0.2) +# mody3.fit() +# mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202) +# mody3.wrap_and_save()