main
ifiguero 2025-01-15 21:09:00 -03:00
parent 85571895de
commit 03981630e5
3 changed files with 102 additions and 9 deletions

2
.gitignore vendored 100644
View File

@ -0,0 +1,2 @@
old/**
*zip

View File

@ -9,6 +9,92 @@ def safe_float(x):
return np.nan return np.nan
def analisis_univariado(dfi, target=None, continuas=[], discretas=[]):
if target is None:
raise ValueError("No target variable provided")
import pandas as pd
from scipy.stats import shapiro, ttest_ind, mannwhitneyu, chi2_contingency, fisher_exact
# label_columns = ['sexo', 'hist fam', 'edad diag', 'IMC', 'glu ayu', 'glu 120','A1c']
label_columns = dfi.drop(target, axis=1).columns
# Separar el target en dos grupos: N positivo y N negativo
groups = dfi[target].unique()
if len(groups) != 2:
raise ValueError("Target variable must have exactly two unique values")
group1, group2 = groups
data_group1 = dfi[dfi[target] == group1][label_columns]
data_group2 = dfi[dfi[target] == group2][label_columns]
results = []
# Análisis de variables continuas
for var in continuas:
Xvar = dfi[var]
group1_values = data_group1[var]
group2_values = data_group2[var]
# Test de normalidad (Shapiro-Wilk)
stat, p = shapiro(Xvar)
normal = p >= 0.05
if normal:
# Distribución normal: media, desviación estándar, y test t
mean1, std1 = group1_values.mean(), group1_values.std()
mean2, std2 = group2_values.mean(), group2_values.std()
t_stat, t_pval = ttest_ind(group1_values, group2_values, equal_var=False)
results.append([
var, "Continua", "Normal",
f"Media: {mean1:.2f} (Grupo 1), {mean2:.2f} (Grupo 2)",
f"Desviación Est.: {std1:.2f} (Grupo 1), {std2:.2f} (Grupo 2)",
f"Test t: p={t_pval:.3f}"
])
else:
# Distribución no normal: mediana, rango intercuartil, y test Mann-Whitney
median1, iqr1 = group1_values.median(), group1_values.quantile(0.75) - group1_values.quantile(0.25)
median2, iqr2 = group2_values.median(), group2_values.quantile(0.75) - group2_values.quantile(0.25)
mw_stat, mw_pval = mannwhitneyu(group1_values, group2_values)
results.append([
var, "Continua", "No Normal",
f"Mediana: {median1:.2f} (Grupo 1), {median2:.2f} (Grupo 2)",
f"RIC: {iqr1:.2f} (Grupo 1), {iqr2:.2f} (Grupo 2)",
f"Mann-Whitney: p={mw_pval:.3f}"
])
# Análisis de variables discretas
for var in discretas:
freq_table = dfi.groupby([target, var]).size().unstack(fill_value=0)
percentages = freq_table.div(freq_table.sum(axis=1), axis=0) * 100
# Pruebas estadísticas
if freq_table.shape[1] == 2:
# Test exacto de Fisher
_, fisher_pval = fisher_exact(freq_table.values)
test_result = f"Fisher Exact: p={fisher_pval:.3f}"
else:
# Test Chi cuadrado
chi2_stat, chi2_pval, _, _ = chi2_contingency(freq_table)
test_result = f"Chi2: p={chi2_pval:.3f}"
results.append([
var, "Discreta", "N/A",
f"Frecuencias: {freq_table.to_dict()}",
f"Porcentajes: {percentages.to_dict()}",
test_result
])
# Crear DataFrame con los resultados
results_df = pd.DataFrame(results, columns=[
"Variable", "Tipo", "Distribución", "Medidas descriptivas", "Estadísticas", "Resultados Prueba"
])
return results_df
def load_data(Reload=False): def load_data(Reload=False):
if os.path.isfile('MODY_data.xlsx'): if os.path.isfile('MODY_data.xlsx'):
import pandas as pd import pandas as pd

View File

@ -1,4 +1,4 @@
from load_dataset import load_data from load_dataset import load_data, analisis_univariado
from trainer import BinaryTuner from trainer import BinaryTuner
import pandas as pd import pandas as pd
import numpy as np import numpy as np
@ -7,13 +7,18 @@ warnings.filterwarnings("ignore")
_, dms2, dms3, _ = load_data() _, dms2, dms3, _ = load_data()
mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=[231964], test_size=0.2)
mody2.fit() resultados = analisis_univariado(dms2, target="MODY2_label", continuas=['edad diag', 'IMC', 'glu ayu', 'glu 120','A1c'], discretas=['sexo', 'hist fam'])
mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964) print(resultados)
mody2.wrap_and_save()
mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=[536202], test_size=0.2) # mody2 = BinaryTuner(dms2, 'MODY2_label', seeds=[231964], test_size=0.2)
mody3.fit() # mody2.fit()
mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202) # mody2.explain_model('GaussianNB', 'fulldataset-oversampled-mice', 231964)
mody3.wrap_and_save() # mody2.wrap_and_save()
#
#
# mody3 = BinaryTuner(dms3, 'MODY3_label', seeds=[536202], test_size=0.2)
# mody3.fit()
# mody3.explain_model('RandomForestClassifier', 'fulldataset-original-mice', 536202)
# mody3.wrap_and_save()