mody_2024/load_dataset.py

435 lines
24 KiB
Python
Raw Normal View History

2024-12-06 19:08:51 -03:00
import numpy as np
import os
import xlsxwriter
def safe_float(x):
try:
return float(x.replace(',', '.'))
except ValueError:
return np.nan
2025-01-15 21:09:00 -03:00
def analisis_univariado(dfi, target=None, continuas=[], discretas=[]):
if target is None:
raise ValueError("No target variable provided")
import pandas as pd
from scipy.stats import shapiro, ttest_ind, mannwhitneyu, chi2_contingency, fisher_exact
# label_columns = ['sexo', 'hist fam', 'edad diag', 'IMC', 'glu ayu', 'glu 120','A1c']
label_columns = dfi.drop(target, axis=1).columns
# Separar el target en dos grupos: N positivo y N negativo
2025-01-17 01:29:05 -03:00
groups = sorted(dfi[target].unique())
2025-01-15 21:09:00 -03:00
if len(groups) != 2:
raise ValueError("Target variable must have exactly two unique values")
2025-01-16 13:54:21 -03:00
glabel = {1: 'MODY Pos', 0: 'MODY Neg'}
2025-01-15 21:09:00 -03:00
group1, group2 = groups
data_group1 = dfi[dfi[target] == group1][label_columns]
data_group2 = dfi[dfi[target] == group2][label_columns]
results = []
2025-01-18 23:14:21 -03:00
resultsmody = []
2025-01-15 21:09:00 -03:00
# Análisis de variables continuas
for var in continuas:
2025-01-17 00:27:48 -03:00
Xvar = dfi[var].T
group1_values = data_group1[var].T
group2_values = data_group2[var].T
2025-01-15 21:09:00 -03:00
# Test de normalidad (Shapiro-Wilk)
2025-01-16 14:41:59 -03:00
stat, p = shapiro(Xvar, nan_policy='raise')
2025-01-17 00:27:48 -03:00
normal = p >= 0.05
2025-01-15 21:09:00 -03:00
if normal:
# Distribución normal: media, desviación estándar, y test t
2025-01-18 23:14:21 -03:00
meang, stdg = dfi[var].mean(), dfi[var].std()
2025-01-15 21:09:00 -03:00
mean1, std1 = group1_values.mean(), group1_values.std()
mean2, std2 = group2_values.mean(), group2_values.std()
t_stat, t_pval = ttest_ind(group1_values, group2_values, equal_var=False)
results.append([
2025-01-16 14:45:21 -03:00
var, "Continua", f"Normal (p={p:.7f})",
2025-01-17 01:05:34 -03:00
f"mean: {mean1:.2f}, stdev: {std1:.2f}",
f"mean: {mean2:.2f}, stdev: {std2:.2f}",
2025-01-17 01:25:28 -03:00
f"t Student: p={t_pval:.3f} " + ("Dif Significativa" if t_pval < 0.05 else "Dif No-Significativa")
2025-01-15 21:09:00 -03:00
])
2025-01-18 23:14:21 -03:00
resultsmody.append([
var, " ", f"{mean1:.1f} ± {std1:.1f}", f"{mean2:.1f} ± {std2:.1f}",
f"{t_pval:.3f}", ("*" if t_pval < 0.05 else "NS"), f"{meang:.1f} ± {stdg:.1f}"
])
2025-01-15 21:09:00 -03:00
else:
# Distribución no normal: mediana, rango intercuartil, y test Mann-Whitney
2025-01-19 00:07:58 -03:00
mediang, qrg, qlg = dfi[var].median(), dfi[var].quantile(0.75), dfi[var].quantile(0.25)
2025-01-18 23:14:21 -03:00
qr1, ql1 = group1_values.quantile(0.75), group1_values.quantile(0.25)
qr2, ql2 = group2_values.quantile(0.75), group2_values.quantile(0.25)
2025-01-15 21:09:00 -03:00
median1, iqr1 = group1_values.median(), group1_values.quantile(0.75) - group1_values.quantile(0.25)
median2, iqr2 = group2_values.median(), group2_values.quantile(0.75) - group2_values.quantile(0.25)
mw_stat, mw_pval = mannwhitneyu(group1_values, group2_values)
results.append([
2025-01-16 14:45:21 -03:00
var, "Continua", f"No Normal (p={p:.7f})",
2025-01-17 01:05:34 -03:00
f"Mediana: {median1:.2f}, RIC: {iqr1:.2f}",
f"Mediana: {median2:.2f}, RIC: {iqr2:.2f}",
2025-01-17 01:25:28 -03:00
f"Mann-Whitney: p={mw_pval:.3f} " + ("Dif Significativa" if mw_pval < 0.05 else "Dif No-Significativa")
2025-01-15 21:09:00 -03:00
])
2025-01-18 23:14:21 -03:00
resultsmody.append([
2025-01-19 00:00:03 -03:00
var, " ", f"{median1:.1f} ({ql1:.1f} - {qr1:.1f})", f"{median2:.1f} ({ql2:.1f} - {qr2:.1f})",
2025-01-18 23:31:59 -03:00
f"{mw_pval:.3f}", ("*" if mw_pval < 0.05 else "NS"), f"{mediang:.1f} ({qlg:.1f} - {qrg:.1f})"
2025-01-18 23:14:21 -03:00
])
2025-01-15 21:09:00 -03:00
# Análisis de variables discretas
for var in discretas:
freq_table = dfi.groupby([target, var]).size().unstack(fill_value=0)
percentages = freq_table.div(freq_table.sum(axis=1), axis=0) * 100
# Pruebas estadísticas
if freq_table.shape[1] == 2:
# Test exacto de Fisher
_, fisher_pval = fisher_exact(freq_table.values)
2025-01-17 01:25:28 -03:00
test_result = f"Fisher Exact: p={fisher_pval:.3f} " + ("Dif Significativa" if fisher_pval < 0.05 else "Dif No-Significativa")
2025-01-15 21:09:00 -03:00
else:
# Test Chi cuadrado
chi2_stat, chi2_pval, _, _ = chi2_contingency(freq_table)
test_result = f"Chi2: p={chi2_pval:.3f}"
results.append([
var, "Discreta", "N/A",
2025-01-17 00:35:13 -03:00
f"Frecuencias: {freq_table.values}",
2025-01-17 01:06:27 -03:00
f"Porcentajes: {percentages.values.round(1)}",
2025-01-15 21:09:00 -03:00
test_result
])
2025-01-18 23:39:03 -03:00
freq_matrix = freq_table.values
2025-01-18 23:40:41 -03:00
percentages_matrix = percentages.values
2025-01-18 23:39:03 -03:00
tot = freq_matrix[0][1] + freq_matrix[1][1]
2025-01-18 23:14:21 -03:00
totf = 100 * tot / len(dfi[var])
resultsmody.append([
2025-01-18 23:52:06 -03:00
var, " ", f"{percentages_matrix[0][1]:.1f} ({freq_matrix[0][1]}/{len(data_group1)})", f"{percentages_matrix[1][1]:.1f} ({freq_matrix[1][1]}/{len(data_group2)})",
2025-01-18 23:31:59 -03:00
f"{fisher_pval:.3f}", ("*" if fisher_pval < 0.05 else "NS"), f"{totf:.1f} ({tot}/{len(dfi[var])})"
2025-01-18 23:14:21 -03:00
])
2025-01-15 21:09:00 -03:00
# Crear DataFrame con los resultados
2025-01-18 23:14:21 -03:00
results_df = pd.DataFrame(resultsmody, columns=[
2025-01-19 00:01:21 -03:00
"Variable", "Unidad", "{}\n n={}".format(glabel[group1], len(data_group1)),"{}\n n={}".format(glabel[group2], len(data_group2)), "Pvalue", " ", "Total"
2025-01-15 21:09:00 -03:00
])
return results_df
2024-12-06 19:08:51 -03:00
def load_data(Reload=False):
if os.path.isfile('MODY_data.xlsx'):
import pandas as pd
with pd.ExcelFile("MODY_data.xlsx") as xls:
dsm1_complete = pd.read_excel(xls, sheet_name='Dataset MODY1')
dsm2_complete = pd.read_excel(xls, sheet_name='Dataset MODY2')
dsm3_complete = pd.read_excel(xls, sheet_name='Dataset MODY3')
dsm5_complete = pd.read_excel(xls, sheet_name='Dataset MODY5')
else:
print("========================================================================================")
if not os.path.isfile('HC.xlsx'):
raise 'NoDatasetToLoad'
import pandas as pd
with pd.ExcelFile("HC.xlsx") as xls:
raw_data = pd.read_excel(xls, header=0)
# pd.read_excel('HC.xlsx', header=0)
# Retiramos las columnas que no son de interes
drop_columns=['HC', 'probando', 'procedencia','apellido','fecha ingreso','edad','pago','factura','monto','Pendiente','método','Referencias','Analisis','aclar_pagos','tratamiento','notas','nro de familia', 'resultado']
raw_data.drop(columns=drop_columns, inplace=True)
for index, var in raw_data.iterrows():
if not pd.isna(var['IMC']) and isinstance(var['IMC'], str):
raw_data.loc[index, 'IMC'] = safe_float(var['IMC'])
if not pd.isna(var['A1c']) and isinstance(var['A1c'], str):
raw_data.loc[index, 'A1c'] = safe_float(var['A1c'])
if not pd.isna(var['edad diag']) and isinstance(var['edad diag'], str):
raw_data.loc[index, 'edad diag'] = round(safe_float(var['edad diag']),0)
if not pd.isna(var['glu ayu']) and isinstance(var['glu ayu'], str):
raw_data.loc[index, 'glu ayu'] = round(safe_float(var['glu ayu']),0)
if not pd.isna(var['glu 120']) and isinstance(var['glu 120'], str):
raw_data.loc[index, 'glu 120'] = round(safe_float(var['glu 120']),0)
raw_data['IMC'] = raw_data['IMC'].astype(np.float64)
raw_data['A1c'] = raw_data['A1c'].astype(np.float64)
raw_data['edad diag'] = raw_data['edad diag'].astype(np.float64)
raw_data['glu ayu'] = raw_data['glu ayu'].astype(np.float64)
raw_data['glu 120'] = raw_data['glu 120'].astype(np.float64)
diagnosticos = []
for index, var in raw_data.iterrows():
if var['sospecha MODY'] == '2':
diagnosticos.append(var['diagnostico'])
print("Total elementos en el dataset con sospecha MODY2:\t{}".format(len(diagnosticos)))
print("Diagnosticos del grupo:")
diagnosticos = list(set(diagnosticos))
for diagnostico in diagnosticos:
print("- '{}'".format(diagnostico))
print("========================================================================================")
diagnosticos = []
for index, var in raw_data.iterrows():
if var['sospecha MODY'] == '3':
diagnosticos.append(var['diagnostico'])
print("Total elementos en el dataset con sospecha MODY3:\t{}".format(len(diagnosticos)))
print("Diagnosticos del grupo:")
diagnosticos = list(set(diagnosticos))
for diagnostico in diagnosticos:
print("- '{}'".format(diagnostico))
print("========================================================================================")
diagnosticos = []
for index, var in raw_data.iterrows():
if var['sospecha MODY'] not in ['2', '3']:
diagnosticos.append(var['diagnostico'])
print("Total elementos en el dataset con sospechas diferentes a 2 o 3:\t{}".format(len(diagnosticos)))
print("Diagnosticos del grupo:")
diagnosticos = list(set(diagnosticos))
for diagnostico in diagnosticos:
print("- '{}'".format(diagnostico))
## generación de las clases en base a la confirmación de la sospecha
raw_data['MODY1_pos'] = False
raw_data['MODY1_neg'] = False
raw_data['MODY2_pos'] = False
raw_data['MODY2_neg'] = False
raw_data['MODY3_pos'] = False
raw_data['MODY3_neg'] = False
raw_data['MODY5_pos'] = False
raw_data['MODY5_neg'] = False
raw_data['SiEntiqueta'] = False
raw_data['Normal'] = False
raw_data.loc[ (raw_data['diagnostico'].str.contains('Diagnóstico MODY1', case=False, na=False)), 'MODY1_pos'] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('Diagnóstico MODY2', case=False, na=False)), 'MODY2_pos'] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('Diagnóstico MODY3', case=False, na=False)), 'MODY3_pos'] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('Diagnóstico MODY5', case=False, na=False)), 'MODY5_pos'] = True
raw_data.loc[ (raw_data['sospecha MODY'] == '1') & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'MODY1_neg'] = True
raw_data.loc[ (raw_data['sospecha MODY'] == '2') & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'MODY2_neg'] = True
raw_data.loc[ (raw_data['sospecha MODY'] == '3') & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'MODY3_neg'] = True
raw_data.loc[ (raw_data['sospecha MODY'] == '5') & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'MODY5_neg'] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('Normal', case=False, na=False)), 'Normal' ] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('No se hace', case=False, na=False)), 'SiEntiqueta'] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('Sin diagnóstico', case=False, na=False)), 'SiEntiqueta'] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('Otros', case=False, na=False)), 'SiEntiqueta'] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('No es MODY', case=False, na=False)), 'SiEntiqueta'] = True
raw_data.loc[ (raw_data['diagnostico'].str.contains('Falta definir', case=False, na=False)), 'SiEntiqueta'] = True
raw_data.loc[ (~raw_data['sospecha MODY'].isin(['1', '2', '3', '5'])) & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'SiEntiqueta'] = True
raw_data.loc[ pd.isna(raw_data['diagnostico']), 'SiEntiqueta'] = True
print("================== Datos sin confirmar/descartar ningún MODY ===========================")
tipos = ['MODY1_pos', 'MODY1_neg', 'MODY2_pos', 'MODY2_neg', 'MODY3_pos', 'MODY3_neg','MODY5_pos', 'MODY5_neg','Normal','SiEntiqueta']
sinconfirmar = 0
## Datos que no cumplen con el criterio
for index, var in raw_data.iterrows(): # imprime los registros que no pertenecen a ninguna categoria:
if not any(var[col] for col in tipos):
print("sujeto: {} \t| sospecha: {} \t| diagnostico: {:18} \t | historial: {} ".format(var['protocolo'],var['sospecha MODY'], var['diagnostico'], var['historial']))
sinconfirmar += 1
print("====================== Diagnosticos confirmados/descartados ==========================")
contador = {}
for tipo in tipos:
contador[tipo] = 0
for index, var in raw_data.iterrows():
for tipo in tipos:
if var[tipo]:
contador[tipo] += 1
for tipo in tipos:
print("{:20} \t {} ({}%)".format(tipo, contador[tipo], round((contador[tipo]/len(raw_data))*100, 2)))
print("=========================== ==================== ==================================")
label_vars = ['protocolo', 'nombre', 'edad diag', 'IMC', 'antecedentes fam', 'glu ayu', 'glu 120', 'A1c','MODY1_pos', 'MODY1_neg', 'MODY2_pos', 'MODY2_neg', 'MODY3_pos', 'MODY3_neg','MODY5_pos', 'MODY5_neg','Normal']
pre_labeled_data = raw_data[raw_data['SiEntiqueta'] == False][label_vars]
pre_labeled_data.head()
"""## 2.2. Antecedentes familiares
Se genera el campo a partir del comentario del grupo familiar
"""
pre_labeled_data['diabetes_familia'] = np.nan
## -1 == no hay antecedentes familiares de diabetes
pre_labeled_data.loc[pre_labeled_data['antecedentes fam'].str.lower().str.startswith('no', na=False), 'diabetes_familia'] = -1.0
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.contains('no dm', case=False, na=False), 'diabetes_familia'] = -1.0
## 1 == si hay antecedentes familiares de diabetes
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('si', na=False), 'diabetes_familia'] = 1.0
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('her', na=False), 'diabetes_familia'] = 1.0 #hermana o hermano
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('pad', na=False), 'diabetes_familia'] = 1.0
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('mad', na=False), 'diabetes_familia'] = 1.0
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('amb', na=False), 'diabetes_familia'] = 1.0
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('hij', na=False), 'diabetes_familia'] = 1.0 #hija o hijo
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('multi', na=False), 'diabetes_familia'] = 1.0
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('ti', na=False), 'diabetes_familia'] = 1.0 #tia o tio
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('abu', na=False), 'diabetes_familia'] = 1.0 #abuela o abuelo
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('diab', na=False), 'diabetes_familia'] = 1.0
## 0 == no se sabe: sin información (Ej: adoptado)
# clean_data.loc[ clean_data['diabetes_familia'] == 0, 'antecedentes fam'].unique() #muestra los valores que no tienen match con lo indicado anteriormente
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('mare', na=False), 'diabetes_familia'] = 1.0 #anomalía, madre mal escrito
print("==================================== Clasificados =============================================")
for value, count in pre_labeled_data[~pre_labeled_data['diabetes_familia'].isna()]['diabetes_familia'].value_counts(dropna=False).items():
print(f"Value: {value}, Count: {count}")
print("==================================== No se pudo Clasificar =============================================")
for value, count in pre_labeled_data[pre_labeled_data['diabetes_familia'].isna()]['antecedentes fam'].value_counts(dropna=False).items():
print(f"Value: {value}, Count: {count}")
"""## 2.3. Sexo
Se infiere el sexo a partir de los nombres
"""
pre_labeled_data['sexo'] = np.nan
## 1 == Mujer
nombres_f = ['andrea', 'agustina', 'antonella', 'angelica', 'alicia', 'alejandra', 'ariana', 'ayelen', 'ayleen', 'belen', 'bianca',
'camila', 'carolina', 'catalina', 'claudia', 'delfina', 'eliana', 'estefania', 'eva', 'karina', 'florencia', 'gabriela',
'georgina', 'geraldine', 'guillermina', 'jazmin', 'jessica', 'julieta', 'karen', 'laura', 'lidia', 'lucia', 'magali', 'mina',
'mabel', 'malena', 'malena', 'mariana', 'marina', 'martina', 'micaela', 'micalela', 'milagros', 'milena',
'miriam', 'morena', 'natalia', 'noemi', 'nayla', 'rocio', 'rosa', 'sandra', 'sara', 'sasha', 'silvia', 'silvana',
'sofia', 'solange', 'soledad', 'valentina', 'victoria', 'vanina', 'vanesa', 'virginia', 'yanina', 'zamira',
'abril', 'adriana', 'ailen', 'aixa', 'ambar', 'ana', 'ana esmerlada', 'ana iris', 'anahi', 'analia', 'aylen', 'barbara',
'brenda', 'brisa', 'candela', 'carmela (carmen)', 'chiara', 'elizabeth', 'ema', 'emilia', 'emma', 'eugenia', 'fiorella',
'flavia', 'franca', 'francesca', 'graciela', 'helena', 'isabela', 'isabella', 'jacinta', 'jesica', 'jorgelina', 'julia', 'lorena',
'lucila', 'lucía', 'magdalena', 'maricruz', 'mariel', 'mariela', 'marilina', 'marixa', 'martha', 'maría emilia', 'maría verónica',
'melany', 'mercedes', 'monica', 'nancy rosa alba', 'nerina', 'oriana', 'paola', 'patricia', 'paula', 'pilar', 'priscila', 'renata',
'romina', 'roxana', 'ruth', 'shirley', 'tamara', 'valeria' ]
nombres_f.append('zahirah') # dejo los nombres que me hacen duda en forma individual
nombres_f.append('antu')
nombres_f.append('tali')
nombres_f.append('ma laura')
nombres_f.append('qian') # nombre femenino de origen chino
nombres_f.append('maria')
for nombre_f in nombres_f:
pre_labeled_data.loc[ pre_labeled_data['sexo'].isna() & (pre_labeled_data['nombre'].str.lower().str.startswith(nombre_f, na=False)), 'sexo'] = 1.0
## -1 == Hombre
nombres_h = ['agustin', 'alejandro', 'alvaro', 'augusto', 'benjamin', 'bruno', 'camilo', 'cristian', 'damian', 'dario', 'daniel', 'dante',
'david', 'diego', 'emiliano', 'elian', 'enzo', 'ezequiel', 'facundo', 'federico', 'felipe', 'fernando', 'felix', 'franco', 'german',
'gonzalo', 'gustavo', 'guillermo', 'ignacio', 'ian','joaquin', 'juan', 'julian', 'leandro', 'lorenzo', 'lucas', 'luka', 'marcelo',
'marcos', 'martin', 'martin', 'maximiliano', 'mateo', 'matias', 'pablo', 'nehemias', 'nicolas', 'ramiro', 'rogelio', 'rodrigo',
'santiago', 'santino', 'sebastian', 'thiago', 'tomas',
'alan', 'alfredo', 'antonio', 'axel', 'benicio', 'carlos', 'carlos gonzalo', 'claudio', 'dylan', 'eduardo', 'emanuel', 'ernesto',
'fabian', 'farid', 'fidel', 'francisco', 'gabriel facundo', 'gael', 'gerardo', 'gerónimo', 'hernan', 'ivan', 'javier', 'jorge',
'julio', 'mauricio', 'miguel angel', 'oscar', 'pedro', 'raul', 'rene', 'ricardo', 'roberto', 'sergio', 'teo', 'tiago', 'tobias', 'walter']
nombres_h.append('agustín')
for nombre_h in nombres_h:
pre_labeled_data.loc[ pre_labeled_data['sexo'].isna() & (pre_labeled_data['nombre'].str.lower().str.startswith(nombre_h, na=False)), 'sexo'] = -1.0
print("==================================== Clasificados =============================================")
for value, count in pre_labeled_data[~pre_labeled_data['sexo'].isna()]['sexo'].value_counts(dropna=False).items():
print(f"Value: {value}, Count: {count}")
listnames = []
print("==================================== No se pudo Clasificar =============================================")
for value, count in pre_labeled_data[pre_labeled_data['sexo'].isna()]['nombre'].value_counts(dropna=False).items():
print(f"Value: {value}, Count: {count}")
listnames.append(value)
print(sorted([x for x in listnames if isinstance(x, str)]))
"""## 2.1. Registros incompletos
Se desplegan información sobre valores faltantes en las variables de interés, sujetos sin datos y se genera una versión que solo incluye los registros que contienen toda la información para poder ser usados en el entrenamiento.
"""
import pandas as pd
variables = ['sexo', 'diabetes_familia','edad diag', 'IMC', 'glu ayu', 'glu 120', 'A1c']
print("========================================================================================")
print("Total registros en el dataset etiquetado:\t{}".format(pre_labeled_data.shape[0]))
print("Variables:\t{}".format(str(variables)))
print("==================== Desglose por N de variables faltantes ==============================")
for num in range(len(variables)+1):
nrows = len(pre_labeled_data[pre_labeled_data[variables].isnull().sum(axis=1) == num])
print("Le faltan {}/{} variables:\t{}\t({}%)".format(num, len(variables), nrows, round(nrows*100/pre_labeled_data.shape[0], 2)))
print("============================ Desglose por variables =====000=============================")
for var in variables:
nrows = pre_labeled_data[var].isna().astype(int).sum()
print("Variable {} ausente en \t\t {} ({}%) registros ".format(var, nrows, round(nrows*100/pre_labeled_data.shape[0], 2)))
pre_labeled_data['MODY1_label'] = np.nan
pre_labeled_data.loc[pre_labeled_data['MODY1_pos'], 'MODY1_label'] = 1
pre_labeled_data.loc[pre_labeled_data['MODY1_neg'], 'MODY1_label'] = 0#-1
pre_labeled_data['MODY2_label'] = np.nan
pre_labeled_data.loc[pre_labeled_data['MODY2_pos'], 'MODY2_label'] = 1
pre_labeled_data.loc[pre_labeled_data['MODY2_neg'], 'MODY2_label'] = 0#-1
pre_labeled_data['MODY3_label'] = np.nan
pre_labeled_data.loc[pre_labeled_data['MODY3_pos'], 'MODY3_label'] = 1
pre_labeled_data.loc[pre_labeled_data['MODY3_neg'], 'MODY3_label'] = 0#-1
pre_labeled_data['MODY5_label'] = np.nan
pre_labeled_data.loc[pre_labeled_data['MODY5_pos'], 'MODY5_label'] = 1
pre_labeled_data.loc[pre_labeled_data['MODY5_neg'], 'MODY5_label'] = 0#-1
"""# 3. Datos iniciales"""
dsm1_complete = pre_labeled_data[~pre_labeled_data['MODY1_label'].isna()][variables+['MODY1_label']]
dsm2_complete = pre_labeled_data[~pre_labeled_data['MODY2_label'].isna()][variables+['MODY2_label']]
dsm3_complete = pre_labeled_data[~pre_labeled_data['MODY3_label'].isna()][variables+['MODY3_label']]
dsm5_complete = pre_labeled_data[~pre_labeled_data['MODY5_label'].isna()][variables+['MODY5_label']]
dsnormal_complete = pre_labeled_data[pre_labeled_data['Normal']][variables]
"""# 4. Salida intermedia de los datos para verificación manual
Guarda los dataframes en un excel para verificación
"""
with pd.ExcelWriter("MODY_data.xlsx", engine='xlsxwriter') as xls:
raw_data.to_excel(xls, sheet_name='HC Original', index=False)
pre_labeled_data.to_excel(xls, sheet_name='Datos etiquetados', index=False)
raw_data[raw_data['SiEntiqueta'] == True].to_excel(xls, sheet_name='Datos excluídos', index=False)
dsm1_complete.to_excel(xls, sheet_name='Dataset MODY1', index=False)
dsm1_complete.dropna().to_excel(xls, sheet_name='Dataset MODY1 sin ausentes', index=False)
dsm2_complete.to_excel(xls, sheet_name='Dataset MODY2', index=False)
dsm2_complete.dropna().to_excel(xls, sheet_name='Dataset MODY2 sin ausentes', index=False)
dsm3_complete.to_excel(xls, sheet_name='Dataset MODY3', index=False)
dsm3_complete.dropna().to_excel(xls, sheet_name='Dataset MODY3 sin ausentes', index=False)
dsm5_complete.to_excel(xls, sheet_name='Dataset MODY5', index=False)
dsm5_complete.dropna().to_excel(xls, sheet_name='Dataset MODY5 sin ausentes', index=False)
dsnormal_complete.to_excel(xls, sheet_name='Sin Diabetes', index=False)
return dsm1_complete, dsm2_complete, dsm3_complete, dsm5_complete