409 lines
23 KiB
Python
409 lines
23 KiB
Python
import numpy as np
|
|
import os
|
|
import xlsxwriter
|
|
|
|
def safe_float(x):
|
|
try:
|
|
return float(x.replace(',', '.'))
|
|
except ValueError:
|
|
return np.nan
|
|
|
|
|
|
|
|
def analisis_univariado(dfi, target=None, continuas=[], discretas=[]):
|
|
if target is None:
|
|
raise ValueError("No target variable provided")
|
|
|
|
import pandas as pd
|
|
from scipy.stats import shapiro, ttest_ind, mannwhitneyu, chi2_contingency, fisher_exact
|
|
|
|
# label_columns = ['sexo', 'hist fam', 'edad diag', 'IMC', 'glu ayu', 'glu 120','A1c']
|
|
label_columns = dfi.drop(target, axis=1).columns
|
|
|
|
# Separar el target en dos grupos: N positivo y N negativo
|
|
groups = dfi[target].unique()
|
|
if len(groups) != 2:
|
|
raise ValueError("Target variable must have exactly two unique values")
|
|
|
|
group1, group2 = groups
|
|
data_group1 = dfi[dfi[target] == group1][label_columns]
|
|
data_group2 = dfi[dfi[target] == group2][label_columns]
|
|
|
|
results = []
|
|
|
|
# Análisis de variables continuas
|
|
for var in continuas:
|
|
Xvar = dfi[var]
|
|
group1_values = data_group1[var]
|
|
group2_values = data_group2[var]
|
|
|
|
# Test de normalidad (Shapiro-Wilk)
|
|
stat, p = shapiro(Xvar)
|
|
normal = p >= 0.05
|
|
|
|
if normal:
|
|
# Distribución normal: media, desviación estándar, y test t
|
|
mean1, std1 = group1_values.mean(), group1_values.std()
|
|
mean2, std2 = group2_values.mean(), group2_values.std()
|
|
t_stat, t_pval = ttest_ind(group1_values, group2_values, equal_var=False)
|
|
results.append([
|
|
var, "Continua", "Normal",
|
|
f"Media: {mean1:.2f} (Grupo 1), {mean2:.2f} (Grupo 2)",
|
|
f"Desviación Est.: {std1:.2f} (Grupo 1), {std2:.2f} (Grupo 2)",
|
|
f"Test t: p={t_pval:.3f}"
|
|
])
|
|
else:
|
|
# Distribución no normal: mediana, rango intercuartil, y test Mann-Whitney
|
|
median1, iqr1 = group1_values.median(), group1_values.quantile(0.75) - group1_values.quantile(0.25)
|
|
median2, iqr2 = group2_values.median(), group2_values.quantile(0.75) - group2_values.quantile(0.25)
|
|
mw_stat, mw_pval = mannwhitneyu(group1_values, group2_values)
|
|
results.append([
|
|
var, "Continua", "No Normal",
|
|
f"Mediana: {median1:.2f} (Grupo 1), {median2:.2f} (Grupo 2)",
|
|
f"RIC: {iqr1:.2f} (Grupo 1), {iqr2:.2f} (Grupo 2)",
|
|
f"Mann-Whitney: p={mw_pval:.3f}"
|
|
])
|
|
|
|
# Análisis de variables discretas
|
|
for var in discretas:
|
|
freq_table = dfi.groupby([target, var]).size().unstack(fill_value=0)
|
|
percentages = freq_table.div(freq_table.sum(axis=1), axis=0) * 100
|
|
|
|
# Pruebas estadísticas
|
|
if freq_table.shape[1] == 2:
|
|
# Test exacto de Fisher
|
|
_, fisher_pval = fisher_exact(freq_table.values)
|
|
test_result = f"Fisher Exact: p={fisher_pval:.3f}"
|
|
else:
|
|
# Test Chi cuadrado
|
|
chi2_stat, chi2_pval, _, _ = chi2_contingency(freq_table)
|
|
test_result = f"Chi2: p={chi2_pval:.3f}"
|
|
|
|
results.append([
|
|
var, "Discreta", "N/A",
|
|
f"Frecuencias: {freq_table.to_dict()}",
|
|
f"Porcentajes: {percentages.to_dict()}",
|
|
test_result
|
|
])
|
|
|
|
# Crear DataFrame con los resultados
|
|
results_df = pd.DataFrame(results, columns=[
|
|
"Variable", "Tipo", "Distribución", "Medidas descriptivas", "Estadísticas", "Resultados Prueba"
|
|
])
|
|
return results_df
|
|
|
|
|
|
|
|
|
|
def load_data(Reload=False):
|
|
if os.path.isfile('MODY_data.xlsx'):
|
|
import pandas as pd
|
|
|
|
with pd.ExcelFile("MODY_data.xlsx") as xls:
|
|
dsm1_complete = pd.read_excel(xls, sheet_name='Dataset MODY1')
|
|
dsm2_complete = pd.read_excel(xls, sheet_name='Dataset MODY2')
|
|
dsm3_complete = pd.read_excel(xls, sheet_name='Dataset MODY3')
|
|
dsm5_complete = pd.read_excel(xls, sheet_name='Dataset MODY5')
|
|
|
|
else:
|
|
print("========================================================================================")
|
|
if not os.path.isfile('HC.xlsx'):
|
|
raise 'NoDatasetToLoad'
|
|
|
|
import pandas as pd
|
|
|
|
with pd.ExcelFile("HC.xlsx") as xls:
|
|
raw_data = pd.read_excel(xls, header=0)
|
|
# pd.read_excel('HC.xlsx', header=0)
|
|
|
|
# Retiramos las columnas que no son de interes
|
|
drop_columns=['HC', 'probando', 'procedencia','apellido','fecha ingreso','edad','pago','factura','monto','Pendiente','método','Referencias','Analisis','aclar_pagos','tratamiento','notas','nro de familia', 'resultado']
|
|
raw_data.drop(columns=drop_columns, inplace=True)
|
|
|
|
for index, var in raw_data.iterrows():
|
|
if not pd.isna(var['IMC']) and isinstance(var['IMC'], str):
|
|
raw_data.loc[index, 'IMC'] = safe_float(var['IMC'])
|
|
|
|
if not pd.isna(var['A1c']) and isinstance(var['A1c'], str):
|
|
raw_data.loc[index, 'A1c'] = safe_float(var['A1c'])
|
|
|
|
if not pd.isna(var['edad diag']) and isinstance(var['edad diag'], str):
|
|
raw_data.loc[index, 'edad diag'] = round(safe_float(var['edad diag']),0)
|
|
|
|
if not pd.isna(var['glu ayu']) and isinstance(var['glu ayu'], str):
|
|
raw_data.loc[index, 'glu ayu'] = round(safe_float(var['glu ayu']),0)
|
|
|
|
if not pd.isna(var['glu 120']) and isinstance(var['glu 120'], str):
|
|
raw_data.loc[index, 'glu 120'] = round(safe_float(var['glu 120']),0)
|
|
|
|
raw_data['IMC'] = raw_data['IMC'].astype(np.float64)
|
|
raw_data['A1c'] = raw_data['A1c'].astype(np.float64)
|
|
raw_data['edad diag'] = raw_data['edad diag'].astype(np.float64)
|
|
raw_data['glu ayu'] = raw_data['glu ayu'].astype(np.float64)
|
|
raw_data['glu 120'] = raw_data['glu 120'].astype(np.float64)
|
|
|
|
|
|
diagnosticos = []
|
|
for index, var in raw_data.iterrows():
|
|
if var['sospecha MODY'] == '2':
|
|
diagnosticos.append(var['diagnostico'])
|
|
|
|
print("Total elementos en el dataset con sospecha MODY2:\t{}".format(len(diagnosticos)))
|
|
print("Diagnosticos del grupo:")
|
|
|
|
diagnosticos = list(set(diagnosticos))
|
|
for diagnostico in diagnosticos:
|
|
print("- '{}'".format(diagnostico))
|
|
|
|
print("========================================================================================")
|
|
|
|
diagnosticos = []
|
|
for index, var in raw_data.iterrows():
|
|
if var['sospecha MODY'] == '3':
|
|
diagnosticos.append(var['diagnostico'])
|
|
|
|
print("Total elementos en el dataset con sospecha MODY3:\t{}".format(len(diagnosticos)))
|
|
print("Diagnosticos del grupo:")
|
|
|
|
diagnosticos = list(set(diagnosticos))
|
|
for diagnostico in diagnosticos:
|
|
print("- '{}'".format(diagnostico))
|
|
|
|
print("========================================================================================")
|
|
|
|
diagnosticos = []
|
|
for index, var in raw_data.iterrows():
|
|
if var['sospecha MODY'] not in ['2', '3']:
|
|
diagnosticos.append(var['diagnostico'])
|
|
|
|
print("Total elementos en el dataset con sospechas diferentes a 2 o 3:\t{}".format(len(diagnosticos)))
|
|
print("Diagnosticos del grupo:")
|
|
|
|
diagnosticos = list(set(diagnosticos))
|
|
for diagnostico in diagnosticos:
|
|
print("- '{}'".format(diagnostico))
|
|
|
|
## generación de las clases en base a la confirmación de la sospecha
|
|
raw_data['MODY1_pos'] = False
|
|
raw_data['MODY1_neg'] = False
|
|
raw_data['MODY2_pos'] = False
|
|
raw_data['MODY2_neg'] = False
|
|
raw_data['MODY3_pos'] = False
|
|
raw_data['MODY3_neg'] = False
|
|
raw_data['MODY5_pos'] = False
|
|
raw_data['MODY5_neg'] = False
|
|
raw_data['SiEntiqueta'] = False
|
|
raw_data['Normal'] = False
|
|
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('Diagnóstico MODY1', case=False, na=False)), 'MODY1_pos'] = True
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('Diagnóstico MODY2', case=False, na=False)), 'MODY2_pos'] = True
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('Diagnóstico MODY3', case=False, na=False)), 'MODY3_pos'] = True
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('Diagnóstico MODY5', case=False, na=False)), 'MODY5_pos'] = True
|
|
|
|
raw_data.loc[ (raw_data['sospecha MODY'] == '1') & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'MODY1_neg'] = True
|
|
raw_data.loc[ (raw_data['sospecha MODY'] == '2') & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'MODY2_neg'] = True
|
|
raw_data.loc[ (raw_data['sospecha MODY'] == '3') & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'MODY3_neg'] = True
|
|
raw_data.loc[ (raw_data['sospecha MODY'] == '5') & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'MODY5_neg'] = True
|
|
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('Normal', case=False, na=False)), 'Normal' ] = True
|
|
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('No se hace', case=False, na=False)), 'SiEntiqueta'] = True
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('Sin diagnóstico', case=False, na=False)), 'SiEntiqueta'] = True
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('Otros', case=False, na=False)), 'SiEntiqueta'] = True
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('No es MODY', case=False, na=False)), 'SiEntiqueta'] = True
|
|
raw_data.loc[ (raw_data['diagnostico'].str.contains('Falta definir', case=False, na=False)), 'SiEntiqueta'] = True
|
|
raw_data.loc[ (~raw_data['sospecha MODY'].isin(['1', '2', '3', '5'])) & (raw_data['diagnostico'].str.contains('No se confirma', case=False, na=False)), 'SiEntiqueta'] = True
|
|
raw_data.loc[ pd.isna(raw_data['diagnostico']), 'SiEntiqueta'] = True
|
|
|
|
|
|
print("================== Datos sin confirmar/descartar ningún MODY ===========================")
|
|
tipos = ['MODY1_pos', 'MODY1_neg', 'MODY2_pos', 'MODY2_neg', 'MODY3_pos', 'MODY3_neg','MODY5_pos', 'MODY5_neg','Normal','SiEntiqueta']
|
|
sinconfirmar = 0
|
|
## Datos que no cumplen con el criterio
|
|
for index, var in raw_data.iterrows(): # imprime los registros que no pertenecen a ninguna categoria:
|
|
if not any(var[col] for col in tipos):
|
|
print("sujeto: {} \t| sospecha: {} \t| diagnostico: {:18} \t | historial: {} ".format(var['protocolo'],var['sospecha MODY'], var['diagnostico'], var['historial']))
|
|
sinconfirmar += 1
|
|
|
|
print("====================== Diagnosticos confirmados/descartados ==========================")
|
|
|
|
contador = {}
|
|
|
|
for tipo in tipos:
|
|
contador[tipo] = 0
|
|
|
|
for index, var in raw_data.iterrows():
|
|
for tipo in tipos:
|
|
if var[tipo]:
|
|
contador[tipo] += 1
|
|
|
|
for tipo in tipos:
|
|
print("{:20} \t {} ({}%)".format(tipo, contador[tipo], round((contador[tipo]/len(raw_data))*100, 2)))
|
|
|
|
print("=========================== ==================== ==================================")
|
|
label_vars = ['protocolo', 'nombre', 'edad diag', 'IMC', 'antecedentes fam', 'glu ayu', 'glu 120', 'A1c','MODY1_pos', 'MODY1_neg', 'MODY2_pos', 'MODY2_neg', 'MODY3_pos', 'MODY3_neg','MODY5_pos', 'MODY5_neg','Normal']
|
|
pre_labeled_data = raw_data[raw_data['SiEntiqueta'] == False][label_vars]
|
|
|
|
pre_labeled_data.head()
|
|
|
|
"""## 2.2. Antecedentes familiares
|
|
|
|
Se genera el campo a partir del comentario del grupo familiar
|
|
"""
|
|
|
|
pre_labeled_data['diabetes_familia'] = np.nan
|
|
## -1 == no hay antecedentes familiares de diabetes
|
|
pre_labeled_data.loc[pre_labeled_data['antecedentes fam'].str.lower().str.startswith('no', na=False), 'diabetes_familia'] = -1.0
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.contains('no dm', case=False, na=False), 'diabetes_familia'] = -1.0
|
|
## 1 == si hay antecedentes familiares de diabetes
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('si', na=False), 'diabetes_familia'] = 1.0
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('her', na=False), 'diabetes_familia'] = 1.0 #hermana o hermano
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('pad', na=False), 'diabetes_familia'] = 1.0
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('mad', na=False), 'diabetes_familia'] = 1.0
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('amb', na=False), 'diabetes_familia'] = 1.0
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('hij', na=False), 'diabetes_familia'] = 1.0 #hija o hijo
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('multi', na=False), 'diabetes_familia'] = 1.0
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('ti', na=False), 'diabetes_familia'] = 1.0 #tia o tio
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('abu', na=False), 'diabetes_familia'] = 1.0 #abuela o abuelo
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('diab', na=False), 'diabetes_familia'] = 1.0
|
|
|
|
## 0 == no se sabe: sin información (Ej: adoptado)
|
|
# clean_data.loc[ clean_data['diabetes_familia'] == 0, 'antecedentes fam'].unique() #muestra los valores que no tienen match con lo indicado anteriormente
|
|
pre_labeled_data.loc[ pre_labeled_data['antecedentes fam'].str.lower().str.startswith('mare', na=False), 'diabetes_familia'] = 1.0 #anomalía, madre mal escrito
|
|
|
|
print("==================================== Clasificados =============================================")
|
|
for value, count in pre_labeled_data[~pre_labeled_data['diabetes_familia'].isna()]['diabetes_familia'].value_counts(dropna=False).items():
|
|
print(f"Value: {value}, Count: {count}")
|
|
print("==================================== No se pudo Clasificar =============================================")
|
|
for value, count in pre_labeled_data[pre_labeled_data['diabetes_familia'].isna()]['antecedentes fam'].value_counts(dropna=False).items():
|
|
print(f"Value: {value}, Count: {count}")
|
|
|
|
"""## 2.3. Sexo
|
|
|
|
Se infiere el sexo a partir de los nombres
|
|
"""
|
|
|
|
pre_labeled_data['sexo'] = np.nan
|
|
## 1 == Mujer
|
|
nombres_f = ['andrea', 'agustina', 'antonella', 'angelica', 'alicia', 'alejandra', 'ariana', 'ayelen', 'ayleen', 'belen', 'bianca',
|
|
'camila', 'carolina', 'catalina', 'claudia', 'delfina', 'eliana', 'estefania', 'eva', 'karina', 'florencia', 'gabriela',
|
|
'georgina', 'geraldine', 'guillermina', 'jazmin', 'jessica', 'julieta', 'karen', 'laura', 'lidia', 'lucia', 'magali', 'mina',
|
|
'mabel', 'malena', 'malena', 'mariana', 'marina', 'martina', 'micaela', 'micalela', 'milagros', 'milena',
|
|
'miriam', 'morena', 'natalia', 'noemi', 'nayla', 'rocio', 'rosa', 'sandra', 'sara', 'sasha', 'silvia', 'silvana',
|
|
'sofia', 'solange', 'soledad', 'valentina', 'victoria', 'vanina', 'vanesa', 'virginia', 'yanina', 'zamira',
|
|
'abril', 'adriana', 'ailen', 'aixa', 'ambar', 'ana', 'ana esmerlada', 'ana iris', 'anahi', 'analia', 'aylen', 'barbara',
|
|
'brenda', 'brisa', 'candela', 'carmela (carmen)', 'chiara', 'elizabeth', 'ema', 'emilia', 'emma', 'eugenia', 'fiorella',
|
|
'flavia', 'franca', 'francesca', 'graciela', 'helena', 'isabela', 'isabella', 'jacinta', 'jesica', 'jorgelina', 'julia', 'lorena',
|
|
'lucila', 'lucía', 'magdalena', 'maricruz', 'mariel', 'mariela', 'marilina', 'marixa', 'martha', 'maría emilia', 'maría verónica',
|
|
'melany', 'mercedes', 'monica', 'nancy rosa alba', 'nerina', 'oriana', 'paola', 'patricia', 'paula', 'pilar', 'priscila', 'renata',
|
|
'romina', 'roxana', 'ruth', 'shirley', 'tamara', 'valeria' ]
|
|
|
|
nombres_f.append('zahirah') # dejo los nombres que me hacen duda en forma individual
|
|
nombres_f.append('antu')
|
|
nombres_f.append('tali')
|
|
nombres_f.append('ma laura')
|
|
nombres_f.append('qian') # nombre femenino de origen chino
|
|
nombres_f.append('maria')
|
|
|
|
for nombre_f in nombres_f:
|
|
pre_labeled_data.loc[ pre_labeled_data['sexo'].isna() & (pre_labeled_data['nombre'].str.lower().str.startswith(nombre_f, na=False)), 'sexo'] = 1.0
|
|
|
|
## -1 == Hombre
|
|
nombres_h = ['agustin', 'alejandro', 'alvaro', 'augusto', 'benjamin', 'bruno', 'camilo', 'cristian', 'damian', 'dario', 'daniel', 'dante',
|
|
'david', 'diego', 'emiliano', 'elian', 'enzo', 'ezequiel', 'facundo', 'federico', 'felipe', 'fernando', 'felix', 'franco', 'german',
|
|
'gonzalo', 'gustavo', 'guillermo', 'ignacio', 'ian','joaquin', 'juan', 'julian', 'leandro', 'lorenzo', 'lucas', 'luka', 'marcelo',
|
|
'marcos', 'martin', 'martin', 'maximiliano', 'mateo', 'matias', 'pablo', 'nehemias', 'nicolas', 'ramiro', 'rogelio', 'rodrigo',
|
|
'santiago', 'santino', 'sebastian', 'thiago', 'tomas',
|
|
'alan', 'alfredo', 'antonio', 'axel', 'benicio', 'carlos', 'carlos gonzalo', 'claudio', 'dylan', 'eduardo', 'emanuel', 'ernesto',
|
|
'fabian', 'farid', 'fidel', 'francisco', 'gabriel facundo', 'gael', 'gerardo', 'gerónimo', 'hernan', 'ivan', 'javier', 'jorge',
|
|
'julio', 'mauricio', 'miguel angel', 'oscar', 'pedro', 'raul', 'rene', 'ricardo', 'roberto', 'sergio', 'teo', 'tiago', 'tobias', 'walter']
|
|
nombres_h.append('agustín')
|
|
|
|
for nombre_h in nombres_h:
|
|
pre_labeled_data.loc[ pre_labeled_data['sexo'].isna() & (pre_labeled_data['nombre'].str.lower().str.startswith(nombre_h, na=False)), 'sexo'] = -1.0
|
|
|
|
print("==================================== Clasificados =============================================")
|
|
for value, count in pre_labeled_data[~pre_labeled_data['sexo'].isna()]['sexo'].value_counts(dropna=False).items():
|
|
print(f"Value: {value}, Count: {count}")
|
|
|
|
listnames = []
|
|
print("==================================== No se pudo Clasificar =============================================")
|
|
for value, count in pre_labeled_data[pre_labeled_data['sexo'].isna()]['nombre'].value_counts(dropna=False).items():
|
|
print(f"Value: {value}, Count: {count}")
|
|
listnames.append(value)
|
|
|
|
print(sorted([x for x in listnames if isinstance(x, str)]))
|
|
|
|
"""## 2.1. Registros incompletos
|
|
|
|
Se desplegan información sobre valores faltantes en las variables de interés, sujetos sin datos y se genera una versión que solo incluye los registros que contienen toda la información para poder ser usados en el entrenamiento.
|
|
"""
|
|
|
|
import pandas as pd
|
|
|
|
variables = ['sexo', 'diabetes_familia','edad diag', 'IMC', 'glu ayu', 'glu 120', 'A1c']
|
|
|
|
print("========================================================================================")
|
|
print("Total registros en el dataset etiquetado:\t{}".format(pre_labeled_data.shape[0]))
|
|
print("Variables:\t{}".format(str(variables)))
|
|
print("==================== Desglose por N de variables faltantes ==============================")
|
|
for num in range(len(variables)+1):
|
|
nrows = len(pre_labeled_data[pre_labeled_data[variables].isnull().sum(axis=1) == num])
|
|
print("Le faltan {}/{} variables:\t{}\t({}%)".format(num, len(variables), nrows, round(nrows*100/pre_labeled_data.shape[0], 2)))
|
|
|
|
print("============================ Desglose por variables =====000=============================")
|
|
for var in variables:
|
|
nrows = pre_labeled_data[var].isna().astype(int).sum()
|
|
print("Variable {} ausente en \t\t {} ({}%) registros ".format(var, nrows, round(nrows*100/pre_labeled_data.shape[0], 2)))
|
|
|
|
pre_labeled_data['MODY1_label'] = np.nan
|
|
pre_labeled_data.loc[pre_labeled_data['MODY1_pos'], 'MODY1_label'] = 1
|
|
pre_labeled_data.loc[pre_labeled_data['MODY1_neg'], 'MODY1_label'] = 0#-1
|
|
|
|
pre_labeled_data['MODY2_label'] = np.nan
|
|
pre_labeled_data.loc[pre_labeled_data['MODY2_pos'], 'MODY2_label'] = 1
|
|
pre_labeled_data.loc[pre_labeled_data['MODY2_neg'], 'MODY2_label'] = 0#-1
|
|
|
|
pre_labeled_data['MODY3_label'] = np.nan
|
|
pre_labeled_data.loc[pre_labeled_data['MODY3_pos'], 'MODY3_label'] = 1
|
|
pre_labeled_data.loc[pre_labeled_data['MODY3_neg'], 'MODY3_label'] = 0#-1
|
|
|
|
pre_labeled_data['MODY5_label'] = np.nan
|
|
pre_labeled_data.loc[pre_labeled_data['MODY5_pos'], 'MODY5_label'] = 1
|
|
pre_labeled_data.loc[pre_labeled_data['MODY5_neg'], 'MODY5_label'] = 0#-1
|
|
|
|
"""# 3. Datos iniciales"""
|
|
|
|
dsm1_complete = pre_labeled_data[~pre_labeled_data['MODY1_label'].isna()][variables+['MODY1_label']]
|
|
dsm2_complete = pre_labeled_data[~pre_labeled_data['MODY2_label'].isna()][variables+['MODY2_label']]
|
|
dsm3_complete = pre_labeled_data[~pre_labeled_data['MODY3_label'].isna()][variables+['MODY3_label']]
|
|
dsm5_complete = pre_labeled_data[~pre_labeled_data['MODY5_label'].isna()][variables+['MODY5_label']]
|
|
dsnormal_complete = pre_labeled_data[pre_labeled_data['Normal']][variables]
|
|
|
|
"""# 4. Salida intermedia de los datos para verificación manual
|
|
|
|
Guarda los dataframes en un excel para verificación
|
|
"""
|
|
|
|
with pd.ExcelWriter("MODY_data.xlsx", engine='xlsxwriter') as xls:
|
|
|
|
raw_data.to_excel(xls, sheet_name='HC Original', index=False)
|
|
pre_labeled_data.to_excel(xls, sheet_name='Datos etiquetados', index=False)
|
|
raw_data[raw_data['SiEntiqueta'] == True].to_excel(xls, sheet_name='Datos excluídos', index=False)
|
|
|
|
dsm1_complete.to_excel(xls, sheet_name='Dataset MODY1', index=False)
|
|
dsm1_complete.dropna().to_excel(xls, sheet_name='Dataset MODY1 sin ausentes', index=False)
|
|
|
|
dsm2_complete.to_excel(xls, sheet_name='Dataset MODY2', index=False)
|
|
dsm2_complete.dropna().to_excel(xls, sheet_name='Dataset MODY2 sin ausentes', index=False)
|
|
|
|
dsm3_complete.to_excel(xls, sheet_name='Dataset MODY3', index=False)
|
|
dsm3_complete.dropna().to_excel(xls, sheet_name='Dataset MODY3 sin ausentes', index=False)
|
|
|
|
dsm5_complete.to_excel(xls, sheet_name='Dataset MODY5', index=False)
|
|
dsm5_complete.dropna().to_excel(xls, sheet_name='Dataset MODY5 sin ausentes', index=False)
|
|
|
|
dsnormal_complete.to_excel(xls, sheet_name='Sin Diabetes', index=False)
|
|
|
|
return dsm1_complete, dsm2_complete, dsm3_complete, dsm5_complete
|