2024-12-06 19:08:51 -03:00
import numpy as np
import os
import xlsxwriter
def safe_float ( x ) :
try :
return float ( x . replace ( ' , ' , ' . ' ) )
except ValueError :
return np . nan
2025-01-15 21:09:00 -03:00
def analisis_univariado ( dfi , target = None , continuas = [ ] , discretas = [ ] ) :
if target is None :
raise ValueError ( " No target variable provided " )
import pandas as pd
from scipy . stats import shapiro , ttest_ind , mannwhitneyu , chi2_contingency , fisher_exact
# label_columns = ['sexo', 'hist fam', 'edad diag', 'IMC', 'glu ayu', 'glu 120','A1c']
label_columns = dfi . drop ( target , axis = 1 ) . columns
# Separar el target en dos grupos: N positivo y N negativo
2025-01-17 01:29:05 -03:00
groups = sorted ( dfi [ target ] . unique ( ) )
2025-01-15 21:09:00 -03:00
if len ( groups ) != 2 :
raise ValueError ( " Target variable must have exactly two unique values " )
2025-01-16 13:54:21 -03:00
glabel = { 1 : ' MODY Pos ' , 0 : ' MODY Neg ' }
2025-01-15 21:09:00 -03:00
group1 , group2 = groups
data_group1 = dfi [ dfi [ target ] == group1 ] [ label_columns ]
data_group2 = dfi [ dfi [ target ] == group2 ] [ label_columns ]
results = [ ]
2025-01-18 23:14:21 -03:00
resultsmody = [ ]
2025-01-15 21:09:00 -03:00
# Análisis de variables continuas
for var in continuas :
2025-01-17 00:27:48 -03:00
Xvar = dfi [ var ] . T
group1_values = data_group1 [ var ] . T
group2_values = data_group2 [ var ] . T
2025-01-15 21:09:00 -03:00
# Test de normalidad (Shapiro-Wilk)
2025-01-16 14:41:59 -03:00
stat , p = shapiro ( Xvar , nan_policy = ' raise ' )
2025-01-17 00:27:48 -03:00
normal = p > = 0.05
2025-01-15 21:09:00 -03:00
if normal :
# Distribución normal: media, desviación estándar, y test t
2025-01-18 23:14:21 -03:00
meang , stdg = dfi [ var ] . mean ( ) , dfi [ var ] . std ( )
2025-01-15 21:09:00 -03:00
mean1 , std1 = group1_values . mean ( ) , group1_values . std ( )
mean2 , std2 = group2_values . mean ( ) , group2_values . std ( )
t_stat , t_pval = ttest_ind ( group1_values , group2_values , equal_var = False )
results . append ( [
2025-01-16 14:45:21 -03:00
var , " Continua " , f " Normal (p= { p : .7f } ) " ,
2025-01-17 01:05:34 -03:00
f " mean: { mean1 : .2f } , stdev: { std1 : .2f } " ,
f " mean: { mean2 : .2f } , stdev: { std2 : .2f } " ,
2025-01-17 01:25:28 -03:00
f " t Student: p= { t_pval : .3f } " + ( " Dif Significativa " if t_pval < 0.05 else " Dif No-Significativa " )
2025-01-15 21:09:00 -03:00
] )
2025-01-18 23:14:21 -03:00
resultsmody . append ( [
var , " " , f " { mean1 : .1f } ± { std1 : .1f } " , f " { mean2 : .1f } ± { std2 : .1f } " ,
f " { t_pval : .3f } " , ( " * " if t_pval < 0.05 else " NS " ) , f " { meang : .1f } ± { stdg : .1f } "
] )
2025-01-15 21:09:00 -03:00
else :
# Distribución no normal: mediana, rango intercuartil, y test Mann-Whitney
2025-01-19 00:07:58 -03:00
mediang , qrg , qlg = dfi [ var ] . median ( ) , dfi [ var ] . quantile ( 0.75 ) , dfi [ var ] . quantile ( 0.25 )
2025-01-18 23:14:21 -03:00
qr1 , ql1 = group1_values . quantile ( 0.75 ) , group1_values . quantile ( 0.25 )
qr2 , ql2 = group2_values . quantile ( 0.75 ) , group2_values . quantile ( 0.25 )
2025-01-15 21:09:00 -03:00
median1 , iqr1 = group1_values . median ( ) , group1_values . quantile ( 0.75 ) - group1_values . quantile ( 0.25 )
median2 , iqr2 = group2_values . median ( ) , group2_values . quantile ( 0.75 ) - group2_values . quantile ( 0.25 )
mw_stat , mw_pval = mannwhitneyu ( group1_values , group2_values )
results . append ( [
2025-01-16 14:45:21 -03:00
var , " Continua " , f " No Normal (p= { p : .7f } ) " ,
2025-01-17 01:05:34 -03:00
f " Mediana: { median1 : .2f } , RIC: { iqr1 : .2f } " ,
f " Mediana: { median2 : .2f } , RIC: { iqr2 : .2f } " ,
2025-01-17 01:25:28 -03:00
f " Mann-Whitney: p= { mw_pval : .3f } " + ( " Dif Significativa " if mw_pval < 0.05 else " Dif No-Significativa " )
2025-01-15 21:09:00 -03:00
] )
2025-01-18 23:14:21 -03:00
resultsmody . append ( [
2025-01-19 00:00:03 -03:00
var , " " , f " { median1 : .1f } ( { ql1 : .1f } - { qr1 : .1f } ) " , f " { median2 : .1f } ( { ql2 : .1f } - { qr2 : .1f } ) " ,
2025-01-18 23:31:59 -03:00
f " { mw_pval : .3f } " , ( " * " if mw_pval < 0.05 else " NS " ) , f " { mediang : .1f } ( { qlg : .1f } - { qrg : .1f } ) "
2025-01-18 23:14:21 -03:00
] )
2025-01-15 21:09:00 -03:00
# Análisis de variables discretas
for var in discretas :
freq_table = dfi . groupby ( [ target , var ] ) . size ( ) . unstack ( fill_value = 0 )
percentages = freq_table . div ( freq_table . sum ( axis = 1 ) , axis = 0 ) * 100
# Pruebas estadísticas
if freq_table . shape [ 1 ] == 2 :
# Test exacto de Fisher
_ , fisher_pval = fisher_exact ( freq_table . values )
2025-01-17 01:25:28 -03:00
test_result = f " Fisher Exact: p= { fisher_pval : .3f } " + ( " Dif Significativa " if fisher_pval < 0.05 else " Dif No-Significativa " )
2025-01-15 21:09:00 -03:00
else :
# Test Chi cuadrado
chi2_stat , chi2_pval , _ , _ = chi2_contingency ( freq_table )
test_result = f " Chi2: p= { chi2_pval : .3f } "
results . append ( [
var , " Discreta " , " N/A " ,
2025-01-17 00:35:13 -03:00
f " Frecuencias: { freq_table . values } " ,
2025-01-17 01:06:27 -03:00
f " Porcentajes: { percentages . values . round ( 1 ) } " ,
2025-01-15 21:09:00 -03:00
test_result
] )
2025-01-18 23:39:03 -03:00
freq_matrix = freq_table . values
2025-01-18 23:40:41 -03:00
percentages_matrix = percentages . values
2025-01-18 23:39:03 -03:00
tot = freq_matrix [ 0 ] [ 1 ] + freq_matrix [ 1 ] [ 1 ]
2025-01-18 23:14:21 -03:00
totf = 100 * tot / len ( dfi [ var ] )
resultsmody . append ( [
2025-01-18 23:52:06 -03:00
var , " " , f " { percentages_matrix [ 0 ] [ 1 ] : .1f } ( { freq_matrix [ 0 ] [ 1 ] } / { len ( data_group1 ) } ) " , f " { percentages_matrix [ 1 ] [ 1 ] : .1f } ( { freq_matrix [ 1 ] [ 1 ] } / { len ( data_group2 ) } ) " ,
2025-01-18 23:31:59 -03:00
f " { fisher_pval : .3f } " , ( " * " if fisher_pval < 0.05 else " NS " ) , f " { totf : .1f } ( { tot } / { len ( dfi [ var ] ) } ) "
2025-01-18 23:14:21 -03:00
] )
2025-01-15 21:09:00 -03:00
# Crear DataFrame con los resultados
2025-01-18 23:14:21 -03:00
results_df = pd . DataFrame ( resultsmody , columns = [
2025-01-19 00:01:21 -03:00
" Variable " , " Unidad " , " {} \n n= {} " . format ( glabel [ group1 ] , len ( data_group1 ) ) , " {} \n n= {} " . format ( glabel [ group2 ] , len ( data_group2 ) ) , " Pvalue " , " " , " Total "
2025-01-15 21:09:00 -03:00
] )
return results_df
2024-12-06 19:08:51 -03:00
def load_data ( Reload = False ) :
if os . path . isfile ( ' MODY_data.xlsx ' ) :
import pandas as pd
with pd . ExcelFile ( " MODY_data.xlsx " ) as xls :
dsm1_complete = pd . read_excel ( xls , sheet_name = ' Dataset MODY1 ' )
dsm2_complete = pd . read_excel ( xls , sheet_name = ' Dataset MODY2 ' )
dsm3_complete = pd . read_excel ( xls , sheet_name = ' Dataset MODY3 ' )
dsm5_complete = pd . read_excel ( xls , sheet_name = ' Dataset MODY5 ' )
else :
print ( " ======================================================================================== " )
if not os . path . isfile ( ' HC.xlsx ' ) :
raise ' NoDatasetToLoad '
import pandas as pd
with pd . ExcelFile ( " HC.xlsx " ) as xls :
raw_data = pd . read_excel ( xls , header = 0 )
# pd.read_excel('HC.xlsx', header=0)
# Retiramos las columnas que no son de interes
drop_columns = [ ' HC ' , ' probando ' , ' procedencia ' , ' apellido ' , ' fecha ingreso ' , ' edad ' , ' pago ' , ' factura ' , ' monto ' , ' Pendiente ' , ' método ' , ' Referencias ' , ' Analisis ' , ' aclar_pagos ' , ' tratamiento ' , ' notas ' , ' nro de familia ' , ' resultado ' ]
raw_data . drop ( columns = drop_columns , inplace = True )
for index , var in raw_data . iterrows ( ) :
if not pd . isna ( var [ ' IMC ' ] ) and isinstance ( var [ ' IMC ' ] , str ) :
raw_data . loc [ index , ' IMC ' ] = safe_float ( var [ ' IMC ' ] )
if not pd . isna ( var [ ' A1c ' ] ) and isinstance ( var [ ' A1c ' ] , str ) :
raw_data . loc [ index , ' A1c ' ] = safe_float ( var [ ' A1c ' ] )
if not pd . isna ( var [ ' edad diag ' ] ) and isinstance ( var [ ' edad diag ' ] , str ) :
raw_data . loc [ index , ' edad diag ' ] = round ( safe_float ( var [ ' edad diag ' ] ) , 0 )
if not pd . isna ( var [ ' glu ayu ' ] ) and isinstance ( var [ ' glu ayu ' ] , str ) :
raw_data . loc [ index , ' glu ayu ' ] = round ( safe_float ( var [ ' glu ayu ' ] ) , 0 )
if not pd . isna ( var [ ' glu 120 ' ] ) and isinstance ( var [ ' glu 120 ' ] , str ) :
raw_data . loc [ index , ' glu 120 ' ] = round ( safe_float ( var [ ' glu 120 ' ] ) , 0 )
raw_data [ ' IMC ' ] = raw_data [ ' IMC ' ] . astype ( np . float64 )
raw_data [ ' A1c ' ] = raw_data [ ' A1c ' ] . astype ( np . float64 )
raw_data [ ' edad diag ' ] = raw_data [ ' edad diag ' ] . astype ( np . float64 )
raw_data [ ' glu ayu ' ] = raw_data [ ' glu ayu ' ] . astype ( np . float64 )
raw_data [ ' glu 120 ' ] = raw_data [ ' glu 120 ' ] . astype ( np . float64 )
diagnosticos = [ ]
for index , var in raw_data . iterrows ( ) :
if var [ ' sospecha MODY ' ] == ' 2 ' :
diagnosticos . append ( var [ ' diagnostico ' ] )
print ( " Total elementos en el dataset con sospecha MODY2: \t {} " . format ( len ( diagnosticos ) ) )
print ( " Diagnosticos del grupo: " )
diagnosticos = list ( set ( diagnosticos ) )
for diagnostico in diagnosticos :
print ( " - ' {} ' " . format ( diagnostico ) )
print ( " ======================================================================================== " )
diagnosticos = [ ]
for index , var in raw_data . iterrows ( ) :
if var [ ' sospecha MODY ' ] == ' 3 ' :
diagnosticos . append ( var [ ' diagnostico ' ] )
print ( " Total elementos en el dataset con sospecha MODY3: \t {} " . format ( len ( diagnosticos ) ) )
print ( " Diagnosticos del grupo: " )
diagnosticos = list ( set ( diagnosticos ) )
for diagnostico in diagnosticos :
print ( " - ' {} ' " . format ( diagnostico ) )
print ( " ======================================================================================== " )
diagnosticos = [ ]
for index , var in raw_data . iterrows ( ) :
if var [ ' sospecha MODY ' ] not in [ ' 2 ' , ' 3 ' ] :
diagnosticos . append ( var [ ' diagnostico ' ] )
print ( " Total elementos en el dataset con sospechas diferentes a 2 o 3: \t {} " . format ( len ( diagnosticos ) ) )
print ( " Diagnosticos del grupo: " )
diagnosticos = list ( set ( diagnosticos ) )
for diagnostico in diagnosticos :
print ( " - ' {} ' " . format ( diagnostico ) )
## generación de las clases en base a la confirmación de la sospecha
raw_data [ ' MODY1_pos ' ] = False
raw_data [ ' MODY1_neg ' ] = False
raw_data [ ' MODY2_pos ' ] = False
raw_data [ ' MODY2_neg ' ] = False
raw_data [ ' MODY3_pos ' ] = False
raw_data [ ' MODY3_neg ' ] = False
raw_data [ ' MODY5_pos ' ] = False
raw_data [ ' MODY5_neg ' ] = False
raw_data [ ' SiEntiqueta ' ] = False
raw_data [ ' Normal ' ] = False
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' Diagnóstico MODY1 ' , case = False , na = False ) ) , ' MODY1_pos ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' Diagnóstico MODY2 ' , case = False , na = False ) ) , ' MODY2_pos ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' Diagnóstico MODY3 ' , case = False , na = False ) ) , ' MODY3_pos ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' Diagnóstico MODY5 ' , case = False , na = False ) ) , ' MODY5_pos ' ] = True
raw_data . loc [ ( raw_data [ ' sospecha MODY ' ] == ' 1 ' ) & ( raw_data [ ' diagnostico ' ] . str . contains ( ' No se confirma ' , case = False , na = False ) ) , ' MODY1_neg ' ] = True
raw_data . loc [ ( raw_data [ ' sospecha MODY ' ] == ' 2 ' ) & ( raw_data [ ' diagnostico ' ] . str . contains ( ' No se confirma ' , case = False , na = False ) ) , ' MODY2_neg ' ] = True
raw_data . loc [ ( raw_data [ ' sospecha MODY ' ] == ' 3 ' ) & ( raw_data [ ' diagnostico ' ] . str . contains ( ' No se confirma ' , case = False , na = False ) ) , ' MODY3_neg ' ] = True
raw_data . loc [ ( raw_data [ ' sospecha MODY ' ] == ' 5 ' ) & ( raw_data [ ' diagnostico ' ] . str . contains ( ' No se confirma ' , case = False , na = False ) ) , ' MODY5_neg ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' Normal ' , case = False , na = False ) ) , ' Normal ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' No se hace ' , case = False , na = False ) ) , ' SiEntiqueta ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' Sin diagnóstico ' , case = False , na = False ) ) , ' SiEntiqueta ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' Otros ' , case = False , na = False ) ) , ' SiEntiqueta ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' No es MODY ' , case = False , na = False ) ) , ' SiEntiqueta ' ] = True
raw_data . loc [ ( raw_data [ ' diagnostico ' ] . str . contains ( ' Falta definir ' , case = False , na = False ) ) , ' SiEntiqueta ' ] = True
raw_data . loc [ ( ~ raw_data [ ' sospecha MODY ' ] . isin ( [ ' 1 ' , ' 2 ' , ' 3 ' , ' 5 ' ] ) ) & ( raw_data [ ' diagnostico ' ] . str . contains ( ' No se confirma ' , case = False , na = False ) ) , ' SiEntiqueta ' ] = True
raw_data . loc [ pd . isna ( raw_data [ ' diagnostico ' ] ) , ' SiEntiqueta ' ] = True
print ( " ================== Datos sin confirmar/descartar ningún MODY =========================== " )
tipos = [ ' MODY1_pos ' , ' MODY1_neg ' , ' MODY2_pos ' , ' MODY2_neg ' , ' MODY3_pos ' , ' MODY3_neg ' , ' MODY5_pos ' , ' MODY5_neg ' , ' Normal ' , ' SiEntiqueta ' ]
sinconfirmar = 0
## Datos que no cumplen con el criterio
for index , var in raw_data . iterrows ( ) : # imprime los registros que no pertenecen a ninguna categoria:
if not any ( var [ col ] for col in tipos ) :
print ( " sujeto: {} \t | sospecha: {} \t | diagnostico: {:18} \t | historial: {} " . format ( var [ ' protocolo ' ] , var [ ' sospecha MODY ' ] , var [ ' diagnostico ' ] , var [ ' historial ' ] ) )
sinconfirmar + = 1
print ( " ====================== Diagnosticos confirmados/descartados ========================== " )
contador = { }
for tipo in tipos :
contador [ tipo ] = 0
for index , var in raw_data . iterrows ( ) :
for tipo in tipos :
if var [ tipo ] :
contador [ tipo ] + = 1
for tipo in tipos :
print ( " {:20} \t {} ( {} % ) " . format ( tipo , contador [ tipo ] , round ( ( contador [ tipo ] / len ( raw_data ) ) * 100 , 2 ) ) )
print ( " =========================== ==================== ================================== " )
label_vars = [ ' protocolo ' , ' nombre ' , ' edad diag ' , ' IMC ' , ' antecedentes fam ' , ' glu ayu ' , ' glu 120 ' , ' A1c ' , ' MODY1_pos ' , ' MODY1_neg ' , ' MODY2_pos ' , ' MODY2_neg ' , ' MODY3_pos ' , ' MODY3_neg ' , ' MODY5_pos ' , ' MODY5_neg ' , ' Normal ' ]
pre_labeled_data = raw_data [ raw_data [ ' SiEntiqueta ' ] == False ] [ label_vars ]
pre_labeled_data . head ( )
""" ## 2.2. Antecedentes familiares
Se genera el campo a partir del comentario del grupo familiar
"""
pre_labeled_data [ ' diabetes_familia ' ] = np . nan
## -1 == no hay antecedentes familiares de diabetes
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' no ' , na = False ) , ' diabetes_familia ' ] = - 1.0
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . contains ( ' no dm ' , case = False , na = False ) , ' diabetes_familia ' ] = - 1.0
## 1 == si hay antecedentes familiares de diabetes
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' si ' , na = False ) , ' diabetes_familia ' ] = 1.0
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' her ' , na = False ) , ' diabetes_familia ' ] = 1.0 #hermana o hermano
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' pad ' , na = False ) , ' diabetes_familia ' ] = 1.0
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' mad ' , na = False ) , ' diabetes_familia ' ] = 1.0
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' amb ' , na = False ) , ' diabetes_familia ' ] = 1.0
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' hij ' , na = False ) , ' diabetes_familia ' ] = 1.0 #hija o hijo
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' multi ' , na = False ) , ' diabetes_familia ' ] = 1.0
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' ti ' , na = False ) , ' diabetes_familia ' ] = 1.0 #tia o tio
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' abu ' , na = False ) , ' diabetes_familia ' ] = 1.0 #abuela o abuelo
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' diab ' , na = False ) , ' diabetes_familia ' ] = 1.0
## 0 == no se sabe: sin información (Ej: adoptado)
# clean_data.loc[ clean_data['diabetes_familia'] == 0, 'antecedentes fam'].unique() #muestra los valores que no tienen match con lo indicado anteriormente
pre_labeled_data . loc [ pre_labeled_data [ ' antecedentes fam ' ] . str . lower ( ) . str . startswith ( ' mare ' , na = False ) , ' diabetes_familia ' ] = 1.0 #anomalía, madre mal escrito
print ( " ==================================== Clasificados ============================================= " )
for value , count in pre_labeled_data [ ~ pre_labeled_data [ ' diabetes_familia ' ] . isna ( ) ] [ ' diabetes_familia ' ] . value_counts ( dropna = False ) . items ( ) :
print ( f " Value: { value } , Count: { count } " )
print ( " ==================================== No se pudo Clasificar ============================================= " )
for value , count in pre_labeled_data [ pre_labeled_data [ ' diabetes_familia ' ] . isna ( ) ] [ ' antecedentes fam ' ] . value_counts ( dropna = False ) . items ( ) :
print ( f " Value: { value } , Count: { count } " )
""" ## 2.3. Sexo
Se infiere el sexo a partir de los nombres
"""
pre_labeled_data [ ' sexo ' ] = np . nan
## 1 == Mujer
nombres_f = [ ' andrea ' , ' agustina ' , ' antonella ' , ' angelica ' , ' alicia ' , ' alejandra ' , ' ariana ' , ' ayelen ' , ' ayleen ' , ' belen ' , ' bianca ' ,
' camila ' , ' carolina ' , ' catalina ' , ' claudia ' , ' delfina ' , ' eliana ' , ' estefania ' , ' eva ' , ' karina ' , ' florencia ' , ' gabriela ' ,
' georgina ' , ' geraldine ' , ' guillermina ' , ' jazmin ' , ' jessica ' , ' julieta ' , ' karen ' , ' laura ' , ' lidia ' , ' lucia ' , ' magali ' , ' mina ' ,
' mabel ' , ' malena ' , ' malena ' , ' mariana ' , ' marina ' , ' martina ' , ' micaela ' , ' micalela ' , ' milagros ' , ' milena ' ,
' miriam ' , ' morena ' , ' natalia ' , ' noemi ' , ' nayla ' , ' rocio ' , ' rosa ' , ' sandra ' , ' sara ' , ' sasha ' , ' silvia ' , ' silvana ' ,
' sofia ' , ' solange ' , ' soledad ' , ' valentina ' , ' victoria ' , ' vanina ' , ' vanesa ' , ' virginia ' , ' yanina ' , ' zamira ' ,
' abril ' , ' adriana ' , ' ailen ' , ' aixa ' , ' ambar ' , ' ana ' , ' ana esmerlada ' , ' ana iris ' , ' anahi ' , ' analia ' , ' aylen ' , ' barbara ' ,
' brenda ' , ' brisa ' , ' candela ' , ' carmela (carmen) ' , ' chiara ' , ' elizabeth ' , ' ema ' , ' emilia ' , ' emma ' , ' eugenia ' , ' fiorella ' ,
' flavia ' , ' franca ' , ' francesca ' , ' graciela ' , ' helena ' , ' isabela ' , ' isabella ' , ' jacinta ' , ' jesica ' , ' jorgelina ' , ' julia ' , ' lorena ' ,
' lucila ' , ' lucía ' , ' magdalena ' , ' maricruz ' , ' mariel ' , ' mariela ' , ' marilina ' , ' marixa ' , ' martha ' , ' maría emilia ' , ' maría verónica ' ,
' melany ' , ' mercedes ' , ' monica ' , ' nancy rosa alba ' , ' nerina ' , ' oriana ' , ' paola ' , ' patricia ' , ' paula ' , ' pilar ' , ' priscila ' , ' renata ' ,
' romina ' , ' roxana ' , ' ruth ' , ' shirley ' , ' tamara ' , ' valeria ' ]
nombres_f . append ( ' zahirah ' ) # dejo los nombres que me hacen duda en forma individual
nombres_f . append ( ' antu ' )
nombres_f . append ( ' tali ' )
nombres_f . append ( ' ma laura ' )
nombres_f . append ( ' qian ' ) # nombre femenino de origen chino
nombres_f . append ( ' maria ' )
for nombre_f in nombres_f :
pre_labeled_data . loc [ pre_labeled_data [ ' sexo ' ] . isna ( ) & ( pre_labeled_data [ ' nombre ' ] . str . lower ( ) . str . startswith ( nombre_f , na = False ) ) , ' sexo ' ] = 1.0
## -1 == Hombre
nombres_h = [ ' agustin ' , ' alejandro ' , ' alvaro ' , ' augusto ' , ' benjamin ' , ' bruno ' , ' camilo ' , ' cristian ' , ' damian ' , ' dario ' , ' daniel ' , ' dante ' ,
' david ' , ' diego ' , ' emiliano ' , ' elian ' , ' enzo ' , ' ezequiel ' , ' facundo ' , ' federico ' , ' felipe ' , ' fernando ' , ' felix ' , ' franco ' , ' german ' ,
' gonzalo ' , ' gustavo ' , ' guillermo ' , ' ignacio ' , ' ian ' , ' joaquin ' , ' juan ' , ' julian ' , ' leandro ' , ' lorenzo ' , ' lucas ' , ' luka ' , ' marcelo ' ,
' marcos ' , ' martin ' , ' martin ' , ' maximiliano ' , ' mateo ' , ' matias ' , ' pablo ' , ' nehemias ' , ' nicolas ' , ' ramiro ' , ' rogelio ' , ' rodrigo ' ,
' santiago ' , ' santino ' , ' sebastian ' , ' thiago ' , ' tomas ' ,
' alan ' , ' alfredo ' , ' antonio ' , ' axel ' , ' benicio ' , ' carlos ' , ' carlos gonzalo ' , ' claudio ' , ' dylan ' , ' eduardo ' , ' emanuel ' , ' ernesto ' ,
' fabian ' , ' farid ' , ' fidel ' , ' francisco ' , ' gabriel facundo ' , ' gael ' , ' gerardo ' , ' gerónimo ' , ' hernan ' , ' ivan ' , ' javier ' , ' jorge ' ,
' julio ' , ' mauricio ' , ' miguel angel ' , ' oscar ' , ' pedro ' , ' raul ' , ' rene ' , ' ricardo ' , ' roberto ' , ' sergio ' , ' teo ' , ' tiago ' , ' tobias ' , ' walter ' ]
nombres_h . append ( ' agustín ' )
for nombre_h in nombres_h :
pre_labeled_data . loc [ pre_labeled_data [ ' sexo ' ] . isna ( ) & ( pre_labeled_data [ ' nombre ' ] . str . lower ( ) . str . startswith ( nombre_h , na = False ) ) , ' sexo ' ] = - 1.0
print ( " ==================================== Clasificados ============================================= " )
for value , count in pre_labeled_data [ ~ pre_labeled_data [ ' sexo ' ] . isna ( ) ] [ ' sexo ' ] . value_counts ( dropna = False ) . items ( ) :
print ( f " Value: { value } , Count: { count } " )
listnames = [ ]
print ( " ==================================== No se pudo Clasificar ============================================= " )
for value , count in pre_labeled_data [ pre_labeled_data [ ' sexo ' ] . isna ( ) ] [ ' nombre ' ] . value_counts ( dropna = False ) . items ( ) :
print ( f " Value: { value } , Count: { count } " )
listnames . append ( value )
print ( sorted ( [ x for x in listnames if isinstance ( x , str ) ] ) )
""" ## 2.1. Registros incompletos
Se desplegan información sobre valores faltantes en las variables de interés , sujetos sin datos y se genera una versión que solo incluye los registros que contienen toda la información para poder ser usados en el entrenamiento .
"""
import pandas as pd
variables = [ ' sexo ' , ' diabetes_familia ' , ' edad diag ' , ' IMC ' , ' glu ayu ' , ' glu 120 ' , ' A1c ' ]
print ( " ======================================================================================== " )
print ( " Total registros en el dataset etiquetado: \t {} " . format ( pre_labeled_data . shape [ 0 ] ) )
print ( " Variables: \t {} " . format ( str ( variables ) ) )
print ( " ==================== Desglose por N de variables faltantes ============================== " )
for num in range ( len ( variables ) + 1 ) :
nrows = len ( pre_labeled_data [ pre_labeled_data [ variables ] . isnull ( ) . sum ( axis = 1 ) == num ] )
print ( " Le faltan {} / {} variables: \t {} \t ( {} % ) " . format ( num , len ( variables ) , nrows , round ( nrows * 100 / pre_labeled_data . shape [ 0 ] , 2 ) ) )
print ( " ============================ Desglose por variables =====000============================= " )
for var in variables :
nrows = pre_labeled_data [ var ] . isna ( ) . astype ( int ) . sum ( )
print ( " Variable {} ausente en \t \t {} ( {} % ) registros " . format ( var , nrows , round ( nrows * 100 / pre_labeled_data . shape [ 0 ] , 2 ) ) )
pre_labeled_data [ ' MODY1_label ' ] = np . nan
pre_labeled_data . loc [ pre_labeled_data [ ' MODY1_pos ' ] , ' MODY1_label ' ] = 1
pre_labeled_data . loc [ pre_labeled_data [ ' MODY1_neg ' ] , ' MODY1_label ' ] = 0 #-1
pre_labeled_data [ ' MODY2_label ' ] = np . nan
pre_labeled_data . loc [ pre_labeled_data [ ' MODY2_pos ' ] , ' MODY2_label ' ] = 1
pre_labeled_data . loc [ pre_labeled_data [ ' MODY2_neg ' ] , ' MODY2_label ' ] = 0 #-1
pre_labeled_data [ ' MODY3_label ' ] = np . nan
pre_labeled_data . loc [ pre_labeled_data [ ' MODY3_pos ' ] , ' MODY3_label ' ] = 1
pre_labeled_data . loc [ pre_labeled_data [ ' MODY3_neg ' ] , ' MODY3_label ' ] = 0 #-1
pre_labeled_data [ ' MODY5_label ' ] = np . nan
pre_labeled_data . loc [ pre_labeled_data [ ' MODY5_pos ' ] , ' MODY5_label ' ] = 1
pre_labeled_data . loc [ pre_labeled_data [ ' MODY5_neg ' ] , ' MODY5_label ' ] = 0 #-1
""" # 3. Datos iniciales """
dsm1_complete = pre_labeled_data [ ~ pre_labeled_data [ ' MODY1_label ' ] . isna ( ) ] [ variables + [ ' MODY1_label ' ] ]
dsm2_complete = pre_labeled_data [ ~ pre_labeled_data [ ' MODY2_label ' ] . isna ( ) ] [ variables + [ ' MODY2_label ' ] ]
dsm3_complete = pre_labeled_data [ ~ pre_labeled_data [ ' MODY3_label ' ] . isna ( ) ] [ variables + [ ' MODY3_label ' ] ]
dsm5_complete = pre_labeled_data [ ~ pre_labeled_data [ ' MODY5_label ' ] . isna ( ) ] [ variables + [ ' MODY5_label ' ] ]
dsnormal_complete = pre_labeled_data [ pre_labeled_data [ ' Normal ' ] ] [ variables ]
""" # 4. Salida intermedia de los datos para verificación manual
Guarda los dataframes en un excel para verificación
"""
with pd . ExcelWriter ( " MODY_data.xlsx " , engine = ' xlsxwriter ' ) as xls :
raw_data . to_excel ( xls , sheet_name = ' HC Original ' , index = False )
pre_labeled_data . to_excel ( xls , sheet_name = ' Datos etiquetados ' , index = False )
raw_data [ raw_data [ ' SiEntiqueta ' ] == True ] . to_excel ( xls , sheet_name = ' Datos excluídos ' , index = False )
dsm1_complete . to_excel ( xls , sheet_name = ' Dataset MODY1 ' , index = False )
dsm1_complete . dropna ( ) . to_excel ( xls , sheet_name = ' Dataset MODY1 sin ausentes ' , index = False )
dsm2_complete . to_excel ( xls , sheet_name = ' Dataset MODY2 ' , index = False )
dsm2_complete . dropna ( ) . to_excel ( xls , sheet_name = ' Dataset MODY2 sin ausentes ' , index = False )
dsm3_complete . to_excel ( xls , sheet_name = ' Dataset MODY3 ' , index = False )
dsm3_complete . dropna ( ) . to_excel ( xls , sheet_name = ' Dataset MODY3 sin ausentes ' , index = False )
dsm5_complete . to_excel ( xls , sheet_name = ' Dataset MODY5 ' , index = False )
dsm5_complete . dropna ( ) . to_excel ( xls , sheet_name = ' Dataset MODY5 sin ausentes ' , index = False )
dsnormal_complete . to_excel ( xls , sheet_name = ' Sin Diabetes ' , index = False )
return dsm1_complete , dsm2_complete , dsm3_complete , dsm5_complete