271 lines
11 KiB
Python
271 lines
11 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import tensorflow as tf
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib
|
|
matplotlib.rcParams['text.usetex'] = True
|
|
|
|
from sklearn.model_selection import GroupShuffleSplit, ShuffleSplit, GridSearchCV
|
|
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
|
|
|
from xgboost import XGBRegressor
|
|
|
|
# from ray import tune
|
|
# import ray
|
|
# from keras.callbacks import TensorBoard
|
|
# from keras.models import Sequential
|
|
# from keras.callbacks import EarlyStopping
|
|
# from keras.layers import Dense, BatchNormalization, Dropout
|
|
# from kerastuner.tuners import RandomSearch, Hyperband, GridSearch
|
|
|
|
from datetime import datetime
|
|
import enlighten
|
|
import logging
|
|
import zipfile
|
|
import random
|
|
import joblib
|
|
import pickle
|
|
import time
|
|
import json
|
|
import os
|
|
|
|
|
|
def get_seed():
|
|
return random.randint(0, 2**32 - 1)
|
|
|
|
class eNoseTrainer:
|
|
def __init__(self, loader, splits=1, test_size=0.2, debug=False):
|
|
self.ledger = pd.DataFrame(columns=["node", "ts", "Dataset", "Samples", "Target", "Train Size", "Train Ratio", "Model", "Params", "Ratio", "Train mse", "mse", "mae", "rmse"])
|
|
self.loader = loader
|
|
self.splits = splits
|
|
self.name = self.loader.label_file
|
|
self.target = '_'.join(self.loader.target_list)
|
|
self.state = dict()
|
|
|
|
os.makedirs(self.name, exist_ok=True)
|
|
self.start = int(time.time())
|
|
|
|
log_format = '%(asctime)s | %(levelname)-8s | %(name)-15s | %(message)s'
|
|
date_format = '%Y-%m-%d %H:%M:%S'
|
|
logging.basicConfig(format=log_format, datefmt=date_format)
|
|
|
|
target_log = '{}/load-{}.log'.format(self.name, self.start)
|
|
fh = logging.FileHandler(target_log)
|
|
|
|
self.debug = debug
|
|
|
|
self.logger = logging.getLogger("eNoseTrainer")
|
|
if self.debug:
|
|
self.logger.setLevel(logging.DEBUG)
|
|
fh.setLevel(logging.DEBUG)
|
|
else:
|
|
self.logger.setLevel(logging.INFO)
|
|
fh.setLevel(logging.INFO)
|
|
self.logger.addHandler(fh)
|
|
|
|
self.ratio = test_size
|
|
|
|
self.loader.stats()
|
|
self.loadCheckPoint()
|
|
|
|
def loadCheckPoint(self):
|
|
if not os.path.isfile('{}/Simulaciones.xlsx'.format(self.name)):
|
|
self.saveCheckPoint()
|
|
|
|
with pd.ExcelFile('{}/Simulaciones.xlsx'.format(self.name)) as xls:
|
|
self.ledger = pd.read_excel(xls, sheet_name='Historial')
|
|
self.trained = self.ledger.shape[0]
|
|
|
|
with open('{}/vars.pickle'.format(self.name), 'rb') as pfile:
|
|
self.ratio, self.splits, self.state = pickle.load(pfile)
|
|
|
|
def saveCheckPoint(self):
|
|
with pd.ExcelWriter('{}/Simulaciones.xlsx'.format(self.name), engine='xlsxwriter') as xls:
|
|
self.ledger.to_excel(xls, sheet_name='Historial', index=False)
|
|
|
|
with open('{}/vars.pickle'.format(self.name), 'wb') as pfile:
|
|
pickle.dump((self.ratio, self.splits, self.state), pfile, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
self.trained = self.ledger.shape[0]
|
|
|
|
def wrap_and_save(self):
|
|
self.logger.info("{:=^60}".format(' Saving Summary and Wrap the output in a ZipFile '))
|
|
|
|
with pd.ExcelWriter('{}/Summary.xlsx'.format(self.name) , engine='xlsxwriter') as xls:
|
|
self.get_best_models().to_excel(xls, sheet_name='Results')
|
|
|
|
with zipfile.ZipFile('{}-{}.zip'.format(self.name, self.start), 'w', zipfile.ZIP_DEFLATED) as zipf:
|
|
for root, dirs, files in os.walk(self.name):
|
|
for file in files:
|
|
zipf.write(os.path.join(root, file))
|
|
|
|
def row_exists(self, dataset, model):
|
|
return self.ledger[(self.ledger["Dataset"] == dataset) & (self.ledger["Target"] == self.target) & (self.ledger["Model"] == model) & (self.ledger["Ratio"] == self.ratio)].shape[0] > 0
|
|
|
|
def model_A(self, hp):
|
|
|
|
model = Sequential()
|
|
model.add(Dense(units=hp.Int('units_input', min_value=48, max_value=56, step=8), input_dim=self.nvars, activation='relu'))
|
|
model.add(BatchNormalization())
|
|
model.add(Dropout(rate=hp.Float('dropout_input', min_value=0.1, max_value=0.1, step=0.1)))
|
|
|
|
model.add(Dense(units=hp.Int('units_hidden', min_value=32, max_value=48, step=8), activation='relu'))
|
|
model.add(BatchNormalization())
|
|
model.add(Dropout(rate=hp.Float('dropout_hidden', min_value=0.4, max_value=0.4, step=0.1)))
|
|
|
|
model.add(Dense(1, activation='sigmoid'))
|
|
|
|
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC()])
|
|
|
|
return model
|
|
|
|
def train_and_score_model_keras(self, X_train, X_test, y_train, y_test, seed, label):
|
|
# set_random_seed(seed)
|
|
ntrials = 6
|
|
tuner = RandomSearch(
|
|
self.get_model_train_keras,
|
|
objective='val_loss', #val_loss
|
|
# seed=seed,
|
|
max_trials=ntrials,
|
|
# executions_per_trial=1, # Número de ejecuciones por cada configuración
|
|
directory=self.name,
|
|
project_name='{}-{}'.format(label,seed))
|
|
|
|
self.logger.info("{:~^60}".format(' {}-{} '.format(label,seed)))
|
|
|
|
|
|
search_dir = "{}/keras-tuner-{}/".format(self.name,label)
|
|
os.makedirs(search_dir, exist_ok=True)
|
|
search_callback = TensorBoard(log_dir=search_dir)
|
|
early_stopping_search = EarlyStopping(monitor='val_loss', patience=13, min_delta=0.005, start_from_epoch=7, restore_best_weights=True)
|
|
tuner.search(X_train, y_train, epochs=150, batch_size=10, validation_data=(X_test, y_test), callbacks=[early_stopping_search, search_callback])
|
|
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
|
|
|
|
self.trained += 1
|
|
self.bar.update()
|
|
return mse, mae, rmse, optimized_model, model_params
|
|
|
|
def get_model_train(self):
|
|
return [
|
|
XGBRegressor(objective='reg:squarederror'),
|
|
]
|
|
|
|
def get_tunable_params(self, model):
|
|
if isinstance(model, XGBRegressor):
|
|
return {
|
|
"n_estimators": [800, 1000, 1200],
|
|
"learning_rate": np.logspace(-1.5, -0.5, 3),
|
|
'max_depth': [5, 7, 9],
|
|
'subsample': [0.5, 0.75, 1.0],
|
|
# 'colsample_bytree': [0.8, 0.9, 1.0],
|
|
# 'gamma': [0, 0.1, 0.2],
|
|
# 'min_child_weight': [1, 3, 5]
|
|
}
|
|
elif isinstance(model, RandomForestClassifier):
|
|
return {
|
|
"n_estimators": [50, 100, 200],
|
|
"max_depth": [5, 10, 15],
|
|
"max_features": [2, 5, 10] #['n', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start']
|
|
}
|
|
else:
|
|
return {}
|
|
|
|
def train_and_score_model(self, model, X_train, X_test, y_train, y_test):
|
|
param_dist = self.get_tunable_params(model)
|
|
|
|
cv = ShuffleSplit(n_splits=self.splits//2, test_size=0.2, random_state=get_seed())
|
|
grid_search = GridSearchCV(estimator=model, param_grid=param_dist, scoring='neg_mean_squared_error', cv=cv, verbose=10, n_jobs=-1)
|
|
|
|
grid_search.fit(X_train, y_train)
|
|
|
|
optimized_model = grid_search.best_estimator_
|
|
model_params = grid_search.best_params_
|
|
|
|
y_aux = optimized_model.predict(X_train)
|
|
tmse = mean_squared_error(y_train, y_aux)
|
|
|
|
y_pred = optimized_model.predict(X_test)
|
|
mse = mean_squared_error(y_test, y_pred)
|
|
mae = mean_absolute_error(y_test, y_pred)
|
|
rmse = np.sqrt(mse)
|
|
|
|
return tmse, mse, mae, rmse, optimized_model, model_params
|
|
|
|
def fit(self):
|
|
total_train_queue = self.splits*len(self.get_model_train())
|
|
self.logger.info("{:=^60}".format(f'Begin Fit {total_train_queue} Models'))
|
|
self.trained = 0
|
|
manager = enlighten.get_manager()
|
|
self.bar = manager.counter(total=total_train_queue, count=self.trained, desc='Tunning', unit='Models',
|
|
format='{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]'
|
|
)
|
|
|
|
|
|
node = os.uname()[1]
|
|
X_xboost, Y_xboost, G_xboost = self.loader.load_dataset_xboost()
|
|
|
|
gss = GroupShuffleSplit(n_splits=self.splits, test_size=self.ratio, random_state=get_seed())
|
|
|
|
dataset = 'Tabular'
|
|
os.makedirs('{}/{}/{}'.format(self.name, self.target, dataset), exist_ok=True)
|
|
|
|
for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_xboost, G_xboost)):
|
|
X_train, X_test = X_xboost[train_index], X_xboost[test_index]
|
|
y_train, y_test = Y_xboost[train_index], Y_xboost[test_index]
|
|
|
|
|
|
for model in self.get_model_train():
|
|
model_id = "{}_{}".format(type(model).__name__, i)
|
|
self.trained += 1
|
|
|
|
if self.row_exists(dataset, model_id):
|
|
self.bar.update()
|
|
continue
|
|
|
|
model_file = '{}/{}/{}/{}'.format(self.name, self.target, dataset, model_id )
|
|
|
|
tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_model(model, X_train, X_test, y_train, y_test)
|
|
|
|
ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
|
|
joblib.dump(optimized_model, model_file)
|
|
|
|
newrow = pd.DataFrame( [{"node": node,
|
|
"ts": ts,
|
|
"Dataset": dataset,
|
|
"Samples": Y_xboost.shape[0],
|
|
"Target": self.target,
|
|
"Train Size": y_train.shape[0],
|
|
"Train Ratio": y_train.shape[0]/Y_xboost.shape[0],
|
|
"Ratio": self.ratio,
|
|
"Model": model_id,
|
|
"Params": json.dumps(model_params),
|
|
"Train mse": tmse,
|
|
"mse": mse,
|
|
"mae": mae,
|
|
"rmse": rmse
|
|
}] )
|
|
self.ledger = pd.concat([self.ledger, newrow], ignore_index=True)
|
|
self.bar.update()
|
|
|
|
self.saveCheckPoint()
|
|
|
|
# if self.dnn:
|
|
# model_file = '{}/{}/DNN_{}'.format(self.name, label, seed )
|
|
# model_label = "{}".format(label)
|
|
#
|
|
# accuracy, specificity, recall, f1, roc_auc, optimized_model, parms = self.train_and_score_model_keras(X_train, X_test, y_train, y_test, seed, model_label)
|
|
# ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
|
|
#
|
|
# newrow = pd.DataFrame( [{"node": node,
|
|
# "ts": ts,
|
|
# "Dataset": model_label,
|
|
# "Model": 'DNN',
|
|
# "Params": parms,
|
|
# "Seed": seed,
|
|
# "F1": f1,
|
|
# "ROC_AUC": roc_auc
|
|
# }] )
|
|
# self.ledger = pd.concat([self.ledger, newrow], ignore_index=True)
|
|
|
|
self.bar.close()
|