import numpy as np import pandas as pd import tensorflow as tf import matplotlib.pyplot as plt import matplotlib matplotlib.rcParams['text.usetex'] = True from sklearn.preprocessing import KBinsDiscretizer from sklearn.model_selection import StratifiedGroupKFold, StratifiedShuffleSplit, GridSearchCV from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.preprocessing import MinMaxScaler from xgboost import XGBRegressor # from ray import tune # import ray # from keras.callbacks import TensorBoard # from keras.models import Sequential # from keras.callbacks import EarlyStopping # from keras.layers import Dense, BatchNormalization, Dropout # from kerastuner.tuners import RandomSearch, Hyperband, GridSearch from datetime import datetime import enlighten import logging import zipfile import random import joblib import pickle import time import json import os def get_seed(): return random.randint(0, 2**32 - 1) class eNoseTrainer: def __init__(self, loader, splits=1, test_size=0.2, debug=False): self.ledger = pd.DataFrame(columns=["node", "ts", "Dataset", "Samples", "Target", "Train Size", "Train Ratio", "Model", "Params", "Ratio", "Train mse", "mse", "mae", "rmse"]) self.loader = loader self.splits = splits self.name = self.loader.label_file self.target = '_'.join(self.loader.target_list) self.state = dict() os.makedirs(self.name, exist_ok=True) self.start = int(time.time()) log_format = '%(asctime)s | %(levelname)-8s | %(name)-15s | %(message)s' date_format = '%Y-%m-%d %H:%M:%S' logging.basicConfig(format=log_format, datefmt=date_format) target_log = '{}/load-{}.log'.format(self.name, self.start) fh = logging.FileHandler(target_log) self.debug = debug self.logger = logging.getLogger("eNoseTrainer") if self.debug: self.logger.setLevel(logging.DEBUG) fh.setLevel(logging.DEBUG) else: self.logger.setLevel(logging.INFO) fh.setLevel(logging.INFO) self.logger.addHandler(fh) self.ratio = test_size self.loader.stats() self.loadCheckPoint() def loadCheckPoint(self): if not os.path.isfile('{}/Simulaciones.xlsx'.format(self.name)): self.saveCheckPoint() with pd.ExcelFile('{}/Simulaciones.xlsx'.format(self.name)) as xls: self.ledger = pd.read_excel(xls, sheet_name='Historial') self.trained = self.ledger.shape[0] with open('{}/vars.pickle'.format(self.name), 'rb') as pfile: self.ratio, self.splits, self.state = pickle.load(pfile) def saveCheckPoint(self): with pd.ExcelWriter('{}/Simulaciones.xlsx'.format(self.name), engine='xlsxwriter') as xls: self.ledger.to_excel(xls, sheet_name='Historial', index=False) with open('{}/vars.pickle'.format(self.name), 'wb') as pfile: pickle.dump((self.ratio, self.splits, self.state), pfile, protocol=pickle.HIGHEST_PROTOCOL) self.trained = self.ledger.shape[0] def wrap_and_save(self): self.logger.info("{:=^60}".format(' Saving Summary and Wrap the output in a ZipFile ')) with pd.ExcelWriter('{}/Summary.xlsx'.format(self.name) , engine='xlsxwriter') as xls: self.get_best_models().to_excel(xls, sheet_name='Results') with zipfile.ZipFile('{}-{}.zip'.format(self.name, self.start), 'w', zipfile.ZIP_DEFLATED) as zipf: for root, dirs, files in os.walk(self.name): for file in files: zipf.write(os.path.join(root, file)) def row_exists(self, dataset, model): return self.ledger[(self.ledger["Dataset"] == dataset) & (self.ledger["Target"] == self.target) & (self.ledger["Model"] == model) & (self.ledger["Ratio"] == self.ratio)].shape[0] > 0 def model_A(self, hp): model = Sequential() model.add(Dense(units=hp.Int('units_input', min_value=48, max_value=56, step=8), input_dim=self.nvars, activation='relu')) model.add(BatchNormalization()) model.add(Dropout(rate=hp.Float('dropout_input', min_value=0.1, max_value=0.1, step=0.1))) model.add(Dense(units=hp.Int('units_hidden', min_value=32, max_value=48, step=8), activation='relu')) model.add(BatchNormalization()) model.add(Dropout(rate=hp.Float('dropout_hidden', min_value=0.4, max_value=0.4, step=0.1))) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC()]) return model def train_and_score_model_keras(self, X_train, X_test, y_train, y_test, seed, label): # set_random_seed(seed) ntrials = 6 tuner = RandomSearch( self.get_model_train_keras, objective='val_loss', #val_loss # seed=seed, max_trials=ntrials, # executions_per_trial=1, # Número de ejecuciones por cada configuración directory=self.name, project_name='{}-{}'.format(label,seed)) self.logger.info("{:~^60}".format(' {}-{} '.format(label,seed))) search_dir = "{}/keras-tuner-{}/".format(self.name,label) os.makedirs(search_dir, exist_ok=True) search_callback = TensorBoard(log_dir=search_dir) early_stopping_search = EarlyStopping(monitor='val_loss', patience=13, min_delta=0.005, start_from_epoch=7, restore_best_weights=True) tuner.search(X_train, y_train, epochs=150, batch_size=10, validation_data=(X_test, y_test), callbacks=[early_stopping_search, search_callback]) best_hps = tuner.get_best_hyperparameters(num_trials=1)[0] self.trained += 1 self.bar.update() return mse, mae, rmse, optimized_model, model_params def get_model_train(self): return [ XGBRegressor(objective='reg:squarederror'), ] def get_tunable_params(self, model): if isinstance(model, XGBRegressor): return { "n_estimators": [800, 1000, 1200], "learning_rate": np.logspace(-1.5, -0.5, 3), 'max_depth': [5, 7, 9], 'subsample': [0.5, 0.75, 1.0], # 'colsample_bytree': [0.8, 0.9, 1.0], # 'gamma': [0, 0.1, 0.2], # 'min_child_weight': [1, 3, 5] } elif isinstance(model, RandomForestClassifier): return { "n_estimators": [50, 100, 200], "max_depth": [5, 10, 15], "max_features": [2, 5, 10] #['n', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'] } else: return {} def train_and_score_model(self, model, X_train, X_test, y_train, y_test): param_dist = self.get_tunable_params(model) cv = StratifiedShuffleSplit(n_splits=int(1/(2*self.ratio))+1, test_size=self.ratio, random_state=get_seed()) grid_search = GridSearchCV(estimator=model, param_grid=param_dist, scoring='neg_mean_squared_error', cv=cv, verbose=10, n_jobs=-1) grid_search.fit(X_train, y_train) optimized_model = grid_search.best_estimator_ model_params = grid_search.best_params_ y_aux = optimized_model.predict(X_train) tmse = mean_squared_error(y_train, y_aux) y_pred = optimized_model.predict(X_test) mse = mean_squared_error(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) rmse = np.sqrt(mse) return tmse, mse, mae, rmse, optimized_model, model_params def fit(self): total_train_queue = int(1/self.ratio)*len(self.get_model_train()) self.logger.info("{:=^60}".format(f'Begin Fit {total_train_queue} Models')) self.trained = 0 manager = enlighten.get_manager() self.bar = manager.counter(total=total_train_queue, count=self.trained, desc='Tunning', unit='Models', format='{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]' ) node = os.uname()[1] X_xboost, Y_xboost, G_xboost = self.loader.load_dataset_xboost() discretizer = KBinsDiscretizer(n_bins=50*Y_xboost.shape[1], encode='ordinal', strategy='uniform') Y_discrete = discretizer.fit_transform(Y_xboost) gss = StratifiedGroupKFold(n_splits=int(1/self.ratio), shuffle=True, random_state=get_seed()) dataset = 'Tabular' os.makedirs('{}/{}/{}'.format(self.name, self.target, dataset), exist_ok=True) for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)): X_train, X_test = X_xboost[train_index], Y_xboost[test_index] y_train, y_test = Y_xboost[train_index], Y_xboost[test_index] for model in self.get_model_train(): model_id = "{}_{}".format(type(model).__name__, i) self.trained += 1 if self.row_exists(dataset, model_id): self.bar.update() continue model_file = '{}/{}/{}/{}'.format(self.name, self.target, dataset, model_id ) tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_model(model, X_train, X_test, y_train, y_test) ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S") joblib.dump(optimized_model, model_file) newrow = pd.DataFrame( [{"node": node, "ts": ts, "Dataset": dataset, "Samples": Y_xboost.shape[0], "Target": self.target, "Train Size": y_train.shape[0], "Train Ratio": y_train.shape[0]/Y_xboost.shape[0], "Ratio": self.ratio, "Model": model_id, "Params": json.dumps(model_params), "Train mse": tmse, "mse": mse, "mae": mae, "rmse": rmse }] ) self.ledger = pd.concat([self.ledger, newrow], ignore_index=True) self.bar.update() self.saveCheckPoint() # if self.dnn: # model_file = '{}/{}/DNN_{}'.format(self.name, label, seed ) # model_label = "{}".format(label) # # accuracy, specificity, recall, f1, roc_auc, optimized_model, parms = self.train_and_score_model_keras(X_train, X_test, y_train, y_test, seed, model_label) # ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S") # # newrow = pd.DataFrame( [{"node": node, # "ts": ts, # "Dataset": model_label, # "Model": 'DNN', # "Params": parms, # "Seed": seed, # "F1": f1, # "ROC_AUC": roc_auc # }] ) # self.ledger = pd.concat([self.ledger, newrow], ignore_index=True) self.bar.close()