enose_2025/TrainerClass.py

650 lines
30 KiB
Python
Raw Normal View History

2025-03-09 04:22:15 -03:00
import numpy as np
import pandas as pd
import tensorflow as tf
2025-03-13 17:51:13 -03:00
import matplotlib.cm as cm
2025-03-09 04:22:15 -03:00
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['text.usetex'] = True
2025-03-10 19:34:13 -03:00
from sklearn.preprocessing import KBinsDiscretizer
2025-03-16 19:41:52 -03:00
from sklearn.model_selection import StratifiedGroupKFold, StratifiedShuffleSplit, GridSearchCV, train_test_split
2025-03-09 04:22:15 -03:00
from sklearn.metrics import mean_squared_error, mean_absolute_error
2025-03-10 19:16:11 -03:00
from sklearn.preprocessing import MinMaxScaler
2025-03-09 04:22:15 -03:00
from xgboost import XGBRegressor
2025-03-16 04:09:55 -03:00
import keras
from keras import layers
2025-03-16 05:03:11 -03:00
import ray
2025-03-16 04:09:55 -03:00
from ray import tune
from ray.tune.schedulers import ASHAScheduler
2025-03-09 04:22:15 -03:00
# from ray import tune
# import ray
# from keras.callbacks import TensorBoard
# from keras.models import Sequential
# from keras.callbacks import EarlyStopping
# from keras.layers import Dense, BatchNormalization, Dropout
# from kerastuner.tuners import RandomSearch, Hyperband, GridSearch
from datetime import datetime
import enlighten
import logging
import zipfile
import random
import joblib
import pickle
import time
import json
import os
2025-03-16 19:41:52 -03:00
2025-03-09 04:22:15 -03:00
def get_seed():
return random.randint(0, 2**32 - 1)
class eNoseTrainer:
2025-03-13 17:51:13 -03:00
def __init__(self, loader, test_size=0.2, debug=False):
2025-03-09 04:22:15 -03:00
self.ledger = pd.DataFrame(columns=["node", "ts", "Dataset", "Samples", "Target", "Train Size", "Train Ratio", "Model", "Params", "Ratio", "Train mse", "mse", "mae", "rmse"])
self.loader = loader
self.name = self.loader.label_file
self.state = dict()
os.makedirs(self.name, exist_ok=True)
self.start = int(time.time())
log_format = '%(asctime)s | %(levelname)-8s | %(name)-15s | %(message)s'
date_format = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(format=log_format, datefmt=date_format)
target_log = '{}/load-{}.log'.format(self.name, self.start)
fh = logging.FileHandler(target_log)
self.debug = debug
self.logger = logging.getLogger("eNoseTrainer")
if self.debug:
self.logger.setLevel(logging.DEBUG)
fh.setLevel(logging.DEBUG)
else:
self.logger.setLevel(logging.INFO)
fh.setLevel(logging.INFO)
self.logger.addHandler(fh)
self.ratio = test_size
self.loader.stats()
self.loadCheckPoint()
def loadCheckPoint(self):
if not os.path.isfile('{}/Simulaciones.xlsx'.format(self.name)):
self.saveCheckPoint()
with pd.ExcelFile('{}/Simulaciones.xlsx'.format(self.name)) as xls:
self.ledger = pd.read_excel(xls, sheet_name='Historial')
self.trained = self.ledger.shape[0]
2025-03-13 17:51:13 -03:00
# with open('{}/vars.pickle'.format(self.name), 'rb') as pfile:
# self.ratio, self.state = pickle.load(pfile)
2025-03-09 04:22:15 -03:00
def saveCheckPoint(self):
with pd.ExcelWriter('{}/Simulaciones.xlsx'.format(self.name), engine='xlsxwriter') as xls:
self.ledger.to_excel(xls, sheet_name='Historial', index=False)
2025-03-13 17:51:13 -03:00
# with open('{}/vars.pickle'.format(self.name), 'wb') as pfile:
# pickle.dump((self.ratio, self.state), pfile, protocol=pickle.HIGHEST_PROTOCOL)
2025-03-09 04:22:15 -03:00
self.trained = self.ledger.shape[0]
def wrap_and_save(self):
self.logger.info("{:=^60}".format(' Saving Summary and Wrap the output in a ZipFile '))
with zipfile.ZipFile('{}-{}.zip'.format(self.name, self.start), 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(self.name):
for file in files:
zipf.write(os.path.join(root, file))
def row_exists(self, dataset, model):
2025-03-13 17:51:13 -03:00
search_result = self.ledger[(self.ledger["Dataset"]==dataset) & (self.ledger["Target"]==self.loader.target) & (self.ledger["Model"]==model) & (self.ledger["Ratio"]==self.ratio)].shape[0] > 0
self.logger.debug(f'Looking for {dataset}, {model}, {self.loader.target}, {self.ratio} => {search_result} {self.ledger.shape}')
return search_result
2025-03-09 04:22:15 -03:00
def model_A(self, hp):
model = Sequential()
model.add(Dense(units=hp.Int('units_input', min_value=48, max_value=56, step=8), input_dim=self.nvars, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=hp.Float('dropout_input', min_value=0.1, max_value=0.1, step=0.1)))
model.add(Dense(units=hp.Int('units_hidden', min_value=32, max_value=48, step=8), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=hp.Float('dropout_hidden', min_value=0.4, max_value=0.4, step=0.1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC()])
return model
2025-03-10 23:11:41 -03:00
def train_and_score_model_keras(self, X_train, X_test, Y_train, Y_test, seed, label):
2025-03-09 04:22:15 -03:00
# set_random_seed(seed)
ntrials = 6
tuner = RandomSearch(
self.get_model_train_keras,
objective='val_loss', #val_loss
# seed=seed,
max_trials=ntrials,
# executions_per_trial=1, # Número de ejecuciones por cada configuración
directory=self.name,
project_name='{}-{}'.format(label,seed))
self.logger.info("{:~^60}".format(' {}-{} '.format(label,seed)))
search_dir = "{}/keras-tuner-{}/".format(self.name,label)
os.makedirs(search_dir, exist_ok=True)
search_callback = TensorBoard(log_dir=search_dir)
early_stopping_search = EarlyStopping(monitor='val_loss', patience=13, min_delta=0.005, start_from_epoch=7, restore_best_weights=True)
2025-03-10 23:11:41 -03:00
tuner.search(X_train, Y_train, epochs=150, batch_size=10, validation_data=(X_test, Y_test), callbacks=[early_stopping_search, search_callback])
2025-03-09 04:22:15 -03:00
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
self.trained += 1
self.bar.update()
return mse, mae, rmse, optimized_model, model_params
2025-03-16 19:41:52 -03:00
def get_model_train(self):
return [
XGBRegressor(objective='reg:squarederror'),
]
2025-03-16 04:09:55 -03:00
2025-03-16 19:41:52 -03:00
def get_tunable_params(self, model):
if isinstance(model, XGBRegressor):
return {
'tree_method': ["hist"],
"n_estimators": [100, 128, 150],
'max_depth': [6, 7, 8],
'subsample': [0.5, 0.6, 0.7],
'multi_strategy': ['one_output_per_tree', 'multi_output_tree']
}
elif isinstance(model, RandomForestClassifier):
return {
"n_estimators": [50, 100, 200],
"max_depth": [5, 10, 15],
"max_features": [2, 5, 10] #['n', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start']
}
else:
return {}
2025-03-16 21:26:01 -03:00
def search_best_conv1D_v1(self, X_train_orig, X_test_orig, Y_train_orig, Y_test_orig, epochs=30, nsamples=0.1):
2025-03-16 23:07:59 -03:00
ray.init(ignore_reinit_error=True, configure_logging=True, logging_level=logging.INFO)
2025-03-16 19:41:52 -03:00
X_train_ref = ray.put(X_train_orig)
Y_train_ref = ray.put(Y_train_orig)
X_test_ref = ray.put(X_test_orig)
Y_test_ref = ray.put(Y_test_orig)
2025-03-16 22:24:57 -03:00
# input channels 80 window
# l1 minimo 2, maximo 5 => 40 | 16
# maxpool 2 o 3 => 20 |
# l2 minimo 2 maximo 5 => 4 | 2/5
# maxpool 2 o 3 => 2 | 2/15
2025-03-16 19:41:52 -03:00
def build_model_conv1D(config, input_shape, output_dim):
2025-03-16 04:09:55 -03:00
model = keras.Sequential([
2025-03-16 22:36:53 -03:00
layers.Conv1D(filters=config['filters'], kernel_size=config['kernel_l1'], strides=config['kernel_l1']//2, activation='relu', padding='causal', input_shape=input_shape),
2025-03-16 04:09:55 -03:00
layers.MaxPooling1D(pool_size=config['pool_size']),
2025-03-16 22:36:53 -03:00
layers.Conv1D(filters=config['filters'] * 2, kernel_size=config['kernel_l2'], strides=config['kernel_l2']//2, activation='relu', padding='causal'),
2025-03-16 04:09:55 -03:00
layers.MaxPooling1D(pool_size=config['pool_size']),
layers.Flatten(),
layers.Dense(config['dense_units'], activation='relu'),
layers.Dropout(config['dropout']),
layers.Dense(output_dim)
])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=config['lr']), loss='mse')
return model
2025-03-16 19:41:52 -03:00
def train_model_conv1D(config):
X_trainc1D = ray.get(X_train_ref)
Y_trainc1D = ray.get(Y_train_ref)
X_testc1D = ray.get(X_test_ref)
Y_testc1D = ray.get(Y_test_ref)
input_shape = X_trainc1D.shape[1:]
output_dim = Y_trainc1D.shape[1]
model = build_model_conv1D(config, input_shape, output_dim)
2025-03-16 04:09:55 -03:00
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(
2025-03-16 19:41:52 -03:00
X_trainc1D, Y_trainc1D,
validation_data=(X_testc1D, Y_testc1D),
epochs=config['epochs'],
2025-03-16 04:09:55 -03:00
batch_size=config['batch_size'],
2025-03-16 21:26:01 -03:00
verbose=0,
2025-03-16 04:09:55 -03:00
callbacks=[early_stopping]
)
2025-03-16 19:41:52 -03:00
Y_pred = model.predict(X_testc1D)
mse = mean_squared_error(Y_testc1D, Y_pred)
tune.report({'mse': mse})
2025-03-16 04:09:55 -03:00
config_space = {
2025-03-16 20:32:03 -03:00
'filters': tune.choice([16, 32, 64]),
2025-03-16 22:24:57 -03:00
'kernel_l1': tune.choice([3, 5, 7]),
'kernel_l2': tune.choice([3, 5, 7]),
2025-03-16 04:09:55 -03:00
'pool_size': tune.choice([2, 3]),
2025-03-16 22:24:57 -03:00
'dense_units': tune.choice([32, 64, 128, 256]),
'dropout': tune.choice([0.05, 0.15, 0.3]),
'lr': tune.choice([0.01, 0.005, 0.001]),
2025-03-16 22:38:24 -03:00
'batch_size': tune.choice([16, 32, 64, 128]),
2025-03-16 22:33:58 -03:00
'epochs': epochs
2025-03-16 04:09:55 -03:00
}
2025-03-16 22:38:24 -03:00
total_space = (3*3*3*2*4*3*3*4)
2025-03-16 22:24:57 -03:00
2025-03-16 22:25:26 -03:00
self.logger.info(f"total_space: {total_space}, num_samples: {int(nsamples*total_space)}")
2025-03-16 04:09:55 -03:00
scheduler = ASHAScheduler(metric='mse', mode='min', max_t=epochs, grace_period=5, reduction_factor=2)
2025-03-16 05:02:01 -03:00
# analysis = tune.run(train_model, config=config_space, num_samples=num_samples, scheduler=scheduler)
2025-03-16 21:26:01 -03:00
analysis = tune.run( tune.with_parameters(train_model_conv1D), config=config_space, num_samples=int(nsamples*total_space), scheduler=scheduler, max_concurrent_trials=8 )
2025-03-16 04:09:55 -03:00
best_config = analysis.get_best_config(metric='mse', mode='min')
2025-03-16 20:32:03 -03:00
best_model = build_model_conv1D(best_config, X_train_orig.shape[1:], Y_train_orig.shape[1])
2025-03-16 04:09:55 -03:00
2025-03-09 04:22:15 -03:00
2025-03-16 19:41:52 -03:00
ray.internal.free([X_train_ref, Y_train_ref, X_test_ref, Y_test_ref])
ray.shutdown()
2025-03-16 23:18:03 -03:00
return best_model, analysis
2025-03-09 04:22:15 -03:00
2025-03-10 23:11:41 -03:00
def train_and_score_model(self, model, X_train, X_test, Y_train, Y_test):
2025-03-09 04:22:15 -03:00
param_dist = self.get_tunable_params(model)
2025-03-10 18:49:22 -03:00
cv = StratifiedShuffleSplit(n_splits=int(1/(2*self.ratio))+1, test_size=self.ratio, random_state=get_seed())
2025-03-13 23:17:57 -03:00
grid_search = GridSearchCV(estimator=model, param_grid=param_dist, scoring='neg_mean_squared_error', cv=cv, verbose=2, n_jobs=-1)
2025-03-09 04:22:15 -03:00
2025-03-10 23:11:41 -03:00
grid_search.fit(X_train, Y_train)
2025-03-09 04:22:15 -03:00
optimized_model = grid_search.best_estimator_
model_params = grid_search.best_params_
y_aux = optimized_model.predict(X_train)
2025-03-10 23:11:41 -03:00
tmse = mean_squared_error(Y_train, y_aux)
2025-03-09 04:22:15 -03:00
y_pred = optimized_model.predict(X_test)
2025-03-10 23:11:41 -03:00
mse = mean_squared_error(Y_test, y_pred)
mae = mean_absolute_error(Y_test, y_pred)
2025-03-09 04:22:15 -03:00
rmse = np.sqrt(mse)
return tmse, mse, mae, rmse, optimized_model, model_params
2025-03-13 17:51:13 -03:00
def gen_plots(self, dataset, model_id, target=None):
if isinstance(target, list):
self.loader.target_list=target
if isinstance(target, str):
self.loader.target_list= list(target)
if dataset.endswith("-conv3"):
self.loader.smooth = 'conv3'
else:
self.loader.smooth = None
self.loader.reset()
if not self.row_exists(dataset, model_id):
self.logger.error(f'No se encuentra la simulacion {dataset}, {model_id}')
return
model_file = '{}/{}/{}/{}'.format(self.name, self.loader.target, dataset, model_id )
if not os.path.isfile(model_file):
self.logger.error('No se encuentra el modelo')
return
trained_model = joblib.load(model_file)
pics_folder = '{}/{}/{}/plots'.format(self.name, self.loader.target, dataset)
os.makedirs(pics_folder, exist_ok=True)
df = self.loader.scaled_data
Y_samples = np.zeros((len(df), len(self.loader.target_list)))
for i, sample in enumerate(df):
Y_samples[i] = np.array([[df[sample]['label'][key] for key in self.loader.target_list]])
self.logger.debug(f"Y_samples.shape: {Y_samples.shape}")
target_scaler = MinMaxScaler()
Y_samples = target_scaler.fit_transform(Y_samples)
2025-03-16 04:09:55 -03:00
cmapx = cm.get_cmap('winter', len(self.loader.source_channels))
2025-03-13 17:51:13 -03:00
cmapy = cm.get_cmap('prism', Y_samples.shape[1])
for measurament, (r, l) in self.loader.dataset['range'].items():
# df[measurament]['data'].plot(figsize=(12, 6), title=f"{measurament} Prediction")
plt.figure(figsize=(12, 6))
2025-03-16 04:09:55 -03:00
plt.title(f"[{dataset}] {model_id}. Sample {measurament}")
plt.xlabel("Sensor Readings")
2025-03-13 17:51:13 -03:00
2025-03-16 04:09:55 -03:00
plt.vlines(x=r, ymin=0, ymax=1, colors='blue', linestyle='dashed')
plt.vlines(x=l, ymin=0, ymax=1, colors='blue', linestyle='dashed')
2025-03-13 17:51:13 -03:00
Y_value = np.zeros((1, len(self.loader.target_list)))
Y_value[0] = np.array([[df[measurament]['label'][key] for key in self.loader.target_list]])
self.logger.debug(f"Y_value.shape: {Y_value.shape}")
self.logger.debug(f"Y_value: {Y_value}")
Y_scaled = target_scaler.transform(Y_value).reshape(1, -1)
self.logger.debug(f"Y_scaled.shape: {Y_scaled.shape}")
self.logger.debug(f"Y_scaled: {Y_scaled}")
y_pred = trained_model.predict(df[measurament]['data'].to_numpy())
2025-03-16 04:09:55 -03:00
self.logger.debug(f"y_pred.shape: {y_pred.shape}")
# self.logger.debug(f"y_pred: {Y_scaled}")
2025-03-13 17:51:13 -03:00
if y_pred.ndim == 2:
2025-03-16 04:09:55 -03:00
plt.ylabel("Target dashed / Pred solid")
for i, channel_name in enumerate(df[measurament]['data'].columns):
plt.plot(df[measurament]['data'][channel_name], linestyle = 'dotted', color=cmapx(i), alpha=0.2)
for i in range(y_pred.shape[1]):
self.logger.debug(f"Y_scaled[0][i]: {Y_scaled[0][i]}")
plt.axhline(y=Y_scaled[0][i], xmin=0, xmax=df[measurament]['data'].shape[0], color=cmapy(i), linestyle='dashed')
2025-03-13 17:51:13 -03:00
plt.plot(y_pred[:, i], color=cmapy(i), linestyle='solid')
else:
2025-03-16 04:09:55 -03:00
plt.ylabel("Samples dotted / Target dashed / Pred solid")
for i, channel_name in enumerate(df[measurament]['data'].columns):
plt.plot(df[measurament]['data'][channel_name], linestyle = 'dotted', color=cmapx(i))
2025-03-13 17:51:13 -03:00
plt.plot(y_pred, color=cmapy(0), linestyle='solid')
2025-03-16 04:09:55 -03:00
plt.axhline(y=Y_scaled, xmin=0, xmax=df[measurament]['data'].shape[0], color=cmapy(i), linestyle='dashed')
2025-03-13 17:51:13 -03:00
filename = os.path.join(pics_folder, f"{measurament}_{model_id}.png")
2025-03-16 04:09:55 -03:00
plt.savefig(filename, format='png')
2025-03-13 17:51:13 -03:00
self.logger.info(f"Saved plot as {filename}")
plt.close()
2025-03-09 04:22:15 -03:00
def fit(self):
2025-03-16 22:24:57 -03:00
windows = [128, 256, 384]
2025-03-16 19:41:52 -03:00
total_train_queue = 2*int(1/self.ratio)*(len(self.get_model_train())+len(windows))
2025-03-09 04:22:15 -03:00
self.logger.info("{:=^60}".format(f'Begin Fit {total_train_queue} Models'))
self.trained = 0
manager = enlighten.get_manager()
self.bar = manager.counter(total=total_train_queue, count=self.trained, desc='Tunning', unit='Models',
format='{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]'
)
2025-03-13 17:51:13 -03:00
discretizer = KBinsDiscretizer(n_bins=200, encode='ordinal', strategy='uniform')
gss = StratifiedGroupKFold(n_splits=int(1/self.ratio), shuffle=True, random_state=get_seed())
2025-03-09 04:22:15 -03:00
node = os.uname()[1]
2025-03-13 17:51:13 -03:00
self.loader.smooth = None
self.loader.reset()
2025-03-09 04:22:15 -03:00
X_xboost, Y_xboost, G_xboost = self.loader.load_dataset_xboost()
2025-03-13 17:51:13 -03:00
# self.logger.debug(f"X_xboost: {X_xboost.shape}")
2025-03-10 23:11:41 -03:00
self.logger.debug(f"Y_xboost: {Y_xboost.shape}")
2025-03-13 17:51:13 -03:00
# self.logger.debug(f"G_xboost: {G_xboost.shape}")
2025-03-09 04:22:15 -03:00
2025-03-13 17:51:13 -03:00
Y_discrete = discretizer.fit_transform(Y_xboost)
if Y_discrete.ndim == 2:
Y_discrete = np.sum(Y_discrete, axis=1)
# self.logger.debug(f"Y_discrete: {Y_discrete.shape}")
2025-03-09 04:22:15 -03:00
2025-03-16 04:09:55 -03:00
dataset = 'Tabular'
2025-03-10 19:32:46 -03:00
for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)):
2025-03-16 19:41:52 -03:00
self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}'))
2025-03-13 17:51:13 -03:00
os.makedirs('{}/{}/{}'.format(self.name, self.loader.target, dataset), exist_ok=True)
2025-03-10 23:11:41 -03:00
X_train, X_test = X_xboost[train_index], X_xboost[test_index]
Y_train, Y_test = Y_xboost[train_index], Y_xboost[test_index]
2025-03-13 17:51:13 -03:00
# self.logger.debug(f"X_train: {X_train.shape}")
# self.logger.debug(f"X_test: {X_test.shape}")
2025-03-10 23:11:41 -03:00
self.logger.debug(f"Y_train: {Y_train.shape}")
self.logger.debug(f"Y_test: {Y_test.shape}")
2025-03-09 04:22:15 -03:00
2025-03-11 02:14:25 -03:00
for model in self.get_model_train():
model_id = "{}_{}".format(type(model).__name__, i)
self.trained += 1
if self.row_exists(dataset, model_id):
self.bar.update()
continue
2025-03-13 17:51:13 -03:00
model_file = '{}/{}/{}/{}'.format(self.name, self.loader.target, dataset, model_id )
2025-03-11 02:14:25 -03:00
tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_model(model, X_train, X_test, Y_train, Y_test)
ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
joblib.dump(optimized_model, model_file)
newrow = pd.DataFrame( [{"node": node,
"ts": ts,
"Dataset": dataset,
"Samples": Y_xboost.shape[0],
2025-03-13 17:51:13 -03:00
"Target": self.loader.target,
2025-03-11 02:14:25 -03:00
"Train Size": Y_train.shape[0],
"Train Ratio": Y_train.shape[0]/Y_xboost.shape[0],
"Ratio": self.ratio,
"Model": model_id,
"Params": json.dumps(model_params),
"Train mse": tmse,
"mse": mse,
"mae": mae,
"rmse": rmse
}] )
self.ledger = pd.concat([self.ledger, newrow], ignore_index=True)
self.bar.update()
self.saveCheckPoint()
2025-03-13 17:51:13 -03:00
self.loader.smooth = 'conv3'
self.loader.reset()
X_xboost, Y_xboost, G_xboost = self.loader.load_dataset_xboost()
# self.logger.debug(f"X_xboost: {X_xboost.shape}")
self.logger.debug(f"Y_xboost: {Y_xboost.shape}")
# self.logger.debug(f"G_xboost: {G_xboost.shape}")
Y_discrete = discretizer.fit_transform(Y_xboost)
if Y_discrete.ndim == 2:
Y_discrete = np.sum(Y_discrete, axis=1)
2025-03-16 04:09:55 -03:00
dataset = 'Tabular-conv3'
2025-03-13 17:51:13 -03:00
for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)):
2025-03-16 19:41:52 -03:00
self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}'))
2025-03-13 17:51:13 -03:00
os.makedirs('{}/{}/{}'.format(self.name, self.loader.target, dataset), exist_ok=True)
X_train, X_test = X_xboost[train_index], X_xboost[test_index]
Y_train, Y_test = Y_xboost[train_index], Y_xboost[test_index]
# self.logger.debug(f"X_train: {X_train.shape}")
# self.logger.debug(f"X_test: {X_test.shape}")
self.logger.debug(f"Y_train: {Y_train.shape}")
self.logger.debug(f"Y_test: {Y_test.shape}")
2025-03-11 02:14:25 -03:00
2025-03-09 04:22:15 -03:00
for model in self.get_model_train():
model_id = "{}_{}".format(type(model).__name__, i)
self.trained += 1
if self.row_exists(dataset, model_id):
self.bar.update()
continue
2025-03-13 17:51:13 -03:00
model_file = '{}/{}/{}/{}'.format(self.name, self.loader.target, dataset, model_id )
2025-03-09 04:22:15 -03:00
2025-03-10 23:11:41 -03:00
tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_model(model, X_train, X_test, Y_train, Y_test)
2025-03-09 04:22:15 -03:00
ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
joblib.dump(optimized_model, model_file)
newrow = pd.DataFrame( [{"node": node,
"ts": ts,
"Dataset": dataset,
"Samples": Y_xboost.shape[0],
2025-03-13 17:51:13 -03:00
"Target": self.loader.target,
2025-03-10 23:11:41 -03:00
"Train Size": Y_train.shape[0],
"Train Ratio": Y_train.shape[0]/Y_xboost.shape[0],
2025-03-09 04:22:15 -03:00
"Ratio": self.ratio,
"Model": model_id,
"Params": json.dumps(model_params),
"Train mse": tmse,
"mse": mse,
"mae": mae,
"rmse": rmse
}] )
self.ledger = pd.concat([self.ledger, newrow], ignore_index=True)
self.bar.update()
self.saveCheckPoint()
2025-03-16 21:26:01 -03:00
sample_size = 50000
2025-03-16 22:33:58 -03:00
epochs = 50
2025-03-16 04:09:55 -03:00
self.loader.smooth = None
self.loader.reset()
for window in windows:
X_conv1d, Y_conv1d, G_conv1d = self.loader.load_dataset_window(window)
self.logger.debug(f"X_conv1d: {X_conv1d.shape}")
self.logger.debug(f"Y_conv1d: {Y_conv1d.shape}")
self.logger.debug(f"G_conv1d: {G_conv1d.shape}")
Y_discrete = discretizer.fit_transform(Y_conv1d)
if Y_discrete.ndim == 2:
Y_discrete = np.sum(Y_discrete, axis=1)
dataset = f'Conv1d-base-w{window}'
for i, (train_index, test_index) in enumerate(gss.split(X_conv1d, Y_discrete, G_conv1d)):
2025-03-16 19:41:52 -03:00
self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}'))
2025-03-16 04:09:55 -03:00
os.makedirs('{}/{}/{}-w{}'.format(self.name, self.loader.target, dataset, window), exist_ok=True)
X_train, X_test = X_conv1d[train_index], X_conv1d[test_index]
Y_train, Y_test = Y_conv1d[train_index], Y_conv1d[test_index]
2025-03-16 19:41:52 -03:00
G_train, G_test = G_conv1d[train_index], G_conv1d[test_index]
2025-03-16 04:09:55 -03:00
# self.logger.debug(f"X_train: {X_train.shape}")
# self.logger.debug(f"X_test: {X_test.shape}")
model_id = "Conv1d-base_{}".format(i)
self.trained += 1
if self.row_exists(dataset, model_id):
self.bar.update()
continue
model_file = '{}/{}/{}-w{}/{}'.format(self.name, self.loader.target, dataset, window, model_id )
2025-03-16 21:26:01 -03:00
X_train_sample, _, Y_train_sample, _ = train_test_split(X_train, Y_train, stratify=G_train, train_size=0.8*sample_size / len(X_train), random_state=get_seed())
2025-03-16 19:41:52 -03:00
X_test_sample, _, Y_test_sample, _ = train_test_split(X_test, Y_test, stratify=G_test, train_size=0.2*sample_size / len(X_test), random_state=get_seed())
self.logger.debug(f"Y_train_sample: {Y_train_sample.shape}")
self.logger.debug(f"Y_test_sample: {Y_test_sample.shape}")
2025-03-16 23:20:08 -03:00
optimized_model, analysis = self.search_best_conv1D_v1(X_train_sample, X_test_sample, Y_train_sample, Y_test_sample, epochs=10)
2025-03-16 23:18:03 -03:00
model_params = analysis.get_best_config(metric='mse', mode='min')
analysis.results_df.to_excel(f"{model_file}.search.xlsx", index=False)
2025-03-16 19:41:52 -03:00
2025-03-16 21:46:34 -03:00
self.logger.info(f"Training Model {model_id} with {model_params}")
2025-03-16 22:33:58 -03:00
optimized_model.fit(X_train, Y_train, epochs=epochs, batch_size=model_params['batch_size'], verbose=1)
2025-03-16 19:41:52 -03:00
2025-03-16 21:26:01 -03:00
Y_train_pred = optimized_model.predict(X_train)
Y_test_pred = optimized_model.predict(X_test)
2025-03-16 19:41:52 -03:00
mse_train = mean_squared_error(Y_train, Y_train_pred)
mae_test = mean_absolute_error(Y_test, Y_test_pred)
mse_test = mean_squared_error(Y_test, Y_test_pred)
rmse_test = np.sqrt(mse_test)
2025-03-16 04:09:55 -03:00
ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
optimized_model.save(model_file)
optimized_model.save_weights(f"{model_file}.weights.h5")
newrow = pd.DataFrame( [{"node": node,
"ts": ts,
"Dataset": dataset,
"Samples": Y_xboost.shape[0],
"Target": self.loader.target,
"Train Size": Y_train.shape[0],
"Train Ratio": Y_train.shape[0]/Y_xboost.shape[0],
"Ratio": self.ratio,
"Model": model_id,
"Params": json.dumps(model_params),
2025-03-16 19:41:52 -03:00
"Train mse": mse_train,
"mse": mse_test,
"mae": mae_test,
"rmse": rmse_test
2025-03-16 04:09:55 -03:00
}] )
self.ledger = pd.concat([self.ledger, newrow], ignore_index=True)
self.bar.update()
self.saveCheckPoint()
self.loader.smooth = 'conv3'
self.loader.reset()
for window in windows:
X_conv1d, Y_conv1d, G_conv1d = self.loader.load_dataset_window(window)
self.logger.debug(f"X_conv1d: {X_conv1d.shape}")
self.logger.debug(f"Y_conv1d: {Y_conv1d.shape}")
self.logger.debug(f"G_conv1d: {G_conv1d.shape}")
Y_discrete = discretizer.fit_transform(Y_conv1d)
if Y_discrete.ndim == 2:
Y_discrete = np.sum(Y_discrete, axis=1)
dataset = f'Conv1d-base-w{window}-conv3'
for i, (train_index, test_index) in enumerate(gss.split(X_conv1d, Y_discrete, G_conv1d)):
2025-03-16 19:41:52 -03:00
self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}'))
2025-03-16 04:09:55 -03:00
os.makedirs('{}/{}/{}-w{}'.format(self.name, self.loader.target, dataset, window), exist_ok=True)
X_train, X_test = X_conv1d[train_index], X_conv1d[test_index]
Y_train, Y_test = Y_conv1d[train_index], Y_conv1d[test_index]
2025-03-16 19:41:52 -03:00
G_train, G_test = G_conv1d[train_index], G_conv1d[test_index]
2025-03-16 04:09:55 -03:00
# self.logger.debug(f"X_train: {X_train.shape}")
# self.logger.debug(f"X_test: {X_test.shape}")
self.logger.debug(f"Y_train: {Y_train.shape}")
self.logger.debug(f"Y_test: {Y_test.shape}")
model_id = "Conv1d-base_{}".format(i)
self.trained += 1
if self.row_exists(dataset, model_id):
self.bar.update()
continue
2025-03-16 21:26:01 -03:00
X_train_sample, _, Y_train_sample, _ = train_test_split(X_train, Y_train, stratify=G_train, train_size=0.8*sample_size / len(X_train), random_state=get_seed())
2025-03-16 19:41:52 -03:00
X_test_sample, _, Y_test_sample, _ = train_test_split(X_test, Y_test, stratify=G_test, train_size=0.2*sample_size / len(X_test), random_state=get_seed())
self.logger.debug(f"Y_train_sample: {Y_train_sample.shape}")
self.logger.debug(f"Y_test_sample: {Y_test_sample.shape}")
2025-03-16 22:33:58 -03:00
optimized_model, model_params = self.search_best_conv1D_v1(X_train_sample, X_test_sample, Y_train_sample, Y_test_sample, epochs=epochs//3)
2025-03-16 19:41:52 -03:00
2025-03-16 21:46:34 -03:00
self.logger.info(f"Training Model {model_id} with {model_params}")
2025-03-16 22:33:58 -03:00
optimized_model.fit(X_train, Y_train, epochs=epochs, batch_size=model_params['batch_size'], verbose=1)
2025-03-16 19:41:52 -03:00
2025-03-16 21:26:01 -03:00
Y_train_pred = optimized_model.predict(X_train)
Y_test_pred = optimized_model.predict(X_test)
2025-03-16 04:09:55 -03:00
2025-03-16 19:41:52 -03:00
mse_train = mean_squared_error(Y_train, Y_train_pred)
mae_test = mean_absolute_error(Y_test, Y_test_pred)
mse_test = mean_squared_error(Y_test, Y_test_pred)
rmse_test = np.sqrt(mse_test)
2025-03-16 04:09:55 -03:00
ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
optimized_model.save(model_file)
optimized_model.save_weights(f"{model_file}.weights.h5")
newrow = pd.DataFrame( [{"node": node,
"ts": ts,
"Dataset": dataset,
"Samples": Y_xboost.shape[0],
"Target": self.loader.target,
"Train Size": Y_train.shape[0],
"Train Ratio": Y_train.shape[0]/Y_xboost.shape[0],
"Ratio": self.ratio,
"Model": model_id,
"Params": json.dumps(model_params),
"Train mse": tmse,
"mse": mse,
"mae": mae,
"rmse": rmse
}] )
self.ledger = pd.concat([self.ledger, newrow], ignore_index=True)
self.bar.update()
self.saveCheckPoint()
2025-03-09 04:22:15 -03:00
self.bar.close()