main
ifiguero 2025-03-16 19:41:52 -03:00
parent 7695340e91
commit 15af3d3745
1 changed files with 120 additions and 90 deletions

View File

@ -7,7 +7,7 @@ import matplotlib
matplotlib.rcParams['text.usetex'] = True
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedGroupKFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.model_selection import StratifiedGroupKFold, StratifiedShuffleSplit, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
@ -40,6 +40,7 @@ import json
import os
def get_seed():
return random.randint(0, 2**32 - 1)
@ -153,80 +154,6 @@ class eNoseTrainer:
self.bar.update()
return mse, mae, rmse, optimized_model, model_params
def train_and_score_conv1D_v1(self, X_train, X_test, Y_train, Y_test, epochs=30, num_samples=25):
ray.init(ignore_reinit_error=True)
X_train_ref = ray.put(X_train)
Y_train_ref = ray.put(Y_train)
X_test_ref = ray.put(X_test)
Y_test_ref = ray.put(Y_test)
def build_model(config, input_shape, output_dim):
model = keras.Sequential([
layers.Conv1D(filters=config['filters'], kernel_size=config['kernel_size'], activation='relu', input_shape=input_shape),
layers.MaxPooling1D(pool_size=config['pool_size']),
layers.Conv1D(filters=config['filters'] * 2, kernel_size=config['kernel_size'], activation='relu'),
layers.MaxPooling1D(pool_size=config['pool_size']),
layers.Flatten(),
layers.Dense(config['dense_units'], activation='relu'),
layers.Dropout(config['dropout']),
layers.Dense(output_dim)
])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=config['lr']), loss='mse')
return model
def train_model(config):
input_shape = X_train.shape[1:]
output_dim = Y_train.shape[1]
model = build_model(config, input_shape, output_dim)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(
X_train, Y_train,
validation_data=(X_test, Y_test),
epochs=epochs,
batch_size=config['batch_size'],
verbose=0,
callbacks=[early_stopping]
)
Y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
tune.report(mse=mse)
config_space = {
'filters': tune.choice([32, 64, 128]),
'kernel_size': tune.choice([3, 5]),
'pool_size': tune.choice([2, 3]),
'dense_units': tune.choice([32, 64, 128]),
'dropout': tune.choice([0.1, 0.2, 0.3]),
'lr': tune.choice([0.001, 0.0005, 0.0001]),
'batch_size': tune.choice([16, 32, 64])
}
scheduler = ASHAScheduler(metric='mse', mode='min', max_t=epochs, grace_period=5, reduction_factor=2)
# analysis = tune.run(train_model, config=config_space, num_samples=num_samples, scheduler=scheduler)
analysis = tune.run(
tune.with_parameters(train_model, X_train=ray.get(X_train_ref), Y_train=ray.get(Y_train_ref), X_test=ray.get(X_test_ref), Y_test=ray.get(Y_test_ref)),
config=config_space, num_samples=num_samples, scheduler=scheduler
)
best_config = analysis.get_best_config(metric='mse', mode='min')
best_model = build_model(best_config, X_train.shape[1:], Y_train.shape[1])
best_model.fit(X_train, Y_train, epochs=epochs, batch_size=best_config['batch_size'], verbose=0)
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)
mse_train = mean_squared_error(Y_train, Y_train_pred)
mae_test = mean_absolute_error(Y_test, Y_test_pred)
mse_test = mean_squared_error(Y_test, Y_test_pred)
rmse_test = np.sqrt(mse_test)
# # Calculate evaluation metrics
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
return mse_train, mae_test, mse_test, rmse_test, best_model, best_config
def get_model_train(self):
return [
XGBRegressor(objective='reg:squarederror'),
@ -250,6 +177,76 @@ class eNoseTrainer:
else:
return {}
def train_and_score_conv1D_v1(self, X_train_orig, X_test_orig, Y_train_orig, Y_test_orig, epochs=30, num_samples=25):
ray.init(ignore_reinit_error=True)
X_train_ref = ray.put(X_train_orig)
Y_train_ref = ray.put(Y_train_orig)
X_test_ref = ray.put(X_test_orig)
Y_test_ref = ray.put(Y_test_orig)
def build_model_conv1D(config, input_shape, output_dim):
model = keras.Sequential([
layers.Conv1D(filters=config['filters'], kernel_size=config['kernel_size'], activation='relu', input_shape=input_shape),
layers.MaxPooling1D(pool_size=config['pool_size']),
layers.Conv1D(filters=config['filters'] * 2, kernel_size=config['kernel_size'], activation='relu'),
layers.MaxPooling1D(pool_size=config['pool_size']),
layers.Flatten(),
layers.Dense(config['dense_units'], activation='relu'),
layers.Dropout(config['dropout']),
layers.Dense(output_dim)
])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=config['lr']), loss='mse')
return model
def train_model_conv1D(config):
X_trainc1D = ray.get(X_train_ref)
Y_trainc1D = ray.get(Y_train_ref)
X_testc1D = ray.get(X_test_ref)
Y_testc1D = ray.get(Y_test_ref)
input_shape = X_trainc1D.shape[1:]
output_dim = Y_trainc1D.shape[1]
model = build_model_conv1D(config, input_shape, output_dim)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(
X_trainc1D, Y_trainc1D,
validation_data=(X_testc1D, Y_testc1D),
epochs=config['epochs'],
batch_size=config['batch_size'],
verbose=0,
callbacks=[early_stopping]
)
Y_pred = model.predict(X_testc1D)
mse = mean_squared_error(Y_testc1D, Y_pred)
tune.report({'mse': mse})
config_space = {
'filters': tune.choice([32, 64, 128]),
'kernel_size': tune.choice([3, 5]),
'pool_size': tune.choice([2, 3]),
'dense_units': tune.choice([32, 64, 128]),
'dropout': tune.choice([0.1, 0.2, 0.3]),
'lr': tune.choice([0.001, 0.0005, 0.0001]),
'batch_size': tune.choice([16, 32, 64]),
'epochs': epochs
}
scheduler = ASHAScheduler(metric='mse', mode='min', max_t=epochs, grace_period=5, reduction_factor=2)
# analysis = tune.run(train_model, config=config_space, num_samples=num_samples, scheduler=scheduler)
analysis = tune.run( tune.with_parameters(train_model_conv1D), config=config_space, num_samples=num_samples, scheduler=scheduler, max_concurrent_trials=3 )
best_config = analysis.get_best_config(metric='mse', mode='min')
best_model = build_model_conv1D(best_config, X_train_ref.shape[1:], Y_train_ref.shape[1])
ray.internal.free([X_train_ref, Y_train_ref, X_test_ref, Y_test_ref])
ray.shutdown()
return best_model, best_config
def train_and_score_model(self, model, X_train, X_test, Y_train, Y_test):
param_dist = self.get_tunable_params(model)
@ -357,8 +354,8 @@ class eNoseTrainer:
plt.close()
def fit(self):
windows = [16, 32, 64, 128]
total_train_queue = 2*int(1/self.ratio)*(len(self.get_model_train())+1)
windows = [32, 64, 128]
total_train_queue = 2*int(1/self.ratio)*(len(self.get_model_train())+len(windows))
self.logger.info("{:=^60}".format(f'Begin Fit {total_train_queue} Models'))
self.trained = 0
manager = enlighten.get_manager()
@ -385,7 +382,7 @@ class eNoseTrainer:
dataset = 'Tabular'
for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)):
self.logger.info("{:=^60}".format(f'CV {i}/{int(1/self.ratio)} {dataset}'))
self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}'))
os.makedirs('{}/{}/{}'.format(self.name, self.loader.target, dataset), exist_ok=True)
X_train, X_test = X_xboost[train_index], X_xboost[test_index]
Y_train, Y_test = Y_xboost[train_index], Y_xboost[test_index]
@ -443,7 +440,7 @@ class eNoseTrainer:
dataset = 'Tabular-conv3'
for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)):
self.logger.info("{:=^60}".format(f'CV {i}/{int(1/self.ratio)} {dataset}'))
self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}'))
os.makedirs('{}/{}/{}'.format(self.name, self.loader.target, dataset), exist_ok=True)
X_train, X_test = X_xboost[train_index], X_xboost[test_index]
Y_train, Y_test = Y_xboost[train_index], Y_xboost[test_index]
@ -502,15 +499,14 @@ class eNoseTrainer:
dataset = f'Conv1d-base-w{window}'
for i, (train_index, test_index) in enumerate(gss.split(X_conv1d, Y_discrete, G_conv1d)):
self.logger.info("{:=^60}".format(f'CV {i}/{int(1/self.ratio)} {dataset}'))
self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}'))
os.makedirs('{}/{}/{}-w{}'.format(self.name, self.loader.target, dataset, window), exist_ok=True)
X_train, X_test = X_conv1d[train_index], X_conv1d[test_index]
Y_train, Y_test = Y_conv1d[train_index], Y_conv1d[test_index]
G_train, G_test = G_conv1d[train_index], G_conv1d[test_index]
# self.logger.debug(f"X_train: {X_train.shape}")
# self.logger.debug(f"X_test: {X_test.shape}")
self.logger.debug(f"Y_train: {Y_train.shape}")
self.logger.debug(f"Y_test: {Y_test.shape}")
model_id = "Conv1d-base_{}".format(i)
self.trained += 1
@ -521,7 +517,25 @@ class eNoseTrainer:
model_file = '{}/{}/{}-w{}/{}'.format(self.name, self.loader.target, dataset, window, model_id )
tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_conv1D_v1(X_train, X_test, Y_train, Y_test)
sample_size = 25000
X_train_sample, _, Y_train_sample, _ = train_test_split(X_train, Y_train, stratify=G_train, train_size=sample_size / len(X_train), random_state=get_seed())
X_test_sample, _, Y_test_sample, _ = train_test_split(X_test, Y_test, stratify=G_test, train_size=0.2*sample_size / len(X_test), random_state=get_seed())
self.logger.debug(f"Y_train_sample: {Y_train_sample.shape}")
self.logger.debug(f"Y_test_sample: {Y_test_sample.shape}")
optimized_model, model_params = self.train_and_score_conv1D_v1(X_train_sample, X_test_sample, Y_train_sample, Y_test_sample)
optimized_model.fit(X_train, Y_train, epochs=model_params['epochs'], batch_size=model_params['batch_size'], verbose=0)
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)
mse_train = mean_squared_error(Y_train, Y_train_pred)
mae_test = mean_absolute_error(Y_test, Y_test_pred)
mse_test = mean_squared_error(Y_test, Y_test_pred)
rmse_test = np.sqrt(mse_test)
ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
optimized_model.save(model_file)
@ -537,10 +551,10 @@ class eNoseTrainer:
"Ratio": self.ratio,
"Model": model_id,
"Params": json.dumps(model_params),
"Train mse": tmse,
"mse": mse,
"mae": mae,
"rmse": rmse
"Train mse": mse_train,
"mse": mse_test,
"mae": mae_test,
"rmse": rmse_test
}] )
self.ledger = pd.concat([self.ledger, newrow], ignore_index=True)
self.bar.update()
@ -560,11 +574,12 @@ class eNoseTrainer:
dataset = f'Conv1d-base-w{window}-conv3'
for i, (train_index, test_index) in enumerate(gss.split(X_conv1d, Y_discrete, G_conv1d)):
self.logger.info("{:=^60}".format(f'CV {i}/{int(1/self.ratio)} {dataset}'))
self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}'))
os.makedirs('{}/{}/{}-w{}'.format(self.name, self.loader.target, dataset, window), exist_ok=True)
X_train, X_test = X_conv1d[train_index], X_conv1d[test_index]
Y_train, Y_test = Y_conv1d[train_index], Y_conv1d[test_index]
G_train, G_test = G_conv1d[train_index], G_conv1d[test_index]
# self.logger.debug(f"X_train: {X_train.shape}")
# self.logger.debug(f"X_test: {X_test.shape}")
self.logger.debug(f"Y_train: {Y_train.shape}")
@ -578,9 +593,24 @@ class eNoseTrainer:
self.bar.update()
continue
model_file = '{}/{}/{}-w{}/{}'.format(self.name, self.loader.target, dataset, window, model_id )
sample_size = 25000
X_train_sample, _, Y_train_sample, _ = train_test_split(X_train, Y_train, stratify=G_train, train_size=sample_size / len(X_train), random_state=get_seed())
X_test_sample, _, Y_test_sample, _ = train_test_split(X_test, Y_test, stratify=G_test, train_size=0.2*sample_size / len(X_test), random_state=get_seed())
tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_conv1D_v1(X_train, X_test, Y_train, Y_test)
self.logger.debug(f"Y_train_sample: {Y_train_sample.shape}")
self.logger.debug(f"Y_test_sample: {Y_test_sample.shape}")
optimized_model, model_params = self.train_and_score_conv1D_v1(X_train_sample, X_test_sample, Y_train_sample, Y_test_sample)
optimized_model.fit(X_train, Y_train, epochs=model_params['epochs'], batch_size=model_params['batch_size'], verbose=0)
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)
mse_train = mean_squared_error(Y_train, Y_train_pred)
mae_test = mean_absolute_error(Y_test, Y_test_pred)
mse_test = mean_squared_error(Y_test, Y_test_pred)
rmse_test = np.sqrt(mse_test)
ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
optimized_model.save(model_file)