diff --git a/TrainerClass.py b/TrainerClass.py index b010479..a26e153 100644 --- a/TrainerClass.py +++ b/TrainerClass.py @@ -7,7 +7,7 @@ import matplotlib matplotlib.rcParams['text.usetex'] = True from sklearn.preprocessing import KBinsDiscretizer -from sklearn.model_selection import StratifiedGroupKFold, StratifiedShuffleSplit, GridSearchCV +from sklearn.model_selection import StratifiedGroupKFold, StratifiedShuffleSplit, GridSearchCV, train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.preprocessing import MinMaxScaler @@ -40,6 +40,7 @@ import json import os + def get_seed(): return random.randint(0, 2**32 - 1) @@ -153,80 +154,6 @@ class eNoseTrainer: self.bar.update() return mse, mae, rmse, optimized_model, model_params - - def train_and_score_conv1D_v1(self, X_train, X_test, Y_train, Y_test, epochs=30, num_samples=25): - ray.init(ignore_reinit_error=True) - X_train_ref = ray.put(X_train) - Y_train_ref = ray.put(Y_train) - X_test_ref = ray.put(X_test) - Y_test_ref = ray.put(Y_test) - def build_model(config, input_shape, output_dim): - model = keras.Sequential([ - layers.Conv1D(filters=config['filters'], kernel_size=config['kernel_size'], activation='relu', input_shape=input_shape), - layers.MaxPooling1D(pool_size=config['pool_size']), - layers.Conv1D(filters=config['filters'] * 2, kernel_size=config['kernel_size'], activation='relu'), - layers.MaxPooling1D(pool_size=config['pool_size']), - layers.Flatten(), - layers.Dense(config['dense_units'], activation='relu'), - layers.Dropout(config['dropout']), - layers.Dense(output_dim) - ]) - model.compile(optimizer=keras.optimizers.Adam(learning_rate=config['lr']), loss='mse') - return model - - def train_model(config): - input_shape = X_train.shape[1:] - output_dim = Y_train.shape[1] - - model = build_model(config, input_shape, output_dim) - early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) - - model.fit( - X_train, Y_train, - validation_data=(X_test, Y_test), - epochs=epochs, - batch_size=config['batch_size'], - verbose=0, - callbacks=[early_stopping] - ) - - Y_pred = model.predict(X_test) - mse = mean_squared_error(Y_test, Y_pred) - tune.report(mse=mse) - - config_space = { - 'filters': tune.choice([32, 64, 128]), - 'kernel_size': tune.choice([3, 5]), - 'pool_size': tune.choice([2, 3]), - 'dense_units': tune.choice([32, 64, 128]), - 'dropout': tune.choice([0.1, 0.2, 0.3]), - 'lr': tune.choice([0.001, 0.0005, 0.0001]), - 'batch_size': tune.choice([16, 32, 64]) - } - - scheduler = ASHAScheduler(metric='mse', mode='min', max_t=epochs, grace_period=5, reduction_factor=2) - # analysis = tune.run(train_model, config=config_space, num_samples=num_samples, scheduler=scheduler) - analysis = tune.run( - tune.with_parameters(train_model, X_train=ray.get(X_train_ref), Y_train=ray.get(Y_train_ref), X_test=ray.get(X_test_ref), Y_test=ray.get(Y_test_ref)), - config=config_space, num_samples=num_samples, scheduler=scheduler - ) - best_config = analysis.get_best_config(metric='mse', mode='min') - best_model = build_model(best_config, X_train.shape[1:], Y_train.shape[1]) - best_model.fit(X_train, Y_train, epochs=epochs, batch_size=best_config['batch_size'], verbose=0) - - Y_train_pred = best_model.predict(X_train) - Y_test_pred = best_model.predict(X_test) - - mse_train = mean_squared_error(Y_train, Y_train_pred) - mae_test = mean_absolute_error(Y_test, Y_test_pred) - mse_test = mean_squared_error(Y_test, Y_test_pred) - rmse_test = np.sqrt(mse_test) - -# # Calculate evaluation metrics - mse = mean_squared_error(Y_test, y_pred) - rmse = np.sqrt(mse) - - return mse_train, mae_test, mse_test, rmse_test, best_model, best_config def get_model_train(self): return [ XGBRegressor(objective='reg:squarederror'), @@ -250,6 +177,76 @@ class eNoseTrainer: else: return {} + def train_and_score_conv1D_v1(self, X_train_orig, X_test_orig, Y_train_orig, Y_test_orig, epochs=30, num_samples=25): + ray.init(ignore_reinit_error=True) + X_train_ref = ray.put(X_train_orig) + Y_train_ref = ray.put(Y_train_orig) + X_test_ref = ray.put(X_test_orig) + Y_test_ref = ray.put(Y_test_orig) + + + def build_model_conv1D(config, input_shape, output_dim): + model = keras.Sequential([ + layers.Conv1D(filters=config['filters'], kernel_size=config['kernel_size'], activation='relu', input_shape=input_shape), + layers.MaxPooling1D(pool_size=config['pool_size']), + layers.Conv1D(filters=config['filters'] * 2, kernel_size=config['kernel_size'], activation='relu'), + layers.MaxPooling1D(pool_size=config['pool_size']), + layers.Flatten(), + layers.Dense(config['dense_units'], activation='relu'), + layers.Dropout(config['dropout']), + layers.Dense(output_dim) + ]) + model.compile(optimizer=keras.optimizers.Adam(learning_rate=config['lr']), loss='mse') + return model + + + def train_model_conv1D(config): + X_trainc1D = ray.get(X_train_ref) + Y_trainc1D = ray.get(Y_train_ref) + X_testc1D = ray.get(X_test_ref) + Y_testc1D = ray.get(Y_test_ref) + + input_shape = X_trainc1D.shape[1:] + output_dim = Y_trainc1D.shape[1] + + model = build_model_conv1D(config, input_shape, output_dim) + early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) + + model.fit( + X_trainc1D, Y_trainc1D, + validation_data=(X_testc1D, Y_testc1D), + epochs=config['epochs'], + batch_size=config['batch_size'], + verbose=0, + callbacks=[early_stopping] + ) + + Y_pred = model.predict(X_testc1D) + mse = mean_squared_error(Y_testc1D, Y_pred) + tune.report({'mse': mse}) + + config_space = { + 'filters': tune.choice([32, 64, 128]), + 'kernel_size': tune.choice([3, 5]), + 'pool_size': tune.choice([2, 3]), + 'dense_units': tune.choice([32, 64, 128]), + 'dropout': tune.choice([0.1, 0.2, 0.3]), + 'lr': tune.choice([0.001, 0.0005, 0.0001]), + 'batch_size': tune.choice([16, 32, 64]), + 'epochs': epochs + } + + scheduler = ASHAScheduler(metric='mse', mode='min', max_t=epochs, grace_period=5, reduction_factor=2) + # analysis = tune.run(train_model, config=config_space, num_samples=num_samples, scheduler=scheduler) + analysis = tune.run( tune.with_parameters(train_model_conv1D), config=config_space, num_samples=num_samples, scheduler=scheduler, max_concurrent_trials=3 ) + best_config = analysis.get_best_config(metric='mse', mode='min') + best_model = build_model_conv1D(best_config, X_train_ref.shape[1:], Y_train_ref.shape[1]) + + + ray.internal.free([X_train_ref, Y_train_ref, X_test_ref, Y_test_ref]) + ray.shutdown() + return best_model, best_config + def train_and_score_model(self, model, X_train, X_test, Y_train, Y_test): param_dist = self.get_tunable_params(model) @@ -357,8 +354,8 @@ class eNoseTrainer: plt.close() def fit(self): - windows = [16, 32, 64, 128] - total_train_queue = 2*int(1/self.ratio)*(len(self.get_model_train())+1) + windows = [32, 64, 128] + total_train_queue = 2*int(1/self.ratio)*(len(self.get_model_train())+len(windows)) self.logger.info("{:=^60}".format(f'Begin Fit {total_train_queue} Models')) self.trained = 0 manager = enlighten.get_manager() @@ -385,7 +382,7 @@ class eNoseTrainer: dataset = 'Tabular' for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)): - self.logger.info("{:=^60}".format(f'CV {i}/{int(1/self.ratio)} {dataset}')) + self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}')) os.makedirs('{}/{}/{}'.format(self.name, self.loader.target, dataset), exist_ok=True) X_train, X_test = X_xboost[train_index], X_xboost[test_index] Y_train, Y_test = Y_xboost[train_index], Y_xboost[test_index] @@ -443,7 +440,7 @@ class eNoseTrainer: dataset = 'Tabular-conv3' for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)): - self.logger.info("{:=^60}".format(f'CV {i}/{int(1/self.ratio)} {dataset}')) + self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}')) os.makedirs('{}/{}/{}'.format(self.name, self.loader.target, dataset), exist_ok=True) X_train, X_test = X_xboost[train_index], X_xboost[test_index] Y_train, Y_test = Y_xboost[train_index], Y_xboost[test_index] @@ -502,15 +499,14 @@ class eNoseTrainer: dataset = f'Conv1d-base-w{window}' for i, (train_index, test_index) in enumerate(gss.split(X_conv1d, Y_discrete, G_conv1d)): - self.logger.info("{:=^60}".format(f'CV {i}/{int(1/self.ratio)} {dataset}')) + self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}')) os.makedirs('{}/{}/{}-w{}'.format(self.name, self.loader.target, dataset, window), exist_ok=True) X_train, X_test = X_conv1d[train_index], X_conv1d[test_index] Y_train, Y_test = Y_conv1d[train_index], Y_conv1d[test_index] + G_train, G_test = G_conv1d[train_index], G_conv1d[test_index] # self.logger.debug(f"X_train: {X_train.shape}") # self.logger.debug(f"X_test: {X_test.shape}") - self.logger.debug(f"Y_train: {Y_train.shape}") - self.logger.debug(f"Y_test: {Y_test.shape}") model_id = "Conv1d-base_{}".format(i) self.trained += 1 @@ -521,7 +517,25 @@ class eNoseTrainer: model_file = '{}/{}/{}-w{}/{}'.format(self.name, self.loader.target, dataset, window, model_id ) - tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_conv1D_v1(X_train, X_test, Y_train, Y_test) + sample_size = 25000 + X_train_sample, _, Y_train_sample, _ = train_test_split(X_train, Y_train, stratify=G_train, train_size=sample_size / len(X_train), random_state=get_seed()) + X_test_sample, _, Y_test_sample, _ = train_test_split(X_test, Y_test, stratify=G_test, train_size=0.2*sample_size / len(X_test), random_state=get_seed()) + + self.logger.debug(f"Y_train_sample: {Y_train_sample.shape}") + self.logger.debug(f"Y_test_sample: {Y_test_sample.shape}") + + optimized_model, model_params = self.train_and_score_conv1D_v1(X_train_sample, X_test_sample, Y_train_sample, Y_test_sample) + + optimized_model.fit(X_train, Y_train, epochs=model_params['epochs'], batch_size=model_params['batch_size'], verbose=0) + + Y_train_pred = best_model.predict(X_train) + Y_test_pred = best_model.predict(X_test) + + mse_train = mean_squared_error(Y_train, Y_train_pred) + mae_test = mean_absolute_error(Y_test, Y_test_pred) + mse_test = mean_squared_error(Y_test, Y_test_pred) + rmse_test = np.sqrt(mse_test) + ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S") optimized_model.save(model_file) @@ -537,10 +551,10 @@ class eNoseTrainer: "Ratio": self.ratio, "Model": model_id, "Params": json.dumps(model_params), - "Train mse": tmse, - "mse": mse, - "mae": mae, - "rmse": rmse + "Train mse": mse_train, + "mse": mse_test, + "mae": mae_test, + "rmse": rmse_test }] ) self.ledger = pd.concat([self.ledger, newrow], ignore_index=True) self.bar.update() @@ -560,11 +574,12 @@ class eNoseTrainer: dataset = f'Conv1d-base-w{window}-conv3' for i, (train_index, test_index) in enumerate(gss.split(X_conv1d, Y_discrete, G_conv1d)): - self.logger.info("{:=^60}".format(f'CV {i}/{int(1/self.ratio)} {dataset}')) + self.logger.info("{:=^60}".format(f'CV {i+1}/{int(1/self.ratio)} {dataset}')) os.makedirs('{}/{}/{}-w{}'.format(self.name, self.loader.target, dataset, window), exist_ok=True) X_train, X_test = X_conv1d[train_index], X_conv1d[test_index] Y_train, Y_test = Y_conv1d[train_index], Y_conv1d[test_index] + G_train, G_test = G_conv1d[train_index], G_conv1d[test_index] # self.logger.debug(f"X_train: {X_train.shape}") # self.logger.debug(f"X_test: {X_test.shape}") self.logger.debug(f"Y_train: {Y_train.shape}") @@ -578,9 +593,24 @@ class eNoseTrainer: self.bar.update() continue - model_file = '{}/{}/{}-w{}/{}'.format(self.name, self.loader.target, dataset, window, model_id ) + sample_size = 25000 + X_train_sample, _, Y_train_sample, _ = train_test_split(X_train, Y_train, stratify=G_train, train_size=sample_size / len(X_train), random_state=get_seed()) + X_test_sample, _, Y_test_sample, _ = train_test_split(X_test, Y_test, stratify=G_test, train_size=0.2*sample_size / len(X_test), random_state=get_seed()) - tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_conv1D_v1(X_train, X_test, Y_train, Y_test) + self.logger.debug(f"Y_train_sample: {Y_train_sample.shape}") + self.logger.debug(f"Y_test_sample: {Y_test_sample.shape}") + + optimized_model, model_params = self.train_and_score_conv1D_v1(X_train_sample, X_test_sample, Y_train_sample, Y_test_sample) + + optimized_model.fit(X_train, Y_train, epochs=model_params['epochs'], batch_size=model_params['batch_size'], verbose=0) + + Y_train_pred = best_model.predict(X_train) + Y_test_pred = best_model.predict(X_test) + + mse_train = mean_squared_error(Y_train, Y_train_pred) + mae_test = mean_absolute_error(Y_test, Y_test_pred) + mse_test = mean_squared_error(Y_test, Y_test_pred) + rmse_test = np.sqrt(mse_test) ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S") optimized_model.save(model_file)