diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6bc968a --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*xlsx +pics +*zip +*log +*pkl +*pyc +*_dataset diff --git a/LoaderClass.py b/LoaderClass.py new file mode 100644 index 0000000..582cc99 --- /dev/null +++ b/LoaderClass.py @@ -0,0 +1,434 @@ +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import pickle +import logging +from sklearn.preprocessing import MinMaxScaler + +#self.logger.basicConfig(level=self.logger.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +class GasSensorDataLoader: + def __init__(self, label_file, force_overwrite=False, output_format="png", lower_limit=-0.01, threshold=0.7, target_list=['C2H2', 'CH4', 'C3H6', 'CO', 'C2H6', 'C3H8', 'C2H4', 'H2', 'O2'], source_channels=["MQ 8", "MQ 9", "MQ 135", "TGS 813", "TGS 821", "TGS 2600", "TGS 2602", "TGS 2611-0", "TGS 2612", "TGS 2620"], debug=False): + self.label_file = label_file + self.main_file = f"{self.label_file}/{self.label_file}.xlsx" + self.data_folder = os.path.splitext(label_file)[0] + self.state_file = f"{self.label_file}.pkl" + self.lower_limit = lower_limit + self.data = None + self.debug = debug + self.threshold = threshold + self.dataset = {} + self.dataset['threshold'] = self.threshold + self.dataset['range'] = {} + + self.samples = {} + self.target_list = sorted(target_list) + self.target_len = len(self.target_list) + self.source_channels = sorted(source_channels) + self.force_overwrite = force_overwrite + self.output_format = output_format.lower() + self.pics_folder = self.create_pics_folder() + + self.logger = logging.getLogger("GasSensorDataLoader") + if self.debug: + self.logger.setLevel(logging.DEBUG) + else: + self.logger.setLevel(logging.INFO) + + if self.output_format not in ["png", "pdf"]: + raise ValueError("Invalid output format. Choose either 'png' or 'pdf'.") + + if not os.path.isdir(self.data_folder): + raise FileNotFoundError(f"Data folder '{self.data_folder}' does not exist.") + + if not os.path.exists(self.main_file): + raise FileNotFoundError(f"Main Excel file '{self.main_file}' not found.") + + if not isinstance(threshold, (int, float)) or not 0 <= threshold <= 1: + raise ValueError("threshold must be a number between 0 and 1") + + if os.path.exists(self.state_file): + if False:#not self.force_overwrite and not self._compare_state_with_main(): + raise ValueError("State file differs from the main Excel file. Use 'force_overwrite=True' to overwrite.") + else: + self.load_state() + else: + self.logger.info("State file not found. Loading dataset.") + self.load_dataset() + + def _compare_state_with_main(self): + try: + existing_labels = pd.read_excel(self.main_file) + with open(self.state_file, 'rb') as f: + saved_data = pickle.load(f) + saved_labels = pd.DataFrame([saved_data[key]['label'] for key in saved_data]) + return existing_labels.equals(saved_labels) + except Exception as e: + self.logger.error(f"Error comparing state file: {e}") + return False + + def load_dataset(self): + self.logger.info("Loading dataset from Excel files.") + labels = pd.read_excel(self.main_file) + data_dict = {} + samples, measuraments = 0, 0 + + for _, row in labels.iterrows(): + file_path = os.path.join(self.data_folder, "Raw_data", '{}{}'.format(row['Raw_data'], '.xlsx')) + if os.path.exists(file_path): + self.logger.info(f"Loading data from {file_path}.") + df = pd.read_excel(file_path, header=0, usecols=self.source_channels) # Ensure first row is used as column names + data_dict[row['Raw_data']] = { + 'label': row.to_dict(), + 'data': df, + 'sampleId': samples + } + samples += 1 + measuraments += df.shape[0] + else: + raise FileNotFoundError(f"measurament file not found: {file_path}") + + self.data = data_dict + self.save_state() + self.logger.info("Dataset loaded. {} samples in {} measuraments".format(samples, measuraments)) + + def save_state(self): + with open(self.state_file, 'wb') as f: + pickle.dump(self.data, f) + self.logger.info("State saved.") + + def load_state(self): + with open(self.state_file, 'rb') as f: + self.data = pickle.load(f) + self.logger.info("State loaded.") + + def create_pics_folder(self): + pics_folder = os.path.join(self.data_folder, "pics") + if not os.path.exists(pics_folder): + os.makedirs(pics_folder) + self.logger.info(f"Created folder: {pics_folder}") + return pics_folder + + def init_delta(self): + self.logger.info("Initializing dataset delta values.") + data_copy = {key: {'label': value['label'], 'sampleId': value['sampleId'], 'data': value['data'].copy()} for key, value in self.data.items()} + lower_limit = pd.concat([data_copy[key]['data'] for key in data_copy], axis=0).max() * self.lower_limit + self.logger.debug("Lower limit {}.".format(lower_limit)) + + for key in data_copy: + data_copy[key]['data'] = data_copy[key]['data'] - data_copy[key]['data'].iloc[0] + for column in data_copy[key]['data'].columns: + data_copy[key]['data'][column] = data_copy[key]['data'][column].where(data_copy[key]['data'][column] >= lower_limit[column], other=lower_limit[column]) + self.delta_data = data_copy + + def init_minmax(self): + if not hasattr(self, 'delta_data'): + self.init_delta() + + self.logger.info("Initializing dataset using MinMaxScaler.") + concatenated_data = pd.concat([self.delta_data[key]['data'] for key in self.delta_data], axis=0) + scaler = MinMaxScaler() + scaler.fit(concatenated_data) + + self.scaled_data = {key: {'label': value['label'], 'sampleId': value['sampleId'], 'data': pd.DataFrame(scaler.transform(value['data']), columns=value['data'].columns)} for key, value in self.delta_data.items()} + + def plot_channel(self, data_instance, channel_name, save=False, tag="", title=""): + self.logger.debug(f"{title}All measurament Sensor readings for: {channel_name}.") + plt.figure(figsize=(12, 6)) + for key in data_instance: + if channel_name in data_instance[key]['data'].columns: + plt.plot(data_instance[key]['data'][channel_name]) + plt.xlabel("Time") + plt.ylabel("Sensor Reading") + plt.title(f"{title} Sensor Channel: {channel_name}") + + if save: + filename = os.path.join(self.pics_folder, f"{channel_name}_{tag}.{self.output_format}") + plt.savefig(filename, format=self.output_format) + self.logger.info(f"Saved plot as {filename}") + else: + plt.show() + + plt.close() + + def plot_measurament(self, data_instance, measurament_name, save=False, tag="", title="", limits=[]): + self.logger.debug(f"{title}All sensor readings for measurament: {measurament_name}.") + if measurament_name in data_instance: + data_instance[measurament_name]['data'].plot(figsize=(12, 6), title=f"{title} measurament: {measurament_name}") + plt.xlabel("Time") + plt.ylabel("Sensor Readings") + plt.legend(bbox_to_anchor=(0.95, 0.5), loc="center left") + for xvalue, txtcolor in limits: + plt.vlines(x=xvalue, ymin=0, ymax=1, colors=txtcolor) + + if save: + filename = os.path.join(self.pics_folder, f"{measurament_name}_{tag}.{self.output_format}") + plt.savefig(filename, format=self.output_format) + self.logger.info(f"Saved plot as {filename}") + else: + plt.show() + + plt.close() + + def plotRawdata(self, save=True): + self.logger.debug("Plotting raw data for all measuraments and channels.") + for measurament in self.data: + self.plot_measurament(self.data, measurament, save=save, tag="raw", title="[Original] ") + for channel in self.data[next(iter(self.data))]['data'].columns: + self.plot_channel(self.data, channel, save=save, tag="raw", title="[Original] ") + + def plotDeltadata(self, save=True): + if not hasattr(self, 'delta_data'): + self.init_delta() + + self.logger.debug("Plotting raw data for all measuraments and channels.") + for measurament in self.delta_data: + self.plot_measurament(self.delta_data, measurament, save=save, tag="delta", title="[$\Delta V$] ") + for channel in self.delta_data[next(iter(self.delta_data))]['data'].columns: + self.plot_channel(self.delta_data, channel, save=save, tag="delta", title="[$\delta V$] ") + + def plotScaleddata(self, save=True): + if not hasattr(self, 'scaled_data'): + self.init_minmax() + + self.logger.debug("Plotting raw data for all measuraments and channels.") + for measurament in self.scaled_data: + self.plot_measurament(self.scaled_data, measurament, save=save, tag="scaled", title="[Scaled] ") + for channel in self.scaled_data[next(iter(self.scaled_data))]['data'].columns: + self.plot_channel(self.scaled_data, channel, save=save, tag="scaled", title="[Scaled] ") + + def plotScaledBoundaries(self, save=True): + if not hasattr(self, 'scaled_data'): + self.init_minmax() + + self.logger.debug("Plotting raw data for all measuraments and channels.") + for measurament in self.scaled_data: + r, l, m = self.findIndicesAbovethreshold(self.scaled_data[measurament]['data']) + self.plot_measurament(self.scaled_data, measurament, save=save, tag="train", title=f"[Interval {self.threshold} max] ", limits=[(r, 'blue'), (l, 'blue'), (m, 'red')]) + + def findIndicesAbovethreshold(self, df): + if not isinstance(df, pd.DataFrame): + raise TypeError("Input must be a pandas DataFrame.") + + row_sums = df.sum(axis=1) + threshold = row_sums.max() * self.threshold + + above_threshold_indices = row_sums[row_sums > threshold].index + + if not above_threshold_indices.empty: + first_index = above_threshold_indices[0] + last_index = above_threshold_indices[-1] + return first_index, last_index, row_sums.idxmax() + else: + return None, None, None + + def load_dataset_window(self, ws): + + self.logger.info(f"Requested sample with threshold {self.threshold} and window size {ws}") + + if not hasattr(self, 'scaled_data'): + self.init_minmax() + + if not hasattr(self, 'dataset'): + self.logger.debug(f"Empty dataset") + self.dataset = {} + + if 'threshold' in self.dataset: + self.logger.debug(f"threshold in dataset") + if self.dataset['threshold'] != self.threshold: + self.logger.info(f"wrong threshold {self.dataset['threshold']} {self.threshold}") + self.dataset = {} + self.dataset['threshold'] = self.threshold + self.dataset['range'] = {} + self.stats() + else: + self.logger.debug(f"correct threshold {self.dataset['threshold']} {self.threshold}") + else: + self.dataset['threshold'] = self.threshold + self.dataset['range'] = {} + self.stats() + self.logger.debug(f"No threshold {self.dataset['threshold']} {self.threshold}") + + if ws in self.dataset: + return self.dataset[ws] + + g_output = np.empty((0, 1)) + y_output = np.empty((0, self.target_len)) + x_output = np.empty((0, ws, len(self.source_channels))) + sample_size = self.min_sample - ws + 1 + self.logger.info(f"Computing sample with threshold {self.threshold} and window size {ws}") + for measurament, (r, l) in self.dataset['range'].items(): + self.logger.debug('{} | {} | {}-{} | {}'.format(measurament, self.scaled_data[measurament]['data'].shape, r, l, self.scaled_data[measurament]['data'].iloc[r:l].shape)) + x_sample, y_sample, g_sample = self.create_conv1d_dataset(measurament, r, l, ws) + + g_output = np.concatenate((g_output, g_sample)) + x_output = np.concatenate((x_output, x_sample)) + y_output = np.concatenate((y_output, y_sample)) + + self.dataset[ws] = (x_output, y_output, g_output) + + return self.dataset[ws] + + def load_dataset_xboost(self): + + self.logger.info(f"Requested sample with threshold {self.threshold} for xboost") + + if not hasattr(self, 'scaled_data'): + self.init_minmax() + + if not hasattr(self, 'dataset'): + self.logger.debug(f"Empty dataset") + self.dataset = {} + + if 'threshold' in self.dataset: + self.logger.debug(f"threshold in dataset") + if self.dataset['threshold'] != self.threshold: + self.logger.info(f"wrong threshold {self.dataset['threshold']} {self.threshold}") + self.dataset = {} + self.dataset['threshold'] = self.threshold + self.dataset['range'] = {} + self.stats() + else: + self.logger.debug(f"correct threshold {self.dataset['threshold']} {self.threshold}") + else: + self.dataset['threshold'] = self.threshold + self.dataset['range'] = {} + self.stats() + self.logger.debug(f"No threshold {self.dataset['threshold']} {self.threshold}") + + if 'xboost' in self.dataset: + return self.dataset['xboost'] + + g_output = np.empty((0, 1)) + y_output = np.empty((0, self.target_len)) + x_output = np.empty((0, self.data_channels)) + + self.logger.info(f"Computing sample with threshold {self.threshold} for xboost") + for measurament, (r, l) in self.dataset['range'].items(): + self.logger.debug('{} | {} | {}-{} | {}'.format(measurament, self.scaled_data[measurament]['data'].shape, r, l, self.scaled_data[measurament]['data'].iloc[r:l].shape)) + x_sample, y_sample, g_sample = self.create_xboost_dataset(measurament, r, l) + + g_output = np.concatenate((g_output, g_sample)) + x_output = np.concatenate((x_output, x_sample)) + y_output = np.concatenate((y_output, y_sample)) + + self.dataset['xboost'] = (x_output, y_output, g_output) + + return self.dataset['xboost'] + + def create_xboost_dataset(self, measurament, r, l): + + X_data = self.scaled_data[measurament]['data'].iloc[r:l] + Y_value = np.array([[self.scaled_data[measurament]['label'][key] for key in self.target_list]]) + G_value = self.scaled_data[measurament]['sampleId'] + + total_samples = X_data.shape[0] + sample_size = self.min_sample + + self.logger.debug(f"{measurament}: ({total_samples}) values, sampling ({sample_size}). (l-r {l-r}) (l {l}) (r {r}).") + if sample_size > total_samples: + self.logger.warn(f"sample_size ({sample_size}) exceeds available samples ({total_samples}). Using available samples.") + sample_size = total_samples + + random_indices = np.random.choice(total_samples, size=sample_size, replace=False) + g_sample = np.zeros((sample_size, 1)) + y_sample = np.zeros((sample_size, self.target_len)) + x_samples = np.zeros((sample_size, self.data_channels)) + + for i, index in enumerate(random_indices): + x_samples[i] = X_data.iloc[index] + y_sample[i] = Y_value + g_sample[i] = G_value + + return x_samples, y_sample, g_sample + + def create_conv1d_dataset(self, measurament, r, l, window): + + X_data = self.scaled_data[measurament]['data'].iloc[r:l] + Y_value = np.array([[self.scaled_data[measurament]['label'][key] for key in self.target_list]]) + G_value = self.scaled_data[measurament]['sampleId'] + + total_samples = X_data.shape[0] - window + 1 + sample_size = self.min_sample - window + 1 + + if sample_size > total_samples: + self.logger.warn(f"sample_size ({sample_size}) exceeds available samples ({total_samples}). Using available samples.") + sample_size = total_samples + + random_indices = np.random.choice(total_samples, size=sample_size, replace=False) + g_sample = np.zeros((sample_size, 1)) + y_sample = np.zeros((sample_size, self.target_len)) + x_samples = np.zeros((sample_size, window, self.data_channels)) + + + for i, index in enumerate(random_indices): + x_samples[i] = X_data.iloc[index:index + window].values + y_sample[i] = Y_value + g_sample[i] = G_value + + + return x_samples, y_sample, g_sample + + def stats(self): + channel_columns = {} + sample_columns = {} + for key in self.data: + for col in self.data[key]['data'].columns: + if col in sample_columns: + sample_columns[col].append(key) + else: + sample_columns[col] = [key] + for col in self.data[key]['label']: + if col in channel_columns: + channel_columns[col].append(key) + else: + channel_columns[col] = [key] + + self.data_channels = len(set(sample_columns)) + sorted_channel = sorted(channel_columns.items(), key=lambda x: len(x[1]), reverse=True) + sorted_samples = sorted(sample_columns.items(), key=lambda x: len(x[1]), reverse=True) + + self.logger.debug("{:=^60}".format("CHANNELS")) + for i, (col, keys) in enumerate(sorted_channel): + self.logger.debug(f"{i} | {col}: {len(keys)}") + + self.logger.debug("{:=^60}".format("SAMPLES")) + for i, (col, keys) in enumerate(sorted_samples): + self.logger.debug(f"{i} | {col}: {len(keys)}") + + if not hasattr(self, 'scaled_data'): + self.init_minmax() + + self.logger.debug("{:=^60}".format(f"DATASET SIZE for {self.threshold}")) + valid_dataset = [] + + for measurament in self.scaled_data: + r, l, m = self.findIndicesAbovethreshold(self.scaled_data[measurament]['data']) + self.logger.debug(f"{measurament}: {l} - {r} = {l-r}") + valid_dataset.append(l-r) + self.dataset['range'][measurament] = (r,l) + + self.min_sample = np.min(valid_dataset) + + self.logger.info("{:=^60}".format(f"DATASET STATS for {self.threshold}")) + self.logger.info(f"Min: {np.min(valid_dataset)}") + self.logger.info(f"Max: {np.max(valid_dataset)}") + self.logger.info(f"Mean: {np.mean(valid_dataset)}") + self.logger.info(f"Median: {np.median(valid_dataset)}") + + +# Example usage: +# loader = GasSensorDataLoader("enose_dataset", output_format="png", threshold=0.9) +# loader.threshold = 0.8 +# loader.plotRawdata(save=True) +# loader.plotDeltadata(save=True) +# loader.plotScaledBoundaries(save=True) +# # loader.threshold = 0.90 +# print(loader.load_dataset_window(128).shape) +# loader.threshold = 0.85 +# print(loader.load_dataset_window(128).shape) +# loader.threshold = 0.80 +# print(loader.load_dataset_window(128).shape) +# loader.threshold = 0.75 +# print(loader.load_dataset_window(128).shape) diff --git a/README.md b/README.md index b55b1d8..ed24c20 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,15 @@ -# enose_2025 +# Codigo de entrenamiento del e-nose +Sistema bio-inspirado para la detección de gases en diferentes fuentes. + +## Instalación del ambiente conda + +``` +./create_conda nombre-ambiente +``` + +## Ejecución de las Simulaciones + +``` +(nombre-ambiente)$ python train_sequence.py +``` diff --git a/TrainerClass.py b/TrainerClass.py new file mode 100644 index 0000000..097da09 --- /dev/null +++ b/TrainerClass.py @@ -0,0 +1,270 @@ +import numpy as np +import pandas as pd +import tensorflow as tf +import matplotlib.pyplot as plt +import matplotlib +matplotlib.rcParams['text.usetex'] = True + +from sklearn.model_selection import GroupShuffleSplit, ShuffleSplit, GridSearchCV +from sklearn.metrics import mean_squared_error, mean_absolute_error + +from xgboost import XGBRegressor + +# from ray import tune +# import ray +# from keras.callbacks import TensorBoard +# from keras.models import Sequential +# from keras.callbacks import EarlyStopping +# from keras.layers import Dense, BatchNormalization, Dropout +# from kerastuner.tuners import RandomSearch, Hyperband, GridSearch + +from datetime import datetime +import enlighten +import logging +import zipfile +import random +import joblib +import pickle +import time +import json +import os + + +def get_seed(): + return random.randint(0, 2**32 - 1) + +class eNoseTrainer: + def __init__(self, loader, splits=1, test_size=0.2, debug=False): + self.ledger = pd.DataFrame(columns=["node", "ts", "Dataset", "Samples", "Target", "Train Size", "Train Ratio", "Model", "Params", "Ratio", "Train mse", "mse", "mae", "rmse"]) + self.loader = loader + self.splits = splits + self.name = self.loader.label_file + self.target = '_'.join(self.loader.target_list) + self.state = dict() + + os.makedirs(self.name, exist_ok=True) + self.start = int(time.time()) + + log_format = '%(asctime)s | %(levelname)-8s | %(name)-15s | %(message)s' + date_format = '%Y-%m-%d %H:%M:%S' + logging.basicConfig(format=log_format, datefmt=date_format) + + target_log = '{}/load-{}.log'.format(self.name, self.start) + fh = logging.FileHandler(target_log) + + self.debug = debug + + self.logger = logging.getLogger("eNoseTrainer") + if self.debug: + self.logger.setLevel(logging.DEBUG) + fh.setLevel(logging.DEBUG) + else: + self.logger.setLevel(logging.INFO) + fh.setLevel(logging.INFO) + self.logger.addHandler(fh) + + self.ratio = test_size + + self.loader.stats() + self.loadCheckPoint() + + def loadCheckPoint(self): + if not os.path.isfile('{}/Simulaciones.xlsx'.format(self.name)): + self.saveCheckPoint() + + with pd.ExcelFile('{}/Simulaciones.xlsx'.format(self.name)) as xls: + self.ledger = pd.read_excel(xls, sheet_name='Historial') + self.trained = self.ledger.shape[0] + + with open('{}/vars.pickle'.format(self.name), 'rb') as pfile: + self.ratio, self.splits, self.state = pickle.load(pfile) + + def saveCheckPoint(self): + with pd.ExcelWriter('{}/Simulaciones.xlsx'.format(self.name), engine='xlsxwriter') as xls: + self.ledger.to_excel(xls, sheet_name='Historial', index=False) + + with open('{}/vars.pickle'.format(self.name), 'wb') as pfile: + pickle.dump((self.ratio, self.splits, self.state), pfile, protocol=pickle.HIGHEST_PROTOCOL) + + self.trained = self.ledger.shape[0] + + def wrap_and_save(self): + self.logger.info("{:=^60}".format(' Saving Summary and Wrap the output in a ZipFile ')) + + with pd.ExcelWriter('{}/Summary.xlsx'.format(self.name) , engine='xlsxwriter') as xls: + self.get_best_models().to_excel(xls, sheet_name='Results') + + with zipfile.ZipFile('{}-{}.zip'.format(self.name, self.start), 'w', zipfile.ZIP_DEFLATED) as zipf: + for root, dirs, files in os.walk(self.name): + for file in files: + zipf.write(os.path.join(root, file)) + + def row_exists(self, dataset, model): + return self.ledger[(self.ledger["Dataset"] == dataset) & (self.ledger["Target"] == self.target) & (self.ledger["Model"] == model) & (self.ledger["Ratio"] == self.ratio)].shape[0] > 0 + + def model_A(self, hp): + + model = Sequential() + model.add(Dense(units=hp.Int('units_input', min_value=48, max_value=56, step=8), input_dim=self.nvars, activation='relu')) + model.add(BatchNormalization()) + model.add(Dropout(rate=hp.Float('dropout_input', min_value=0.1, max_value=0.1, step=0.1))) + + model.add(Dense(units=hp.Int('units_hidden', min_value=32, max_value=48, step=8), activation='relu')) + model.add(BatchNormalization()) + model.add(Dropout(rate=hp.Float('dropout_hidden', min_value=0.4, max_value=0.4, step=0.1))) + + model.add(Dense(1, activation='sigmoid')) + + model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC()]) + + return model + + def train_and_score_model_keras(self, X_train, X_test, y_train, y_test, seed, label): +# set_random_seed(seed) + ntrials = 6 + tuner = RandomSearch( + self.get_model_train_keras, + objective='val_loss', #val_loss +# seed=seed, + max_trials=ntrials, +# executions_per_trial=1, # Número de ejecuciones por cada configuración + directory=self.name, + project_name='{}-{}'.format(label,seed)) + + self.logger.info("{:~^60}".format(' {}-{} '.format(label,seed))) + + + search_dir = "{}/keras-tuner-{}/".format(self.name,label) + os.makedirs(search_dir, exist_ok=True) + search_callback = TensorBoard(log_dir=search_dir) + early_stopping_search = EarlyStopping(monitor='val_loss', patience=13, min_delta=0.005, start_from_epoch=7, restore_best_weights=True) + tuner.search(X_train, y_train, epochs=150, batch_size=10, validation_data=(X_test, y_test), callbacks=[early_stopping_search, search_callback]) + best_hps = tuner.get_best_hyperparameters(num_trials=1)[0] + + self.trained += 1 + self.bar.update() + return mse, mae, rmse, optimized_model, model_params + + def get_model_train(self): + return [ + XGBRegressor(objective='reg:squarederror'), + ] + + def get_tunable_params(self, model): + if isinstance(model, XGBRegressor): + return { + "n_estimators": [800, 1000, 1200], + "learning_rate": np.logspace(-1.5, -0.5, 3), + 'max_depth': [5, 7, 9], + 'subsample': [0.5, 0.75, 1.0], +# 'colsample_bytree': [0.8, 0.9, 1.0], +# 'gamma': [0, 0.1, 0.2], +# 'min_child_weight': [1, 3, 5] + } + elif isinstance(model, RandomForestClassifier): + return { + "n_estimators": [50, 100, 200], + "max_depth": [5, 10, 15], + "max_features": [2, 5, 10] #['n', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'] + } + else: + return {} + + def train_and_score_model(self, model, X_train, X_test, y_train, y_test): + param_dist = self.get_tunable_params(model) + + cv = ShuffleSplit(n_splits=self.splits//2, test_size=0.2, random_state=get_seed()) + grid_search = GridSearchCV(estimator=model, param_grid=param_dist, scoring='neg_mean_squared_error', cv=cv, verbose=10, n_jobs=-1) + + grid_search.fit(X_train, y_train) + + optimized_model = grid_search.best_estimator_ + model_params = grid_search.best_params_ + + y_aux = optimized_model.predict(X_train) + tmse = mean_squared_error(y_train, y_aux) + + y_pred = optimized_model.predict(X_test) + mse = mean_squared_error(y_test, y_pred) + mae = mean_absolute_error(y_test, y_pred) + rmse = np.sqrt(mse) + + return tmse, mse, mae, rmse, optimized_model, model_params + + def fit(self): + total_train_queue = self.splits*len(self.get_model_train()) + self.logger.info("{:=^60}".format(f'Begin Fit {total_train_queue} Models')) + self.trained = 0 + manager = enlighten.get_manager() + self.bar = manager.counter(total=total_train_queue, count=self.trained, desc='Tunning', unit='Models', + format='{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]' + ) + + + node = os.uname()[1] + X_xboost, Y_xboost, G_xboost = self.loader.load_dataset_xboost() + + gss = GroupShuffleSplit(n_splits=self.splits, test_size=self.ratio, random_state=get_seed()) + + dataset = 'Tabular' + os.makedirs('{}/{}/{}'.format(self.name, self.target, dataset), exist_ok=True) + + for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_xboost, G_xboost)): + X_train, X_test = X_xboost[train_index], X_xboost[test_index] + y_train, y_test = Y_xboost[train_index], Y_xboost[test_index] + + + for model in self.get_model_train(): + model_id = "{}_{}".format(type(model).__name__, i) + self.trained += 1 + + if self.row_exists(dataset, model_id): + self.bar.update() + continue + + model_file = '{}/{}/{}/{}'.format(self.name, self.target, dataset, model_id ) + + tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_model(model, X_train, X_test, y_train, y_test) + + ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S") + joblib.dump(optimized_model, model_file) + + newrow = pd.DataFrame( [{"node": node, + "ts": ts, + "Dataset": dataset, + "Samples": Y_xboost.shape[0], + "Target": self.target, + "Train Size": y_train.shape[0], + "Train Ratio": y_train.shape[0]/Y_xboost.shape[0], + "Ratio": self.ratio, + "Model": model_id, + "Params": json.dumps(model_params), + "Train mse": tmse, + "mse": mse, + "mae": mae, + "rmse": rmse + }] ) + self.ledger = pd.concat([self.ledger, newrow], ignore_index=True) + self.bar.update() + + self.saveCheckPoint() + + # if self.dnn: + # model_file = '{}/{}/DNN_{}'.format(self.name, label, seed ) + # model_label = "{}".format(label) + # + # accuracy, specificity, recall, f1, roc_auc, optimized_model, parms = self.train_and_score_model_keras(X_train, X_test, y_train, y_test, seed, model_label) + # ts = datetime.now().strftime("%d/%m/%Y %H:%M:%S") + # + # newrow = pd.DataFrame( [{"node": node, + # "ts": ts, + # "Dataset": model_label, + # "Model": 'DNN', + # "Params": parms, + # "Seed": seed, + # "F1": f1, + # "ROC_AUC": roc_auc + # }] ) + # self.ledger = pd.concat([self.ledger, newrow], ignore_index=True) + + self.bar.close() diff --git a/create_conda.sh b/create_conda.sh new file mode 100755 index 0000000..fee216c --- /dev/null +++ b/create_conda.sh @@ -0,0 +1,21 @@ +# prompt: create a bash script that check if a single argument was given and install this conda packages: python scikit-learn keras pandas numpy matplotlib tensorflow openpyxl enlighten +# if the user dont input any argument or more than 1 print a help to use the script + +#!/bin/bash + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + echo "" + echo " Installs conda packages for a specified environment." + echo " Example: $0 enose1" + exit 1 +fi + +env_name=$1 + +conda create -n "$env_name" python scikit-learn==1.3.1 xgboost keras pandas numpy matplotlib tensorflow openpyxl xlsxwriter conda-forge::enlighten +if [ $? -eq 0 ]; then + echo "Packages installed successfully in environment '$env_name'" +else + echo "Aborted." +fi diff --git a/train_sequence.py b/train_sequence.py new file mode 100644 index 0000000..f21ed93 --- /dev/null +++ b/train_sequence.py @@ -0,0 +1,21 @@ +from LoaderClass import GasSensorDataLoader +from TrainerClass import eNoseTrainer + +import warnings +warnings.filterwarnings("ignore") + +source_channels=["MQ 8", "MQ 9", "MQ 135", "TGS 813", "TGS 821", "TGS 2600", "TGS 2602", "TGS 2611-0", "TGS 2612", "TGS 2620"] +#target_variables=['C2H2', 'CH4', 'C3H6', 'CO', 'C2H6', 'C3H8', 'C2H4', 'H2', 'O2'] + +#target_variables=['C2H2'] +#eNoseLoaderC2H2 = GasSensorDataLoader("enose_dataset", threshold=0.85, source_channels=source_channels, target_list=target_variables, debug=False) +#eNoseC2H2 = eNoseTrainer(eNoseLoaderC2H2, splits=3, test_size=0.2, debug=True) +#eNoseC2H2.fit() + +target_variables=['CH4'] +eNoseLoaderCH4 = GasSensorDataLoader("enose_dataset", threshold=0.85, source_channels=source_channels, target_list=target_variables, debug=False) + +eNoseCH4 = eNoseTrainer(eNoseLoaderCH4, splits=3, test_size=0.2, debug=True) +eNoseCH4.fit() + +#eNose.wrap_and_save()