diff --git a/LoaderClass.py b/LoaderClass.py index ba00248..df16625 100644 --- a/LoaderClass.py +++ b/LoaderClass.py @@ -15,6 +15,7 @@ class GasSensorDataLoader: self.data_folder = os.path.splitext(label_file)[0] self.state_file = f"{self.label_file}.pkl" self.lower_limit = lower_limit + self.smooth = None self.data = None self.debug = debug self.threshold = threshold @@ -24,6 +25,7 @@ class GasSensorDataLoader: self.samples = {} self.target_list = sorted(target_list) + self.target = '_'.join(self.target_list) self.target_len = len(self.target_list) self.source_channels = sorted(source_channels) self.force_overwrite = force_overwrite @@ -53,6 +55,7 @@ class GasSensorDataLoader: if False:#not self.force_overwrite and not self._compare_state_with_main(): raise ValueError("State file differs from the main Excel file. Use 'force_overwrite=True' to overwrite.") else: + self.logger.info(f"Init for {len(self.target_list)} targets => {self.target_list}") self.load_state() else: self.logger.info("State file not found. Loading dataset.") @@ -69,6 +72,27 @@ class GasSensorDataLoader: self.logger.error(f"Error comparing state file: {e}") return False + def reset(self): + self.dataset = {} + self.dataset['threshold'] = self.threshold + self.dataset['range'] = {} + + if isinstance(self.target_list, list): + self.target_list = sorted(self.target_list) + + elif isinstance(self.target_list, str): + self.target_list = list(self.target_list) + + self.target = '_'.join(self.target_list) + self.target_len = len(self.target_list) + + self.logger.info(f"Reset requested. Init for {len(self.target_list)} targets => {self.target}") + + delattr(self, "delta_data") + delattr(self, "scaled_data") + self.init_minmax() + self.stats() + def load_dataset(self): self.logger.info("Loading dataset from Excel files.") labels = pd.read_excel(self.main_file) @@ -114,6 +138,15 @@ class GasSensorDataLoader: def init_delta(self): self.logger.info("Initializing dataset delta values.") data_copy = {key: {'label': value['label'], 'sampleId': value['sampleId'], 'data': value['data'].copy()} for key, value in self.data.items()} + if self.smooth == 'conv3': + kernel = np.array([0.2, 0.6, 0.2]) + for key in data_copy: + tempdf = pd.DataFrame() + for col in data_copy[key]['data'].columns: + tempdf[col] = np.convolve(data_copy[key]['data'][col], kernel, mode='valid') + data_copy[key]['data'] = tempdf.copy() + + lower_limit = pd.concat([data_copy[key]['data'] for key in data_copy], axis=0).max() * self.lower_limit self.logger.debug("Lower limit {}.".format(lower_limit)) @@ -140,6 +173,7 @@ class GasSensorDataLoader: for key in data_instance: if channel_name in data_instance[key]['data'].columns: plt.plot(data_instance[key]['data'][channel_name]) + plt.xlabel("Time") plt.ylabel("Sensor Reading") plt.title(f"{title} Sensor Channel: {channel_name}") @@ -314,7 +348,8 @@ class GasSensorDataLoader: x_output = np.concatenate((x_output, x_sample)) y_output = np.concatenate((y_output, y_sample)) - + target_scaler = MinMaxScaler() + y_output = target_scaler.fit_transform(y_output) self.dataset['xboost'] = (x_output, y_output, g_output) return self.dataset['xboost'] @@ -426,7 +461,7 @@ class GasSensorDataLoader: # loader.plotRawdata(save=True) # loader.plotDeltadata(save=True) # loader.plotScaledBoundaries(save=True) -# # loader.threshold = 0.90 +# # loader.threshold = 0.90, smooth=None # print(loader.load_dataset_window(128).shape) # loader.threshold = 0.85 # print(loader.load_dataset_window(128).shape) diff --git a/TrainerClass.py b/TrainerClass.py index 1284bf0..446ba56 100644 --- a/TrainerClass.py +++ b/TrainerClass.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import tensorflow as tf +import matplotlib.cm as cm import matplotlib.pyplot as plt import matplotlib matplotlib.rcParams['text.usetex'] = True @@ -36,12 +37,10 @@ def get_seed(): return random.randint(0, 2**32 - 1) class eNoseTrainer: - def __init__(self, loader, splits=1, test_size=0.2, debug=False): + def __init__(self, loader, test_size=0.2, debug=False): self.ledger = pd.DataFrame(columns=["node", "ts", "Dataset", "Samples", "Target", "Train Size", "Train Ratio", "Model", "Params", "Ratio", "Train mse", "mse", "mae", "rmse"]) self.loader = loader - self.splits = splits self.name = self.loader.label_file - self.target = '_'.join(self.loader.target_list) self.state = dict() os.makedirs(self.name, exist_ok=True) @@ -78,15 +77,15 @@ class eNoseTrainer: self.ledger = pd.read_excel(xls, sheet_name='Historial') self.trained = self.ledger.shape[0] - with open('{}/vars.pickle'.format(self.name), 'rb') as pfile: - self.ratio, self.splits, self.state = pickle.load(pfile) + # with open('{}/vars.pickle'.format(self.name), 'rb') as pfile: + # self.ratio, self.state = pickle.load(pfile) def saveCheckPoint(self): with pd.ExcelWriter('{}/Simulaciones.xlsx'.format(self.name), engine='xlsxwriter') as xls: self.ledger.to_excel(xls, sheet_name='Historial', index=False) - with open('{}/vars.pickle'.format(self.name), 'wb') as pfile: - pickle.dump((self.ratio, self.splits, self.state), pfile, protocol=pickle.HIGHEST_PROTOCOL) + # with open('{}/vars.pickle'.format(self.name), 'wb') as pfile: + # pickle.dump((self.ratio, self.state), pfile, protocol=pickle.HIGHEST_PROTOCOL) self.trained = self.ledger.shape[0] @@ -102,7 +101,9 @@ class eNoseTrainer: zipf.write(os.path.join(root, file)) def row_exists(self, dataset, model): - return self.ledger[(self.ledger["Dataset"] == dataset) & (self.ledger["Target"] == self.target) & (self.ledger["Model"] == model) & (self.ledger["Ratio"] == self.ratio)].shape[0] > 0 + search_result = self.ledger[(self.ledger["Dataset"]==dataset) & (self.ledger["Target"]==self.loader.target) & (self.ledger["Model"]==model) & (self.ledger["Ratio"]==self.ratio)].shape[0] > 0 + self.logger.debug(f'Looking for {dataset}, {model}, {self.loader.target}, {self.ratio} => {search_result} {self.ledger.shape}') + return search_result def model_A(self, hp): @@ -155,13 +156,11 @@ class eNoseTrainer: def get_tunable_params(self, model): if isinstance(model, XGBRegressor): return { - "n_estimators": [800, 1000, 1200], - "learning_rate": np.logspace(-1.5, -0.5, 3), - 'max_depth': [5, 7, 9], - 'subsample': [0.5, 0.75, 1.0], -# 'colsample_bytree': [0.8, 0.9, 1.0], -# 'gamma': [0, 0.1, 0.2], -# 'min_child_weight': [1, 3, 5] + 'tree_method': ["hist"], + "n_estimators": [100, 128, 150], + 'max_depth': [6, 7, 8], + 'subsample': [0.5, 0.6, 0.7], + 'multi_strategy': ['one_output_per_tree', 'multi_output_tree'] } elif isinstance(model, RandomForestClassifier): return { @@ -193,6 +192,86 @@ class eNoseTrainer: return tmse, mse, mae, rmse, optimized_model, model_params + def gen_plots(self, dataset, model_id, target=None): + if isinstance(target, list): + self.loader.target_list=target + if isinstance(target, str): + self.loader.target_list= list(target) + + if dataset.endswith("-conv3"): + self.loader.smooth = 'conv3' + else: + self.loader.smooth = None + + self.loader.reset() + if not self.row_exists(dataset, model_id): + self.logger.error(f'No se encuentra la simulacion {dataset}, {model_id}') + return + + model_file = '{}/{}/{}/{}'.format(self.name, self.loader.target, dataset, model_id ) + if not os.path.isfile(model_file): + self.logger.error('No se encuentra el modelo') + return + + trained_model = joblib.load(model_file) + + pics_folder = '{}/{}/{}/plots'.format(self.name, self.loader.target, dataset) + os.makedirs(pics_folder, exist_ok=True) + + df = self.loader.scaled_data + + Y_samples = np.zeros((len(df), len(self.loader.target_list))) + for i, sample in enumerate(df): + Y_samples[i] = np.array([[df[sample]['label'][key] for key in self.loader.target_list]]) + + self.logger.debug(f"Y_samples.shape: {Y_samples.shape}") + + target_scaler = MinMaxScaler() + Y_samples = target_scaler.fit_transform(Y_samples) + + cmapx = cm.get_cmap('ocean', len(self.loader.source_channels)) + cmapy = cm.get_cmap('prism', Y_samples.shape[1]) + + for measurament, (r, l) in self.loader.dataset['range'].items(): + # df[measurament]['data'].plot(figsize=(12, 6), title=f"{measurament} Prediction") + plt.figure(figsize=(12, 6)) + plt.xlabel("Time") + plt.ylabel("Sensor Readings") + plt.legend(bbox_to_anchor=(0.95, 0.5), loc="center left") + + plt.vlines(x=r, ymin=0, ymax=1, colors='blue') + plt.vlines(x=l, ymin=0, ymax=1, colors='blue') + + for i, channel_name in enumerate(df[measurament]['data'].columns): + plt.plot(df[measurament]['data'][channel_name], linestyle = 'dotted', color=cmapx(i)) + + Y_value = np.zeros((1, len(self.loader.target_list))) + Y_value[0] = np.array([[df[measurament]['label'][key] for key in self.loader.target_list]]) + + self.logger.debug(f"Y_value.shape: {Y_value.shape}") + self.logger.debug(f"Y_value: {Y_value}") + + Y_scaled = target_scaler.transform(Y_value).reshape(1, -1) + self.logger.debug(f"Y_scaled.shape: {Y_scaled.shape}") + self.logger.debug(f"Y_scaled: {Y_scaled}") + + for i, value in enumerate(Y_scaled): + plt.axhline(y=value, xmin=0, xmax=df[measurament]['data'].shape[0], color=cmapy(i), linestyle='dashed') + + y_pred = trained_model.predict(df[measurament]['data'].to_numpy()) + + if y_pred.ndim == 2: + for i in range(y_pred.shape[0]): + plt.plot(y_pred[:, i], color=cmapy(i), linestyle='solid') + else: + plt.plot(y_pred, color=cmapy(0), linestyle='solid') + + filename = os.path.join(pics_folder, f"{measurament}_{model_id}.png") + plt.savefig(filename) + self.logger.info(f"Saved plot as {filename}") + + plt.close() + def fit(self): total_train_queue = 2*int(1/self.ratio)*len(self.get_model_train()) self.logger.info("{:=^60}".format(f'Begin Fit {total_train_queue} Models')) @@ -202,28 +281,30 @@ class eNoseTrainer: format='{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]' ) - - node = os.uname()[1] - X_xboost, Y_xboost, G_xboost = self.loader.load_dataset_xboost() - self.logger.debug(f"X_xboost: {X_xboost.shape}") - self.logger.debug(f"Y_xboost: {Y_xboost.shape}") - self.logger.debug(f"G_xboost: {G_xboost.shape}") - - discretizer = KBinsDiscretizer(n_bins=50*Y_xboost.shape[1], encode='ordinal', strategy='uniform') - discretizer.fit(Y_xboost) - Y_discrete = discretizer.transform(Y_xboost) - self.logger.debug(f"Y_discrete: {Y_discrete.shape}") - + discretizer = KBinsDiscretizer(n_bins=200, encode='ordinal', strategy='uniform') gss = StratifiedGroupKFold(n_splits=int(1/self.ratio), shuffle=True, random_state=get_seed()) + node = os.uname()[1] + self.loader.smooth = None + self.loader.reset() + + X_xboost, Y_xboost, G_xboost = self.loader.load_dataset_xboost() + # self.logger.debug(f"X_xboost: {X_xboost.shape}") + self.logger.debug(f"Y_xboost: {Y_xboost.shape}") + # self.logger.debug(f"G_xboost: {G_xboost.shape}") + + Y_discrete = discretizer.fit_transform(Y_xboost) + if Y_discrete.ndim == 2: + Y_discrete = np.sum(Y_discrete, axis=1) + # self.logger.debug(f"Y_discrete: {Y_discrete.shape}") for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)): dataset = 'Tabular' - os.makedirs('{}/{}/{}'.format(self.name, self.target, dataset), exist_ok=True) + os.makedirs('{}/{}/{}'.format(self.name, self.loader.target, dataset), exist_ok=True) X_train, X_test = X_xboost[train_index], X_xboost[test_index] Y_train, Y_test = Y_xboost[train_index], Y_xboost[test_index] - self.logger.debug(f"X_train: {X_train.shape}") - self.logger.debug(f"X_test: {X_test.shape}") + # self.logger.debug(f"X_train: {X_train.shape}") + # self.logger.debug(f"X_test: {X_test.shape}") self.logger.debug(f"Y_train: {Y_train.shape}") self.logger.debug(f"Y_test: {Y_test.shape}") @@ -236,7 +317,7 @@ class eNoseTrainer: self.bar.update() continue - model_file = '{}/{}/{}/{}'.format(self.name, self.target, dataset, model_id ) + model_file = '{}/{}/{}/{}'.format(self.name, self.loader.target, dataset, model_id ) tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_model(model, X_train, X_test, Y_train, Y_test) @@ -247,7 +328,7 @@ class eNoseTrainer: "ts": ts, "Dataset": dataset, "Samples": Y_xboost.shape[0], - "Target": self.target, + "Target": self.loader.target, "Train Size": Y_train.shape[0], "Train Ratio": Y_train.shape[0]/Y_xboost.shape[0], "Ratio": self.ratio, @@ -263,10 +344,26 @@ class eNoseTrainer: self.saveCheckPoint() - dataset = 'Tabular-s3' - os.makedirs('{}/{}/{}'.format(self.name, self.target, dataset), exist_ok=True) - X_xboost_no_noise = np.convolve(X_xboost, [0.2, 0.6, 0.2], mode='same') - X_train, X_test = X_xboost_no_noise[train_index], X_xboost_no_noise[test_index] + self.loader.smooth = 'conv3' + self.loader.reset() + X_xboost, Y_xboost, G_xboost = self.loader.load_dataset_xboost() + # self.logger.debug(f"X_xboost: {X_xboost.shape}") + self.logger.debug(f"Y_xboost: {Y_xboost.shape}") + # self.logger.debug(f"G_xboost: {G_xboost.shape}") + + Y_discrete = discretizer.fit_transform(Y_xboost) + if Y_discrete.ndim == 2: + Y_discrete = np.sum(Y_discrete, axis=1) + + for i, (train_index, test_index) in enumerate(gss.split(X_xboost, Y_discrete, G_xboost)): + dataset = 'Tabular-conv3' + os.makedirs('{}/{}/{}'.format(self.name, self.loader.target, dataset), exist_ok=True) + X_train, X_test = X_xboost[train_index], X_xboost[test_index] + Y_train, Y_test = Y_xboost[train_index], Y_xboost[test_index] + # self.logger.debug(f"X_train: {X_train.shape}") + # self.logger.debug(f"X_test: {X_test.shape}") + self.logger.debug(f"Y_train: {Y_train.shape}") + self.logger.debug(f"Y_test: {Y_test.shape}") for model in self.get_model_train(): model_id = "{}_{}".format(type(model).__name__, i) @@ -276,7 +373,7 @@ class eNoseTrainer: self.bar.update() continue - model_file = '{}/{}/{}/{}'.format(self.name, self.target, dataset, model_id ) + model_file = '{}/{}/{}/{}'.format(self.name, self.loader.target, dataset, model_id ) tmse, mse, mae, rmse, optimized_model, model_params = self.train_and_score_model(model, X_train, X_test, Y_train, Y_test) @@ -287,7 +384,7 @@ class eNoseTrainer: "ts": ts, "Dataset": dataset, "Samples": Y_xboost.shape[0], - "Target": self.target, + "Target": self.loader.target, "Train Size": Y_train.shape[0], "Train Ratio": Y_train.shape[0]/Y_xboost.shape[0], "Ratio": self.ratio, diff --git a/train_sequence.py b/train_sequence.py index b0a7b92..fd70f07 100644 --- a/train_sequence.py +++ b/train_sequence.py @@ -5,35 +5,37 @@ import warnings warnings.filterwarnings("ignore") source_channels=["MQ 8", "MQ 9", "MQ 135", "TGS 813", "TGS 821", "TGS 2600", "TGS 2602", "TGS 2611-0", "TGS 2612", "TGS 2620"] -#target_variables=['C2H2', 'CH4', 'C3H6', 'CO', 'C2H6', 'C3H8', 'C2H4', 'H2', 'O2'] +target_variables=['C2H2', 'CH4', 'C3H6', 'CO', 'C2H6', 'C3H8', 'C2H4', 'H2', 'O2'] -target_variables=['C2H2'] -eNoseLoaderC2H2 = GasSensorDataLoader("enose_dataset", threshold=0.85, source_channels=source_channels, target_list=target_variables, debug=False) -eNoseC2H2 = eNoseTrainer(eNoseLoaderC2H2, test_size=0.2, debug=True) -eNoseC2H2.fit() +eNoseLoader = GasSensorDataLoader("enose_dataset", threshold=0.85, source_channels=source_channels, target_list=target_variables, debug=False) +eNose = eNoseTrainer(eNoseLoader, test_size=0.5) +eNoseLoader.target_list=['C2H2',] +eNose.fit() +eNoseLoader.target_list=['CH4',] +eNose.fit() +eNoseLoader.target_list=['C3H6',] +eNose.fit() +eNoseLoader.target_list=['CO',] +eNose.fit() +eNoseLoader.target_list=['C2H6',] +eNose.fit() +eNoseLoader.target_list=['C3H8',] +eNose.fit() +eNoseLoader.target_list=['C2H2', 'CH4', 'C3H6', 'CO', 'C2H6',] +eNose.fit() +eNose.wrap_and_save() -target_variables=['CH4'] -eNoseLoaderCH4 = GasSensorDataLoader("enose_dataset", threshold=0.85, source_channels=source_channels, target_list=target_variables, debug=False) -eNoseCH4 = eNoseTrainer(eNoseLoaderCH4, test_size=0.2, debug=True) -eNoseCH4.fit() - -target_variables=['C3H6'] -eNoseLoaderC3H6 = GasSensorDataLoader("enose_dataset", threshold=0.85, source_channels=source_channels, target_list=target_variables, debug=False) -eNoseC3H6 = eNoseTrainer(eNoseLoaderC3H6, test_size=0.2, debug=True) -eNoseC3H6.fit() - - -target_variables=['C2H6'] -eNoseLoaderC2H6 = GasSensorDataLoader("enose_dataset", threshold=0.85, source_channels=source_channels, target_list=target_variables, debug=False) -eNoseC2H6 = eNoseTrainer(eNoseLoaderC2H6, test_size=0.2, debug=True) -eNoseC2H6.fit() - - -target_variables=['H2'] -eNoseLoaderH2 = GasSensorDataLoader("enose_dataset", threshold=0.85, source_channels=source_channels, target_list=target_variables, debug=False) -eNoseH2 = eNoseTrainer(eNoseLoaderH2, test_size=0.2, debug=True) -eNoseH2.fit() - - - -#eNose.wrap_and_save() +# eNoseLoader.target_list=['CH4'] +# eNose.fit() +# +# eNoseLoader.target_list=['C3H6'] +# eNose.fit() +# +# eNoseLoader.target_list=['C2H6'] +# eNose.fit() +# +# eNoseLoader.target_list=['H2'] +# eNose.fit() +# +# eNoseLoader.target_list=['C2H2', 'CH4', 'C3H6', 'C2H6', 'H2'] +# eNose.fit()