Source code for traveltimes_prediction.interface.training_manager

import logging
import datetime
import time
import os
import math

import pickle
import subprocess

from ..configs import sections_settings, all_models, real_tt_calculator_path, training_interval_seconds, \
    training_data_interval, dumped_features_path, sections_to_maintain
from ..data_processing import DataProcessor
from ..support_files.helpers import partition_interval, subtract_time_intervals
from ..support_files.exceptions import DataNotInDatabaseException

from ..models import params_converter, model_translator, params_grids_training
from ..data_processing import DBInterface

from test_suite import TestFramework

logger = logging.getLogger('traveltimes_training')


[docs]class TrainingManager: """ Class for training of the models for specified sections. """ def __init__(self): """ Constructor. """ self.db_interface = None
[docs] def train(self, section, MODEL, model_params, X_Y_data): """ Method for training of the model. :param string section: e.g. 'LNCE-KOCE' :param class MODEL: class of the model to be created - e.g. ClusterModel, TimeDomainModel... :param dict model_params: model parameters :param tuple X_Y_data: tuple of DataFrames - input data for training """ logger.info("[%s] Training model -- %s -- ...", section, MODEL.name) model = None if X_Y_data is not None: X, Y = X_Y_data try: model = MODEL(**model_params).fit(X=X, Y=Y) # Train model logger.info('[%s] Model -- %s -- fitted ... ', section, MODEL.name) except Exception as e: X.to_pickle('dump_X_err_' + datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) Y.to_pickle('dump_Y_err_' + datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) logger.exception(e) logger.error('[%s] Model -- %s -- has not been fitted !!', section, MODEL.name) else: logger.error('[%s] Invalid data retrieved, model -- %s -- has not been fitted !!', section, MODEL.name) if model is not None: time_from = X.index[0] time_to = X.index[-1] self.db_interface.save_model(section=section, model=model.dump(), time_from=time_from, time_to=time_to, model_params=params_converter(model_params)) logger.info('[%s] Model -- %s -- saved to DB ...', section, MODEL.name) else: logger.warning('[%s] Model -- %s -- has not been saved to DB !!!', section, MODEL.name)
[docs] def prepare_training_data(self, section, time_interval_list): """ Method for retrieving the data from database and preparing them for training. :param string section: e.g. 'KOCE-LNCE' :param list time_interval_list: list of dicts - {'from: datetime, 'to': datetime} :return: tuple - (X, Y) - data for training """ logger.info('[%s] Retrieving data for training ...', section) DP = DataProcessor(section=section) ref_df = DP.get_referential_data(time_interval_list=time_interval_list) logger.info('[%s] Data for training retrieved ...', section) feature_df, _ = DP.get_features(time_interval_list=time_interval_list) logger.info('[%s] Features for training engineered ...', section) # Aligning is done due to possibility that ref traveltimes might contain traveltimes which are not in BCK traveltimes (and features) X_Y = DataProcessor.align_training_dataset(X=feature_df, Y=ref_df) # Align X with Y return X_Y
[docs] def training_loop(self, sections=sections_to_maintain, training_interval_s=training_interval_seconds, train_for=None): """ Method for the training to be kept in loop, calls self.train(). :param list sections: list of section whose models are gonna be trained. :param int training_interval_s: hours, how often should be the models trained. :param datetime.datetime train_for: for TEST purposes, datetime for which should be trained. """ while True: self._reconnect_to_db() tick = time.time() self._make_models_for_sections(sections, train_for=train_for) tock = time.time() self._close_db_connection() logger.info("Training loop finished in %.2f seconds ...", (tock - tick)) if training_interval_s - (tock - tick) > 0: time.sleep(training_interval_s - (tock - tick))
def _necessary_to_retrain(self, last_timestamp, days=5): """ Method for calculation of time interval, from which the data will be used for training. :param datetime.datetime last_timestamp: timestamp of the most recent data in DB (tt3.output_traveltimes) :param int days: how many days should be used for the training :return: dict - {'from': datetime, 'to': datetime} """ if last_timestamp is None: raise ValueError('Last timestamp cannot be equal to None !!') d = {'from': last_timestamp - datetime.timedelta(days=days), 'to': last_timestamp} return d def _calculate_referential_traveltimes(self, section, training_timespan): """ Method for invoking the calculation of referential traveltimes by calling calculator written in C#. :param string section: e.g. 'KOCE-LNCE' :param dict training_timespan: {'from': datetime, 'to': datetime} :return: boolean - True if finished successfully, False otherwise """ if training_timespan: _from = training_timespan[0]['from'] _to = training_timespan[-1]['to'] # Check the last occurrence calculated latest_calculated = self.db_interface.check_latest_referential_traveltime(section=section) if latest_calculated > _from: _from = latest_calculated if latest_calculated > _to: logger.info("[%s] Real traveltimes calculation skipped, the referential traveltimes for requested timespan are already calculated.", section) return True logger.info("[%s] Recalculation of real traveltimes started ...", section) dt = _from.strftime("%Y-%m-%d %H:%M:%S") hours = _to - _from hours = math.ceil(hours.total_seconds() / 3600) args = [real_tt_calculator_path, section, dt, str(hours)] ret = subprocess.run(args=args, stdout=subprocess.DEVNULL) if ret.returncode == 0: logger.info("[%s] Real traveltimes - ground truth for training calculated ...", section) return True logger.error("[%s] Real traveltimes have not been calculated !", section) return False return None def _has_preprocessed_data(self, section): """ Method for looking up if the section has some previously saved data. :param string section: e.g. 'KOCE-LNCE' :return: tuple - (dict - interval for training, tuple - data) """ f_name = dumped_features_path # os.path.realpath(os.path.join("preprocessed_data", section + ".pickle")) try: with open(f_name, 'rb') as f: d = pickle.load(f) except FileNotFoundError: logger.info("[%s] Saved data not found ...", section) return None, None logger.info("[%s] Saved data found and loaded ...", section) return d['interval'], d['data'] def _recalculate_training_interval(self, processed_interval, generated_interval): """ Method for recalculation of the training data interval if previously saved data were found. :param dict processed_interval: {'from': dt, 'to': dt}, interval from which are previously saved data :param dict generated_interval: {'from': dt, 'to': dt}, interval which should be used for training :return: list - list of interval chunked into defined length intervals """ interval = subtract_time_intervals(new=generated_interval, saved=processed_interval) if interval is not None: return partition_interval(time_interval=interval) return [] def _merge_data(self, preprocessed_data, data, preprocessed_data_interval, data_interval): """ Method for merging of the previously used data and newly generated data. :param tuple preprocessed_data: tuple(pd.DataFrame, pd.DataFrame) -- X, Y :param tuple data: tuple(pd.DataFrame, pd.DataFrame) -- X, Y :param dict preprocessed_data_interval: {'from': dt, 'to': dt}, interval from which are previously saved data :param dict data_interval: {'from': dt, 'to': dt}, interval which should be used for training :return: tuple(tuple(X, Y), dict - interval) """ if preprocessed_data is None: return data, data_interval elif data is None: return preprocessed_data, preprocessed_data_interval elif preprocessed_data_interval['to'] < data_interval['from']: return data, data_interval else: X1, Y1 = data X2, Y2 = preprocessed_data X = X2.append(X1, ignore_index=False).fillna(method='bfill').fillna(method='ffill') Y = Y2.append(Y1, ignore_index=False).fillna(method='bfill').fillna(method='ffill') return (X,Y), {'from': preprocessed_data_interval['from'], 'to': data_interval['to']} def _dump_data(self, data, section, time_interval): """ Method for dumping of the data that has been used for the training. :param tuple data: (X, Y) of data used for training. :param string section: e.g. 'KOCE-LNCE' :param time_interval: dict - time interval used for training :return tuple - pandas.DataFrames, pandas.DataFrame """ if data is not None and (data[0] is not None and data[1] is not None): # Always save only the amount of data, that are used for training # FIXME -- uncomment in the future - now lets keep all processed data # end = data[0].index[-1] # start = data[0].index.searchsorted(end - datetime.timedelta(days=training_data_interval)) # start = data[0].index[start+1] # # X = data[0].ix[start:end] # Y = data[1].ix[start:end] # data = (X, Y) f_name = os.path.join(dumped_features_path, section + ".pickle") #os.path.join(dumped_features_path, "preprocessed_data", section + ".pickle") with open(f_name, 'wb') as f: pickle.dump({'data': data, 'interval': time_interval}, f) logger.info("[%s] Processed features data dumped ...", section) return data else: logger.warning("[%s] Processed data not saved, invalid structure !", section) return None def _tune_model(self, X, Y, section, model_type): """ Method for tuning of the model`s hyper parameters. :param pd.DataFrame X: :param pd.Series Y: :param string section: :param string model_type: :return: dict - hyperparameters """ # Check model occurrence in database, if it has been already saved, load the params. logger.info("[%s] Checking whether it is necessary to tune the model -- %s ...", section, model_type) _model_type, _model_params = self.db_interface.get_model_params(section=section, model_type=model_type) TF = TestFramework(X_path=X, Y_path=Y) if _model_params is None or _model_type is None: execute_tuning = True logger.warning('[%s] Model -- %s -- and its params have not been found in DB !', section, model_type) else: execute_tuning = False if not execute_tuning: _model_type = model_translator(_model_type) _model_params = params_converter(_model_params) # Train using params, check performance - comparison of BCK and estimated differences from real traveltime. Y_bck, estimated_Y, test_Y, _, _ = TF.train_predict_classifier(algorithm=_model_type, params=_model_params, cv_ratio=0.8, portion=1.0, valid_ratio=0.99) execute_tuning = execute_tuning or not TF.est_better_than_bck(bck=Y_bck, est=estimated_Y, ref=test_Y) if execute_tuning: logger.info('[%s] Model -- %s -- is being tuned ...', section, model_type) out_cluster = TF.regressor_evaluation_parallel(algorithm=_model_type, param_grid=params_grids_training[_model_type.name], parallel=False) out_cluster['combined_loss'] = out_cluster['loss'] + out_cluster['valid_loss'] best_params = out_cluster.sort_values(by=['combined_loss'], ascending=True)['params'].iloc[0] return best_params logger.info('[%s] Model -- %s -- was not tuned, old params are sufficient ...', section, model_type) return _model_params def _make_models_for_sections(self, sections, train_for=None): """ Private method for running one training. :param list sections: e.g. ['KOCE-LNCE', 'ZTCE-HZMBO', ...] :param datetime.datetime train_for: datetime, for which should be created model """ for s in sections: # Models share features try: allowed_models = list(all_models - sections_settings[s]['forbidden_models']) logger.info("[%s] Computing training interval", s) training_interval = self._compute_training_interval(section=s, last_timestamp=train_for) logger.info("[%s] Looking for saved features data...", s) preprocessed_data_interval, preprocessed_data = self._has_preprocessed_data(section=s) logger.info("[%s] Recalculation of training interval ...", s) recalculated_interval_list = self._recalculate_training_interval( processed_interval=preprocessed_data_interval, generated_interval=training_interval) # self._calculate_referential_traveltimes(section=s, training_timespan=recalculated_interval_list) X_Y_data = self.prepare_training_data(section=s, time_interval_list=recalculated_interval_list) if X_Y_data is None: logger.warning('[%s] There are not any new data. Using old data for fitting of the model !', s) X_Y_data, training_interval = self._merge_data(preprocessed_data=preprocessed_data, data=X_Y_data, preprocessed_data_interval=preprocessed_data_interval, data_interval=training_interval) X_Y_data = self._dump_data(section=s, data=X_Y_data, time_interval=training_interval) for m in sections_settings[s]['models_config']: if m['model'] in allowed_models: model_params = None # self._tune_model(X=X_Y_data[0], Y=X_Y_data[1], section=s, model_type=m['model'].name) self.train(section=s, MODEL=m['model'], model_params=model_params if model_params is not None else m['model_params'], X_Y_data=X_Y_data) except Exception as e: logger.exception(e) def _compute_training_interval(self, section, last_timestamp=None): """ Method for computation of the training interval of data which should be used for the training. :param string section: e.g. 'KOCE-LNCE' :param datetime.datetime last_timestamp: For TEST :return: dict - training interval - {'from': datetime, 'to': datetime} """ # First get the timestamp of the last record in DB logger.debug('[%s] Retrieving last timestamp from DB ...', section) if last_timestamp is None: last_timestamp = self.db_interface.get_last_timestamp(section=section) if last_timestamp is None: logger.error('Unable to retrieve the last timestamp of data for section %s !!', section) raise DataNotInDatabaseException('Unable to retrieve last timestamp of data for section %s' % section) time_interval = self._necessary_to_retrain(last_timestamp=last_timestamp, days=training_data_interval) # Make interval for training return time_interval def _reconnect_to_db(self): """ Private method for reconnection to DB in case of erroneous connection. :return: void """ attempt = 1 while attempt < 10: try: self.db_interface = DBInterface() break except Exception as e: logger.warning("Reconnecting to DB not successful - attempt %i !!", attempt) attempt += 1 time.sleep(5) def _close_db_connection(self): """ Private method for closing of the DB connection. :return: void """ if self.db_interface is not None: self.db_interface._close_connection() else: raise Exception("Connection to DB cannot be established !!!!")