import logging
import datetime
import time
import os
import math
import pickle
import subprocess
from ..configs import sections_settings, all_models, real_tt_calculator_path, training_interval_seconds, \
training_data_interval, dumped_features_path, sections_to_maintain
from ..data_processing import DataProcessor
from ..support_files.helpers import partition_interval, subtract_time_intervals
from ..support_files.exceptions import DataNotInDatabaseException
from ..models import params_converter, model_translator, params_grids_training
from ..data_processing import DBInterface
from test_suite import TestFramework
logger = logging.getLogger('traveltimes_training')
[docs]class TrainingManager:
"""
Class for training of the models for specified sections.
"""
def __init__(self):
"""
Constructor.
"""
self.db_interface = None
[docs] def train(self, section, MODEL, model_params, X_Y_data):
"""
Method for training of the model.
:param string section: e.g. 'LNCE-KOCE'
:param class MODEL: class of the model to be created - e.g. ClusterModel, TimeDomainModel...
:param dict model_params: model parameters
:param tuple X_Y_data: tuple of DataFrames - input data for training
"""
logger.info("[%s] Training model -- %s -- ...", section, MODEL.name)
model = None
if X_Y_data is not None:
X, Y = X_Y_data
try:
model = MODEL(**model_params).fit(X=X, Y=Y) # Train model
logger.info('[%s] Model -- %s -- fitted ... ', section, MODEL.name)
except Exception as e:
X.to_pickle('dump_X_err_' + datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S'))
Y.to_pickle('dump_Y_err_' + datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S'))
logger.exception(e)
logger.error('[%s] Model -- %s -- has not been fitted !!', section, MODEL.name)
else:
logger.error('[%s] Invalid data retrieved, model -- %s -- has not been fitted !!', section, MODEL.name)
if model is not None:
time_from = X.index[0]
time_to = X.index[-1]
self.db_interface.save_model(section=section, model=model.dump(),
time_from=time_from, time_to=time_to,
model_params=params_converter(model_params))
logger.info('[%s] Model -- %s -- saved to DB ...', section, MODEL.name)
else:
logger.warning('[%s] Model -- %s -- has not been saved to DB !!!', section, MODEL.name)
[docs] def prepare_training_data(self, section, time_interval_list):
"""
Method for retrieving the data from database and preparing them for training.
:param string section: e.g. 'KOCE-LNCE'
:param list time_interval_list: list of dicts - {'from: datetime, 'to': datetime}
:return: tuple - (X, Y) - data for training
"""
logger.info('[%s] Retrieving data for training ...', section)
DP = DataProcessor(section=section)
ref_df = DP.get_referential_data(time_interval_list=time_interval_list)
logger.info('[%s] Data for training retrieved ...', section)
feature_df, _ = DP.get_features(time_interval_list=time_interval_list)
logger.info('[%s] Features for training engineered ...', section)
# Aligning is done due to possibility that ref traveltimes might contain traveltimes which are not in BCK traveltimes (and features)
X_Y = DataProcessor.align_training_dataset(X=feature_df, Y=ref_df) # Align X with Y
return X_Y
[docs] def training_loop(self, sections=sections_to_maintain, training_interval_s=training_interval_seconds, train_for=None):
"""
Method for the training to be kept in loop, calls self.train().
:param list sections: list of section whose models are gonna be trained.
:param int training_interval_s: hours, how often should be the models trained.
:param datetime.datetime train_for: for TEST purposes, datetime for which should be trained.
"""
while True:
self._reconnect_to_db()
tick = time.time()
self._make_models_for_sections(sections, train_for=train_for)
tock = time.time()
self._close_db_connection()
logger.info("Training loop finished in %.2f seconds ...", (tock - tick))
if training_interval_s - (tock - tick) > 0:
time.sleep(training_interval_s - (tock - tick))
def _necessary_to_retrain(self, last_timestamp, days=5):
"""
Method for calculation of time interval, from which the data will be used for training.
:param datetime.datetime last_timestamp: timestamp of the most recent data in DB (tt3.output_traveltimes)
:param int days: how many days should be used for the training
:return: dict - {'from': datetime, 'to': datetime}
"""
if last_timestamp is None:
raise ValueError('Last timestamp cannot be equal to None !!')
d = {'from': last_timestamp - datetime.timedelta(days=days),
'to': last_timestamp}
return d
def _calculate_referential_traveltimes(self, section, training_timespan):
"""
Method for invoking the calculation of referential traveltimes by calling calculator written in C#.
:param string section: e.g. 'KOCE-LNCE'
:param dict training_timespan: {'from': datetime, 'to': datetime}
:return: boolean - True if finished successfully, False otherwise
"""
if training_timespan:
_from = training_timespan[0]['from']
_to = training_timespan[-1]['to']
# Check the last occurrence calculated
latest_calculated = self.db_interface.check_latest_referential_traveltime(section=section)
if latest_calculated > _from:
_from = latest_calculated
if latest_calculated > _to:
logger.info("[%s] Real traveltimes calculation skipped, the referential traveltimes for requested timespan are already calculated.", section)
return True
logger.info("[%s] Recalculation of real traveltimes started ...", section)
dt = _from.strftime("%Y-%m-%d %H:%M:%S")
hours = _to - _from
hours = math.ceil(hours.total_seconds() / 3600)
args = [real_tt_calculator_path, section, dt, str(hours)]
ret = subprocess.run(args=args, stdout=subprocess.DEVNULL)
if ret.returncode == 0:
logger.info("[%s] Real traveltimes - ground truth for training calculated ...", section)
return True
logger.error("[%s] Real traveltimes have not been calculated !", section)
return False
return None
def _has_preprocessed_data(self, section):
"""
Method for looking up if the section has some previously saved data.
:param string section: e.g. 'KOCE-LNCE'
:return: tuple - (dict - interval for training, tuple - data)
"""
f_name = dumped_features_path # os.path.realpath(os.path.join("preprocessed_data", section + ".pickle"))
try:
with open(f_name, 'rb') as f:
d = pickle.load(f)
except FileNotFoundError:
logger.info("[%s] Saved data not found ...", section)
return None, None
logger.info("[%s] Saved data found and loaded ...", section)
return d['interval'], d['data']
def _recalculate_training_interval(self, processed_interval, generated_interval):
"""
Method for recalculation of the training data interval if previously saved data were found.
:param dict processed_interval: {'from': dt, 'to': dt}, interval from which are previously saved data
:param dict generated_interval: {'from': dt, 'to': dt}, interval which should be used for training
:return: list - list of interval chunked into defined length intervals """
interval = subtract_time_intervals(new=generated_interval, saved=processed_interval)
if interval is not None:
return partition_interval(time_interval=interval)
return []
def _merge_data(self, preprocessed_data, data, preprocessed_data_interval, data_interval):
"""
Method for merging of the previously used data and newly generated data.
:param tuple preprocessed_data: tuple(pd.DataFrame, pd.DataFrame) -- X, Y
:param tuple data: tuple(pd.DataFrame, pd.DataFrame) -- X, Y
:param dict preprocessed_data_interval: {'from': dt, 'to': dt}, interval from which are previously saved data
:param dict data_interval: {'from': dt, 'to': dt}, interval which should be used for training
:return: tuple(tuple(X, Y), dict - interval)
"""
if preprocessed_data is None:
return data, data_interval
elif data is None:
return preprocessed_data, preprocessed_data_interval
elif preprocessed_data_interval['to'] < data_interval['from']:
return data, data_interval
else:
X1, Y1 = data
X2, Y2 = preprocessed_data
X = X2.append(X1, ignore_index=False).fillna(method='bfill').fillna(method='ffill')
Y = Y2.append(Y1, ignore_index=False).fillna(method='bfill').fillna(method='ffill')
return (X,Y), {'from': preprocessed_data_interval['from'],
'to': data_interval['to']}
def _dump_data(self, data, section, time_interval):
"""
Method for dumping of the data that has been used for the training.
:param tuple data: (X, Y) of data used for training.
:param string section: e.g. 'KOCE-LNCE'
:param time_interval: dict - time interval used for training
:return tuple - pandas.DataFrames, pandas.DataFrame
"""
if data is not None and (data[0] is not None and data[1] is not None):
# Always save only the amount of data, that are used for training
# FIXME -- uncomment in the future - now lets keep all processed data
# end = data[0].index[-1]
# start = data[0].index.searchsorted(end - datetime.timedelta(days=training_data_interval))
# start = data[0].index[start+1]
#
# X = data[0].ix[start:end]
# Y = data[1].ix[start:end]
# data = (X, Y)
f_name = os.path.join(dumped_features_path, section + ".pickle") #os.path.join(dumped_features_path, "preprocessed_data", section + ".pickle")
with open(f_name, 'wb') as f:
pickle.dump({'data': data, 'interval': time_interval}, f)
logger.info("[%s] Processed features data dumped ...", section)
return data
else:
logger.warning("[%s] Processed data not saved, invalid structure !", section)
return None
def _tune_model(self, X, Y, section, model_type):
"""
Method for tuning of the model`s hyper parameters.
:param pd.DataFrame X:
:param pd.Series Y:
:param string section:
:param string model_type:
:return: dict - hyperparameters
"""
# Check model occurrence in database, if it has been already saved, load the params.
logger.info("[%s] Checking whether it is necessary to tune the model -- %s ...", section, model_type)
_model_type, _model_params = self.db_interface.get_model_params(section=section, model_type=model_type)
TF = TestFramework(X_path=X, Y_path=Y)
if _model_params is None or _model_type is None:
execute_tuning = True
logger.warning('[%s] Model -- %s -- and its params have not been found in DB !', section, model_type)
else:
execute_tuning = False
if not execute_tuning:
_model_type = model_translator(_model_type)
_model_params = params_converter(_model_params)
# Train using params, check performance - comparison of BCK and estimated differences from real traveltime.
Y_bck, estimated_Y, test_Y, _, _ = TF.train_predict_classifier(algorithm=_model_type,
params=_model_params,
cv_ratio=0.8,
portion=1.0,
valid_ratio=0.99)
execute_tuning = execute_tuning or not TF.est_better_than_bck(bck=Y_bck, est=estimated_Y, ref=test_Y)
if execute_tuning:
logger.info('[%s] Model -- %s -- is being tuned ...', section, model_type)
out_cluster = TF.regressor_evaluation_parallel(algorithm=_model_type,
param_grid=params_grids_training[_model_type.name],
parallel=False)
out_cluster['combined_loss'] = out_cluster['loss'] + out_cluster['valid_loss']
best_params = out_cluster.sort_values(by=['combined_loss'], ascending=True)['params'].iloc[0]
return best_params
logger.info('[%s] Model -- %s -- was not tuned, old params are sufficient ...', section, model_type)
return _model_params
def _make_models_for_sections(self, sections, train_for=None):
"""
Private method for running one training.
:param list sections: e.g. ['KOCE-LNCE', 'ZTCE-HZMBO', ...]
:param datetime.datetime train_for: datetime, for which should be created model
"""
for s in sections: # Models share features
try:
allowed_models = list(all_models - sections_settings[s]['forbidden_models'])
logger.info("[%s] Computing training interval", s)
training_interval = self._compute_training_interval(section=s, last_timestamp=train_for)
logger.info("[%s] Looking for saved features data...", s)
preprocessed_data_interval, preprocessed_data = self._has_preprocessed_data(section=s)
logger.info("[%s] Recalculation of training interval ...", s)
recalculated_interval_list = self._recalculate_training_interval(
processed_interval=preprocessed_data_interval,
generated_interval=training_interval)
# self._calculate_referential_traveltimes(section=s, training_timespan=recalculated_interval_list)
X_Y_data = self.prepare_training_data(section=s, time_interval_list=recalculated_interval_list)
if X_Y_data is None:
logger.warning('[%s] There are not any new data. Using old data for fitting of the model !', s)
X_Y_data, training_interval = self._merge_data(preprocessed_data=preprocessed_data,
data=X_Y_data,
preprocessed_data_interval=preprocessed_data_interval,
data_interval=training_interval)
X_Y_data = self._dump_data(section=s, data=X_Y_data, time_interval=training_interval)
for m in sections_settings[s]['models_config']:
if m['model'] in allowed_models:
model_params = None # self._tune_model(X=X_Y_data[0], Y=X_Y_data[1], section=s, model_type=m['model'].name)
self.train(section=s,
MODEL=m['model'],
model_params=model_params if model_params is not None else m['model_params'],
X_Y_data=X_Y_data)
except Exception as e:
logger.exception(e)
def _compute_training_interval(self, section, last_timestamp=None):
"""
Method for computation of the training interval of data which should be used for the training.
:param string section: e.g. 'KOCE-LNCE'
:param datetime.datetime last_timestamp: For TEST
:return: dict - training interval - {'from': datetime, 'to': datetime}
"""
# First get the timestamp of the last record in DB
logger.debug('[%s] Retrieving last timestamp from DB ...', section)
if last_timestamp is None:
last_timestamp = self.db_interface.get_last_timestamp(section=section)
if last_timestamp is None:
logger.error('Unable to retrieve the last timestamp of data for section %s !!', section)
raise DataNotInDatabaseException('Unable to retrieve last timestamp of data for section %s' % section)
time_interval = self._necessary_to_retrain(last_timestamp=last_timestamp,
days=training_data_interval) # Make interval for training
return time_interval
def _reconnect_to_db(self):
"""
Private method for reconnection to DB in case of erroneous connection.
:return: void
"""
attempt = 1
while attempt < 10:
try:
self.db_interface = DBInterface()
break
except Exception as e:
logger.warning("Reconnecting to DB not successful - attempt %i !!", attempt)
attempt += 1
time.sleep(5)
def _close_db_connection(self):
"""
Private method for closing of the DB connection.
:return: void
"""
if self.db_interface is not None:
self.db_interface._close_connection()
else:
raise Exception("Connection to DB cannot be established !!!!")