Source code for traveltimes_prediction.data_processing.data_processor
import logging
import numpy as np
import pandas as pd
from ..support_files import ColumnNamesRawFiveMin, ColumnNames
from ..support_files.helpers import merge_inner_lists, merge_to_nearest
from .data_entities import DET1, DET2, BCK_TT, REF_TT, TIME
from ..configs.sections_settings import sections_settings, SENSOR_TYPE
logger = logging.getLogger('traveltimes_data_processor')
[docs]class DataProcessor:
"""
Class for processing of the data - aggregation, feature engineering.
"""
def __init__(self, section):
"""
Constructor.
:param string section: e.g. 'KOCE-LNCE'
"""
self.section = section
self.feature_entities = ([DET1] if SENSOR_TYPE.SENSOR_DET1 in sections_settings[section]['sensor_types'] else []) + \
([DET2] if SENSOR_TYPE.SENSOR_DET2 in sections_settings[section]['sensor_types'] else []) + \
([DETECTOR_TT] if SENSOR_TYPE.SENSOR_DETECTOR_TT in sections_settings[section]['sensor_types'] else []) + \
[BCK_TT] + \
[TIME]
# @profile
[docs] def get_features(self, time_interval_list):
"""
Method for retrieving data from DB, aggregating and feature engineering.
:param list time_interval_list: list of dicts - intervals --> {'from': datetime, 'to': datetime}
:return: tuple - pd.DataFrame features, int - confidence (0-100)
"""
df_features = None
confidences = []
try:
for cls in self.feature_entities: # For all feature entities
if cls is TIME:
continue
obj = cls()
obj.get_data(section=self.section, list_between_times=time_interval_list)
aggregated_df_list, confidence = obj.process_data()
# confidences.append(confidence)
_df = obj.engineer_features()
if _df is None:
break
df_features = df_features.join(_df, how='outer') if df_features is not None else _df
if TIME in self.feature_entities and df_features is not None:
obj = TIME(time_index=df_features.index)
obj.process_data()
time_features = obj.engineer_features()
df_features = df_features.join(time_features, how='outer')
if df_features is not None:
confidence_features = 1 - np.sum(pd.isnull(df_features.values))/df_features.values.size
df_features = df_features.interpolate().fillna(method='bfill').fillna(method='ffill')
confidences.append(confidence_features)
except Exception as e:
logger.error('[%s] Feature processing unsuccessful !!' % self.section)
logger.exception(e)
# Confidence calculation - magic -- for now mean
confidence = int(np.mean(merge_inner_lists(list(filter(None, confidences))))*100) if confidences else 0.0
return df_features, confidence
[docs] def get_referential_data(self, time_interval_list):
"""
Method for retrieval of the referential data.
:param list time_interval_list: list of dicts.
:return: pd.DataFrame
"""
obj = REF_TT()
obj.get_data(section=self.section, list_between_times=time_interval_list)
return obj.retrieved_data
@staticmethod
[docs] def align_training_dataset(X, Y, Y_bck=None):
"""
Method for preparation of the data for training & visualization - aligning and leaving only the records with
timestamp contained in both X and Y (and Y_bck).
:param pd.DataFrame X: dataframe of features
:param pd.DataFrame Y: dataframe of the true values of travel times
:param pd.DataFrame Y_bck: dataframe of the backward predicted traveltimes - optional
:return: tuple - pd.DataFrame of features, pd.DataFrame of true values of travel times, optionally pd.DataFrame with the backward predicted traveltimes + dataframe of time
"""
if not isinstance(Y, pd.DataFrame):
Y = pd.DataFrame(Y)
if X is not None and Y is not None:
tmp = merge_to_nearest(X, Y.set_index(ColumnNamesRawFiveMin.CALC_TIME) if ColumnNamesRawFiveMin.CALC_TIME in Y.columns else Y)
tmp_cols = list(tmp.columns)
tmp_cols.remove(ColumnNamesRawFiveMin.TT_REAL)
if Y_bck is None:
return tmp[tmp_cols], tmp[ColumnNamesRawFiveMin.TT_REAL]
else:
return tmp[tmp_cols], tmp[ColumnNamesRawFiveMin.TT_REAL], tmp[[x for x in tmp.columns if ColumnNames.FEAT_TT_BCK in x][0]]
else:
logger.error("Invalid data received, aligning phase skipped !")
return None