Source code for traveltimes_prediction.data_processing.data_processor

import logging

import numpy as np
import pandas as pd

from ..support_files import ColumnNamesRawFiveMin, ColumnNames
from ..support_files.helpers import merge_inner_lists, merge_to_nearest
from .data_entities import DET1, DET2, BCK_TT, REF_TT, TIME
from ..configs.sections_settings import sections_settings, SENSOR_TYPE

logger = logging.getLogger('traveltimes_data_processor')


[docs]class DataProcessor: """ Class for processing of the data - aggregation, feature engineering. """ def __init__(self, section): """ Constructor. :param string section: e.g. 'KOCE-LNCE' """ self.section = section self.feature_entities = ([DET1] if SENSOR_TYPE.SENSOR_DET1 in sections_settings[section]['sensor_types'] else []) + \ ([DET2] if SENSOR_TYPE.SENSOR_DET2 in sections_settings[section]['sensor_types'] else []) + \ ([DETECTOR_TT] if SENSOR_TYPE.SENSOR_DETECTOR_TT in sections_settings[section]['sensor_types'] else []) + \ [BCK_TT] + \ [TIME] # @profile
[docs] def get_features(self, time_interval_list): """ Method for retrieving data from DB, aggregating and feature engineering. :param list time_interval_list: list of dicts - intervals --> {'from': datetime, 'to': datetime} :return: tuple - pd.DataFrame features, int - confidence (0-100) """ df_features = None confidences = [] try: for cls in self.feature_entities: # For all feature entities if cls is TIME: continue obj = cls() obj.get_data(section=self.section, list_between_times=time_interval_list) aggregated_df_list, confidence = obj.process_data() # confidences.append(confidence) _df = obj.engineer_features() if _df is None: break df_features = df_features.join(_df, how='outer') if df_features is not None else _df if TIME in self.feature_entities and df_features is not None: obj = TIME(time_index=df_features.index) obj.process_data() time_features = obj.engineer_features() df_features = df_features.join(time_features, how='outer') if df_features is not None: confidence_features = 1 - np.sum(pd.isnull(df_features.values))/df_features.values.size df_features = df_features.interpolate().fillna(method='bfill').fillna(method='ffill') confidences.append(confidence_features) except Exception as e: logger.error('[%s] Feature processing unsuccessful !!' % self.section) logger.exception(e) # Confidence calculation - magic -- for now mean confidence = int(np.mean(merge_inner_lists(list(filter(None, confidences))))*100) if confidences else 0.0 return df_features, confidence
[docs] def get_referential_data(self, time_interval_list): """ Method for retrieval of the referential data. :param list time_interval_list: list of dicts. :return: pd.DataFrame """ obj = REF_TT() obj.get_data(section=self.section, list_between_times=time_interval_list) return obj.retrieved_data
@staticmethod
[docs] def align_training_dataset(X, Y, Y_bck=None): """ Method for preparation of the data for training & visualization - aligning and leaving only the records with timestamp contained in both X and Y (and Y_bck). :param pd.DataFrame X: dataframe of features :param pd.DataFrame Y: dataframe of the true values of travel times :param pd.DataFrame Y_bck: dataframe of the backward predicted traveltimes - optional :return: tuple - pd.DataFrame of features, pd.DataFrame of true values of travel times, optionally pd.DataFrame with the backward predicted traveltimes + dataframe of time """ if not isinstance(Y, pd.DataFrame): Y = pd.DataFrame(Y) if X is not None and Y is not None: tmp = merge_to_nearest(X, Y.set_index(ColumnNamesRawFiveMin.CALC_TIME) if ColumnNamesRawFiveMin.CALC_TIME in Y.columns else Y) tmp_cols = list(tmp.columns) tmp_cols.remove(ColumnNamesRawFiveMin.TT_REAL) if Y_bck is None: return tmp[tmp_cols], tmp[ColumnNamesRawFiveMin.TT_REAL] else: return tmp[tmp_cols], tmp[ColumnNamesRawFiveMin.TT_REAL], tmp[[x for x in tmp.columns if ColumnNames.FEAT_TT_BCK in x][0]] else: logger.error("Invalid data received, aligning phase skipped !") return None