Source code for traveltimes_prediction.data_processing.data_entities.detector_tt

import logging

import numpy as np
import pandas as pd
from datetime import datetime, timedelta

from ...configs.db_settings import connection_string_data_bs3
from ..feature_engineering import features_to_extract
from ...support_files import ColumnNamesRaw, ColumnNames

from ...configs.sections_settings import SENSOR_TYPE

import pymysql

from .data_entity import DataEntity


[docs]class DETECTOR_TT(DataEntity): """ Class implementing the data retrieval & feature creation from the detections` data. """ def __init__(self, *args, **kwargs): super().__init__(**kwargs) self.mysqlconn_data = pymysql.connect(**connection_string_data_bs3) # Connection to data source - bck, sensors self.mysql_datetime_format = '%Y-%m-%d %H:%M:%S' # Datetime format that MySQL understands self.logger = logging.getLogger('traveltimes_processing_detector_tt') self.data_type = SENSOR_TYPE.SENSOR_DETECTOR_TT self.features_definitions = features_to_extract[SENSOR_TYPE.SENSOR_DETECTOR_TT] self._sensors_order = None # @profile
[docs] def process_data(self): """ Method for processing of the retrieved data to be ready for feature engineering. :return: tuple - list of sensors` dataframes, confidence of data retrieved. """ obj = self._reorganize(self.retrieved_data) if obj is None: return None, None output_list = [] for idx, val in enumerate(obj): try: if idx != len(obj) -1: l_tmp = self._aggregate((val['df'], obj[idx+1]['df'])) else: if len(obj) > 2: l_tmp = self._aggregate((obj[0]['df'], obj[-1]['df'])) else: continue l_tmp = self._finalize_traveltime(l_tmp) output_list.append({'df': pd.DataFrame(l_tmp), 'sensor_name': val['sensor_name']}) self.logger.debug('Sensor slice %s has been aggregated ...', val['sensor_name']) except Exception as e: self.logger.error('Sensor slice %s has not been aggregated !!', val['sensor_name']) self.logger.exception(e) output_list.append({'df': pd.DataFrame(), 'sensor_name': val['sensor_name']}) self.aggregated_df_list = output_list return output_list, None # Add some kind of confidence
# @profile def _aggregate(self, obj): """ Method for aggregation of the data. Calculates traveltime values for each minute by finding matches between detections. :param tuple obj: input dataframe, output dataframe - for given sub-section (between two slices) :return: list of dicts - each dist represent the matches for one minute interval. """ self.logger.debug('Aggregating ...') df_in, df_out = obj if df_out.empty or df_in.empty: raise ValueError('Aggregation is not possible due to empty dataframe/s !') df_in = df_in.sort_values(by=ColumnNamesRaw.LP_FULL_ENC, ascending=True).reset_index(drop=True) df_out = df_out.sort_values(by=ColumnNamesRaw.CALC_TIME, ascending=False).reset_index(drop=True) lps = df_in[ColumnNamesRaw.LP_FULL_ENC].values.tolist() time = df_in[ColumnNamesRaw.CALC_TIME].values.tolist() ddd = {x: t for x, t in zip(lps, time)} # creation of dict for super-fast 'lp' lookup epoch_length = 60 l = [] l_tmp = None epoch_stop_time = None for item in df_out.values.tolist(): if (epoch_stop_time is None) or (item[1] <= epoch_stop_time): if l_tmp is not None: l.append(l_tmp) epoch_stop_time = item[1] - timedelta(seconds=epoch_length) _dt = datetime.utcfromtimestamp(int(item[1].value/1e9)) l_tmp = {ColumnNames.CALC_TIME: datetime(_dt.year, _dt.month, _dt.day, _dt.hour, _dt.minute), 'matches': [], ColumnNamesRaw.TT_LP_UNMATCHED: 0} # Dictionary optimization - O(1) search&deletion try: matched_time = ddd.pop(item[2]) tt = (item[1].value - matched_time) / 1e9 l_tmp['matches'].append(tt) except KeyError: l_tmp[ColumnNamesRaw.TT_LP_UNMATCHED] += 1 # O(log2(n)) # match = index(lps, item[2]) # # tt = (item[1].value - time[match])/1e9 # l_tmp['matches'].append(tt) # del time[match] # del lps[match] # except ValueError: # l_tmp[ColumnNamesRaw.TT_LP_UNMATCHED] += 1 return l # @profile def _finalize_traveltime(self, obj): """ Method for finalizing the traveltime value - calculating one representative descriptor per one minute. :param list obj: list of dictionaries, where one dictionary contains traveltimes of all cars, that came to that sub-section in given minute. :return: list of dicts - dicts contain descriptors of the minutes, as many dicts as minutes in data. """ def average(arr): _arr = arr['matches'] if len(_arr)>2: return np.average(_arr, weights=np.hanning(len(_arr))) elif len(_arr) == 2: return np.mean(_arr) return np.nan res = list(map(average, obj)) self.logger.debug('Finalizing instantaneous traveltime value ...') for x, _tt in zip(obj, res): x[ColumnNamesRaw.TT_LP_MATCH] = _tt x.pop('matches') return obj # @profile def _reorganize(self, df): """ Method for grouping of the data according to the sensor. :param pd.DataFrame df: Data received from the DB. :return: list of dicts - dict -> {'sensor_name': string, 'df': pd.Dataframe - dataframe of the sensor} """ self.logger.debug('Reorganizing ...') if self._sensors_order is None: return None l = [None] * len(self._sensors_order) arr_sensors = df[ColumnNamesRaw.SENSOR_NAME].values for detector in np.unique(arr_sensors): _df = df[arr_sensors == detector] l[self._sensors_order.index(detector)] = {'df': _df, 'sensor_name': detector} # Take care of the case the sensor is completely out... to keep its name in list nones = [x is None for x in l] for idx, x in enumerate(nones): if x: l[idx] = {'df': pd.DataFrame(), 'sensor_name': self._sensors_order[idx]} return l def _retrieve_partial_interval_data(self, data_identifiers, between_time): """ Method for retrieval of the data from database. :param list data_identifiers: sensors` names :param dict between_time: time interval -> {'from': datetime, 'to': datetime} :return: pd.DataFrame -> data retrieved from the DB. """ _sensors = tuple(data_identifiers) self._sensors_order = _sensors query_sensors_data = """ SELECT detector as sensor_name, datetime_created as calculation_time, lp_full_enc FROM unicam_sensors.detectors_detections_tt WHERE detector in ({sensors}) AND datetime_created between %s and %s; """ format_string = '%Y-%m-%d %H:%M:%S' df_sensor_data = None # Get detailed data gathered by sensors each minute query_sensors_data = query_sensors_data \ .format(sensors="".join(["%s," for _ in _sensors])[:-1]) # Update fields in query timespan = (between_time['from'].strftime(format_string), between_time['to'].strftime(format_string)) df_sensor_data = pd.read_sql(sql=query_sensors_data, con=self.mysqlconn_data, params=_sensors + timespan) return df_sensor_data