import logging
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from ...configs.db_settings import connection_string_data_bs3
from ..feature_engineering import features_to_extract
from ...support_files import ColumnNamesRaw, ColumnNames
from ...configs.sections_settings import SENSOR_TYPE
import pymysql
from .data_entity import DataEntity
[docs]class DETECTOR_TT(DataEntity):
"""
Class implementing the data retrieval & feature creation from the detections` data.
"""
def __init__(self, *args, **kwargs):
super().__init__(**kwargs)
self.mysqlconn_data = pymysql.connect(**connection_string_data_bs3) # Connection to data source - bck, sensors
self.mysql_datetime_format = '%Y-%m-%d %H:%M:%S' # Datetime format that MySQL understands
self.logger = logging.getLogger('traveltimes_processing_detector_tt')
self.data_type = SENSOR_TYPE.SENSOR_DETECTOR_TT
self.features_definitions = features_to_extract[SENSOR_TYPE.SENSOR_DETECTOR_TT]
self._sensors_order = None
# @profile
[docs] def process_data(self):
"""
Method for processing of the retrieved data to be ready for feature engineering.
:return: tuple - list of sensors` dataframes, confidence of data retrieved.
"""
obj = self._reorganize(self.retrieved_data)
if obj is None:
return None, None
output_list = []
for idx, val in enumerate(obj):
try:
if idx != len(obj) -1:
l_tmp = self._aggregate((val['df'], obj[idx+1]['df']))
else:
if len(obj) > 2:
l_tmp = self._aggregate((obj[0]['df'], obj[-1]['df']))
else:
continue
l_tmp = self._finalize_traveltime(l_tmp)
output_list.append({'df': pd.DataFrame(l_tmp), 'sensor_name': val['sensor_name']})
self.logger.debug('Sensor slice %s has been aggregated ...', val['sensor_name'])
except Exception as e:
self.logger.error('Sensor slice %s has not been aggregated !!', val['sensor_name'])
self.logger.exception(e)
output_list.append({'df': pd.DataFrame(), 'sensor_name': val['sensor_name']})
self.aggregated_df_list = output_list
return output_list, None # Add some kind of confidence
# @profile
def _aggregate(self, obj):
"""
Method for aggregation of the data. Calculates traveltime values for each minute by finding matches between detections.
:param tuple obj: input dataframe, output dataframe - for given sub-section (between two slices)
:return: list of dicts - each dist represent the matches for one minute interval.
"""
self.logger.debug('Aggregating ...')
df_in, df_out = obj
if df_out.empty or df_in.empty:
raise ValueError('Aggregation is not possible due to empty dataframe/s !')
df_in = df_in.sort_values(by=ColumnNamesRaw.LP_FULL_ENC, ascending=True).reset_index(drop=True)
df_out = df_out.sort_values(by=ColumnNamesRaw.CALC_TIME, ascending=False).reset_index(drop=True)
lps = df_in[ColumnNamesRaw.LP_FULL_ENC].values.tolist()
time = df_in[ColumnNamesRaw.CALC_TIME].values.tolist()
ddd = {x: t for x, t in zip(lps, time)} # creation of dict for super-fast 'lp' lookup
epoch_length = 60
l = []
l_tmp = None
epoch_stop_time = None
for item in df_out.values.tolist():
if (epoch_stop_time is None) or (item[1] <= epoch_stop_time):
if l_tmp is not None:
l.append(l_tmp)
epoch_stop_time = item[1] - timedelta(seconds=epoch_length)
_dt = datetime.utcfromtimestamp(int(item[1].value/1e9))
l_tmp = {ColumnNames.CALC_TIME: datetime(_dt.year, _dt.month, _dt.day, _dt.hour, _dt.minute),
'matches': [], ColumnNamesRaw.TT_LP_UNMATCHED: 0}
# Dictionary optimization - O(1) search&deletion
try:
matched_time = ddd.pop(item[2])
tt = (item[1].value - matched_time) / 1e9
l_tmp['matches'].append(tt)
except KeyError:
l_tmp[ColumnNamesRaw.TT_LP_UNMATCHED] += 1
# O(log2(n))
# match = index(lps, item[2])
#
# tt = (item[1].value - time[match])/1e9
# l_tmp['matches'].append(tt)
# del time[match]
# del lps[match]
# except ValueError:
# l_tmp[ColumnNamesRaw.TT_LP_UNMATCHED] += 1
return l
# @profile
def _finalize_traveltime(self, obj):
"""
Method for finalizing the traveltime value - calculating one representative descriptor per one minute.
:param list obj: list of dictionaries, where one dictionary contains traveltimes of all cars, that came to that sub-section in given minute.
:return: list of dicts - dicts contain descriptors of the minutes, as many dicts as minutes in data.
"""
def average(arr):
_arr = arr['matches']
if len(_arr)>2:
return np.average(_arr, weights=np.hanning(len(_arr)))
elif len(_arr) == 2:
return np.mean(_arr)
return np.nan
res = list(map(average, obj))
self.logger.debug('Finalizing instantaneous traveltime value ...')
for x, _tt in zip(obj, res):
x[ColumnNamesRaw.TT_LP_MATCH] = _tt
x.pop('matches')
return obj
# @profile
def _reorganize(self, df):
"""
Method for grouping of the data according to the sensor.
:param pd.DataFrame df: Data received from the DB.
:return: list of dicts - dict -> {'sensor_name': string, 'df': pd.Dataframe - dataframe of the sensor}
"""
self.logger.debug('Reorganizing ...')
if self._sensors_order is None:
return None
l = [None] * len(self._sensors_order)
arr_sensors = df[ColumnNamesRaw.SENSOR_NAME].values
for detector in np.unique(arr_sensors):
_df = df[arr_sensors == detector]
l[self._sensors_order.index(detector)] = {'df': _df, 'sensor_name': detector}
# Take care of the case the sensor is completely out... to keep its name in list
nones = [x is None for x in l]
for idx, x in enumerate(nones):
if x:
l[idx] = {'df': pd.DataFrame(), 'sensor_name': self._sensors_order[idx]}
return l
def _retrieve_partial_interval_data(self, data_identifiers, between_time):
"""
Method for retrieval of the data from database.
:param list data_identifiers: sensors` names
:param dict between_time: time interval -> {'from': datetime, 'to': datetime}
:return: pd.DataFrame -> data retrieved from the DB.
"""
_sensors = tuple(data_identifiers)
self._sensors_order = _sensors
query_sensors_data = """
SELECT detector as sensor_name, datetime_created as calculation_time, lp_full_enc
FROM unicam_sensors.detectors_detections_tt
WHERE detector in ({sensors})
AND datetime_created between %s and %s;
"""
format_string = '%Y-%m-%d %H:%M:%S'
df_sensor_data = None
# Get detailed data gathered by sensors each minute
query_sensors_data = query_sensors_data \
.format(sensors="".join(["%s," for _ in _sensors])[:-1]) # Update fields in query
timespan = (between_time['from'].strftime(format_string),
between_time['to'].strftime(format_string))
df_sensor_data = pd.read_sql(sql=query_sensors_data, con=self.mysqlconn_data, params=_sensors + timespan)
return df_sensor_data