from ...support_files import ColumnNames, ColumnNamesRawFiveMin
from ...support_files.helpers import impute
from ...configs.sections_settings import SENSOR_TYPE
from ...configs.db_settings import connection_string_data_bs3
from .data_entity import DataEntity
from ..feature_engineering import features_to_extract
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import logging
import pymysql
[docs]class RDET(DataEntity):
"""
Class providing interface to RDET data - access to DB & engineering of features.
"""
def __init__(self, *args, **kwargs):
super().__init__(**kwargs)
self.mysqlconn_data = pymysql.connect(**connection_string_data_bs3) # Connection to data source - bck, sensors
self.mysql_datetime_format = '%Y-%m-%d %H:%M:%S' # Datetime format that MySQL understands
self.logger = logging.getLogger('traveltimes_processing_rdet')
self.data_type = SENSOR_TYPE.SENSOR_RDET
self.features_definitions = features_to_extract[SENSOR_TYPE.SENSOR_RDET]
def _retrieve_partial_interval_data(self, data_identifiers, between_time):
"""
Function for retrieval of detailed data for specified slice-sensors.
:param data_identifiers - sensors` names
:param between_time: dict {'from': datetime, 'to': datetime}
:return: pd.DataFrame of the sensors` data
"""
_sensors = tuple(data_identifiers)
query_sensors_data = """
SELECT rdet as sensor_name, create_time as calculation_time,
velocity_avg, occupancy, detection_count
FROM unicam_sensors.rdets_fivemin_data
WHERE rdet in ({sensors})
AND create_time between %s and %s;
"""
format_string = '%Y-%m-%d %H:%M:%S'
df_sensor_data = None
# Get detailed data gathered by sensors each minute
query_sensors_data = query_sensors_data \
.format(sensors="".join(["%s," for _ in _sensors])[:-1]) # Update fields in query
timespan = (between_time['from'].strftime(format_string),
between_time['to'].strftime(format_string))
df_sensor_data = pd.read_sql(sql=query_sensors_data, con=self.mysqlconn_data, params=_sensors + timespan)
# Cut off seconds + round up
df_sensor_data[ColumnNamesRawFiveMin.CALC_TIME] = df_sensor_data[ColumnNamesRawFiveMin.CALC_TIME]\
.apply(lambda x: datetime(year=x.year, month=x.month, day=x.day, hour=x.hour, minute=x.minute) +
(timedelta(minutes=1) if (round(x.second/60) > 0) else timedelta(minutes=0)))
return df_sensor_data
# @profile
def _aggregate(self, df):
"""
Method for aggregation of information from sensors from multiple lanes on the road to one descriptor.
:param df: pandas.DataFrame
:return: pandas.DataFrame - aggregated DataFrame of sensor`s data
"""
aggregated_data = []
aggregated_columns = []
arr = df.values # Retrieval and working with np.array instead pandas (it is much faster)
columns = list(df.columns)
# Retrieving of the indices of columns
i_CALC_TIME = columns.index(ColumnNamesRawFiveMin.CALC_TIME)
i_VELOCITY = columns.index(ColumnNamesRawFiveMin.VELOCITY)
i_OCCUPANCY = columns.index(ColumnNamesRawFiveMin.OCCUPANCY)
i_COUNT = columns.index(ColumnNamesRawFiveMin.COUNT)
# In the dataframe there are multiple occurrences of the same time (because of multiple lanes - sensors)
# Replace strings-dates by integers - much faster search in array of integers than in arr of strings
unique_timestamps, indices = np.unique(arr[:, i_CALC_TIME], return_inverse=True)
unique_indices = np.unique(indices).tolist()
# Interpolating -1 values and calculating 'confidence' of data - percent of data imputed
columns = [i_OCCUPANCY, i_VELOCITY, i_COUNT]
imputed_arr, confidence_data = impute(array=arr, columns=columns, invalid_val=-1)
if imputed_arr is not None:
arr[:, columns] = imputed_arr
first_run = True
for index in unique_indices:
# Query the dataframe to select only the data that we want & valid data.
time_query = indices == index
sub_arr = arr[time_query]
if sub_arr.size == 0: # Case when in the data there is not any records fulfilling the search criteria
continue
if first_run: # During the first run prepare the columns for dataframe
aggregated_columns.extend([ColumnNames.CALC_TIME,
ColumnNames.AVG_VELOCITY,
ColumnNames.TOTAL_OCCUPANCY,
ColumnNames.NORM_COUNT])
first_run = False
# Aggregate the information from lanes to single road descriptor
aggregated_velocity = np.mean(sub_arr[:, i_VELOCITY])
aggregated_occupancy = np.sum(sub_arr[:, i_OCCUPANCY])
aggregated_count = np.sum(sub_arr[:, i_COUNT])
record = [unique_timestamps[index],
aggregated_velocity if aggregated_velocity != 'False' else 0,
aggregated_occupancy if aggregated_occupancy != 'False' else 0,
aggregated_count if aggregated_count != 'False' else 0]
aggregated_data.append(record)
else:
self.logger.warning('Invalid data has been received and cannot be imputed due to lack of valid data !')
# Creation of pd.DataFrame
if aggregated_data:
aggregated_data = np.array(aggregated_data)
aggregated_data = pd.DataFrame(aggregated_data, columns=aggregated_columns)
aggregated_data = aggregated_data.apply(lambda x: pd.to_numeric(x, errors='ignore'))
aggregated_data[ColumnNamesRawFiveMin.CALC_TIME] = aggregated_data[ColumnNamesRawFiveMin.CALC_TIME].apply \
(pd.to_datetime)
aggregated_data.sort_values(by=[ColumnNamesRawFiveMin.CALC_TIME], ascending=[True], inplace=True)
else:
aggregated_data = None
return aggregated_data, confidence_data
# @profile
def _reorganize(self, obj):
"""
Method for splitting the data for one type of sensors into multiple groups according to their names.
:param obj: pd.DataFrame - DataFrame of one group (type of sensors)
:return: dict - dict of pd.DataFrames - one per slice (sensors with the same names at first two identifiers)
"""
sensor_names = obj[ColumnNamesRawFiveMin.SENSOR_NAME].values
unique_names, indices = np.unique(sensor_names, return_inverse=True)
unique_names = list(unique_names)
identifiers = list(map(lambda x: tuple(x.split('-')[:2]), unique_names))
unique_identifiers = set(identifiers)
groups_dfs = dict()
for x in unique_identifiers:
group = []
for idx, xx in enumerate(identifiers):
if xx == x:
group.append(idx)
s_name = "".join(x)
groups_dfs[s_name] = obj[obj[ColumnNamesRawFiveMin.SENSOR_NAME].isin([unique_names[i]
for i in group])].reset_index(drop=True)
return groups_dfs