Source code for traveltimes_prediction.data_processing.data_entities.det2

from ...support_files.helpers import impute
from ...support_files import ColumnNamesRawFiveMin
from ...configs.sections_settings import SENSOR_TYPE
from ...configs.db_settings import connection_string_data_det1
from .data_entity import DataEntity
from ..feature_engineering import features_to_extract
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import logging

import pymysql


[docs]class DET2(DataEntity): """ Class providing interface to DET2 data - access to DB & engineering of features. """ def __init__(self, *args, **kwargs): super().__init__(**kwargs) self.mysqlconn_data = pymysql.connect(**connection_string_data_det1) # Connection to data source self.mysql_datetime_format = '%Y-%m-%d %H:%M:%S' # Datetime format that MySQL understands self.logger = logging.getLogger('traveltimes_processing_detector') self.data_type = SENSOR_TYPE.SENSOR_DET2 self.features_definitions = features_to_extract[SENSOR_TYPE.SENSOR_DET2] def _retrieve_partial_interval_data(self, data_identifiers, between_time): """ Function for retrieval of detailed data for specified slice-sensors. :param data_identifiers: sensors` names :param between_time: dict {'from': datetime, 'to': datetime} :return: pd.DataFrame of the sensors` data """ _sensors = tuple(data_identifiers) query_sensors_data = """ SELECT detector as sensor_name, lp_count, create_time as calculation_time FROM unicam_sensors.detectors_fivemin_data WHERE detector in ({sensors}) and create_time between %s and %s; """ format_string = '%Y-%m-%d %H:%M:%S' df_sensor_data = None # Get detailed data gathered by sensors each minute query_sensors_data = query_sensors_data \ .format(sensors="".join(["%s," for _ in _sensors])[:-1]) # Update fields in query timespan = (between_time['from'].strftime(format_string), between_time['to'].strftime(format_string)) df_sensor_data = pd.read_sql(sql=query_sensors_data, con=self.mysqlconn_data, params=_sensors + timespan) # Cut off seconds + round up df_sensor_data[ColumnNamesRawFiveMin.CALC_TIME]= df_sensor_data[ColumnNamesRawFiveMin.CALC_TIME]\ .apply(lambda x: datetime(year=x.year, month=x.month, day=x.day, hour=x.hour, minute=x.minute) + (timedelta(minutes=1) if (round(x.second/60) > 0) else timedelta(minutes=0))) return df_sensor_data # @profile def _aggregate(self, df): """ Method for aggregation of information from sensors from multiple lanes on the road to one descriptor. :param df: pandas.DataFrame - sensor_df_in or sensor_df_out :return: pandas.DataFrame - aggregated DataFrame of sensor`s data """ aggregated_data = [] aggregated_columns = [] arr = df.values # Retrieval and working with np.array instead pandas (it is much faster) columns = list(df.columns) # Retrieving of the indices of columns i_CALC_TIME = columns.index(ColumnNamesRawFiveMin.CALC_TIME) i_LP_COUNT = columns.index(ColumnNamesRawFiveMin.LP_COUNT) # In the dataframe there are multiple occurrences of the same time (because of multiple lanes - sensors) # Replace strings-dates by integers - much faster search in array of integers than in arr of strings unique_timestamps, indices = np.unique(arr[:, i_CALC_TIME], return_inverse=True) unique_indices = np.unique(indices).tolist() # Interpolating -1 values and calculating 'confidence' of data - percent of data imputed columns = [i_LP_COUNT] imputed_arr, confidence_data = impute(array=arr, columns=columns, invalid_val=-1) if imputed_arr is not None: arr[:, columns] = imputed_arr first_run = True for index in unique_indices: # Query the dataframe to select only the data that we want & valid data. time_query = indices == index sub_arr = arr[time_query] if sub_arr.size == 0: # Case when in the data there is not any records fulfilling the search criteria continue if first_run: # During the first run prepare the columns for dataframe aggregated_columns.extend([ColumnNamesRawFiveMin.CALC_TIME, ColumnNamesRawFiveMin.LP_COUNT ]) first_run = False # Aggregate the information from lanes to single road descriptor aggregated = np.sum(sub_arr[:, columns], axis=0) record = [unique_timestamps[index]] + aggregated.tolist() aggregated_data.append(record) else: self.logger.warning('Invalid data has been received and cannot be imputed due to lack of valid data !') # Creation of pd.DataFrame if aggregated_data: aggregated_data = np.array(aggregated_data) aggregated_data = pd.DataFrame(aggregated_data, columns=aggregated_columns) aggregated_data = aggregated_data.apply(lambda x: pd.to_numeric(x, errors='ignore')) aggregated_data[ColumnNamesRawFiveMin.CALC_TIME] = aggregated_data[ColumnNamesRawFiveMin.CALC_TIME].apply \ (pd.to_datetime) aggregated_data.sort_values(by=[ColumnNamesRawFiveMin.CALC_TIME], ascending=[True], inplace=True) else: aggregated_data = None return aggregated_data, confidence_data # @profile def _reorganize(self, obj): """ Method for splitting the data for one type of sensors into multiple groups according to their names. :param obj: pd.DataFrame - DataFrame of one group (type of sensors) :return: dict - dict of pd.DataFrames - one per slice (sensors with the same names at first two identifiers) """ sensor_names = obj[ColumnNamesRawFiveMin.SENSOR_NAME].values unique_names, indices = np.unique(sensor_names, return_inverse=True) unique_names = list(unique_names) identifiers = list(map(lambda x: tuple(x.split('-')[:2]), unique_names)) unique_identifiers = set(identifiers) groups_dfs = dict() for x in unique_identifiers: group = [] for idx, xx in enumerate(identifiers): if xx == x: group.append(idx) s_name = "".join(x) groups_dfs[s_name] = obj[obj[ColumnNamesRawFiveMin.SENSOR_NAME].isin([unique_names[i] for i in group])].reset_index(drop=True) return groups_dfs