Source code for traveltimes_prediction.data_processing.data_entities.data_entity

import abc
import traceback
import logging
import pandas as pd
import numpy as np

from ...support_files.helpers import check_params, merge_inner_lists, array_append
from ...support_files import ColumnNamesRawFiveMin, ColumnNames
from ...configs import sections_settings, sensors


[docs]class DataEntity: def __init__(self, *args, **kwargs): self.data_type = None self.features_definitions = [] self.logger = logging.getLogger('') self.retrieved_data = None self.aggregated_df_list = None ##################################################### RETRIEVAL ######################################################## @check_params def get_data(self, section, list_between_times): """ Function for sequential execution of sql queries for longer timespans. :param section: string - which section`s data should be loaded :param list_between_times: list of dicts of traveltimes - [{'from':..., 'to':...}, {...}, ...] ":return pd.DataFrame - raw data retrieved from DB """ df = None used_sensors = sections_settings[section]['input_sensors']+\ merge_inner_lists(list(sections_settings[section]['inner_sensors'])) + \ sections_settings[section]['output_sensors'] _sensors = [x for x in used_sensors if x in sensors[self.data_type]] # Iterating through the entire list of time-spans (tuples) self.logger.debug("Downloading of chunks-days from DB started ...") i = 1 for between_time in list_between_times: try: _df = self._retrieve_partial_interval_data(data_identifiers=_sensors, between_time=between_time) if df is None and _df is not None and not _df.empty: df = _df elif (df is not None) and (_df is not None) and (not _df.empty): df = df.append(_df, ignore_index=True) except: traceback.print_exc() self.logger.error("Error while processing data for time span: {} !!".format(between_time)) self.logger.debug("Processed {:<5} of {:>5} ... ".format(i, len(list_between_times))) i += 1 self.logger.debug("Finished...") self.retrieved_data = df return df @abc.abstractmethod def _retrieve_partial_interval_data(self, data_identifiers, between_time): pass ##################################################### PROCESSING #######################################################
[docs] def process_data(self): """ Method for aggregating of raw data. :return: tuple """ d = [] c = [] if self.retrieved_data is not None and not self.retrieved_data.empty: # split data according the names of sensors to multiple groups taking into account first two identifiers for sensors_name, slice_df in self._reorganize(self.retrieved_data).items(): self.logger.debug('Aggregating slice %s', sensors_name) _df, confidence = self._aggregate(slice_df) # Select data and aggregate d.append({'df': _df, 'sensor_name': sensors_name}) c.append(confidence) self.aggregated_df_list = d self.logger.debug('Aggregation finished ...') else: self.logger.warning('No data retrieved !!! Aggregation skipped !!') return None, 0.0 return d, c
@abc.abstractmethod def _aggregate(self, df): pass @abc.abstractmethod def _reorganize(self, obj): pass ################################################# FEATURE ENGINEERING ################################################## # @profile
[docs] def engineer_features(self): """ Method for engineering of the features. :return: pd.DataFrame - DataFrame of features """ df_features = None if self.aggregated_df_list is not None: try: # Engineering from sensors` data for df_obj in self.aggregated_df_list: # For each slice (group of sensors` names) sensor_name = df_obj['sensor_name'] self.logger.debug("Feature engineering for sensor_slice %s started ...", sensor_name) # Extract features for given slice _df = self._engineer_features(df_obj['df'], features_for_extraction=self.features_definitions) if _df is not None: # Make features slice-specific _df.columns = ["".join([x, "-", sensor_name]) for x in _df.columns] c_list = list(_df.columns) i_CALC = c_list.index("".join([ColumnNamesRawFiveMin.CALC_TIME, "-", sensor_name])) c_list[i_CALC] = ColumnNamesRawFiveMin.CALC_TIME _df.columns = c_list # Using join on index instead of merge to avoid duplicate columns df_features = df_features.join(_df.set_index(ColumnNamesRawFiveMin.CALC_TIME), how='outer') \ if df_features is not None else _df.set_index(ColumnNamesRawFiveMin.CALC_TIME) self.logger.debug("Features for sensor_slice %s engineered ...", sensor_name) else: self.logger.warning("Features for %s /sensor_slice/ has not been engineered !!!", sensor_name) # Add the invalid values to features .. _columns = [] _data = [] for x in self.features_definitions: _columns.extend(x['c_name_feat']) _data.extend([np.nan for _ in x['c_name_feat']]) _columns = ["".join([x, "-", sensor_name]) for x in _columns] _invalid_df = pd.DataFrame(columns=_columns) df_features = pd.concat([df_features, _invalid_df], axis=1) except Exception as e: self.logger.exception(e) return df_features
# @profile def _engineer_features(self, df, features_for_extraction): """ Method for calculating of the feature data - creation of the final DataFrame, which will be used for predictions. :param df: pd.DataFrame - features :param features_for_extraction: list of features (strings) that should be extracted :return: pandas.DataFrame of features. """ # One indexing per feature. Input -> numpy vectors features_df = None feature_array = columns = None self.logger.debug('Engineering of features started ... ') try: for feat in features_for_extraction: self.logger.debug('Engineering feature |%s| ...', feat['c_name_feat']) if df is None: continue # Due to lack of data for feature creation # If the features have not been processed correctly (and do not contain necessary columns), skip if not set(list(df.columns)).intersection(set(feat['c_name'])): continue # Gather all data necessary for the calculation of current feature data_array = df[feat['c_name']].values # Calculate feature if the transformation is described, copy otherwise if feat['f'] is None: feature_vector = np.array(data_array).astype(np.float32) elif feat['f'] == 'copy': feature_vector = data_array else: feature_vector = list(feat['f'](data_array)) feature_array = array_append(feature_array, feature_vector, stack='h') # Build array of features columns = merge_inner_lists([c['c_name_feat'] for c in features_for_extraction]) # Retrieve columns` names except Exception as e: self.logger.exception(e) try: if feature_array is not None: features_df = pd.DataFrame(feature_array, columns=columns) features_df[ColumnNames.CALC_TIME] = df[ColumnNames.CALC_TIME] self.logger.debug('All features engineered ... ') else: self.logger.error('Data for slice has not been engineered due to invalid values !!') features_df = None except Exception as e: self.logger.exception(e) return features_df