import abc
import traceback
import logging
import pandas as pd
import numpy as np
from ...support_files.helpers import check_params, merge_inner_lists, array_append
from ...support_files import ColumnNamesRawFiveMin, ColumnNames
from ...configs import sections_settings, sensors
[docs]class DataEntity:
def __init__(self, *args, **kwargs):
self.data_type = None
self.features_definitions = []
self.logger = logging.getLogger('')
self.retrieved_data = None
self.aggregated_df_list = None
##################################################### RETRIEVAL ########################################################
@check_params
def get_data(self, section, list_between_times):
"""
Function for sequential execution of sql queries for longer timespans.
:param section: string - which section`s data should be loaded
:param list_between_times: list of dicts of traveltimes - [{'from':..., 'to':...}, {...}, ...]
":return pd.DataFrame - raw data retrieved from DB
"""
df = None
used_sensors = sections_settings[section]['input_sensors']+\
merge_inner_lists(list(sections_settings[section]['inner_sensors'])) + \
sections_settings[section]['output_sensors']
_sensors = [x for x in used_sensors if x in sensors[self.data_type]]
# Iterating through the entire list of time-spans (tuples)
self.logger.debug("Downloading of chunks-days from DB started ...")
i = 1
for between_time in list_between_times:
try:
_df = self._retrieve_partial_interval_data(data_identifiers=_sensors, between_time=between_time)
if df is None and _df is not None and not _df.empty:
df = _df
elif (df is not None) and (_df is not None) and (not _df.empty):
df = df.append(_df, ignore_index=True)
except:
traceback.print_exc()
self.logger.error("Error while processing data for time span: {} !!".format(between_time))
self.logger.debug("Processed {:<5} of {:>5} ... ".format(i, len(list_between_times)))
i += 1
self.logger.debug("Finished...")
self.retrieved_data = df
return df
@abc.abstractmethod
def _retrieve_partial_interval_data(self, data_identifiers, between_time):
pass
##################################################### PROCESSING #######################################################
[docs] def process_data(self):
"""
Method for aggregating of raw data.
:return: tuple
"""
d = []
c = []
if self.retrieved_data is not None and not self.retrieved_data.empty:
# split data according the names of sensors to multiple groups taking into account first two identifiers
for sensors_name, slice_df in self._reorganize(self.retrieved_data).items():
self.logger.debug('Aggregating slice %s', sensors_name)
_df, confidence = self._aggregate(slice_df) # Select data and aggregate
d.append({'df': _df, 'sensor_name': sensors_name})
c.append(confidence)
self.aggregated_df_list = d
self.logger.debug('Aggregation finished ...')
else:
self.logger.warning('No data retrieved !!! Aggregation skipped !!')
return None, 0.0
return d, c
@abc.abstractmethod
def _aggregate(self, df):
pass
@abc.abstractmethod
def _reorganize(self, obj):
pass
################################################# FEATURE ENGINEERING ##################################################
# @profile
[docs] def engineer_features(self):
"""
Method for engineering of the features.
:return: pd.DataFrame - DataFrame of features
"""
df_features = None
if self.aggregated_df_list is not None:
try: # Engineering from sensors` data
for df_obj in self.aggregated_df_list: # For each slice (group of sensors` names)
sensor_name = df_obj['sensor_name']
self.logger.debug("Feature engineering for sensor_slice %s started ...", sensor_name)
# Extract features for given slice
_df = self._engineer_features(df_obj['df'], features_for_extraction=self.features_definitions)
if _df is not None:
# Make features slice-specific
_df.columns = ["".join([x, "-", sensor_name]) for x in _df.columns]
c_list = list(_df.columns)
i_CALC = c_list.index("".join([ColumnNamesRawFiveMin.CALC_TIME, "-", sensor_name]))
c_list[i_CALC] = ColumnNamesRawFiveMin.CALC_TIME
_df.columns = c_list
# Using join on index instead of merge to avoid duplicate columns
df_features = df_features.join(_df.set_index(ColumnNamesRawFiveMin.CALC_TIME),
how='outer') \
if df_features is not None else _df.set_index(ColumnNamesRawFiveMin.CALC_TIME)
self.logger.debug("Features for sensor_slice %s engineered ...", sensor_name)
else:
self.logger.warning("Features for %s /sensor_slice/ has not been engineered !!!", sensor_name)
# Add the invalid values to features ..
_columns = []
_data = []
for x in self.features_definitions:
_columns.extend(x['c_name_feat'])
_data.extend([np.nan for _ in x['c_name_feat']])
_columns = ["".join([x, "-", sensor_name]) for x in _columns]
_invalid_df = pd.DataFrame(columns=_columns)
df_features = pd.concat([df_features, _invalid_df], axis=1)
except Exception as e:
self.logger.exception(e)
return df_features
# @profile
def _engineer_features(self, df, features_for_extraction):
"""
Method for calculating of the feature data - creation of the final DataFrame, which will be used for predictions.
:param df: pd.DataFrame - features
:param features_for_extraction: list of features (strings) that should be extracted
:return: pandas.DataFrame of features.
"""
# One indexing per feature. Input -> numpy vectors
features_df = None
feature_array = columns = None
self.logger.debug('Engineering of features started ... ')
try:
for feat in features_for_extraction:
self.logger.debug('Engineering feature |%s| ...', feat['c_name_feat'])
if df is None:
continue # Due to lack of data for feature creation
# If the features have not been processed correctly (and do not contain necessary columns), skip
if not set(list(df.columns)).intersection(set(feat['c_name'])):
continue
# Gather all data necessary for the calculation of current feature
data_array = df[feat['c_name']].values
# Calculate feature if the transformation is described, copy otherwise
if feat['f'] is None:
feature_vector = np.array(data_array).astype(np.float32)
elif feat['f'] == 'copy':
feature_vector = data_array
else:
feature_vector = list(feat['f'](data_array))
feature_array = array_append(feature_array, feature_vector, stack='h') # Build array of features
columns = merge_inner_lists([c['c_name_feat'] for c in features_for_extraction]) # Retrieve columns` names
except Exception as e:
self.logger.exception(e)
try:
if feature_array is not None:
features_df = pd.DataFrame(feature_array, columns=columns)
features_df[ColumnNames.CALC_TIME] = df[ColumnNames.CALC_TIME]
self.logger.debug('All features engineered ... ')
else:
self.logger.error('Data for slice has not been engineered due to invalid values !!')
features_df = None
except Exception as e:
self.logger.exception(e)
return features_df