Source code for traveltimes_prediction.data_processing.data_entities.ref_tt

from ...support_files.helpers import check_params
from ...configs.db_settings import connection_string_tt_real
from .data_entity import DataEntity
from ..feature_engineering import features_to_extract_bck_tt
import pandas as pd
import logging
import traceback

import pymysql


[docs]class REF_TT(DataEntity): """ Class providing access to DB and feature generation from referential data - ground truth. """ def __init__(self, *args, **kwargs): super().__init__(**kwargs) self.mysqlconn_real_tt = pymysql.connect(**connection_string_tt_real) # Connection to data source - bck, sensors self.mysql_datetime_format = '%Y-%m-%d %H:%M:%S' # Datetime format that MySQL understands self.logger = logging.getLogger('traveltimes_processing_ref_tt') self.data_type = None self.features_definitions = features_to_extract_bck_tt @check_params def get_data(self, section, list_between_times): """ Function for sequential execution of sql queries for longer timespans. :param section: string - which section`s data should be loaded :param list_between_times: list of dicts of traveltimes - [{'from':..., 'to':...}, {...}, ...] :return: pd.DataFrame """ df = None # Iterating through the entire list of time-spans (tuples) self.logger.debug("Downloading of chunks-days from DB started ...") i = 1 for between_time in list_between_times: try: _df = self._retrieve_partial_interval_data(section=section, between_time=between_time) if df is None and _df is not None and not _df.empty: df = _df elif (df is not None) and (_df is not None) and (not _df.empty): df = df.append(_df, ignore_index=True) except: traceback.print_exc() self.logger.error("Error while processing data for time span: {} !!".format(between_time)) self.logger.debug("Processed {:<5} of {:>5} ... ".format(i, len(list_between_times))) i += 1 self.logger.debug("Finished...") self.retrieved_data = df return df
[docs] def process_data(self): """ Method for processing of the data - aggregation. :return: tuple """ # TODO some imputing ? + modification of the query self.aggregated_df_list = [{'sensor_name': 'REF_TT', 'df': self.retrieved_data}] return self.aggregated_df_list, None
@check_params def _retrieve_partial_interval_data(self, section, between_time): """ Function for retrieving the traveltime data for given session and time-span from DB. Retrieves ground truth. :param section: string - which section`s data should be loaded :param between_time: dict - {'from': datetime, 'to': datetime} :return: pandas.DataFrame - referential """ ref_traveltimes_query = """ SELECT calculation_time, tt_real FROM tt4.tt_real_traveltimes where section_id=%s and (calculation_time between %s and %s) and tt_real is not null and tt_real > 0 order by calculation_time asc; """ df_referential = None try: section_name = self._get_real_tt_section_name(connection=self.mysqlconn_real_tt, section=section) df_referential = pd.read_sql(sql=ref_traveltimes_query, params=(section_name, between_time['from'].strftime(self.mysql_datetime_format), between_time['to'].strftime(self.mysql_datetime_format)), con=self.mysqlconn_real_tt) # df_referential[df_referential <= 0] = np.nan # df_referential = df_referential.interpolate().fillna(method='bfill').fillna(method='ffill') except: traceback.print_exc() return df_referential def _get_real_tt_section_name(self, section, connection): """ Private method for looking up the valid identifier of the section used in real tt data. :param section: string - section identifier, e.g. 'KOCE-LNCE' :param connection: object - database connection, instance of opened PyMySQL :return: the identifier if exists. """ cursor = connection.cursor() cursor.execute(""" SELECT section_id FROM tt_real_traveltimes WHERE section_id LIKE %s LIMIT 1 """, ('%' + section[:9] + '%',)) backwards_section = cursor.fetchone() cursor.close() return backwards_section[0] if backwards_section is not None else None def _aggregate(self, df): pass def _reorganize(self, obj): pass