Source code for traveltimes_prediction.data_processing.data_entities.ref_tt
from ...support_files.helpers import check_params
from ...configs.db_settings import connection_string_tt_real
from .data_entity import DataEntity
from ..feature_engineering import features_to_extract_bck_tt
import pandas as pd
import logging
import traceback
import pymysql
[docs]class REF_TT(DataEntity):
"""
Class providing access to DB and feature generation from referential data - ground truth.
"""
def __init__(self, *args, **kwargs):
super().__init__(**kwargs)
self.mysqlconn_real_tt = pymysql.connect(**connection_string_tt_real) # Connection to data source - bck, sensors
self.mysql_datetime_format = '%Y-%m-%d %H:%M:%S' # Datetime format that MySQL understands
self.logger = logging.getLogger('traveltimes_processing_ref_tt')
self.data_type = None
self.features_definitions = features_to_extract_bck_tt
@check_params
def get_data(self, section, list_between_times):
"""
Function for sequential execution of sql queries for longer timespans.
:param section: string - which section`s data should be loaded
:param list_between_times: list of dicts of traveltimes - [{'from':..., 'to':...}, {...}, ...]
:return: pd.DataFrame
"""
df = None
# Iterating through the entire list of time-spans (tuples)
self.logger.debug("Downloading of chunks-days from DB started ...")
i = 1
for between_time in list_between_times:
try:
_df = self._retrieve_partial_interval_data(section=section, between_time=between_time)
if df is None and _df is not None and not _df.empty:
df = _df
elif (df is not None) and (_df is not None) and (not _df.empty):
df = df.append(_df, ignore_index=True)
except:
traceback.print_exc()
self.logger.error("Error while processing data for time span: {} !!".format(between_time))
self.logger.debug("Processed {:<5} of {:>5} ... ".format(i, len(list_between_times)))
i += 1
self.logger.debug("Finished...")
self.retrieved_data = df
return df
[docs] def process_data(self):
"""
Method for processing of the data - aggregation.
:return: tuple
"""
# TODO some imputing ? + modification of the query
self.aggregated_df_list = [{'sensor_name': 'REF_TT', 'df': self.retrieved_data}]
return self.aggregated_df_list, None
@check_params
def _retrieve_partial_interval_data(self, section, between_time):
"""
Function for retrieving the traveltime data for given session and time-span from DB. Retrieves ground truth.
:param section: string - which section`s data should be loaded
:param between_time: dict - {'from': datetime, 'to': datetime}
:return: pandas.DataFrame - referential
"""
ref_traveltimes_query = """
SELECT calculation_time, tt_real
FROM tt4.tt_real_traveltimes
where section_id=%s
and (calculation_time between %s and %s)
and tt_real is not null and tt_real > 0
order by calculation_time asc;
"""
df_referential = None
try:
section_name = self._get_real_tt_section_name(connection=self.mysqlconn_real_tt, section=section)
df_referential = pd.read_sql(sql=ref_traveltimes_query,
params=(section_name,
between_time['from'].strftime(self.mysql_datetime_format),
between_time['to'].strftime(self.mysql_datetime_format)),
con=self.mysqlconn_real_tt)
# df_referential[df_referential <= 0] = np.nan
# df_referential = df_referential.interpolate().fillna(method='bfill').fillna(method='ffill')
except:
traceback.print_exc()
return df_referential
def _get_real_tt_section_name(self, section, connection):
"""
Private method for looking up the valid identifier of the section used in real tt data.
:param section: string - section identifier, e.g. 'KOCE-LNCE'
:param connection: object - database connection, instance of opened PyMySQL
:return: the identifier if exists.
"""
cursor = connection.cursor()
cursor.execute("""
SELECT section_id
FROM tt_real_traveltimes
WHERE section_id LIKE %s
LIMIT 1
""", ('%' + section[:9] + '%',))
backwards_section = cursor.fetchone()
cursor.close()
return backwards_section[0] if backwards_section is not None else None
def _aggregate(self, df):
pass
def _reorganize(self, obj):
pass