Source code for test_suite.plots.data_plotting

from datetime import datetime, timedelta

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import OrderedDict

from sklearn.metrics import r2_score

from test_suite.benchmarks._criteria import mean_absolute_percentage_error, root_mean_squared_logarithmic_error, \
    mean_absolute_error, root_mean_squared_error
from traveltimes_prediction.support_files.helpers import impute
from traveltimes_prediction.support_files import ColumnNames, ColumnNamesRawFiveMin


# def plot_multiple_predictions_black_and_white(list_of_tuples, EN=True):
#     """
#     Function for plotting of the comparison of predictions produced by various models.
#
#     :param list list_of_tuples: list of pd.DataFrames(columns=['test_time', 'real', 'bck', 'est'])
#     :param boolean EN: if the labels should be in English.
#     """
#
#     # Plot the series
#     plt.figure()
#     plt.style.use('grayscale')
#
#     first_run = True
#     _max = 0
#     _ref = None
#     _bck = None
#     EN=False
#     ll = []
#     styles = ['--', ':', '-.']
#     for tuple_val in list_of_tuples:
#         calc_time = tuple_val['test_time'].apply(pd.to_datetime)
#
#         if first_run:  # plot real + bck
#             _ref = tuple_val['real'].values.astype(float)
#             _bck = tuple_val['bck'].values.astype(float)
#             # if EN:
#             #     plt.plot(calc_time, _ref, label='Real')
#             # else:
#             #     plt.plot(calc_time, _ref, label='Referenčná dojazdová doba')
#             plt.plot(calc_time, _ref, '-o',label='Reálna dojazdová doba')
#
#             e_mape = mean_absolute_percentage_error(_ref, _bck)
#             e_mae= mean_absolute_error(_ref, _bck)
#             e_rmsle = root_mean_squared_logarithmic_error(_ref, _bck)
#             e_camea_v2 = camea_custom_error_v2(_ref, _bck)
#             r2 = r2_score(_ref, _bck)
#
#             # if EN:
#             #     plt.plot(calc_time, _bck, label='Current type of prediction, MAPE: {:.2f}, RMSLE: {:.3f}, '
#             #                                     'CAMEA: {:.2f}'.format(e_mape, e_rmsle, e_camea_v2))
#             # else:
#             #     plt.plot(calc_time, _bck, label='Súčasný model predikcie, MAPE: {:.2f}, RMSLE: {:.3f}, '
#             #                                     'CAMEA: {:.2f}'.format(e_mape, e_rmsle, e_camea_v2))
#
#
#             plt.plot(calc_time, _bck, label='Okamžitá dojazdová doba')
#
#             first_run = False
#             ll.append({'model': 'BCK',
#                        'algorithm': None,
#                        'params': None,
#                        'MAPE': e_mape, 'MAE':e_mae, 'RMSLE': e_rmsle, 'CAMEA': e_camea_v2, 'R^2': r2,
#                        'fit_time': None,
#                        "predict_time": None})
#         # _max = max(tuple_val['est'])
#         # break
#         _ref = _ref.astype(float)
#         tuple_val['est'] = tuple_val['est'].astype(float)
#
#         e_mape = mean_absolute_percentage_error(_ref, tuple_val['est'])
#         e_mae = mean_absolute_error(_ref, tuple_val['est'])
#         e_rmsle = root_mean_squared_logarithmic_error(_ref, tuple_val['est'])
#         e_camea_v2 = camea_custom_error_v2(_ref, tuple_val['est'])
#         r2 = r2_score(_ref, tuple_val['est'])
#
#         # plt.plot(calc_time, tuple_val['est'], linestyle='--',label='{}, {}, MAPE: {:.2f}, RMSLE: {:.3f}, '
#         #                                             'CAMEA: {:.3f})'.format(tuple_val['algo'], tuple_val['params'],
#         #                                                                                e_mape, e_rmsle, e_camea_v2))
#
#         plt.plot(calc_time, tuple_val['est'], linestyle=styles.pop(0), label='{}'.format(tuple_val['algo']))
#
#         if max(tuple_val['est']) > _max:
#             _max = max(tuple_val['est'])
#         ll.append({'model': tuple_val['algo'],
#                    'algorithm': None,
#                    'params': tuple_val['params'],
#                    'MAPE': e_mape, 'MAE':e_mae, 'RMSLE': e_rmsle, 'CAMEA': e_camea_v2, 'R^2': r2,
#                    'fit_time': tuple_val['time_metrics']['fit'], "predict_time": tuple_val['time_metrics']['predict']})
#     dff = pd.DataFrame(ll)
#     print(dff)
#
#     # plt.title('Comparison of various algorithms for traveltime prediction')
#     if EN:
#         plt.xlabel('Time of day')
#         plt.ylabel('Traveltime [s]')
#     else:
#         plt.xlabel('Čas dňa [-]')
#         plt.ylabel('Dojazdová doba [s]')
#     plt.yticks(np.arange(60, _max + 1, 60))
#     p = plt.legend()
#     p.set_zorder(20)
#     p.draw_frame(True)
#
#     plt.show()


[docs]def plot_multiple_predictions(list_of_tuples, EN=True): """ Function for plotting of the comparison of predictions produced by various models. :param list list_of_tuples: pd.DataFrames(columns=['test_time', 'real', 'bck', 'est']) :param boolean EN: if the labels should be in English. """ # Plot the series fig, ax = plt.subplots(1) first_run = True _max = 0 _ref = None _bck = None EN=False ll = [] lll = [] for tuple_val in list_of_tuples: calc_time = tuple_val['test_time'].apply(pd.to_datetime) if first_run: # plot real + bck _ref = tuple_val['real'].values.astype(float) _bck = tuple_val['bck'].values.astype(float) if EN: plt.plot(calc_time, _ref, label='Real') else: plt.plot(calc_time, _ref, label='Referenčná dojazdová doba') e_mape = mean_absolute_percentage_error(_ref, _bck) e_rmsle = root_mean_squared_logarithmic_error(_ref, _bck) e_camea_v2 = camea_custom_error_v2(_ref, _bck) e_mae = mean_absolute_error(_ref, _bck) e_rmse = root_mean_squared_error(_ref, _bck) r2 = r2_score(_ref, _bck) if EN: plt.plot(calc_time, _bck, label='Current type of prediction') else: plt.plot(calc_time, _bck, label='Súčasný model predikcie') first_run = False ll.append(OrderedDict((('Model', 'BCK'), ('MAPE [%]', e_mape), ('RMSLE [-]', e_rmsle), ('MAE [s]', e_mae), ('RMSE [-]', e_rmse), ('CAMEA [-]', e_camea_v2), ('R^2 [-]', r2), ))) lll.append(OrderedDict((('Model', 'BCK'), ('Čas tréningu [s]', None), ("Čas predikcie [s]", None)))) # _max = max(tuple_val['est']) # break _ref = _ref.astype(float) tuple_val['est'] = tuple_val['est'].astype(float) e_mape = mean_absolute_percentage_error(_ref, tuple_val['est']) e_rmsle = root_mean_squared_logarithmic_error(_ref, tuple_val['est']) e_camea_v2 = camea_custom_error_v2(_ref, tuple_val['est']) e_mae = mean_absolute_error(_ref, tuple_val['est']) e_rmse = root_mean_squared_error(_ref, tuple_val['est']) r2 = r2_score(_ref, tuple_val['est']) plt.plot(calc_time, tuple_val['est'], linestyle='--',label='{}'.format(tuple_val['algo'])) if max(tuple_val['est']) > _max: _max = max(tuple_val['est']) ll.append(OrderedDict((('Model', tuple_val['algo']), ('MAPE [%]', e_mape), ('RMSLE [-]', e_rmsle), ('MAE [s]', e_mae), ('RMSE [-]', e_rmse), ('CAMEA [-]', e_camea_v2), ('R^2 [-]', r2), ))) lll.append(OrderedDict((('Model', tuple_val['algo']), ('Čas tréningu [s]', tuple_val['time_metrics']['fit']), ("Čas predikcie [s]", tuple_val['time_metrics']['predict'])))) dff_metrics = pd.DataFrame(ll).drop('CAMEA [-]', axis=1) dff_time = pd.DataFrame(lll) print(dff_metrics.to_latex(float_format='%.2f', index=False)) print(dff_time.to_latex(float_format='%.2f', index=False)) # plt.title('Comparison of various algorithms for traveltime prediction') if EN: plt.xlabel('Time of day') plt.ylabel('Traveltime [s]') else: plt.xlabel('Čas dňa [-]') plt.ylabel('Dojazdová doba [s]') plt.yticks(np.arange(60, _max + 1, 60)) # xfmt = mdates.DateFormatter('%H:%M') # ax.xaxis.set_major_formatter(xfmt) plt.legend() plt.show()
[docs]def plot_output_comparison(df, EN=True): """ Function, similar as the function 'plot_multiple_predictions', however only for plotting 'bck' & 'ref' comparison. :param pd.DataFrame df: DataFrame with columns - 'bck', 'ref' 'test_time' :param boolean EN: if the labels should be in English (False=Slovak) """ plt.figure() _max = 0 ll = [] calc_time = df['test_time'].apply(pd.to_datetime) _ref = df['real'].interpolate().values.astype(float) _bck = df['bck'].interpolate().values.astype(float) _ref[np.isnan(_ref)] = 0 _bck[np.isnan(_bck)] = 0 if EN: plt.plot(calc_time, _ref, label='Real') else: plt.plot(calc_time, _ref, label='Referenčná dojadová doba') e_mape = mean_absolute_percentage_error(_ref, _bck) e_rmsle = root_mean_squared_logarithmic_error(_ref, _bck) e_camea_v2 = camea_custom_error_v2(_ref, _bck) r2 = r2_score(_ref, _bck) if EN: plt.plot(calc_time, _bck, label='Current type of prediction, MAPE: {:.2f}, RMSLE: {:.3f}, ' 'CAMEA: {:.2f}'.format(e_mape, e_rmsle, e_camea_v2)) else: plt.plot(calc_time, _bck, label='Súčasný model predikcie, MAPE: {:.2f}, RMSLE: {:.3f}, ' 'CAMEA: {:.2f}'.format(e_mape, e_rmsle, e_camea_v2)) ll.append({'model': 'BCK', 'MAPE': e_mape, 'RMSLE': e_rmsle, 'CAMEA': e_camea_v2, 'R^2': r2}) _ref = _ref.astype(float) df['est'] = impute(array=df['est'].astype(float), columns=[0], invalid_val=-1)[0] e_mape = mean_absolute_percentage_error(_ref, df['est']) e_rmsle = root_mean_squared_logarithmic_error(_ref, df['est']) e_camea_v2 = camea_custom_error_v2(_ref, df['est']) r2 = r2_score(_ref, df['est']) plt.plot(calc_time, df['est'], linestyle='--', label='MAPE: {:.2f}, RMSLE: {:.3f}, ' 'CAMEA: {:.3f})'.format(e_mape, e_rmsle, e_camea_v2)) if max(df['est']) > _max: _max = max(df['est']) ll.append({'model': 'EST', 'MAPE': e_mape, 'RMSLE': e_rmsle, 'CAMEA': e_camea_v2, 'R^2': r2}) dff = pd.DataFrame(ll) print(dff) plt.title('Comparison of various algorithms for traveltime prediction') if EN: plt.xlabel('Time of day') plt.ylabel('Traveltime [s]') else: plt.xlabel('Čas dňa [-]') plt.ylabel('Dojazdová doba [s]') plt.yticks(np.arange(60, _max + 1, 60)) plt.legend() plt.show()
[docs]def plot_multiple_days(file, images, use_subplots=False, EN=True): """ Function for plotting referential traveltimes for all days of the week. :param string file: path to the referential traveltimes pickled DataFrame :param string images: how the created images should be named :param boolean use_subplots: if subplots should be used (otherwise all series will be plotted into one plot) :param boolean EN: if the labels should be in English or not (otherwise Slovak) """ ref_traveltimes_data = pd.read_pickle(file) earliest_day = ref_traveltimes_data.index.min() latest_day = ref_traveltimes_data.index.max() earliest_day = datetime(earliest_day.year, earliest_day.month, earliest_day.day, hour=6) if EN is True: days_mapping = {1: "Monday", 2: "Tuesday", 3: "Wednesday", 4: "Thursday", 5: "Friday", 6: "Saturday", 7: "Sunday"} else: days_mapping = {1: "Pondelok", 2: "Utorok", 3: "Streda", 4: "Štvrtok", 5: "Piatok", 6: "Sobota", 7: "Nedeľa"} l = [] empty = False while empty is False: l.append(ref_traveltimes_data.loc[earliest_day:earliest_day + timedelta(hours=16)]) earliest_day = earliest_day + timedelta(days=1) if earliest_day > latest_day: empty = True if use_subplots: f, axarr = plt.subplots(4, 2, figsize=(10, 13)) indices = [(m, n) for m in range(4) for n in range(2)] for d in range(7): for item in l[d::7]: if item.empty: continue date = pd.to_datetime(item.index.min()) label = "{}, weekday: {}".format(date.strftime("%d.%m."), days_mapping[date.isoweekday()]) axarr[indices[d]].plot( pd.to_datetime(pd.to_datetime(list(item.index)).strftime('%H:%M:%S')), item['tt_real'], label=label) axarr[indices[d]].set_title(days_mapping[date.isoweekday()]) if EN is True: axarr[indices[d]].set_xlabel("Time of day [-]") axarr[indices[d]].set_ylabel("Travel time [s]") else: axarr[indices[d]].set_xlabel("Čas dňa [-]") axarr[indices[d]].set_ylabel("Dojazdová doba [s]") # axarr[indices[d]].set_ylim((0, 1000)) plt.setp(axarr[indices[d]].get_xticklabels(), rotation=45) # plt.suptitle("Overview of travel times for a certain day") plt.tight_layout() plt.subplots_adjust(top=0.95) plt.savefig(images + "_overview.png", bbox_inches='tight', dpi=200) plt.close() else: for day in range(7): for item in l[day::7]: if item.empty: continue date = pd.to_datetime(item['calculation_time_local'].min()) label = "{}, weekday: {}".format(date.strftime("%d.%m."), days_mapping[date.isoweekday()]) plt.plot(pd.to_datetime(pd.to_datetime(item['calculation_time_local']).dt.strftime('%H:%M:%S')), item['tt_real'], label=label) plt.xlabel("Time of day") plt.ylabel("Traveltime [s]") plt.title("Overview of traveltimes for a certain weekday") plt.legend() plt.savefig(images + "{}.png".format(day), bbox_inches='tight', dpi=1200, figsize=(25, 20)) plt.close()
[docs]def plot_correlation(X, Y): """ Function for plotting of the correlation matrix of the features & referential traveltimes. :param pd.DataFrame X: matrix of features :param pd.DataFrame Y: vector of referential traveltimes """ T = pd.concat([X, Y], axis=1) T = T.dropna(axis=0, how='any') corr = abs(T.corr().mul(100)) corr[np.isnan(corr)] = 0 corr = corr.astype(int) print(corr[ColumnNamesRawFiveMin.TT_REAL].sort_values(ascending=False)) cg = sns.clustermap(data=corr, annot=True, fmt='d', cmap='Reds', figsize=(23, 20)) plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=-15) plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=60) plt.title("Correlation matrix") plt.savefig("correlation_matrix.png", bbox_inches='tight', dpi=200)
[docs]def plot_clusters(estimated_labels, train_X, train_Y, save=False): """ Method for plotting of the clustered data (clustered ndim features). :param numpy.ndarray estimated_labels: :param pandas.DataFrame train_X: :param pandas.DataFrame train_Y: :param boolean save: """ unique_labels = np.unique(estimated_labels) if save: plt.figure(dpi=1200, figsize=(25, 9), ) else: plt.figure() add_vec = np.array(list(range(len(train_X)))) ax1 = plt.subplot(111) cs = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] for label, c in zip(unique_labels, cs): label_data = train_Y[estimated_labels == label] l_x = add_vec[estimated_labels == label] ax1.scatter(l_x, label_data, c=c) ax1.set_yticks(np.arange(60, max(train_Y.values) + 1, 60)) ax1.set_xlabel("No. of sample [1 sample per minute]") ax1.set_ylabel("Traveltime [s]") ax1.set_title('Estimated vs. true Y') ax1.legend() if save: plt.savefig("images/svr_2.pdf", bbox_inches='tight', dpi=1200, figsize=(60, 260)) plt.close() else: plt.show()
[docs]def plot_distribution(feature): """ Method for plotting of the distribution of the feature. :param pandas.Series feature: """ import seaborn as sns from scipy import stats plt.subplot(2,1,1) sns.kdeplot(feature) plt.title("".join(["p-value: ", str(stats.shapiro(feature))])) plt.subplot(2, 1, 2) sns.boxplot(feature, orient='v') plt.show()
[docs]def plot_boxplots(feature_in, unit="-", f_name=''): """ Method for plotting of the boxplot of feature. :param pd.Series feature_in: :param string unit: :param string f_name: """ plt.figure(figsize=(15, 7)) sns.boxplot(feature_in, orient='h') plt.xlabel('Value of feature [{}]'.format(unit)) plt.suptitle('Distribution of feature') plt.savefig('distribution_{}.png'.format(f_name), bbox_inches='tight')
[docs]def plot_boxplots_week(EN=True): """ Method for comparison of the distribution of the traveltime during weekend and non-weekend days. :param boolean EN: if the labels should be in English or not (Slovak otherwise). """ from test_suite import TestFramework import scipy.stats as sts TF = TestFramework(X_path='../../data/feature_df_KRCE-PICE.pickle', Y_path='../../data/ref_KRCE-PICE.pickle') train_X, train_Y, test_X, test_Y, test_time, train_time = TF.get_subset(portion=1.0, cv_ratio=0.5) weekend_tt = train_Y[train_X[ColumnNames.FEAT_WEEKEND] == 1].values print('iqr', sts.iqr(weekend_tt), 'median', np.median(weekend_tt), 'extreme', sts.iqr(weekend_tt)+np.median(weekend_tt), 'portion of extremes', sum(weekend_tt > (sts.iqr(weekend_tt)+np.median(weekend_tt)))/len(weekend_tt)) week_tt = train_Y[train_X[ColumnNames.FEAT_WEEKEND] != 1].values print('iqr', sts.iqr(week_tt), np.median(week_tt), 'extreme', sts.iqr(week_tt) + np.median(week_tt), 'portion of extremes', sum(week_tt > (sts.iqr(week_tt)+np.median(week_tt)))/len(week_tt)) plt.figure(figsize=(15, 7)) plt.subplot(1,2,1) sns.boxplot(week_tt, orient='v') if EN is True: plt.ylabel('Travel time [s]') plt.title('Weekdays') else: plt.ylabel('Dojazdová doba [s]') plt.title('Deň v týždni') plt.subplot(1,2,2) sns.boxplot(weekend_tt, orient='v') if EN is True: plt.ylabel('Travel time [s]') plt.title('Weekend') else: plt.ylabel('Dojazdová doba [s]') plt.title('Víkend') # plt.suptitle('Comparison of travel times on weekdays and weekend') plt.savefig('distribution_traveltimes.png', bbox_inches='tight') plt.show()
[docs]def plot_time_clusters(X, Y, time): """ Function for plotting of the distribution of the features with respect to the time of day. :param pd.DataFrame X: features matrix :param pd.DataFrame Y: vector of referential traveltimes values :param pd.DataFrame time: timestamps of data """ import datetime as dt time = pd.Series(pd.to_datetime(time)) # get distinct days days = [] for i in range(30): between = time.between(dt.datetime(2016,10,17,0,0,0)+timedelta(days=i), dt.datetime(2016,10,17,0,0,0)+timedelta(days=i+1)) day = Y[between.values] feat = X[between.values] days.append((day, feat)) clusters = [50, 80, 100, 110] f, axx = plt.subplots(2,2) axx = axx.flatten() for i_d, d in enumerate(days): for i_i, i in enumerate(clusters): y_tmp = d[0][d[1][ColumnNames.FEAT_TIME_BIN].isin([i-1, i, i+1])] y_tmp = pd.DataFrame(y_tmp) axx[i_i].boxplot(y_tmp[d[1][ColumnNames.FEAT_TIME_BIN]==i].values, positions=[i_d]) axx[i_i].set_xlim([-1, len(days)+1]) # axx[i_i].set_ylim([0, 1000]) axx[i_i].set_xticks([]) plt.suptitle('Comparison of traveltimes values for several days with respect to the time-of-day') plt.show()
# def compare_ref_bck_tt(EN=True): # """ # Function for plotting the comparison of the referential traveltimes and the mode that is currently used in CAMEA. # # :param boolean EN: if the labels should be in English (otherwise Slovak) # """ # bck = pd.read_pickle("../../data/bck_KRCE-PICE.pickle").iloc[19490:20370] # ref = pd.read_pickle("../../data/ref_KRCE-PICE.pickle").iloc[19490:20370] # # df = pd.merge(bck, ref,on=ColumnNamesRaw.CALC_TIME)#.iloc[71070:71900] # r = ref.tt_real # b = bck[[x for x in bck.columns if 'bck' in x][0]] # d = pd.to_datetime(ref.index) # # plt.figure(figsize=(15, 7)) # if EN: # plt.plot(d, r, label='Real travel times') # plt.plot(d, b, label='Instantaneous travel times') # plt.legend() # plt.xlabel('Time of day [-]') # plt.ylabel('Travel time [s]') # else: # plt.plot(d, r, label='Reálna dojazdová doba') # plt.plot(d, b, label='Okamžitá dojazdová doba') # plt.legend() # plt.xlabel('Čas dňa [-]') # plt.ylabel('Dojazodová doba [s]') # plt.savefig("travel_times_comparison", bbox_inches='tight', dpi=200)
[docs]def plot_all_slices_distribution(df, feature): """ Function for plotting the distribution of the features for all slices. :param string feature: name of the feature that should be plotted. """ import math list_to_plot = [x for x in list(df.columns) if feature in x] rows = math.ceil(len(list_to_plot) / 2) f, axarr = plt.subplots(rows, 2, figsize=(10, 13)) indices = [(m, n) for m in range(rows) for n in range(2)] for idx, feat in enumerate(list_to_plot): parts = feat.split('-') sns.boxplot(df[feat].values, orient='h', ax=axarr[indices[idx]]) axarr[indices[idx]].set_title(parts[-1]) axarr[indices[idx]].set_xlabel(feature + (' [$km.h^{-1}$]' if 'velocity' in parts[0] else ' [-]')) plt.tight_layout(pad=2.5) plt.savefig('distribution_{}.png'.format(feature), bbox_inches='tight', dpi=200)
if __name__ == '__main__': # df = pd.read_pickle("../../data/feature_df_KRCE-PICE_.pickle") # # plot_all_slices_distribution(df, 'rdet_velocity') # plot_boxplots_week(False) # c_name = ColumnNamesRaw.VELOCITY # # d_in = pd.read_csv("../../data/sensors_in_november") # dd_in = d_in[c_name].values # # d_out = pd.read_csv("../../data/sensors_out_november") # dd_out = d_out[c_name].values # # df = pd.read_csv("C:\\Users\\mudroch\\Desktop\\traveltimes_repo\\trunk\\results\\KRCE-PICE\\output.csv") # plot_output_comparison(df) # c_name = ColumnNamesRaw.VELOCITY # # d_in = pd.read_csv("../../data/sensors_in_november") # dd_in = d_in[c_name].values # # d_out = pd.read_csv("../../data/sensors_out_november") # dd_out = d_out[c_name].values # # plot_boxplots(dd_in, dd_out, r'$m.s^{-1}$', 'velocity') # # plot_multiple_days("../../data/") #plot_time_clusters() #compare_ref_bck_tt() plot_boxplots_week() # plot_multiple_days('../../data/ref_KRCE-PICE.pickle', images='traveltime_days', use_subplots=True, EN=True) #compare_ref_bck_tt(False) # Y = pd.read_pickle("../../data/ref_KRCE-PICE.pickle") # X = pd.read_pickle("../../data/feature_df_KRCE-PICE.pickle") # # plot_correlation(X, Y) # plot_time_clusters(X, Y, pd.to_datetime(X.index.values))