from datetime import datetime, timedelta
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import OrderedDict
from sklearn.metrics import r2_score
from test_suite.benchmarks._criteria import mean_absolute_percentage_error, root_mean_squared_logarithmic_error, \
mean_absolute_error, root_mean_squared_error
from traveltimes_prediction.support_files.helpers import impute
from traveltimes_prediction.support_files import ColumnNames, ColumnNamesRawFiveMin
# def plot_multiple_predictions_black_and_white(list_of_tuples, EN=True):
# """
# Function for plotting of the comparison of predictions produced by various models.
#
# :param list list_of_tuples: list of pd.DataFrames(columns=['test_time', 'real', 'bck', 'est'])
# :param boolean EN: if the labels should be in English.
# """
#
# # Plot the series
# plt.figure()
# plt.style.use('grayscale')
#
# first_run = True
# _max = 0
# _ref = None
# _bck = None
# EN=False
# ll = []
# styles = ['--', ':', '-.']
# for tuple_val in list_of_tuples:
# calc_time = tuple_val['test_time'].apply(pd.to_datetime)
#
# if first_run: # plot real + bck
# _ref = tuple_val['real'].values.astype(float)
# _bck = tuple_val['bck'].values.astype(float)
# # if EN:
# # plt.plot(calc_time, _ref, label='Real')
# # else:
# # plt.plot(calc_time, _ref, label='Referenčná dojazdová doba')
# plt.plot(calc_time, _ref, '-o',label='Reálna dojazdová doba')
#
# e_mape = mean_absolute_percentage_error(_ref, _bck)
# e_mae= mean_absolute_error(_ref, _bck)
# e_rmsle = root_mean_squared_logarithmic_error(_ref, _bck)
# e_camea_v2 = camea_custom_error_v2(_ref, _bck)
# r2 = r2_score(_ref, _bck)
#
# # if EN:
# # plt.plot(calc_time, _bck, label='Current type of prediction, MAPE: {:.2f}, RMSLE: {:.3f}, '
# # 'CAMEA: {:.2f}'.format(e_mape, e_rmsle, e_camea_v2))
# # else:
# # plt.plot(calc_time, _bck, label='Súčasný model predikcie, MAPE: {:.2f}, RMSLE: {:.3f}, '
# # 'CAMEA: {:.2f}'.format(e_mape, e_rmsle, e_camea_v2))
#
#
# plt.plot(calc_time, _bck, label='Okamžitá dojazdová doba')
#
# first_run = False
# ll.append({'model': 'BCK',
# 'algorithm': None,
# 'params': None,
# 'MAPE': e_mape, 'MAE':e_mae, 'RMSLE': e_rmsle, 'CAMEA': e_camea_v2, 'R^2': r2,
# 'fit_time': None,
# "predict_time": None})
# # _max = max(tuple_val['est'])
# # break
# _ref = _ref.astype(float)
# tuple_val['est'] = tuple_val['est'].astype(float)
#
# e_mape = mean_absolute_percentage_error(_ref, tuple_val['est'])
# e_mae = mean_absolute_error(_ref, tuple_val['est'])
# e_rmsle = root_mean_squared_logarithmic_error(_ref, tuple_val['est'])
# e_camea_v2 = camea_custom_error_v2(_ref, tuple_val['est'])
# r2 = r2_score(_ref, tuple_val['est'])
#
# # plt.plot(calc_time, tuple_val['est'], linestyle='--',label='{}, {}, MAPE: {:.2f}, RMSLE: {:.3f}, '
# # 'CAMEA: {:.3f})'.format(tuple_val['algo'], tuple_val['params'],
# # e_mape, e_rmsle, e_camea_v2))
#
# plt.plot(calc_time, tuple_val['est'], linestyle=styles.pop(0), label='{}'.format(tuple_val['algo']))
#
# if max(tuple_val['est']) > _max:
# _max = max(tuple_val['est'])
# ll.append({'model': tuple_val['algo'],
# 'algorithm': None,
# 'params': tuple_val['params'],
# 'MAPE': e_mape, 'MAE':e_mae, 'RMSLE': e_rmsle, 'CAMEA': e_camea_v2, 'R^2': r2,
# 'fit_time': tuple_val['time_metrics']['fit'], "predict_time": tuple_val['time_metrics']['predict']})
# dff = pd.DataFrame(ll)
# print(dff)
#
# # plt.title('Comparison of various algorithms for traveltime prediction')
# if EN:
# plt.xlabel('Time of day')
# plt.ylabel('Traveltime [s]')
# else:
# plt.xlabel('Čas dňa [-]')
# plt.ylabel('Dojazdová doba [s]')
# plt.yticks(np.arange(60, _max + 1, 60))
# p = plt.legend()
# p.set_zorder(20)
# p.draw_frame(True)
#
# plt.show()
[docs]def plot_multiple_predictions(list_of_tuples, EN=True):
"""
Function for plotting of the comparison of predictions produced by various models.
:param list list_of_tuples: pd.DataFrames(columns=['test_time', 'real', 'bck', 'est'])
:param boolean EN: if the labels should be in English.
"""
# Plot the series
fig, ax = plt.subplots(1)
first_run = True
_max = 0
_ref = None
_bck = None
EN=False
ll = []
lll = []
for tuple_val in list_of_tuples:
calc_time = tuple_val['test_time'].apply(pd.to_datetime)
if first_run: # plot real + bck
_ref = tuple_val['real'].values.astype(float)
_bck = tuple_val['bck'].values.astype(float)
if EN:
plt.plot(calc_time, _ref, label='Real')
else:
plt.plot(calc_time, _ref, label='Referenčná dojazdová doba')
e_mape = mean_absolute_percentage_error(_ref, _bck)
e_rmsle = root_mean_squared_logarithmic_error(_ref, _bck)
e_camea_v2 = camea_custom_error_v2(_ref, _bck)
e_mae = mean_absolute_error(_ref, _bck)
e_rmse = root_mean_squared_error(_ref, _bck)
r2 = r2_score(_ref, _bck)
if EN:
plt.plot(calc_time, _bck, label='Current type of prediction')
else:
plt.plot(calc_time, _bck, label='Súčasný model predikcie')
first_run = False
ll.append(OrderedDict((('Model', 'BCK'),
('MAPE [%]', e_mape), ('RMSLE [-]', e_rmsle), ('MAE [s]', e_mae),
('RMSE [-]', e_rmse), ('CAMEA [-]', e_camea_v2), ('R^2 [-]', r2),
)))
lll.append(OrderedDict((('Model', 'BCK'),
('Čas tréningu [s]', None),
("Čas predikcie [s]", None))))
# _max = max(tuple_val['est'])
# break
_ref = _ref.astype(float)
tuple_val['est'] = tuple_val['est'].astype(float)
e_mape = mean_absolute_percentage_error(_ref, tuple_val['est'])
e_rmsle = root_mean_squared_logarithmic_error(_ref, tuple_val['est'])
e_camea_v2 = camea_custom_error_v2(_ref, tuple_val['est'])
e_mae = mean_absolute_error(_ref, tuple_val['est'])
e_rmse = root_mean_squared_error(_ref, tuple_val['est'])
r2 = r2_score(_ref, tuple_val['est'])
plt.plot(calc_time, tuple_val['est'], linestyle='--',label='{}'.format(tuple_val['algo']))
if max(tuple_val['est']) > _max:
_max = max(tuple_val['est'])
ll.append(OrderedDict((('Model', tuple_val['algo']),
('MAPE [%]', e_mape), ('RMSLE [-]', e_rmsle), ('MAE [s]', e_mae),
('RMSE [-]', e_rmse), ('CAMEA [-]', e_camea_v2), ('R^2 [-]', r2),
)))
lll.append(OrderedDict((('Model', tuple_val['algo']),
('Čas tréningu [s]', tuple_val['time_metrics']['fit']),
("Čas predikcie [s]", tuple_val['time_metrics']['predict']))))
dff_metrics = pd.DataFrame(ll).drop('CAMEA [-]', axis=1)
dff_time = pd.DataFrame(lll)
print(dff_metrics.to_latex(float_format='%.2f', index=False))
print(dff_time.to_latex(float_format='%.2f', index=False))
# plt.title('Comparison of various algorithms for traveltime prediction')
if EN:
plt.xlabel('Time of day')
plt.ylabel('Traveltime [s]')
else:
plt.xlabel('Čas dňa [-]')
plt.ylabel('Dojazdová doba [s]')
plt.yticks(np.arange(60, _max + 1, 60))
# xfmt = mdates.DateFormatter('%H:%M')
# ax.xaxis.set_major_formatter(xfmt)
plt.legend()
plt.show()
[docs]def plot_output_comparison(df, EN=True):
"""
Function, similar as the function 'plot_multiple_predictions', however only for plotting 'bck' & 'ref' comparison.
:param pd.DataFrame df: DataFrame with columns - 'bck', 'ref' 'test_time'
:param boolean EN: if the labels should be in English (False=Slovak)
"""
plt.figure()
_max = 0
ll = []
calc_time = df['test_time'].apply(pd.to_datetime)
_ref = df['real'].interpolate().values.astype(float)
_bck = df['bck'].interpolate().values.astype(float)
_ref[np.isnan(_ref)] = 0
_bck[np.isnan(_bck)] = 0
if EN:
plt.plot(calc_time, _ref, label='Real')
else:
plt.plot(calc_time, _ref, label='Referenčná dojadová doba')
e_mape = mean_absolute_percentage_error(_ref, _bck)
e_rmsle = root_mean_squared_logarithmic_error(_ref, _bck)
e_camea_v2 = camea_custom_error_v2(_ref, _bck)
r2 = r2_score(_ref, _bck)
if EN:
plt.plot(calc_time, _bck, label='Current type of prediction, MAPE: {:.2f}, RMSLE: {:.3f}, '
'CAMEA: {:.2f}'.format(e_mape, e_rmsle, e_camea_v2))
else:
plt.plot(calc_time, _bck, label='Súčasný model predikcie, MAPE: {:.2f}, RMSLE: {:.3f}, '
'CAMEA: {:.2f}'.format(e_mape, e_rmsle, e_camea_v2))
ll.append({'model': 'BCK',
'MAPE': e_mape, 'RMSLE': e_rmsle, 'CAMEA': e_camea_v2, 'R^2': r2})
_ref = _ref.astype(float)
df['est'] = impute(array=df['est'].astype(float), columns=[0], invalid_val=-1)[0]
e_mape = mean_absolute_percentage_error(_ref, df['est'])
e_rmsle = root_mean_squared_logarithmic_error(_ref, df['est'])
e_camea_v2 = camea_custom_error_v2(_ref, df['est'])
r2 = r2_score(_ref, df['est'])
plt.plot(calc_time, df['est'], linestyle='--', label='MAPE: {:.2f}, RMSLE: {:.3f}, '
'CAMEA: {:.3f})'.format(e_mape, e_rmsle, e_camea_v2))
if max(df['est']) > _max:
_max = max(df['est'])
ll.append({'model': 'EST',
'MAPE': e_mape, 'RMSLE': e_rmsle, 'CAMEA': e_camea_v2, 'R^2': r2})
dff = pd.DataFrame(ll)
print(dff)
plt.title('Comparison of various algorithms for traveltime prediction')
if EN:
plt.xlabel('Time of day')
plt.ylabel('Traveltime [s]')
else:
plt.xlabel('Čas dňa [-]')
plt.ylabel('Dojazdová doba [s]')
plt.yticks(np.arange(60, _max + 1, 60))
plt.legend()
plt.show()
[docs]def plot_multiple_days(file, images, use_subplots=False, EN=True):
"""
Function for plotting referential traveltimes for all days of the week.
:param string file: path to the referential traveltimes pickled DataFrame
:param string images: how the created images should be named
:param boolean use_subplots: if subplots should be used (otherwise all series will be plotted into one plot)
:param boolean EN: if the labels should be in English or not (otherwise Slovak)
"""
ref_traveltimes_data = pd.read_pickle(file)
earliest_day = ref_traveltimes_data.index.min()
latest_day = ref_traveltimes_data.index.max()
earliest_day = datetime(earliest_day.year, earliest_day.month, earliest_day.day, hour=6)
if EN is True:
days_mapping = {1: "Monday", 2: "Tuesday", 3: "Wednesday", 4: "Thursday", 5: "Friday", 6: "Saturday", 7: "Sunday"}
else:
days_mapping = {1: "Pondelok", 2: "Utorok", 3: "Streda", 4: "Štvrtok", 5: "Piatok", 6: "Sobota",
7: "Nedeľa"}
l = []
empty = False
while empty is False:
l.append(ref_traveltimes_data.loc[earliest_day:earliest_day + timedelta(hours=16)])
earliest_day = earliest_day + timedelta(days=1)
if earliest_day > latest_day:
empty = True
if use_subplots:
f, axarr = plt.subplots(4, 2, figsize=(10, 13))
indices = [(m, n) for m in range(4)
for n in range(2)]
for d in range(7):
for item in l[d::7]:
if item.empty:
continue
date = pd.to_datetime(item.index.min())
label = "{}, weekday: {}".format(date.strftime("%d.%m."), days_mapping[date.isoweekday()])
axarr[indices[d]].plot(
pd.to_datetime(pd.to_datetime(list(item.index)).strftime('%H:%M:%S')),
item['tt_real'], label=label)
axarr[indices[d]].set_title(days_mapping[date.isoweekday()])
if EN is True:
axarr[indices[d]].set_xlabel("Time of day [-]")
axarr[indices[d]].set_ylabel("Travel time [s]")
else:
axarr[indices[d]].set_xlabel("Čas dňa [-]")
axarr[indices[d]].set_ylabel("Dojazdová doba [s]")
# axarr[indices[d]].set_ylim((0, 1000))
plt.setp(axarr[indices[d]].get_xticklabels(), rotation=45)
# plt.suptitle("Overview of travel times for a certain day")
plt.tight_layout()
plt.subplots_adjust(top=0.95)
plt.savefig(images + "_overview.png", bbox_inches='tight', dpi=200)
plt.close()
else:
for day in range(7):
for item in l[day::7]:
if item.empty:
continue
date = pd.to_datetime(item['calculation_time_local'].min())
label = "{}, weekday: {}".format(date.strftime("%d.%m."), days_mapping[date.isoweekday()])
plt.plot(pd.to_datetime(pd.to_datetime(item['calculation_time_local']).dt.strftime('%H:%M:%S')),
item['tt_real'], label=label)
plt.xlabel("Time of day")
plt.ylabel("Traveltime [s]")
plt.title("Overview of traveltimes for a certain weekday")
plt.legend()
plt.savefig(images + "{}.png".format(day), bbox_inches='tight', dpi=1200, figsize=(25, 20))
plt.close()
[docs]def plot_correlation(X, Y):
"""
Function for plotting of the correlation matrix of the features & referential traveltimes.
:param pd.DataFrame X: matrix of features
:param pd.DataFrame Y: vector of referential traveltimes
"""
T = pd.concat([X, Y], axis=1)
T = T.dropna(axis=0, how='any')
corr = abs(T.corr().mul(100))
corr[np.isnan(corr)] = 0
corr = corr.astype(int)
print(corr[ColumnNamesRawFiveMin.TT_REAL].sort_values(ascending=False))
cg = sns.clustermap(data=corr, annot=True, fmt='d', cmap='Reds', figsize=(23, 20))
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=-15)
plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=60)
plt.title("Correlation matrix")
plt.savefig("correlation_matrix.png", bbox_inches='tight', dpi=200)
[docs]def plot_clusters(estimated_labels, train_X, train_Y, save=False):
"""
Method for plotting of the clustered data (clustered ndim features).
:param numpy.ndarray estimated_labels:
:param pandas.DataFrame train_X:
:param pandas.DataFrame train_Y:
:param boolean save:
"""
unique_labels = np.unique(estimated_labels)
if save:
plt.figure(dpi=1200, figsize=(25, 9), )
else:
plt.figure()
add_vec = np.array(list(range(len(train_X))))
ax1 = plt.subplot(111)
cs = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
for label, c in zip(unique_labels, cs):
label_data = train_Y[estimated_labels == label]
l_x = add_vec[estimated_labels == label]
ax1.scatter(l_x, label_data, c=c)
ax1.set_yticks(np.arange(60, max(train_Y.values) + 1, 60))
ax1.set_xlabel("No. of sample [1 sample per minute]")
ax1.set_ylabel("Traveltime [s]")
ax1.set_title('Estimated vs. true Y')
ax1.legend()
if save:
plt.savefig("images/svr_2.pdf", bbox_inches='tight', dpi=1200, figsize=(60, 260))
plt.close()
else:
plt.show()
[docs]def plot_distribution(feature):
"""
Method for plotting of the distribution of the feature.
:param pandas.Series feature:
"""
import seaborn as sns
from scipy import stats
plt.subplot(2,1,1)
sns.kdeplot(feature)
plt.title("".join(["p-value: ", str(stats.shapiro(feature))]))
plt.subplot(2, 1, 2)
sns.boxplot(feature, orient='v')
plt.show()
[docs]def plot_boxplots(feature_in, unit="-", f_name=''):
"""
Method for plotting of the boxplot of feature.
:param pd.Series feature_in:
:param string unit:
:param string f_name:
"""
plt.figure(figsize=(15, 7))
sns.boxplot(feature_in, orient='h')
plt.xlabel('Value of feature [{}]'.format(unit))
plt.suptitle('Distribution of feature')
plt.savefig('distribution_{}.png'.format(f_name), bbox_inches='tight')
[docs]def plot_boxplots_week(EN=True):
"""
Method for comparison of the distribution of the traveltime during weekend and non-weekend days.
:param boolean EN: if the labels should be in English or not (Slovak otherwise).
"""
from test_suite import TestFramework
import scipy.stats as sts
TF = TestFramework(X_path='../../data/feature_df_KRCE-PICE.pickle', Y_path='../../data/ref_KRCE-PICE.pickle')
train_X, train_Y, test_X, test_Y, test_time, train_time = TF.get_subset(portion=1.0, cv_ratio=0.5)
weekend_tt = train_Y[train_X[ColumnNames.FEAT_WEEKEND] == 1].values
print('iqr', sts.iqr(weekend_tt), 'median', np.median(weekend_tt),
'extreme', sts.iqr(weekend_tt)+np.median(weekend_tt),
'portion of extremes', sum(weekend_tt > (sts.iqr(weekend_tt)+np.median(weekend_tt)))/len(weekend_tt))
week_tt = train_Y[train_X[ColumnNames.FEAT_WEEKEND] != 1].values
print('iqr', sts.iqr(week_tt), np.median(week_tt),
'extreme', sts.iqr(week_tt) + np.median(week_tt),
'portion of extremes', sum(week_tt > (sts.iqr(week_tt)+np.median(week_tt)))/len(week_tt))
plt.figure(figsize=(15, 7))
plt.subplot(1,2,1)
sns.boxplot(week_tt, orient='v')
if EN is True:
plt.ylabel('Travel time [s]')
plt.title('Weekdays')
else:
plt.ylabel('Dojazdová doba [s]')
plt.title('Deň v týždni')
plt.subplot(1,2,2)
sns.boxplot(weekend_tt, orient='v')
if EN is True:
plt.ylabel('Travel time [s]')
plt.title('Weekend')
else:
plt.ylabel('Dojazdová doba [s]')
plt.title('Víkend')
# plt.suptitle('Comparison of travel times on weekdays and weekend')
plt.savefig('distribution_traveltimes.png', bbox_inches='tight')
plt.show()
[docs]def plot_time_clusters(X, Y, time):
"""
Function for plotting of the distribution of the features with respect to the time of day.
:param pd.DataFrame X: features matrix
:param pd.DataFrame Y: vector of referential traveltimes values
:param pd.DataFrame time: timestamps of data
"""
import datetime as dt
time = pd.Series(pd.to_datetime(time))
# get distinct days
days = []
for i in range(30):
between = time.between(dt.datetime(2016,10,17,0,0,0)+timedelta(days=i), dt.datetime(2016,10,17,0,0,0)+timedelta(days=i+1))
day = Y[between.values]
feat = X[between.values]
days.append((day, feat))
clusters = [50, 80, 100, 110]
f, axx = plt.subplots(2,2)
axx = axx.flatten()
for i_d, d in enumerate(days):
for i_i, i in enumerate(clusters):
y_tmp = d[0][d[1][ColumnNames.FEAT_TIME_BIN].isin([i-1, i, i+1])]
y_tmp = pd.DataFrame(y_tmp)
axx[i_i].boxplot(y_tmp[d[1][ColumnNames.FEAT_TIME_BIN]==i].values, positions=[i_d])
axx[i_i].set_xlim([-1, len(days)+1])
# axx[i_i].set_ylim([0, 1000])
axx[i_i].set_xticks([])
plt.suptitle('Comparison of traveltimes values for several days with respect to the time-of-day')
plt.show()
# def compare_ref_bck_tt(EN=True):
# """
# Function for plotting the comparison of the referential traveltimes and the mode that is currently used in CAMEA.
#
# :param boolean EN: if the labels should be in English (otherwise Slovak)
# """
# bck = pd.read_pickle("../../data/bck_KRCE-PICE.pickle").iloc[19490:20370]
# ref = pd.read_pickle("../../data/ref_KRCE-PICE.pickle").iloc[19490:20370]
# # df = pd.merge(bck, ref,on=ColumnNamesRaw.CALC_TIME)#.iloc[71070:71900]
# r = ref.tt_real
# b = bck[[x for x in bck.columns if 'bck' in x][0]]
# d = pd.to_datetime(ref.index)
#
# plt.figure(figsize=(15, 7))
# if EN:
# plt.plot(d, r, label='Real travel times')
# plt.plot(d, b, label='Instantaneous travel times')
# plt.legend()
# plt.xlabel('Time of day [-]')
# plt.ylabel('Travel time [s]')
# else:
# plt.plot(d, r, label='Reálna dojazdová doba')
# plt.plot(d, b, label='Okamžitá dojazdová doba')
# plt.legend()
# plt.xlabel('Čas dňa [-]')
# plt.ylabel('Dojazodová doba [s]')
# plt.savefig("travel_times_comparison", bbox_inches='tight', dpi=200)
[docs]def plot_all_slices_distribution(df, feature):
"""
Function for plotting the distribution of the features for all slices.
:param string feature: name of the feature that should be plotted.
"""
import math
list_to_plot = [x for x in list(df.columns) if feature in x]
rows = math.ceil(len(list_to_plot) / 2)
f, axarr = plt.subplots(rows, 2, figsize=(10, 13))
indices = [(m, n) for m in range(rows)
for n in range(2)]
for idx, feat in enumerate(list_to_plot):
parts = feat.split('-')
sns.boxplot(df[feat].values, orient='h', ax=axarr[indices[idx]])
axarr[indices[idx]].set_title(parts[-1])
axarr[indices[idx]].set_xlabel(feature + (' [$km.h^{-1}$]' if 'velocity' in parts[0] else ' [-]'))
plt.tight_layout(pad=2.5)
plt.savefig('distribution_{}.png'.format(feature), bbox_inches='tight', dpi=200)
if __name__ == '__main__':
# df = pd.read_pickle("../../data/feature_df_KRCE-PICE_.pickle")
#
# plot_all_slices_distribution(df, 'rdet_velocity')
# plot_boxplots_week(False)
# c_name = ColumnNamesRaw.VELOCITY
#
# d_in = pd.read_csv("../../data/sensors_in_november")
# dd_in = d_in[c_name].values
#
# d_out = pd.read_csv("../../data/sensors_out_november")
# dd_out = d_out[c_name].values
#
# df = pd.read_csv("C:\\Users\\mudroch\\Desktop\\traveltimes_repo\\trunk\\results\\KRCE-PICE\\output.csv")
# plot_output_comparison(df)
# c_name = ColumnNamesRaw.VELOCITY
#
# d_in = pd.read_csv("../../data/sensors_in_november")
# dd_in = d_in[c_name].values
#
# d_out = pd.read_csv("../../data/sensors_out_november")
# dd_out = d_out[c_name].values
#
# plot_boxplots(dd_in, dd_out, r'$m.s^{-1}$', 'velocity')
#
# plot_multiple_days("../../data/")
#plot_time_clusters()
#compare_ref_bck_tt()
plot_boxplots_week()
# plot_multiple_days('../../data/ref_KRCE-PICE.pickle', images='traveltime_days', use_subplots=True, EN=True)
#compare_ref_bck_tt(False)
# Y = pd.read_pickle("../../data/ref_KRCE-PICE.pickle")
# X = pd.read_pickle("../../data/feature_df_KRCE-PICE.pickle")
# # plot_correlation(X, Y)
# plot_time_clusters(X, Y, pd.to_datetime(X.index.values))