Source code for test_suite.test_framework

import logging
import pickle
from random import randint
import traceback
import time

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.linear_model import RandomizedLasso, Ridge, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVR

from test_suite.benchmarks import criteria
from traveltimes_prediction.models.base_model import BaseModel
from traveltimes_prediction.support_files.helpers import impute
from traveltimes_prediction.data_processing import DataProcessor
from traveltimes_prediction.support_files.helpers import chunkify, merge_inner_lists
from traveltimes_prediction.support_files import ColumnNames

logger = logging.getLogger('traveltimes')

param_grid = [
  {'C': list(range(1, 1000, 350)), 'epsilon': np.linspace(1e-9, 1e-1, 4).tolist(), 'kernel': ['linear']}
  #{'C': list(range(1, 1000, 200)), 'epsilon': np.linspace(1e-9, 1e-1, 9).tolist(), 'gamma': np.linspace(1e-6, 1e+6, 12).tolist(), 'kernel': ['rbf']},
 ]


[docs]class TestFramework: """ Class for testing of the system, models and features. """ def __init__(self, X_path=None, Y_path=None): """ Constructor. :param string X_path: Path to the file with features. :param string Y_path: Path to the file with ground truth. """ if X_path is None or Y_path is None: raise ValueError("Please specify file to loaded !") self.X = None self.Y = None self.Y_bck = None # Load from file or take variable if isinstance(X_path, str) and isinstance(Y_path, str): with open(X_path, 'rb') as f: self.X = pickle.load(f) with open(Y_path, 'rb') as f: self.Y = pickle.load(f) elif isinstance(X_path, pd.DataFrame) and (isinstance(Y_path, pd.DataFrame) or isinstance(Y_path, pd.Series)): self.X = X_path self.Y = Y_path _bck_tt_column = [x for x in self.X.columns if ColumnNames.FEAT_TT_BCK in x] self.Y_bck = self.X[_bck_tt_column[0]] if _bck_tt_column else None if self.Y_bck is not None: self.X, self.Y, self.Y_bck = DataProcessor.align_training_dataset(self.X, self.Y, self.Y_bck) else: self.X, self.Y = DataProcessor.align_training_dataset(self.X, self.Y, self.Y_bck) self.time_df = pd.DataFrame(list(self.X.index), columns=['Time']) # feature_list = [ # ColumnNames.FEAT_RDET_VELOCITY, # ColumnNames.FEAT_RDET_COUNT, # ColumnNames.FEAT_RDET_OCCUPANCY, # ColumnNames.FEAT_TT_BCK, # ColumnNames.FEAT_DAY, ColumnNames.FEAT_TIME, # ColumnNames.FEAT_WEEKEND, ColumnNames.FEAT_MONDAY, ColumnNames.FEAT_TUESDAY, # ColumnNames.FEAT_WEDNESDAY, ColumnNames.FEAT_THURSDAY, ColumnNames.FEAT_FRIDAY, # ColumnNames.FEAT_TIME_BIN # ] # # l = self.select_features(feature_list) # self.X = self.X[l]
[docs] def select_features(self, list_of_features): """ Method for selection of the features. :param list list_of_features: :return: list """ cols = list(self.X.columns) selected_cols = [] for c in cols: for f in list_of_features: if f in c: selected_cols.append(c) continue return selected_cols
[docs] def get_subset(self, portion, cv_ratio, bck=False, valid_ratio=0.5): """ Method for retrieving the subset of data. :param float portion: Portion of data to be retrieved (the most recent data are taken as first, the oldest as latest). :param float cv_ratio: Cross-validation ratio, how much data is selected for training. :param pandas.DataFrame bck: Predictions done by used method. :return: tuple """ size_of_dataset = len(self.X.index) to_use = int(portion*size_of_dataset) train_idx = (size_of_dataset - to_use, size_of_dataset - to_use + int(to_use*cv_ratio)) test_idx = (train_idx[1], train_idx[1] + int(to_use*(1-cv_ratio)*valid_ratio)) valid_idx = (test_idx[1], size_of_dataset) train_X = self.X.iloc[train_idx[0]:train_idx[1]] train_Y = self.Y.iloc[train_idx[0]:train_idx[1]] test_X = self.X.iloc[test_idx[0]:test_idx[1]] test_Y = self.Y.iloc[test_idx[0]:test_idx[1]] valid_X = self.X.iloc[valid_idx[0]:valid_idx[1]] valid_Y = self.Y.iloc[valid_idx[0]:valid_idx[1]] test_time = self.time_df[test_idx[0]:test_idx[1]] train_time = self.time_df[train_idx[0]:train_idx[1]] valid_time = self.time_df[valid_idx[0]:valid_idx[1]] if bck: return train_X, train_Y, test_X, test_Y, self.Y_bck.iloc[test_idx[0]:test_idx[1]] if self.Y_bck is not None else None, test_time, train_time, valid_X, valid_Y return train_X, train_Y, test_X, test_Y, test_time, train_time
[docs] def regressor_evaluation_sklearn(self, algorithm, param_grid, loss_function, X, Y): """ Method using GridSearch for hyperparameters implemented in sklearn. :param class algorithm: prediction algorithm, model... :param list param_grid: list of dicts - params to try :param function loss_function: loss function to use :param numpy.ndarray X: array of features :param numpy.ndarray Y: array of ground truth :return: dict - best classifier`s hyperparameters """ clf = GridSearchCV(estimator=algorithm, param_grid=param_grid, cv=5, scoring=loss_function, n_jobs=2, verbose=50) clf.fit(X, Y) return clf.best_params_
def _regressor_evaluation(self, algorithm, param_grid, loss_function, X_train, Y_train, X_test, Y_test, Y_bck, X_valid, Y_valid): """ Method for Grid search - self implemented. :param BaseModel algorithm: :param list param_grid: list of dicts - hyperparams to try :param function loss_function: :param pd.DataFrame X_train: :param pd.DataFrame Y_train: :param pd.DataFrame X_test: :param pd.Series Y_test: :param pd.Series Y_bck: predictions done by used method, for comparison :return: list of dicts - combination of hyperparameters and score of these combinations """ results = [] for params in param_grid: try: clf = algorithm(**params) clf.fit(X=X_train, Y=Y_train) Y_est = clf.predict(X_test) loss = loss_function(Y_test, Y_est) mae = criteria.mean_absolute_error(Y_test, Y_est) mape = criteria.mean_absolute_percentage_error(Y_test, Y_est) rmsle = criteria.root_mean_squared_logarithmic_error(Y_test, Y_est) Y_est2 = clf.predict(X_valid) loss2 = loss_function(Y_valid, Y_est2) print(".") results.append({'params': params, 'loss': loss, 'valid_loss': loss2, 'mae': mae, 'mape': mape, 'rmsle': rmsle, 'model': algorithm.name}) except Exception as e: traceback.print_exc() print("#") return results
[docs] def regressor_evaluation_parallel(self, algorithm, param_grid, loss_function=criteria.camea_custom_error_v2, parallel=True): """ Proxy for _regressor_evaluation(), runs in parallel. Saves the results to csv. """ # parallel=False X_train, Y_train, X_test, Y_test, Y_bck, _, _, X_valid, Y_valid = self.get_subset(portion=1, cv_ratio=0.8, bck=True) if parallel: results = Parallel(n_jobs=-1, verbose=50)\ (delayed(self._regressor_evaluation)(algorithm, p_grid, loss_function, X_train, Y_train, X_test, Y_test, Y_bck, X_valid, Y_valid) for p_grid in chunkify(param_grid, 20)) else: results = [] for p_grid in param_grid: rr = self._regressor_evaluation(algorithm, [p_grid], loss_function, X_train, Y_train, X_test, Y_test, Y_bck, X_valid, Y_valid) results.append(rr) df = pd.DataFrame(merge_inner_lists(results)) return df
[docs] def features_evaluation(self, X, Y): """ Function for measuring the importance of the features. Uses RandomizedLasso for stability measure, RidgeRegression, RandomForests and Recursive Feature Elimination with SVR (linear kernel). :param pandas.DataFrame X: DataFrame with features :param pandas.DataFrame Y: DataFrame with results (travel times) :return: pandas.DataFrame """ if X is None: X = self.X if Y is None: Y = self.Y names = list(X.columns) X = X.values Y = Y.values ranks = {} def rank_to_dict(ranks, names, order=1): minmax = MinMaxScaler() ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0] ranks = map(lambda x: round(x, 2), ranks) return dict(zip(names, ranks)) ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) print("Ridge done") rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["RLasso-Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) print("Lasso done") # stop the search when 5 features are left (they will get equal scores) rfe = RFE(LinearRegression(), n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1) print("RFE done") rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) print("RF done") r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") # print("{:>15}{}".format("", "".join(["{:>15}".format(m) for m in methods]))) # for name in names: # print("{:<15}{}".format(name, "".join(map(lambda x: "{:>15}".format(str(x)), [ranks[method][name] for method in methods])))) df = pd.DataFrame.from_dict(ranks) df = df.sort_values(by='Mean', ascending=False) return df
[docs] def cluster_dataset(self, algorithm, params, cv_ratio, portion): """ Method for clustering of the dataset. :param class algorithm: algorithm to use :param dict params: hyperparameters for the algorithm :param float cv_ratio: should be 1.0 :param float portion: portion of data to take, most recent data are taken as first. :return: tuple - np.arrays x3 - estimated labels, X, Y """ train_X, train_Y, _, _, _, _ = self.get_subset(portion=portion, cv_ratio=cv_ratio, bck=False) _scaler = StandardScaler() _scaler.fit(X=train_X, y=train_Y) train_X = _scaler.transform(train_X) model = algorithm(**params) model.fit(train_X, train_Y) estimated_labels = model.labels_ return estimated_labels, train_X, train_Y
[docs] def train_predict_classifier(self, algorithm, params, cv_ratio, portion, valid_ratio=0.5): """ Method for training and predicting using classifier. :param class algorithm: classifier :param dict params: hyperparameters of classifier :param float cv_ratio: validation setting :param float portion: portion of data to be taken , most recent data are always included. :return: tuple - bck tt, prediction of tt using 'algorithm' with 'params', true value of tt, time """ train_X, train_Y, test_X, test_Y, Y_bck, test_time, _, _, _ = self.get_subset(portion=portion, cv_ratio=cv_ratio, bck=True, valid_ratio=valid_ratio) model = algorithm(**params) t1 = time.time() model.fit(train_X, train_Y) t2 = time.time() # tmp = test_X.time_of_day.values # a = list(map(lambda x: 5 < x < 21, tmp)) # tmp2 = test_X.day_of_week.values # aa = list(map(lambda x: x < 6, tmp2)) # a = aa and a # test_X = test_X.iloc[a] estimated_Y = model.predict(test_X) t3 = time.time() # return Y_bck.iloc[a], estimated_Y, test_Y.iloc[a], test_time.iloc[a], {'fit': t2-t1, 'predict': t3-t2} return Y_bck, estimated_Y, test_Y, test_time, {'fit': t2-t1, 'predict': t3-t2}
@staticmethod
[docs] def convert_func(XY_data_path, section): with open(XY_data_path, 'rb') as f: XY_data = pickle.load(f) X, Y = XY_data['data'] Y_bck = pd.DataFrame(X[[x for x in X.columns if ColumnNames.FEAT_TT_BCK in x][0]]) if ColumnNames.FEAT_TT_BCK in X.columns else None Y = pd.DataFrame(Y) X.to_pickle('data/feature_df_{}.pickle'.format(section)) if Y_bck is not None: Y_bck.to_pickle('data/bck_{}.pickle'.format(section)) Y.to_pickle('data/ref_{}.pickle'.format(section)) return X, Y, Y_bck
[docs] def est_better_than_bck(self, bck, est, ref): """ Function for comparison of quality od predictions - currently used model (bck) and estimated value using ML. :param pd.Series bck: :param pd.Series est: :param pd.Series ref: :return: boolean - True if the estimated traveltimes error is lower. """ if bck is None: return False _ref = ref.interpolate().values.astype(float) _bck = bck.interpolate().values.astype(float) _est = impute(array=est.astype(float), columns=[0], invalid_val=-1)[0].reshape(1, -1)[0] _ref[np.isnan(_ref)] = 0 _bck[np.isnan(_bck)] = 0 err_bck = criteria.camea_custom_error_v2(_ref, _bck) err_est = criteria.camea_custom_error_v2(_ref, _est) return err_bck > err_est