Source code for traveltimes_prediction.models.cluster_model

from sklearn import base
import collections
import numpy as np

from .base_model import BaseModel
from ..support_files.helpers import convert_params, check_params, index

import logging
logger = logging.getLogger('traveltimes')


[docs]class ClusterModel(base.BaseEstimator, BaseModel): """ Regression model. Perform clustering on training data and train individual regressor for each cluster. """ name = 'ClusterModel' def __init__(self, regressor=None, regressor_params=None, clusterizer=None, clusterizer_params=None): """ Constructor. :param class regressor: class of regressor, e.g. SVR, RandomForestsRegressor :param dict regressor_params: kwargs passed to self.regressor`s constructor :param class clusterizer: class of clusterizer :param dict clusterizer_params: kwargs passed to self.clusterizer`s constructor """ super(ClusterModel, self).__init__() self.regressor = regressor self.regressor_params = regressor_params self.regressor_dict = None self.clusterizer_params = clusterizer_params self.clusterizer = clusterizer @convert_params def fit(self, X, Y): """ Method for fitting the model. :param np.ndarray X: matrix of features - SxF :param np.ndarray Y: matrix of outputs - S :return: """ self.clusterizer = self.clusterizer(**self.clusterizer_params) cluster_labels = self._clusterize(X=X) cluster_dict = self._attach_samples_to_cluster(X=X, Y=Y, cluster_labels=cluster_labels) regressor_dict = {str(int(cluster)): None for cluster in cluster_dict.keys()} for cluster in cluster_dict.keys(): regressor_dict[str(int(cluster))] = self.regressor(**self.regressor_params).fit(X=cluster_dict[cluster]['X'], y=cluster_dict[cluster]['Y']) self.regressor_dict = regressor_dict return self @check_params @convert_params def predict(self, X): """ Method for prediction. :param np.ndarray X: array of samples (matrix of feature) for which the value is predicted. :return: list - predicted values """ if not isinstance(X, collections.Iterable): X = np.array([X]) result = [] X = X.reshape(1, -1) if len(X.shape) == 1 else X cluster_label = self.clusterizer.predict(X) for idx, c in enumerate(cluster_label): # Sometimes it happens that the 'c' is not in regressor_dict. if str(int(c)) not in self.regressor_dict.keys(): cs = sorted(np.array(list(self.regressor_dict.keys())).astype(int)) c = cs[index(cs, c)] result.extend(self.regressor_dict[str(c)].predict(X[idx].reshape(1,-1))) return np.array(result) if len(result) > 1 else result[0]
[docs] def dump(self): """ Method for dumping of the model. :return: dict - keys are names of attributes, values are their values """ d = dict() d['model'] = dict() d['model']['clusterizer'] = self.clusterizer.dump() d['model']['regressor_dict'] = {r: self.regressor_dict[r].dump() for r in self.regressor_dict.keys()} d['model_type'] = self.name return d
@staticmethod
[docs] def load(model): """ Method for loading of the dumped model :param dict model: :return: obj """ inst = ClusterModel() from .create_model import create_model inst.clusterizer = create_model(model['clusterizer']) inst.regressor_dict = {k: create_model(v) for k, v in model['regressor_dict'].items()} return inst
def _clusterize(self, X): """ Method - finds the clusters in data. :param np.ndarray X: matrix of features -> SxF :return: np.ndarray - array of labels for each input sample """ self.clusterizer.fit(X=X) return self.clusterizer.labels_ def _attach_samples_to_cluster(self, X, Y, cluster_labels): """ Method that partition the input data according to their clusters. :param np.ndarray X: matrix of features -> SxF :param np.ndarray Y: array of output values -> S :param list cluster_labels: list of clusters` labels for each sample -> S :return: dict - keys are clusters` labels, values are np.ndarray of samples belonging to given cluster """ unique_labels = np.unique(cluster_labels) cluster_dict = {int(label): {'X': [], 'Y': []} for label in unique_labels} for label in unique_labels: idx_label = cluster_labels == label cluster_dict[int(label)]['X'] = X[idx_label, :] cluster_dict[int(label)]['Y'] = Y[idx_label] return cluster_dict