Source code for traveltimes_prediction.models.cluster_model
from sklearn import base
import collections
import numpy as np
from .base_model import BaseModel
from ..support_files.helpers import convert_params, check_params, index
import logging
logger = logging.getLogger('traveltimes')
[docs]class ClusterModel(base.BaseEstimator, BaseModel):
"""
Regression model.
Perform clustering on training data and train individual regressor for each cluster.
"""
name = 'ClusterModel'
def __init__(self, regressor=None, regressor_params=None, clusterizer=None, clusterizer_params=None):
"""
Constructor.
:param class regressor: class of regressor, e.g. SVR, RandomForestsRegressor
:param dict regressor_params: kwargs passed to self.regressor`s constructor
:param class clusterizer: class of clusterizer
:param dict clusterizer_params: kwargs passed to self.clusterizer`s constructor
"""
super(ClusterModel, self).__init__()
self.regressor = regressor
self.regressor_params = regressor_params
self.regressor_dict = None
self.clusterizer_params = clusterizer_params
self.clusterizer = clusterizer
@convert_params
def fit(self, X, Y):
"""
Method for fitting the model.
:param np.ndarray X: matrix of features - SxF
:param np.ndarray Y: matrix of outputs - S
:return:
"""
self.clusterizer = self.clusterizer(**self.clusterizer_params)
cluster_labels = self._clusterize(X=X)
cluster_dict = self._attach_samples_to_cluster(X=X, Y=Y, cluster_labels=cluster_labels)
regressor_dict = {str(int(cluster)): None for cluster in cluster_dict.keys()}
for cluster in cluster_dict.keys():
regressor_dict[str(int(cluster))] = self.regressor(**self.regressor_params).fit(X=cluster_dict[cluster]['X'],
y=cluster_dict[cluster]['Y'])
self.regressor_dict = regressor_dict
return self
@check_params
@convert_params
def predict(self, X):
"""
Method for prediction.
:param np.ndarray X: array of samples (matrix of feature) for which the value is predicted.
:return: list - predicted values
"""
if not isinstance(X, collections.Iterable):
X = np.array([X])
result = []
X = X.reshape(1, -1) if len(X.shape) == 1 else X
cluster_label = self.clusterizer.predict(X)
for idx, c in enumerate(cluster_label):
# Sometimes it happens that the 'c' is not in regressor_dict.
if str(int(c)) not in self.regressor_dict.keys():
cs = sorted(np.array(list(self.regressor_dict.keys())).astype(int))
c = cs[index(cs, c)]
result.extend(self.regressor_dict[str(c)].predict(X[idx].reshape(1,-1)))
return np.array(result) if len(result) > 1 else result[0]
[docs] def dump(self):
"""
Method for dumping of the model.
:return: dict - keys are names of attributes, values are their values
"""
d = dict()
d['model'] = dict()
d['model']['clusterizer'] = self.clusterizer.dump()
d['model']['regressor_dict'] = {r: self.regressor_dict[r].dump() for r in self.regressor_dict.keys()}
d['model_type'] = self.name
return d
@staticmethod
[docs] def load(model):
"""
Method for loading of the dumped model
:param dict model:
:return: obj
"""
inst = ClusterModel()
from .create_model import create_model
inst.clusterizer = create_model(model['clusterizer'])
inst.regressor_dict = {k: create_model(v) for k, v in model['regressor_dict'].items()}
return inst
def _clusterize(self, X):
"""
Method - finds the clusters in data.
:param np.ndarray X: matrix of features -> SxF
:return: np.ndarray - array of labels for each input sample
"""
self.clusterizer.fit(X=X)
return self.clusterizer.labels_
def _attach_samples_to_cluster(self, X, Y, cluster_labels):
"""
Method that partition the input data according to their clusters.
:param np.ndarray X: matrix of features -> SxF
:param np.ndarray Y: array of output values -> S
:param list cluster_labels: list of clusters` labels for each sample -> S
:return: dict - keys are clusters` labels, values are np.ndarray of samples belonging to given cluster
"""
unique_labels = np.unique(cluster_labels)
cluster_dict = {int(label): {'X': [], 'Y': []} for label in unique_labels}
for label in unique_labels:
idx_label = cluster_labels == label
cluster_dict[int(label)]['X'] = X[idx_label, :]
cluster_dict[int(label)]['Y'] = Y[idx_label]
return cluster_dict