Skip to content

Commit

Permalink
Merge pull request #3814 from PrimozGodec/clustering
Browse files Browse the repository at this point in the history
[ENH] Unified clustering API
  • Loading branch information
lanzagar authored Jun 21, 2019
2 parents a95a5a2 + 2d6b629 commit 6f3aef2
Show file tree
Hide file tree
Showing 14 changed files with 593 additions and 328 deletions.
1 change: 1 addition & 0 deletions Orange/clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from .dbscan import *
from .hierarchical import *
from .kmeans import *
from .louvain import *
100 changes: 100 additions & 0 deletions Orange/clustering/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import numpy as np
import scipy.sparse

from Orange.data import Table, Instance
from Orange.data.table import DomainTransformationError
from Orange.misc.wrapper_meta import WrapperMeta
from Orange.preprocess import Continuize, SklImpute


class ClusteringModel:

def __init__(self, projector):
self.projector = projector
self.domain = None
self.original_domain = None
self.labels = projector.labels_

def __call__(self, data):
def fix_dim(x):
return x[0] if one_d else x

one_d = False
if isinstance(data, np.ndarray):
one_d = data.ndim == 1
prediction = self.predict(np.atleast_2d(data))
elif isinstance(data, scipy.sparse.csr.csr_matrix) or \
isinstance(data, scipy.sparse.csc.csc_matrix):
prediction = self.predict(data)
elif isinstance(data, (Table, Instance)):
if isinstance(data, Instance):
data = Table(data.domain, [data])
one_d = True
if data.domain != self.domain:
if self.original_domain.attributes != data.domain.attributes \
and data.X.size \
and not np.isnan(data.X).all():
data = data.transform(self.original_domain)
if np.isnan(data.X).all():
raise DomainTransformationError(
"domain transformation produced no defined values")
data = data.transform(self.domain)
prediction = self.predict(data.X)
elif isinstance(data, (list, tuple)):
if not isinstance(data[0], (list, tuple)):
data = [data]
one_d = True
data = Table.from_list(self.original_domain, data)
data = data.transform(self.domain)
prediction = self.predict(data.X)
else:
raise TypeError("Unrecognized argument (instance of '{}')"
.format(type(data).__name__))

return fix_dim(prediction)

def predict(self, X):
raise NotImplementedError(
"This clustering algorithm does not support predicting.")


class Clustering(metaclass=WrapperMeta):
"""
${skldoc}
Additional Orange parameters
preprocessors : list, optional (default = [Continuize(), SklImpute()])
An ordered list of preprocessors applied to data before
training or testing.
"""
__wraps__ = None
__returns__ = ClusteringModel
preprocessors = [Continuize(), SklImpute()]

def __init__(self, preprocessors, parameters):
self.preprocessors = tuple(preprocessors or self.preprocessors)
self.params = {k: v for k, v in parameters.items()
if k not in ["self", "preprocessors", "__class__"]}

def __call__(self, data):
return self.get_model(data).labels

def get_model(self, data):
orig_domain = data.domain
data = self.preprocess(data)
model = self.fit_storage(data)
model.domain = data.domain
model.original_domain = orig_domain
return model

def fit_storage(self, data):
# only data Table
return self.fit(data.X)

def fit(self, X: np.ndarray, y: np.ndarray = None):
return self.__returns__(self.__wraps__(**self.params).fit(X))

def preprocess(self, data):
for pp in self.preprocessors:
data = pp(data)
return data
60 changes: 15 additions & 45 deletions Orange/clustering/dbscan.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,22 @@
import sklearn.cluster as skl_cluster
from numpy import ndarray, unique
import sklearn.cluster

from Orange.data import Table, DiscreteVariable, Domain, Instance
from Orange.projection import SklProjector, Projection
from Orange.clustering.clustering import Clustering
from Orange.data import Table


__all__ = ["DBSCAN"]

class DBSCAN(SklProjector):
__wraps__ = skl_cluster.DBSCAN

class DBSCAN(Clustering):

__wraps__ = sklearn.cluster.DBSCAN

def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
algorithm='auto', leaf_size=30, p=None,
preprocessors=None):
super().__init__(preprocessors=preprocessors)
self.params = vars()

def fit(self, X, Y=None):
proj = skl_cluster.DBSCAN(**self.params)
self.X = X
if isinstance(X, Table):
proj = proj.fit(X.X,)
else:
proj = proj.fit(X, )
return DBSCANModel(proj)


class DBSCANModel(Projection):
def __init__(self, proj):
super().__init__(proj=proj)

def __call__(self, data):
if isinstance(data, ndarray):
return self.proj.fit_predict(data).reshape((len(data), 1))

if isinstance(data, Table):
if data.domain is not self.pre_domain:
data = data.transform(self.pre_domain)
y = self.proj.fit_predict(data.X)
vals, indices = unique(y, return_inverse=True)
c = DiscreteVariable(name='Core sample index',
values=[str(v) for v in vals])
domain = Domain([c])
return Table(domain, indices.reshape(len(y), 1))

elif isinstance(data, Instance):
if data.domain is not self.pre_domain:
data = Instance(self.pre_domain, data)
# Instances-by-Instance classification is not defined;
raise Exception("Core sample assignment is not supported "
"for single instances.")
algorithm='auto', leaf_size=30, p=None, preprocessors=None):
super().__init__(preprocessors, vars())


if __name__ == "__main__":
d = Table("iris")
km = DBSCAN(preprocessors=None)
clusters = km(d)
95 changes: 34 additions & 61 deletions Orange/clustering/kmeans.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,45 @@
import numpy as np
import sklearn.cluster as skl_cluster
from sklearn.metrics import silhouette_samples, silhouette_score
import warnings

from Orange.data import Table, DiscreteVariable, Domain, Instance
from Orange.projection import SklProjector, Projection
from Orange.distance import Euclidean
import sklearn.cluster

from Orange.clustering.clustering import Clustering, ClusteringModel
from Orange.data import Table


__all__ = ["KMeans"]

SILHOUETTE_MAX_SAMPLES = 5000

class KMeans(SklProjector):
__wraps__ = skl_cluster.KMeans
class KMeansModel(ClusteringModel):

def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
tol=0.0001, random_state=None, preprocessors=None,
compute_silhouette_score=False):
super().__init__(preprocessors=preprocessors)
self.params = vars()
self._compute_silhouette = compute_silhouette_score
def __init__(self, projector):
super().__init__(projector)
self.centroids = projector.cluster_centers_
self.k = projector.get_params()["n_clusters"]

def fit(self, X, Y=None):
proj = skl_cluster.KMeans(**self.params)
proj = proj.fit(X, Y)
proj.silhouette = np.nan
try:
if self._compute_silhouette and 2 <= proj.n_clusters < X.shape[0]:
if X.shape[0] <= SILHOUETTE_MAX_SAMPLES:
proj.silhouette_samples = \
silhouette_samples(X, proj.labels_)
proj.silhouette = np.mean(proj.silhouette_samples)
else:
proj.silhouette_samples = None
proj.silhouette = \
silhouette_score(X, proj.labels_, sample_size=SILHOUETTE_MAX_SAMPLES)
except MemoryError: # Pairwise dist in silhouette fails for large data
pass
proj.inertia = proj.inertia_ / X.shape[0]
cluster_dist = Euclidean(proj.cluster_centers_)
proj.inter_cluster = np.mean(cluster_dist[np.triu_indices_from(cluster_dist, 1)])
return KMeansModel(proj, self.preprocessors)
def predict(self, X):
return self.projector.predict(X)


class KMeansModel(Projection):
def __init__(self, proj, preprocessors=None):
super().__init__(proj=proj)
self.k = self.proj.get_params()["n_clusters"]
self.centroids = self.proj.cluster_centers_
class KMeans(Clustering):

def __call__(self, data):
if isinstance(data, Table):
if data.domain is not self.pre_domain:
data = data.transform(self.pre_domain)
c = DiscreteVariable(name='Cluster id',
values=[str(i) for i in range(self.k)])
domain = Domain([c])
return Table(
domain,
self.proj.predict(data.X).astype(int).reshape((len(data), 1)))
elif isinstance(data, Instance):
if data.domain is not self.pre_domain:
data = Instance(self.pre_domain, data)
c = DiscreteVariable(name='Cluster id',
values=[str(i) for i in range(self.k)])
domain = Domain([c])
return Table(
domain,
np.atleast_2d(self.proj.predict(data._x.reshape(1, -1))).astype(int))
else:
return self.proj.predict(data).reshape((data.shape[0], 1))
__wraps__ = sklearn.cluster.KMeans
__returns__ = KMeansModel

def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
tol=0.0001, random_state=None, preprocessors=None,
compute_silhouette_score=None):
if compute_silhouette_score is not None:
warnings.warn(
"compute_silhouette_score is deprecated. Please use "
"sklearn.metrics.silhouette_score to compute silhouettes.",
DeprecationWarning)
super().__init__(
preprocessors, {k: v for k, v in vars().items()
if k != "compute_silhouette_score"})


if __name__ == "__main__":
d = Table("iris")
km = KMeans(preprocessors=None, n_clusters=3)
clusters = km(d)
model = km.fit_storage(d)
Loading

0 comments on commit 6f3aef2

Please sign in to comment.