Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Unified clustering API #3814

Merged
merged 5 commits into from
Jun 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Orange/clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from .dbscan import *
from .hierarchical import *
from .kmeans import *
from .louvain import *
PrimozGodec marked this conversation as resolved.
Show resolved Hide resolved
100 changes: 100 additions & 0 deletions Orange/clustering/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import numpy as np
import scipy.sparse

from Orange.data import Table, Instance
from Orange.data.table import DomainTransformationError
from Orange.misc.wrapper_meta import WrapperMeta
from Orange.preprocess import Continuize, SklImpute


class ClusteringModel:

def __init__(self, projector):
self.projector = projector
self.domain = None
self.original_domain = None
self.labels = projector.labels_

def __call__(self, data):
def fix_dim(x):
return x[0] if one_d else x

one_d = False
if isinstance(data, np.ndarray):
one_d = data.ndim == 1
prediction = self.predict(np.atleast_2d(data))
elif isinstance(data, scipy.sparse.csr.csr_matrix) or \
isinstance(data, scipy.sparse.csc.csc_matrix):
prediction = self.predict(data)
elif isinstance(data, (Table, Instance)):
if isinstance(data, Instance):
data = Table(data.domain, [data])
one_d = True
if data.domain != self.domain:
if self.original_domain.attributes != data.domain.attributes \
and data.X.size \
and not np.isnan(data.X).all():
data = data.transform(self.original_domain)
if np.isnan(data.X).all():
raise DomainTransformationError(
"domain transformation produced no defined values")
data = data.transform(self.domain)
prediction = self.predict(data.X)
elif isinstance(data, (list, tuple)):
if not isinstance(data[0], (list, tuple)):
data = [data]
one_d = True
data = Table.from_list(self.original_domain, data)
data = data.transform(self.domain)
prediction = self.predict(data.X)
else:
raise TypeError("Unrecognized argument (instance of '{}')"
.format(type(data).__name__))

return fix_dim(prediction)

def predict(self, X):
raise NotImplementedError(
"This clustering algorithm does not support predicting.")


class Clustering(metaclass=WrapperMeta):
"""
${skldoc}
PrimozGodec marked this conversation as resolved.
Show resolved Hide resolved
Additional Orange parameters

preprocessors : list, optional (default = [Continuize(), SklImpute()])
An ordered list of preprocessors applied to data before
training or testing.
"""
__wraps__ = None
__returns__ = ClusteringModel
preprocessors = [Continuize(), SklImpute()]

def __init__(self, preprocessors, parameters):
self.preprocessors = tuple(preprocessors or self.preprocessors)
self.params = {k: v for k, v in parameters.items()
if k not in ["self", "preprocessors", "__class__"]}

def __call__(self, data):
return self.get_model(data).labels

def get_model(self, data):
orig_domain = data.domain
data = self.preprocess(data)
model = self.fit_storage(data)
model.domain = data.domain
model.original_domain = orig_domain
return model

def fit_storage(self, data):
# only data Table
return self.fit(data.X)

def fit(self, X: np.ndarray, y: np.ndarray = None):
return self.__returns__(self.__wraps__(**self.params).fit(X))

def preprocess(self, data):
for pp in self.preprocessors:
data = pp(data)
return data
60 changes: 15 additions & 45 deletions Orange/clustering/dbscan.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,22 @@
import sklearn.cluster as skl_cluster
from numpy import ndarray, unique
import sklearn.cluster

from Orange.data import Table, DiscreteVariable, Domain, Instance
from Orange.projection import SklProjector, Projection
from Orange.clustering.clustering import Clustering
from Orange.data import Table


__all__ = ["DBSCAN"]

class DBSCAN(SklProjector):
__wraps__ = skl_cluster.DBSCAN

class DBSCAN(Clustering):

__wraps__ = sklearn.cluster.DBSCAN

def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
algorithm='auto', leaf_size=30, p=None,
preprocessors=None):
super().__init__(preprocessors=preprocessors)
self.params = vars()

def fit(self, X, Y=None):
proj = skl_cluster.DBSCAN(**self.params)
self.X = X
if isinstance(X, Table):
proj = proj.fit(X.X,)
else:
proj = proj.fit(X, )
return DBSCANModel(proj)


class DBSCANModel(Projection):
def __init__(self, proj):
super().__init__(proj=proj)

def __call__(self, data):
if isinstance(data, ndarray):
return self.proj.fit_predict(data).reshape((len(data), 1))

if isinstance(data, Table):
if data.domain is not self.pre_domain:
data = data.transform(self.pre_domain)
y = self.proj.fit_predict(data.X)
vals, indices = unique(y, return_inverse=True)
c = DiscreteVariable(name='Core sample index',
values=[str(v) for v in vals])
domain = Domain([c])
return Table(domain, indices.reshape(len(y), 1))

elif isinstance(data, Instance):
if data.domain is not self.pre_domain:
data = Instance(self.pre_domain, data)
# Instances-by-Instance classification is not defined;
raise Exception("Core sample assignment is not supported "
"for single instances.")
algorithm='auto', leaf_size=30, p=None, preprocessors=None):
super().__init__(preprocessors, vars())


if __name__ == "__main__":
d = Table("iris")
km = DBSCAN(preprocessors=None)
clusters = km(d)
95 changes: 34 additions & 61 deletions Orange/clustering/kmeans.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,45 @@
import numpy as np
import sklearn.cluster as skl_cluster
from sklearn.metrics import silhouette_samples, silhouette_score
import warnings

from Orange.data import Table, DiscreteVariable, Domain, Instance
from Orange.projection import SklProjector, Projection
from Orange.distance import Euclidean
import sklearn.cluster

from Orange.clustering.clustering import Clustering, ClusteringModel
from Orange.data import Table


__all__ = ["KMeans"]

SILHOUETTE_MAX_SAMPLES = 5000

class KMeans(SklProjector):
__wraps__ = skl_cluster.KMeans
class KMeansModel(ClusteringModel):

def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
tol=0.0001, random_state=None, preprocessors=None,
compute_silhouette_score=False):
super().__init__(preprocessors=preprocessors)
self.params = vars()
self._compute_silhouette = compute_silhouette_score
def __init__(self, projector):
super().__init__(projector)
self.centroids = projector.cluster_centers_
self.k = projector.get_params()["n_clusters"]

def fit(self, X, Y=None):
proj = skl_cluster.KMeans(**self.params)
proj = proj.fit(X, Y)
proj.silhouette = np.nan
try:
if self._compute_silhouette and 2 <= proj.n_clusters < X.shape[0]:
if X.shape[0] <= SILHOUETTE_MAX_SAMPLES:
proj.silhouette_samples = \
silhouette_samples(X, proj.labels_)
proj.silhouette = np.mean(proj.silhouette_samples)
else:
proj.silhouette_samples = None
proj.silhouette = \
silhouette_score(X, proj.labels_, sample_size=SILHOUETTE_MAX_SAMPLES)
except MemoryError: # Pairwise dist in silhouette fails for large data
pass
proj.inertia = proj.inertia_ / X.shape[0]
cluster_dist = Euclidean(proj.cluster_centers_)
proj.inter_cluster = np.mean(cluster_dist[np.triu_indices_from(cluster_dist, 1)])
return KMeansModel(proj, self.preprocessors)
def predict(self, X):
return self.projector.predict(X)


class KMeansModel(Projection):
def __init__(self, proj, preprocessors=None):
super().__init__(proj=proj)
self.k = self.proj.get_params()["n_clusters"]
self.centroids = self.proj.cluster_centers_
class KMeans(Clustering):

def __call__(self, data):
if isinstance(data, Table):
if data.domain is not self.pre_domain:
data = data.transform(self.pre_domain)
c = DiscreteVariable(name='Cluster id',
values=[str(i) for i in range(self.k)])
domain = Domain([c])
return Table(
domain,
self.proj.predict(data.X).astype(int).reshape((len(data), 1)))
elif isinstance(data, Instance):
if data.domain is not self.pre_domain:
data = Instance(self.pre_domain, data)
c = DiscreteVariable(name='Cluster id',
values=[str(i) for i in range(self.k)])
domain = Domain([c])
return Table(
domain,
np.atleast_2d(self.proj.predict(data._x.reshape(1, -1))).astype(int))
else:
return self.proj.predict(data).reshape((data.shape[0], 1))
__wraps__ = sklearn.cluster.KMeans
__returns__ = KMeansModel

def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
tol=0.0001, random_state=None, preprocessors=None,
compute_silhouette_score=None):
if compute_silhouette_score is not None:
warnings.warn(
"compute_silhouette_score is deprecated. Please use "
"sklearn.metrics.silhouette_score to compute silhouettes.",
DeprecationWarning)
super().__init__(
preprocessors, {k: v for k, v in vars().items()
if k != "compute_silhouette_score"})


if __name__ == "__main__":
d = Table("iris")
km = KMeans(preprocessors=None, n_clusters=3)
clusters = km(d)
model = km.fit_storage(d)
Loading