From 3d2fa0eaaa321b7808d6e562c2104fbf403eb4ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Wed, 22 May 2019 11:04:25 +0200
Subject: [PATCH 1/5] Clustering simplified
---
Orange/clustering/__init__.py | 1 +
Orange/clustering/clustering.py | 100 ++++++++++++++++++++++
Orange/clustering/dbscan.py | 60 ++++---------
Orange/clustering/kmeans.py | 85 ++++++-------------
Orange/clustering/louvain.py | 146 +++++++++++++++-----------------
5 files changed, 206 insertions(+), 186 deletions(-)
create mode 100644 Orange/clustering/clustering.py
diff --git a/Orange/clustering/__init__.py b/Orange/clustering/__init__.py
index 03b53fa7f05..818a33c076d 100644
--- a/Orange/clustering/__init__.py
+++ b/Orange/clustering/__init__.py
@@ -4,3 +4,4 @@
from .dbscan import *
from .hierarchical import *
from .kmeans import *
+from .louvain import *
diff --git a/Orange/clustering/clustering.py b/Orange/clustering/clustering.py
new file mode 100644
index 00000000000..0dfaca0608f
--- /dev/null
+++ b/Orange/clustering/clustering.py
@@ -0,0 +1,100 @@
+import numpy as np
+import scipy.sparse
+
+from Orange.data import Table, Instance
+from Orange.data.table import DomainTransformationError
+from Orange.misc.wrapper_meta import WrapperMeta
+from Orange.preprocess import Continuize, SklImpute
+
+
+class ClusteringModel:
+
+ def __init__(self, projector):
+ self.projector = projector
+ self.domain = None
+ self.original_domain = None
+ self.labels = projector.labels_
+
+ def __call__(self, data):
+ def fix_dim(x):
+ return x[0] if one_d else x
+
+ one_d = False
+ if isinstance(data, np.ndarray):
+ one_d = data.ndim == 1
+ prediction = self.predict(np.atleast_2d(data))
+ elif isinstance(data, scipy.sparse.csr.csr_matrix) or \
+ isinstance(data, scipy.sparse.csc.csc_matrix):
+ prediction = self.predict(data)
+ elif isinstance(data, (Table, Instance)):
+ if isinstance(data, Instance):
+ data = Table(data.domain, [data])
+ one_d = True
+ if data.domain != self.domain:
+ if self.original_domain.attributes != data.domain.attributes \
+ and data.X.size \
+ and not np.isnan(data.X).all():
+ data = data.transform(self.original_domain)
+ if np.isnan(data.X).all():
+ raise DomainTransformationError(
+ "domain transformation produced no defined values")
+ data = data.transform(self.domain)
+ prediction = self.predict(data.X)
+ elif isinstance(data, (list, tuple)):
+ if not isinstance(data[0], (list, tuple)):
+ data = [data]
+ one_d = True
+ data = Table.from_list(self.original_domain, data)
+ data = data.transform(self.domain)
+ prediction = self.predict(data.X)
+ else:
+ raise TypeError("Unrecognized argument (instance of '{}')"
+ .format(type(data).__name__))
+
+ return fix_dim(prediction)
+
+ def predict(self, X):
+ raise NotImplementedError(
+ "This clustering algorithm does not support predicting.")
+
+
+class Clustering(metaclass=WrapperMeta):
+ """
+ ${skldoc}
+ Additional Orange parameters
+
+ preprocessors : list, optional (default = [Continuize(), SklImpute()])
+ An ordered list of preprocessors applied to data before
+ training or testing.
+ """
+ __wraps__ = None
+ __returns__ = ClusteringModel
+ preprocessors = [Continuize(), SklImpute()]
+
+ def __init__(self, preprocessors, parameters):
+ self.preprocessors = tuple(preprocessors or self.preprocessors)
+ self.params = {k: v for k, v in parameters.items()
+ if k not in ["self", "preprocessors", "__class__"]}
+
+ def __call__(self, data):
+ return self.get_model(data).labels
+
+ def get_model(self, data):
+ orig_domain = data.domain
+ data = self.preprocess(data)
+ model = self.fit_storage(data)
+ model.domain = data.domain
+ model.original_domain = orig_domain
+ return model
+
+ def fit_storage(self, data):
+ # only data Table
+ return self.fit(data.X)
+
+ def fit(self, X: np.ndarray, y: np.ndarray = None):
+ return self.__returns__(self.__wraps__(**self.params).fit(X))
+
+ def preprocess(self, data):
+ for pp in self.preprocessors:
+ data = pp(data)
+ return data
diff --git a/Orange/clustering/dbscan.py b/Orange/clustering/dbscan.py
index 7481f1c34ec..80e7cdd9948 100644
--- a/Orange/clustering/dbscan.py
+++ b/Orange/clustering/dbscan.py
@@ -1,52 +1,22 @@
-import sklearn.cluster as skl_cluster
-from numpy import ndarray, unique
+import sklearn.cluster
-from Orange.data import Table, DiscreteVariable, Domain, Instance
-from Orange.projection import SklProjector, Projection
+from Orange.clustering.clustering import Clustering
+from Orange.data import Table
__all__ = ["DBSCAN"]
-class DBSCAN(SklProjector):
- __wraps__ = skl_cluster.DBSCAN
+
+class DBSCAN(Clustering):
+
+ __wraps__ = sklearn.cluster.DBSCAN
def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
- algorithm='auto', leaf_size=30, p=None,
- preprocessors=None):
- super().__init__(preprocessors=preprocessors)
- self.params = vars()
-
- def fit(self, X, Y=None):
- proj = skl_cluster.DBSCAN(**self.params)
- self.X = X
- if isinstance(X, Table):
- proj = proj.fit(X.X,)
- else:
- proj = proj.fit(X, )
- return DBSCANModel(proj)
-
-
-class DBSCANModel(Projection):
- def __init__(self, proj):
- super().__init__(proj=proj)
-
- def __call__(self, data):
- if isinstance(data, ndarray):
- return self.proj.fit_predict(data).reshape((len(data), 1))
-
- if isinstance(data, Table):
- if data.domain is not self.pre_domain:
- data = data.transform(self.pre_domain)
- y = self.proj.fit_predict(data.X)
- vals, indices = unique(y, return_inverse=True)
- c = DiscreteVariable(name='Core sample index',
- values=[str(v) for v in vals])
- domain = Domain([c])
- return Table(domain, indices.reshape(len(y), 1))
-
- elif isinstance(data, Instance):
- if data.domain is not self.pre_domain:
- data = Instance(self.pre_domain, data)
- # Instances-by-Instance classification is not defined;
- raise Exception("Core sample assignment is not supported "
- "for single instances.")
+ algorithm='auto', leaf_size=30, p=None, preprocessors=None):
+ super().__init__(preprocessors, vars())
+
+
+if __name__ == "__main__":
+ d = Table("iris")
+ km = DBSCAN(preprocessors=None)
+ clusters = km(d)
diff --git a/Orange/clustering/kmeans.py b/Orange/clustering/kmeans.py
index 364a4261405..4230858c8a3 100644
--- a/Orange/clustering/kmeans.py
+++ b/Orange/clustering/kmeans.py
@@ -1,72 +1,35 @@
-import numpy as np
-import sklearn.cluster as skl_cluster
-from sklearn.metrics import silhouette_samples, silhouette_score
+import sklearn.cluster
-from Orange.data import Table, DiscreteVariable, Domain, Instance
-from Orange.projection import SklProjector, Projection
-from Orange.distance import Euclidean
+from Orange.clustering.clustering import Clustering, ClusteringModel
+from Orange.data import Table
__all__ = ["KMeans"]
-SILHOUETTE_MAX_SAMPLES = 5000
-class KMeans(SklProjector):
- __wraps__ = skl_cluster.KMeans
+class KMeansModel(ClusteringModel):
+
+ def __init__(self, projector):
+ super().__init__(projector)
+ self.centroids = projector.cluster_centers_
+ self.k = projector.get_params()["n_clusters"]
+
+ def predict(self, X):
+ return self.projector.predict(X)
- def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
- tol=0.0001, random_state=None, preprocessors=None,
- compute_silhouette_score=False):
- super().__init__(preprocessors=preprocessors)
- self.params = vars()
- self._compute_silhouette = compute_silhouette_score
- def fit(self, X, Y=None):
- proj = skl_cluster.KMeans(**self.params)
- proj = proj.fit(X, Y)
- proj.silhouette = np.nan
- try:
- if self._compute_silhouette and 2 <= proj.n_clusters < X.shape[0]:
- if X.shape[0] <= SILHOUETTE_MAX_SAMPLES:
- proj.silhouette_samples = \
- silhouette_samples(X, proj.labels_)
- proj.silhouette = np.mean(proj.silhouette_samples)
- else:
- proj.silhouette_samples = None
- proj.silhouette = \
- silhouette_score(X, proj.labels_, sample_size=SILHOUETTE_MAX_SAMPLES)
- except MemoryError: # Pairwise dist in silhouette fails for large data
- pass
- proj.inertia = proj.inertia_ / X.shape[0]
- cluster_dist = Euclidean(proj.cluster_centers_)
- proj.inter_cluster = np.mean(cluster_dist[np.triu_indices_from(cluster_dist, 1)])
- return KMeansModel(proj, self.preprocessors)
+class KMeans(Clustering):
+ __wraps__ = sklearn.cluster.KMeans
+ __returns__ = KMeansModel
+
+ def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
+ tol=0.0001, random_state=None, preprocessors=None):
+ super().__init__(preprocessors, vars())
-class KMeansModel(Projection):
- def __init__(self, proj, preprocessors=None):
- super().__init__(proj=proj)
- self.k = self.proj.get_params()["n_clusters"]
- self.centroids = self.proj.cluster_centers_
- def __call__(self, data):
- if isinstance(data, Table):
- if data.domain is not self.pre_domain:
- data = data.transform(self.pre_domain)
- c = DiscreteVariable(name='Cluster id',
- values=[str(i) for i in range(self.k)])
- domain = Domain([c])
- return Table(
- domain,
- self.proj.predict(data.X).astype(int).reshape((len(data), 1)))
- elif isinstance(data, Instance):
- if data.domain is not self.pre_domain:
- data = Instance(self.pre_domain, data)
- c = DiscreteVariable(name='Cluster id',
- values=[str(i) for i in range(self.k)])
- domain = Domain([c])
- return Table(
- domain,
- np.atleast_2d(self.proj.predict(data._x.reshape(1, -1))).astype(int))
- else:
- return self.proj.predict(data).reshape((data.shape[0], 1))
+if __name__ == "__main__":
+ d = Table("iris")
+ km = KMeans(preprocessors=None, n_clusters=3)
+ clusters = km(d)
+ model = km.fit_storage(d)
diff --git a/Orange/clustering/louvain.py b/Orange/clustering/louvain.py
index a4072341b25..ed2ff7ddc3e 100644
--- a/Orange/clustering/louvain.py
+++ b/Orange/clustering/louvain.py
@@ -3,7 +3,6 @@
Original C++ implementation available at
https://sites.google.com/site/findcommunities/
-
"""
import numpy as np
@@ -11,25 +10,29 @@
# NOTE: The ``community`` package might be renamed in the near future, see
# GH issue https://github.com/taynaud/python-louvain/issues/23
from community import best_partition
+from sklearn.base import BaseEstimator
from sklearn.neighbors import NearestNeighbors
-import Orange
+from Orange.clustering.clustering import Clustering
from Orange.data import Table
+__all__ = ["Louvain", "matrix_to_knn_graph"]
+
+
def jaccard(x, y):
# type: (set, set) -> float
"""Compute the Jaccard similarity between two sets."""
return len(x & y) / len(x | y)
-def table_to_knn_graph(data, k_neighbors, metric, progress_callback=None):
- """Convert tabular data to a graph using a nearest neighbors approach with
+def matrix_to_knn_graph(data, k_neighbors, metric, progress_callback=None):
+ """Convert data matrix to a graph using a nearest neighbors approach with
the Jaccard similarity as the edge weights.
Parameters
----------
- data : Table
+ data : np.ndarray
k_neighbors : int
metric : str
A distance metric supported by sklearn.
@@ -59,99 +62,82 @@ def table_to_knn_graph(data, k_neighbors, metric, progress_callback=None):
graph.add_edge(
node,
neighbor,
- weight=jaccard(nearest_neighbors[node], nearest_neighbors[neighbor]),
+ weight=jaccard(
+ nearest_neighbors[node], nearest_neighbors[neighbor]),
)
return graph
-class Louvain:
- preprocessors = [Orange.preprocess.Continuize(), Orange.preprocess.SklImpute()]
-
- def __init__(
- self,
- k_neighbors=30,
- metric="l2",
- resolution=1.0,
- random_state=None,
- preprocessors=None,
- ):
- """Louvain clustering for community detection in graphs.
-
- Louvain clustering is a community detection algorithm for detecting
- clusters of "communities" in graphs. As such, tabular data must first
- be converted into graph form. This is typically done by computing the
- KNN graph on the input data.
-
- Parameters
- ----------
- k_neighbors : Optional[int]
- The number of nearest neighbors to use for the KNN graph if
- tabular data is passed.
-
- metric : Optional[str]
- The metric to use to compute the nearest neighbors.
-
- resolution : Optional[float]
- The resolution is a parameter of the Louvain method that affects
- the size of the recovered clusters.
-
- random_state: Union[int, RandomState]
- The random state parameter follows the convention used in scikit-learn.
- If the value is an int, random_state is the seed used by the random
- number generator. If the value is a RandomState instance, then it will
- be used as the random number generator. If the value is None, the random
- number generator is the RandomState instance used by `np.random`.
-
- """
- if preprocessors is None:
- preprocessors = type(self).preprocessors
- self.preprocessors = tuple(preprocessors)
+class LouvainMethod(BaseEstimator):
+ def __init__(self, k_neighbors=30, metric="l2", resolution=1.0,
+ random_state=None):
self.k_neighbors = k_neighbors
self.metric = metric
self.resolution = resolution
self.random_state = random_state
+ self.labels_ = None
- self.labels = None
+ def fit(self, X: np.ndarray, y: np.ndarray = None):
+ # If we are given a table, we have to convert it to a graph first
+ graph = matrix_to_knn_graph(
+ X, metric=self.metric, k_neighbors=self.k_neighbors)
+ return self.fit_graph(graph)
- def __call__(self, data):
- data = self.preprocess(data)
- return self.fit_predict(data.X, data.Y)
+ def fit_graph(self, graph):
+ partition = best_partition(
+ graph, resolution=self.resolution, random_state=self.random_state)
+ self.labels_ = np.fromiter(
+ list(zip(*sorted(partition.items())))[1], dtype=int)
+ return self
- def preprocess(self, data):
- for pp in self.preprocessors:
- data = pp(data)
- return data
- def fit(self, X, y=None):
- # If we are given a table, we have to convert it to a graph first
- if isinstance(X, Table):
- graph = table_to_knn_graph(
- X.X, metric=self.metric, k_neighbors=self.k_neighbors
- )
- # Same goes for a matrix
- elif isinstance(X, np.ndarray):
- graph = table_to_knn_graph(
- X, metric=self.metric, k_neighbors=self.k_neighbors
- )
- elif isinstance(X, nx.Graph):
- graph = X
+class Louvain(Clustering):
+ """Louvain clustering for community detection in graphs.
- partition = best_partition(
- graph, resolution=self.resolution, random_state=self.random_state
- )
- partition = np.fromiter(list(zip(*sorted(partition.items())))[1], dtype=int)
+ Louvain clustering is a community detection algorithm for detecting
+ clusters of "communities" in graphs. As such, tabular data must first
+ be converted into graph form. This is typically done by computing the
+ KNN graph on the input data.
+
+ Attributes
+ ----------
+ k_neighbors : Optional[int]
+ The number of nearest neighbors to use for the KNN graph if
+ tabular data is passed.
+
+ metric : Optional[str]
+ The metric to use to compute the nearest neighbors.
+
+ resolution : Optional[float]
+ The resolution is a parameter of the Louvain method that affects
+ the size of the recovered clusters.
+
+ random_state: Union[int, RandomState]
+ The random state parameter follows the convention used in scikit-learn.
+ If the value is an int, random_state is the seed used by the random
+ number generator. If the value is a RandomState instance, then it will
+ be used as the random number generator. If the value is None, the random
+ number generator is the RandomState instance used by `np.random`.
+ """
+
+ __wraps__ = LouvainMethod
- self.labels = partition
+ def __init__(self, k_neighbors=30, metric="l2", resolution=1.0,
+ random_state=None, preprocessors=None):
+ super().__init__(preprocessors, vars())
- def fit_predict(self, X, y=None):
- self.fit(X, y)
- return self.labels
+ def get_model(self, data):
+ if isinstance(data, nx.Graph):
+ return self.__returns__(
+ self.__wraps__(**self.params).fit_graph(data))
+ else:
+ return super().get_model(data)
if __name__ == "__main__":
# clustering run on iris data - orange table
- data = Table("iris")
- louvain = Louvain(2)
- louvain.fit(data)
+ d = Table("iris")
+ louvain = Louvain(5)
+ clusters = louvain(d)
From fa2af37f6929a4b78156e6054bd3295467ab8623 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Thu, 30 May 2019 12:32:42 +0200
Subject: [PATCH 2/5] Clustering: modified dependent widgets
---
Orange/clustering/kmeans.py | 10 ++++-
Orange/evaluation/clustering.py | 5 +--
Orange/widgets/unsupervised/owkmeans.py | 41 +++++++++++-------
.../unsupervised/owlouvainclustering.py | 13 +++---
.../unsupervised/tests/test_owkmeans.py | 42 +++++++++----------
Orange/widgets/visualize/owheatmap.py | 6 +--
6 files changed, 66 insertions(+), 51 deletions(-)
diff --git a/Orange/clustering/kmeans.py b/Orange/clustering/kmeans.py
index 4230858c8a3..26957535050 100644
--- a/Orange/clustering/kmeans.py
+++ b/Orange/clustering/kmeans.py
@@ -1,3 +1,5 @@
+import warnings
+
import sklearn.cluster
from Orange.clustering.clustering import Clustering, ClusteringModel
@@ -24,7 +26,13 @@ class KMeans(Clustering):
__returns__ = KMeansModel
def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
- tol=0.0001, random_state=None, preprocessors=None):
+ tol=0.0001, random_state=None, preprocessors=None,
+ compute_silhouette_score=None):
+ if compute_silhouette_score is not None:
+ warnings.warn(
+ "compute_silhouette_score is deprecated. Please use "
+ "sklearn.metrics.silhouette_score to compute silhouettes.",
+ DeprecationWarning)
super().__init__(preprocessors, vars())
diff --git a/Orange/evaluation/clustering.py b/Orange/evaluation/clustering.py
index 53bb7858cde..77682dd310e 100644
--- a/Orange/evaluation/clustering.py
+++ b/Orange/evaluation/clustering.py
@@ -94,11 +94,10 @@ def __call__(self, data, learners, preprocessor=None, *, callback=None):
for k in range(self.k):
for i, learner in enumerate(learners):
- model = learner(data)
+ model = learner.get_model(data)
if self.store_models:
res.models[k, i] = model
- labels = model(data)
- res.predicted[i, k, :] = labels.X.flatten()
+ res.predicted[i, k, :] = model.labels
return res
diff --git a/Orange/widgets/unsupervised/owkmeans.py b/Orange/widgets/unsupervised/owkmeans.py
index 185b184eb48..0a44de35280 100644
--- a/Orange/widgets/unsupervised/owkmeans.py
+++ b/Orange/widgets/unsupervised/owkmeans.py
@@ -6,9 +6,10 @@
pyqtSlot as Slot
from AnyQt.QtGui import QIntValidator
from AnyQt.QtWidgets import QGridLayout, QTableView
+from sklearn.metrics import silhouette_samples, silhouette_score
from Orange.clustering import KMeans
-from Orange.clustering.kmeans import KMeansModel, SILHOUETTE_MAX_SAMPLES
+from Orange.clustering.kmeans import KMeansModel
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
from Orange.data.util import get_unique_names, array_equal
from Orange.preprocess.impute import ReplaceUnknowns
@@ -23,6 +24,7 @@
RANDOM_STATE = 0
+SILHOUETTE_MAX_SAMPLES = 5000
class ClusterTableModel(QAbstractTableModel):
@@ -268,15 +270,15 @@ def has_attributes(self):
return len(self.data.domain.attributes)
@staticmethod
- def _compute_clustering(data, k, init, n_init, max_iter, silhouette, random_state):
+ def _compute_clustering(data, k, init, n_init, max_iter, random_state):
# type: (Table, int, str, int, int, bool) -> KMeansModel
if k > len(data):
raise NotEnoughData()
return KMeans(
n_clusters=k, init=init, n_init=n_init, max_iter=max_iter,
- compute_silhouette_score=silhouette, random_state=random_state,
- )(data)
+ random_state=random_state
+ ).get_model(data)
@Slot(int, int)
def __progress_changed(self, n, d):
@@ -336,7 +338,6 @@ def __launch_tasks(self, ks):
init=self.INIT_METHODS[self.smart_init][1],
n_init=self.n_init,
max_iter=self.max_iterations,
- silhouette=True,
random_state=RANDOM_STATE,
) for k in ks]
watcher = FutureSetWatcher(futures)
@@ -432,10 +433,9 @@ def invalidate(self):
self.commit()
def update_results(self):
- scores = [
- mk if isinstance(mk, str) else mk.silhouette for mk in (
- self.clusterings[k] for k in range(self.k_from, self.k_to + 1))
- ]
+ scores = [mk if isinstance(mk, str) else silhouette_score(
+ self.data.X, mk.labels) for mk in (
+ self.clusterings[k] for k in range(self.k_from, self.k_to + 1))]
best_row = max(
range(len(scores)), default=0,
key=lambda x: 0 if isinstance(scores[x], str) else scores[x]
@@ -454,6 +454,16 @@ def selected_row(self):
def select_row(self):
self.send_data()
+ def preproces(self, data):
+ for preprocessor in KMeans.preprocessors: # use same preprocessors than
+ data = preprocessor(data)
+ return data
+
+ def samples_scores(self, clust_ids):
+ d = self.preproces(self.data)
+ return np.arctan(
+ silhouette_samples(d.X, clust_ids)) / np.pi + 0.5
+
def send_data(self):
if self.optimize_k:
row = self.selected_row()
@@ -472,16 +482,15 @@ def send_data(self):
get_unique_names(domain, "Cluster"),
values=["C%d" % (x + 1) for x in range(km.k)]
)
- clust_ids = km(self.data)
- clust_col = clust_ids.X.ravel()
+ clust_ids = km.labels
silhouette_var = ContinuousVariable(
get_unique_names(domain, "Silhouette"))
- if km.silhouette_samples is not None:
+ if len(self.data) <= SILHOUETTE_MAX_SAMPLES:
self.Warning.no_silhouettes.clear()
- scores = np.arctan(km.silhouette_samples) / np.pi + 0.5
+ scores = self.samples_scores(clust_ids)
clust_scores = []
for i in range(km.k):
- in_clust = clust_col == i
+ in_clust = clust_ids == i
if in_clust.any():
clust_scores.append(np.mean(scores[in_clust]))
else:
@@ -494,7 +503,7 @@ def send_data(self):
new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
new_table = self.data.transform(new_domain)
- new_table.get_column_view(cluster_var)[0][:] = clust_col
+ new_table.get_column_view(cluster_var)[0][:] = clust_ids
new_table.get_column_view(silhouette_var)[0][:] = scores
centroid_attributes = [
@@ -502,7 +511,7 @@ def send_data(self):
if isinstance(attr.compute_value, ReplaceUnknowns)
and attr.compute_value.variable in domain.attributes
else attr
- for attr in km.pre_domain.attributes]
+ for attr in km.domain.attributes]
centroid_domain = add_columns(
Domain(centroid_attributes, [], domain.metas),
metas=[cluster_var, silhouette_var])
diff --git a/Orange/widgets/unsupervised/owlouvainclustering.py b/Orange/widgets/unsupervised/owlouvainclustering.py
index a4d27ddc4b1..56432544b5a 100644
--- a/Orange/widgets/unsupervised/owlouvainclustering.py
+++ b/Orange/widgets/unsupervised/owlouvainclustering.py
@@ -14,7 +14,7 @@
)
from AnyQt.QtWidgets import QSlider, QCheckBox, QWidget, QLabel
-from Orange.clustering.louvain import table_to_knn_graph, Louvain
+from Orange.clustering.louvain import matrix_to_knn_graph, Louvain
from Orange.data import Table, DiscreteVariable
from Orange.data.util import get_unique_names, array_equal
from Orange import preprocess
@@ -623,10 +623,9 @@ def pcallback(val):
raise InteruptRequested()
try:
- res.graph = graph = table_to_knn_graph(
- data, k_neighbors=k_neighbors, metric=metric,
- progress_callback=pcallback
- )
+ res.graph = graph = matrix_to_knn_graph(
+ data.X, k_neighbors=k_neighbors, metric=metric,
+ progress_callback=pcallback)
except InteruptRequested:
return res
@@ -638,7 +637,7 @@ def pcallback(val):
if state.is_interuption_requested():
return res
- res.partition = louvain.fit_predict(graph)
+ res.partition = louvain(graph)
state.set_partial_results(("partition", res.partition))
return res
@@ -654,7 +653,7 @@ def run_on_graph(graph, resolution, state):
state.set_status("Detecting communities...")
if state.is_interuption_requested():
return res
- partition = louvain.fit_predict(graph)
+ partition = louvain(graph)
res.partition = partition
state.set_partial_results(("partition", res.partition))
return res
diff --git a/Orange/widgets/unsupervised/tests/test_owkmeans.py b/Orange/widgets/unsupervised/tests/test_owkmeans.py
index 988daf5c9a8..f81fed3fc04 100644
--- a/Orange/widgets/unsupervised/tests/test_owkmeans.py
+++ b/Orange/widgets/unsupervised/tests/test_owkmeans.py
@@ -5,6 +5,7 @@
import numpy as np
from AnyQt.QtCore import Qt
from AnyQt.QtWidgets import QRadioButton
+from sklearn.metrics import silhouette_score
import Orange.clustering
from Orange.data import Table, Domain
@@ -197,26 +198,22 @@ def test_data_on_output(self):
# removing data should have cleared the output
self.assertEqual(self.widget.data, None)
- @patch("Orange.clustering.kmeans.KMeansModel.__call__")
- def test_centroids_on_output(self, km_call):
- ret = km_call.return_value = Mock()
- ret.X = np.array([0] * 50 + [1] * 100)
- ret.silhouette_samples = np.arange(150) / 150
-
+ def test_centroids_on_output(self):
widget = self.widget
widget.optimize_k = False
widget.k = 4
self.send_signal(widget.Inputs.data, self.iris)
self.commit_and_wait()
+ widget.clusterings[widget.k].labels = np.array([0] * 50 + [1] * 100).flatten()
- widget.clusterings[4].silhouette_samples = np.arange(150) / 150
+ widget.samples_scores = lambda x: np.arctan(
+ np.arange(150) / 150) / np.pi + 0.5
widget.send_data()
out = self.get_output(widget.Outputs.centroids)
- np.testing.assert_almost_equal(
- out.metas,
- [[0, np.mean(np.arctan(np.arange(50) / 150)) / np.pi + 0.5],
- [1, np.mean(np.arctan(np.arange(50, 150) / 150)) / np.pi + 0.5],
- [2, 0], [3, 0]])
+ np.testing.assert_array_almost_equal(
+ np.array([[0, np.mean(np.arctan(np.arange(50) / 150)) / np.pi + 0.5],
+ [1, np.mean(np.arctan(np.arange(50, 150) / 150)) / np.pi + 0.5],
+ [2, 0], [3, 0]]), out.metas.astype(float))
self.assertEqual(out.name, "iris centroids")
def test_centroids_domain_on_output(self):
@@ -262,12 +259,14 @@ def test_optimization_fails(self):
self.KMeansFail.fail_on = {3, 5, 7}
model = widget.table_view.model()
- with patch.object(model, "set_scores", wraps=model.set_scores) as set_scores:
+ with patch.object(
+ model, "set_scores", wraps=model.set_scores) as set_scores:
self.send_signal(self.widget.Inputs.data, self.iris, wait=5000)
scores, start_k = set_scores.call_args[0]
self.assertEqual(
scores,
- [km if isinstance(km, str) else km.silhouette
+ [km if isinstance(km, str) else silhouette_score(
+ self.iris.X, km(self.iris))
for km in (widget.clusterings[k] for k in range(3, 9))]
)
self.assertEqual(start_k, 3)
@@ -312,15 +311,14 @@ def test_run_fails(self):
self.assertIsNotNone(self.get_output(self.widget.Outputs.annotated_data))
def test_select_best_row(self):
- class Cluster:
- def __init__(self, n):
- self.silhouette = n
-
widget = self.widget
widget.k_from, widget.k_to = 2, 6
- widget.clusterings = {k: Cluster(5 - (k - 4) ** 2) for k in range(2, 7)}
+ widget.optimize_k = True
+ self.send_signal(self.widget.Inputs.data, Table("housing"), wait=5000)
+ self.commit_and_wait()
widget.update_results()
- self.assertEqual(widget.selected_row(), 2)
+ # for housing dataset best selection is 3 clusters, so row no. 1
+ self.assertEqual(widget.selected_row(), 1)
widget.clusterings = {k: "error" for k in range(2, 7)}
widget.update_results()
@@ -394,7 +392,9 @@ def test_silhouette_column(self):
# Avoid randomness in the test
random = np.random.RandomState(0) # pylint: disable=no-member
table = Table(random.rand(110, 2))
- with patch("Orange.clustering.kmeans.SILHOUETTE_MAX_SAMPLES", 100):
+ with patch(
+ "Orange.widgets.unsupervised.owkmeans.SILHOUETTE_MAX_SAMPLES",
+ 100):
self.send_signal(self.widget.Inputs.data, table)
outtable = self.get_output(widget.Outputs.annotated_data)
outtable = outtable.get_column_view("Silhouette")[0]
diff --git a/Orange/widgets/visualize/owheatmap.py b/Orange/widgets/visualize/owheatmap.py
index 6c36e25b803..40f1a5dbef3 100644
--- a/Orange/widgets/visualize/owheatmap.py
+++ b/Orange/widgets/visualize/owheatmap.py
@@ -114,7 +114,7 @@ def barycenter(a, axis=0):
def kmeans_compress(X, k=50):
km = kmeans.KMeans(n_clusters=k, n_init=5, random_state=42)
- return km(X)
+ return km.get_model(X)
def candidate_split_labels(data):
@@ -866,8 +866,8 @@ def construct_heatmaps(self, data, split_label=None):
self.input_data.domain.metas))
nclust = min(self.merge_kmeans_k, len(effective_data) - 1)
self.kmeans_model = kmeans_compress(effective_data, k=nclust)
- effective_data.domain = self.kmeans_model.pre_domain
- merge_indices = [np.flatnonzero(self.kmeans_model.labels_ == ind)
+ effective_data.domain = self.kmeans_model.domain
+ merge_indices = [np.flatnonzero(self.kmeans_model.labels == ind)
for ind in range(nclust)]
not_empty_indices = [i for i, x in enumerate(merge_indices)
if len(x) > 0]
From 857a29ab27e545873fe6bc94c8306df6948a6a24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Thu, 30 May 2019 12:33:07 +0200
Subject: [PATCH 3/5] Clustering: Fixed tests
---
Orange/tests/test_clustering_dbscan.py | 41 ++++++---------------
Orange/tests/test_clustering_kmeans.py | 49 +++++++-------------------
Orange/tests/test_louvain.py | 20 ++++-------
3 files changed, 30 insertions(+), 80 deletions(-)
diff --git a/Orange/tests/test_clustering_dbscan.py b/Orange/tests/test_clustering_dbscan.py
index 5065fe2482c..14e107d4ba5 100644
--- a/Orange/tests/test_clustering_dbscan.py
+++ b/Orange/tests/test_clustering_dbscan.py
@@ -3,45 +3,26 @@
import unittest
-import Orange
+import numpy as np
+
+from Orange.data import Table
from Orange.clustering.dbscan import DBSCAN
class TestDBSCAN(unittest.TestCase):
-
- @classmethod
- def setUpClass(cls):
- cls.iris = Orange.data.Table('iris')
+ def setUp(self):
+ self.iris = Table('iris')
+ self.dbscan = DBSCAN()
def test_dbscan_parameters(self):
dbscan = DBSCAN(eps=0.1, min_samples=7, metric='euclidean',
algorithm='auto', leaf_size=12, p=None)
- c = dbscan(self.iris)
+ dbscan(self.iris)
def test_predict_table(self):
- dbscan = DBSCAN()
- c = dbscan(self.iris)
- table = self.iris[:20]
- p = c(table)
+ pred = self.dbscan(self.iris)
+ self.assertEqual(np.ndarray, type(pred))
def test_predict_numpy(self):
- dbscan = DBSCAN()
- c = dbscan(self.iris)
- X = self.iris.X[::20]
- p = c(X)
-
- def test_values(self):
- dbscan = DBSCAN(eps=1) # it clusters data in two classes
- c = dbscan(self.iris)
- table = self.iris
- p = c(table)
-
- self.assertEqual(2, len(p.domain[0].values))
- self.assertSetEqual({"0", "1"}, set(p.domain[0].values))
-
- table.X[0] = [100, 100, 100, 100] # we add a big outlier
-
- p = c(table)
-
- self.assertEqual(3, len(p.domain[0].values))
- self.assertSetEqual({"-1", "0", "1"}, set(p.domain[0].values))
+ model = self.dbscan.fit(self.iris.X)
+ self.assertEqual(np.ndarray, type(model.labels))
diff --git a/Orange/tests/test_clustering_kmeans.py b/Orange/tests/test_clustering_kmeans.py
index ae2dc82eb1e..e406b8df204 100644
--- a/Orange/tests/test_clustering_kmeans.py
+++ b/Orange/tests/test_clustering_kmeans.py
@@ -11,55 +11,32 @@
class TestKMeans(unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- cls.iris = Orange.data.Table('iris')
+ def setUp(self):
+ self.kmeans = KMeans(n_clusters=2)
+ self.iris = Orange.data.Table('iris')
def test_kmeans(self):
- kmeans = KMeans(n_clusters=2)
- c = kmeans(self.iris)
- X = self.iris.X[:20]
- p = c(X)
+ c = self.kmeans(self.iris)
# First 20 iris belong to one cluster
- assert len(set(p.ravel())) == 1
+ self.assertEqual(1, len(set(c[:20].ravel())))
def test_kmeans_parameters(self):
- kmeans = KMeans(n_clusters=10,
- max_iter=10,
- random_state=42,
- tol=0.001,
- init='random',
- compute_silhouette_score=True)
- c = kmeans(self.iris)
-
- def test_predict_single_instance(self):
- kmeans = KMeans()
- c = kmeans(self.iris)
- inst = self.iris[0]
- p = c(inst)
+ kmeans = KMeans(n_clusters=10, max_iter=10, random_state=42, tol=0.001,
+ init='random')
+ kmeans(self.iris)
def test_predict_table(self):
kmeans = KMeans()
c = kmeans(self.iris)
- table = self.iris[:20]
- p = c(table)
+ self.assertEqual(np.ndarray, type(c))
def test_predict_numpy(self):
kmeans = KMeans()
- c = kmeans(self.iris)
- X = self.iris.X[::20]
- p = c(X)
+ c = kmeans.fit(self.iris.X)
+ self.assertEqual(np.ndarray, type(c.labels))
def test_predict_sparse(self):
kmeans = KMeans()
+ self.iris.X = csc_matrix(self.iris.X[::20])
c = kmeans(self.iris)
- X = csc_matrix(self.iris.X[::20])
- p = c(X)
-
- def test_silhouette_sparse(self):
- """Test if silhouette gets calculated for sparse data"""
- kmeans = KMeans(compute_silhouette_score=True)
- sparse_iris = self.iris.copy()
- sparse_iris.X = csc_matrix(sparse_iris.X)
- c = kmeans(sparse_iris)
- self.assertFalse(np.isnan(c.silhouette))
+ self.assertEqual(np.ndarray, type(c))
diff --git a/Orange/tests/test_louvain.py b/Orange/tests/test_louvain.py
index 2a012889482..e1c192f1eac 100644
--- a/Orange/tests/test_louvain.py
+++ b/Orange/tests/test_louvain.py
@@ -8,19 +8,11 @@
from Orange.clustering.louvain import Louvain
-class TestSVMLearner(unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- cls.data = Table('iris')
- cls.louvain = Louvain()
+class TestLouvain(unittest.TestCase):
+ def setUp(self):
+ self.data = Table('iris')
+ self.louvain = Louvain()
def test_orange_table(self):
- self.assertIsNone(self.louvain.fit(self.data))
- clusters = self.louvain.fit_predict(self.data)
- self.assertIn(type(clusters), [list, np.ndarray])
-
- def test_np_array(self):
- data_np = self.data.X
- self.assertIsNone(self.louvain.fit(data_np))
- clusters = self.louvain.fit_predict(data_np)
- self.assertIn(type(clusters), [list, np.ndarray])
+ labels = self.louvain(self.data)
+ self.assertEqual(np.ndarray, type(labels))
From 27634c5f3ce815c31f400fabd4fca86251502418 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Tue, 11 Jun 2019 16:28:46 +0200
Subject: [PATCH 4/5] Clustering: Deprecate silhouette in kmeans
---
Orange/clustering/kmeans.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/Orange/clustering/kmeans.py b/Orange/clustering/kmeans.py
index 26957535050..97ba7d0e8a1 100644
--- a/Orange/clustering/kmeans.py
+++ b/Orange/clustering/kmeans.py
@@ -33,7 +33,9 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
"compute_silhouette_score is deprecated. Please use "
"sklearn.metrics.silhouette_score to compute silhouettes.",
DeprecationWarning)
- super().__init__(preprocessors, vars())
+ super().__init__(
+ preprocessors, {k: v for k, v in vars().items()
+ if k != "compute_silhouette_score"})
if __name__ == "__main__":
From 2d6b629e8413becb3490cfe8912782b0603f1fe1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Mon, 17 Jun 2019 13:46:00 +0200
Subject: [PATCH 5/5] Clustering: Additional tests for clustering methods
---
Orange/tests/test_clustering_dbscan.py | 70 ++++++++++++-
Orange/tests/test_clustering_kmeans.py | 128 ++++++++++++++++++++++--
Orange/tests/test_clustering_louvain.py | 120 ++++++++++++++++++++++
Orange/tests/test_louvain.py | 18 ----
4 files changed, 307 insertions(+), 29 deletions(-)
create mode 100644 Orange/tests/test_clustering_louvain.py
delete mode 100644 Orange/tests/test_louvain.py
diff --git a/Orange/tests/test_clustering_dbscan.py b/Orange/tests/test_clustering_dbscan.py
index 14e107d4ba5..3286f5a714d 100644
--- a/Orange/tests/test_clustering_dbscan.py
+++ b/Orange/tests/test_clustering_dbscan.py
@@ -4,7 +4,9 @@
import unittest
import numpy as np
+from scipy.sparse import csc_matrix, csr_matrix
+from Orange.clustering.clustering import ClusteringModel
from Orange.data import Table
from Orange.clustering.dbscan import DBSCAN
@@ -14,15 +16,81 @@ def setUp(self):
self.iris = Table('iris')
self.dbscan = DBSCAN()
+ def test_dbscan(self):
+ c = self.dbscan(self.iris)
+ # First 20 iris belong to one cluster
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+ self.assertEqual(1, len(set(c[:20].ravel())))
+
def test_dbscan_parameters(self):
dbscan = DBSCAN(eps=0.1, min_samples=7, metric='euclidean',
algorithm='auto', leaf_size=12, p=None)
- dbscan(self.iris)
+ c = dbscan(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
def test_predict_table(self):
pred = self.dbscan(self.iris)
self.assertEqual(np.ndarray, type(pred))
+ self.assertEqual(len(self.iris), len(pred))
def test_predict_numpy(self):
model = self.dbscan.fit(self.iris.X)
+ self.assertEqual(ClusteringModel, type(model))
self.assertEqual(np.ndarray, type(model.labels))
+ self.assertEqual(len(self.iris), len(model.labels))
+
+ def test_predict_sparse_csc(self):
+ self.iris.X = csc_matrix(self.iris.X[::20])
+ c = self.dbscan(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+
+ def test_predict_spares_csr(self):
+ self.iris.X = csr_matrix(self.iris.X[::20])
+ c = self.dbscan(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+
+ def test_model(self):
+ c = self.dbscan.get_model(self.iris)
+ self.assertEqual(ClusteringModel, type(c))
+ self.assertEqual(len(self.iris), len(c.labels))
+
+ self.assertRaises(NotImplementedError, c, self.iris)
+
+ def test_model_np(self):
+ """
+ Test with numpy array as an input in model.
+ """
+ c = self.dbscan.get_model(self.iris)
+ self.assertRaises(NotImplementedError, c, self.iris.X)
+
+ def test_model_sparse(self):
+ """
+ Test with sparse array as an input in model.
+ """
+ c = self.dbscan.get_model(self.iris)
+ self.assertRaises(NotImplementedError, c, csr_matrix(self.iris.X))
+
+ def test_model_instance(self):
+ """
+ Test with instance as an input in model.
+ """
+ c = self.dbscan.get_model(self.iris)
+ self.assertRaises(NotImplementedError, c, self.iris[0])
+
+ def test_model_list(self):
+ """
+ Test with list as an input in model.
+ """
+ c = self.dbscan.get_model(self.iris)
+ self.assertRaises(NotImplementedError, c, self.iris.X.tolist())
+
+ def test_model_bad_datatype(self):
+ """
+ Check model with data-type that is not supported.
+ """
+ c = self.dbscan.get_model(self.iris)
+ self.assertRaises(TypeError, c, 10)
diff --git a/Orange/tests/test_clustering_kmeans.py b/Orange/tests/test_clustering_kmeans.py
index e406b8df204..7ff40d94992 100644
--- a/Orange/tests/test_clustering_kmeans.py
+++ b/Orange/tests/test_clustering_kmeans.py
@@ -2,12 +2,15 @@
# pylint: disable=missing-docstring
import unittest
+import warnings
import numpy as np
-from scipy.sparse import csc_matrix
+from scipy.sparse import csc_matrix, csr_matrix
import Orange
-from Orange.clustering.kmeans import KMeans
+from Orange.clustering.kmeans import KMeans, KMeansModel
+from Orange.data import Table, Domain, ContinuousVariable
+from Orange.data.table import DomainTransformationError
class TestKMeans(unittest.TestCase):
@@ -18,25 +21,130 @@ def setUp(self):
def test_kmeans(self):
c = self.kmeans(self.iris)
# First 20 iris belong to one cluster
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
self.assertEqual(1, len(set(c[:20].ravel())))
def test_kmeans_parameters(self):
kmeans = KMeans(n_clusters=10, max_iter=10, random_state=42, tol=0.001,
init='random')
- kmeans(self.iris)
+ c = kmeans(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
def test_predict_table(self):
- kmeans = KMeans()
- c = kmeans(self.iris)
+ c = self.kmeans(self.iris)
self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
def test_predict_numpy(self):
- kmeans = KMeans()
- c = kmeans.fit(self.iris.X)
+ c = self.kmeans.fit(self.iris.X)
+ self.assertEqual(KMeansModel, type(c))
self.assertEqual(np.ndarray, type(c.labels))
+ self.assertEqual(len(self.iris), len(c.labels))
- def test_predict_sparse(self):
- kmeans = KMeans()
+ def test_predict_sparse_csc(self):
self.iris.X = csc_matrix(self.iris.X[::20])
- c = kmeans(self.iris)
+ c = self.kmeans(self.iris)
self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+
+ def test_predict_spares_csr(self):
+ self.iris.X = csr_matrix(self.iris.X[::20])
+ c = self.kmeans(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+
+ def test_model(self):
+ c = self.kmeans.get_model(self.iris)
+ self.assertEqual(KMeansModel, type(c))
+ self.assertEqual(len(self.iris), len(c.labels))
+
+ c1 = c(self.iris)
+ # prediction of the model must be same since data are same
+ np.testing.assert_array_almost_equal(c.labels, c1)
+
+ def test_model_np(self):
+ """
+ Test with numpy array as an input in model.
+ """
+ c = self.kmeans.get_model(self.iris)
+ c1 = c(self.iris.X)
+ # prediction of the model must be same since data are same
+ np.testing.assert_array_almost_equal(c.labels, c1)
+
+ def test_model_sparse_csc(self):
+ """
+ Test with sparse array as an input in model.
+ """
+ c = self.kmeans.get_model(self.iris)
+ c1 = c(csc_matrix(self.iris.X))
+ # prediction of the model must be same since data are same
+ np.testing.assert_array_almost_equal(c.labels, c1)
+
+ def test_model_sparse_csr(self):
+ """
+ Test with sparse array as an input in model.
+ """
+ c = self.kmeans.get_model(self.iris)
+ c1 = c(csr_matrix(self.iris.X))
+ # prediction of the model must be same since data are same
+ np.testing.assert_array_almost_equal(c.labels, c1)
+
+ def test_model_instance(self):
+ """
+ Test with instance as an input in model.
+ """
+ c = self.kmeans.get_model(self.iris)
+ c1 = c(self.iris[0])
+ # prediction of the model must be same since data are same
+ self.assertEqual(c1, c.labels[0])
+
+ def test_model_list(self):
+ """
+ Test with list as an input in model.
+ """
+ c = self.kmeans.get_model(self.iris)
+ c1 = c(self.iris.X.tolist())
+ # prediction of the model must be same since data are same
+ np.testing.assert_array_almost_equal(c.labels, c1)
+
+ # example with a list of only one data item
+ c1 = c(self.iris.X.tolist()[0])
+ # prediction of the model must be same since data are same
+ np.testing.assert_array_almost_equal(c.labels[0], c1)
+
+ def test_model_bad_datatype(self):
+ """
+ Check model with data-type that is not supported.
+ """
+ c = self.kmeans.get_model(self.iris)
+ self.assertRaises(TypeError, c, 10)
+
+ def test_model_data_table_domain(self):
+ """
+ Check model with data-type that is not supported.
+ """
+ # ok domain
+ data = Table(Domain(
+ list(self.iris.domain.attributes) + [ContinuousVariable("a")]),
+ np.concatenate((self.iris.X, np.ones((len(self.iris), 1))), axis=1))
+ c = self.kmeans.get_model(self.iris)
+ res = c(data)
+ np.testing.assert_array_almost_equal(c.labels, res)
+
+ # totally different domain - should fail
+ self.assertRaises(DomainTransformationError, c, Table("housing"))
+
+ def test_deprecated_silhouette(self):
+ with warnings.catch_warnings(record=True) as w:
+ KMeans(compute_silhouette_score=True)
+
+ assert len(w) == 1
+ assert issubclass(w[-1].category, DeprecationWarning)
+
+ with warnings.catch_warnings(record=True) as w:
+ KMeans(compute_silhouette_score=False)
+
+ assert len(w) == 1
+ assert issubclass(w[-1].category, DeprecationWarning)
diff --git a/Orange/tests/test_clustering_louvain.py b/Orange/tests/test_clustering_louvain.py
new file mode 100644
index 00000000000..a65ba4a8edf
--- /dev/null
+++ b/Orange/tests/test_clustering_louvain.py
@@ -0,0 +1,120 @@
+# Test methods with long descriptive names can omit docstrings
+# pylint: disable=missing-docstring
+
+import unittest
+
+import numpy as np
+import networkx
+from scipy.sparse import csc_matrix, csr_matrix
+
+from Orange.clustering.clustering import ClusteringModel
+from Orange.clustering.louvain import matrix_to_knn_graph
+from Orange.data import Table
+from Orange.clustering.louvain import Louvain
+
+
+class TestLouvain(unittest.TestCase):
+ def setUp(self):
+ self.iris = Table('iris')
+ self.louvain = Louvain()
+
+ def test_louvain(self):
+ c = self.louvain(self.iris)
+ # First 20 iris belong to one cluster
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+ self.assertEqual(1, len(set(c[:20].ravel())))
+
+ def test_louvain_parameters(self):
+ louvain = Louvain(
+ k_neighbors=3, resolution=1.2, random_state=42, metric="l2")
+ c = louvain(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+
+ def test_predict_table(self):
+ c = self.louvain(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+
+ def test_predict_numpy(self):
+ c = self.louvain.fit(self.iris.X)
+ self.assertEqual(ClusteringModel, type(c))
+ self.assertEqual(np.ndarray, type(c.labels))
+ self.assertEqual(len(self.iris), len(c.labels))
+
+ def test_predict_sparse_csc(self):
+ self.iris.X = csc_matrix(self.iris.X[::5])
+ c = self.louvain(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+
+ def test_predict_spares_csr(self):
+ self.iris.X = csr_matrix(self.iris.X[::5])
+ c = self.louvain(self.iris)
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+
+ def test_model(self):
+ c = self.louvain.get_model(self.iris)
+ self.assertEqual(ClusteringModel, type(c))
+ self.assertEqual(len(self.iris), len(c.labels))
+
+ self.assertRaises(NotImplementedError, c, self.iris)
+
+ def test_model_np(self):
+ """
+ Test with numpy array as an input in model.
+ """
+ c = self.louvain.get_model(self.iris)
+ self.assertRaises(NotImplementedError, c, self.iris.X)
+
+ def test_model_sparse(self):
+ """
+ Test with sparse array as an input in model.
+ """
+ c = self.louvain.get_model(self.iris)
+ self.assertRaises(NotImplementedError, c, csr_matrix(self.iris.X))
+
+ def test_model_instance(self):
+ """
+ Test with instance as an input in model.
+ """
+ c = self.louvain.get_model(self.iris)
+ self.assertRaises(NotImplementedError, c, self.iris[0])
+
+ def test_model_list(self):
+ """
+ Test with list as an input in model.
+ """
+ c = self.louvain.get_model(self.iris)
+ self.assertRaises(NotImplementedError, c, self.iris.X.tolist())
+
+ def test_graph(self):
+ """
+ Louvain accepts graphs too.
+ :return:
+ """
+ graph = matrix_to_knn_graph(self.iris.X, 30, "l2")
+ self.assertIsNotNone(graph)
+ self.assertEqual(networkx.Graph, type(graph), 1)
+
+ # basic clustering - get clusters
+ c = self.louvain(graph)
+ # First 20 iris belong to one cluster
+ self.assertEqual(np.ndarray, type(c))
+ self.assertEqual(len(self.iris), len(c))
+ self.assertEqual(1, len(set(c[:20].ravel())))
+
+ # clustering - get model
+ c = self.louvain.get_model(graph)
+ # First 20 iris belong to one cluster
+ self.assertEqual(ClusteringModel, type(c))
+ self.assertEqual(len(self.iris), len(c.labels))
+
+ def test_model_bad_datatype(self):
+ """
+ Check model with data-type that is not supported.
+ """
+ c = self.louvain.get_model(self.iris)
+ self.assertRaises(TypeError, c, 10)
diff --git a/Orange/tests/test_louvain.py b/Orange/tests/test_louvain.py
deleted file mode 100644
index e1c192f1eac..00000000000
--- a/Orange/tests/test_louvain.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Test methods with long descriptive names can omit docstrings
-# pylint: disable=missing-docstring
-
-import unittest
-import numpy as np
-
-from Orange.data import Table
-from Orange.clustering.louvain import Louvain
-
-
-class TestLouvain(unittest.TestCase):
- def setUp(self):
- self.data = Table('iris')
- self.louvain = Louvain()
-
- def test_orange_table(self):
- labels = self.louvain(self.data)
- self.assertEqual(np.ndarray, type(labels))