From a27031b987ebf9dd875b95cede78ce3d5e4d09a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavlin=20Poli=C4=8Dar?= Date: Thu, 20 Dec 2018 16:39:26 +0100 Subject: [PATCH] OWLouvain: Ensure deterministic clustering --- Orange/clustering/louvain.py | 47 +++++++++++++++---- .../unsupervised/owlouvainclustering.py | 6 +-- .../unsupervised/tests/test_owlouvain.py | 17 +++++++ 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/Orange/clustering/louvain.py b/Orange/clustering/louvain.py index b565b1d1fbe..dfc4e0a2a47 100644 --- a/Orange/clustering/louvain.py +++ b/Orange/clustering/louvain.py @@ -8,6 +8,8 @@ import numpy as np import networkx as nx +# NOTE: The ``community`` package might be renamed in the near future, see +# GH issue https://github.com/taynaud/python-louvain/issues/23 from community import best_partition from sklearn.neighbors import NearestNeighbors @@ -53,18 +55,27 @@ def table_to_knn_graph(data, k_neighbors, metric, progress_callback=None): if progress_callback: progress_callback(idx / num_nodes) - for neighbour in nearest_neighbors[node]: - graph.add_edge(node, neighbour, weight=jaccard( - nearest_neighbors[node], nearest_neighbors[neighbour])) + for neighbor in nearest_neighbors[node]: + graph.add_edge( + node, + neighbor, + weight=jaccard(nearest_neighbors[node], nearest_neighbors[neighbor]), + ) return graph class Louvain: - preprocessors = [Orange.preprocess.Continuize(), - Orange.preprocess.SklImpute()] - - def __init__(self, k_neighbors=30, metric='l2', resolution=1., preprocessors=None): + preprocessors = [Orange.preprocess.Continuize(), Orange.preprocess.SklImpute()] + + def __init__( + self, + k_neighbors=30, + metric="l2", + resolution=1.0, + random_state=None, + preprocessors=None, + ): """Louvain clustering for community detection in graphs. Louvain clustering is a community detection algorithm for detecting @@ -77,12 +88,21 @@ def __init__(self, k_neighbors=30, metric='l2', resolution=1., preprocessors=Non k_neighbors : Optional[int] The number of nearest neighbors to use for the KNN graph if tabular data is passed. + metric : Optional[str] The metric to use to compute the nearest neighbors. + resolution : Optional[float] The resolution is a parameter of the Louvain method that affects the size of the recovered clusters. + random_state: Union[int, RandomState] + The random state parameter follows the convention used in scikit-learn. + If the value is an int, random_state is the seed used by the random + number generator. If the value is a RandomState instance, then it will + be used as the random number generator. If the value is None, the random + number generator is the RandomState instance used by `np.random`. + """ if preprocessors is None: preprocessors = type(self).preprocessors @@ -91,6 +111,7 @@ def __init__(self, k_neighbors=30, metric='l2', resolution=1., preprocessors=Non self.k_neighbors = k_neighbors self.metric = metric self.resolution = resolution + self.random_state = random_state self.labels = None @@ -106,14 +127,20 @@ def preprocess(self, data): def fit(self, X, y=None): # If we are given a table, we have to convert it to a graph first if isinstance(X, Table): - graph = table_to_knn_graph(X.X, metric=self.metric, k_neighbors=self.k_neighbors) + graph = table_to_knn_graph( + X.X, metric=self.metric, k_neighbors=self.k_neighbors + ) # Same goes for a matrix elif isinstance(X, np.ndarray): - graph = table_to_knn_graph(X, metric=self.metric, k_neighbors=self.k_neighbors) + graph = table_to_knn_graph( + X, metric=self.metric, k_neighbors=self.k_neighbors + ) elif isinstance(X, nx.Graph): graph = X - partition = best_partition(graph, resolution=self.resolution) + partition = best_partition( + graph, resolution=self.resolution, random_state=self.random_state + ) partition = np.fromiter(list(zip(*sorted(partition.items())))[1], dtype=int) self.labels = partition diff --git a/Orange/widgets/unsupervised/owlouvainclustering.py b/Orange/widgets/unsupervised/owlouvainclustering.py index 1586f0371e0..af1221326d5 100644 --- a/Orange/widgets/unsupervised/owlouvainclustering.py +++ b/Orange/widgets/unsupervised/owlouvainclustering.py @@ -211,7 +211,7 @@ def commit(self): # Preprocess the dataset if self.preprocessed_data is None: - louvain = Louvain() + louvain = Louvain(random_state=0) self.preprocessed_data = louvain.preprocess(self.data) state = TaskState(self) @@ -590,7 +590,7 @@ def pcallback(val): if state.is_interuption_requested(): return res - louvain = Louvain(resolution=resolution) + louvain = Louvain(resolution=resolution, random_state=0) res.partition = louvain.fit_predict(graph) state.set_partial_results(("partition", res.partition)) return res @@ -603,7 +603,7 @@ def run_on_graph(graph, resolution, state): """ state = state # type: TaskState res = Results(resolution=resolution) - louvain = Louvain(resolution=resolution) + louvain = Louvain(resolution=resolution, random_state=0) state.set_status("Detecting communities...") if state.is_interuption_requested(): return res diff --git a/Orange/widgets/unsupervised/tests/test_owlouvain.py b/Orange/widgets/unsupervised/tests/test_owlouvain.py index b2fb929788c..b686ca37e1e 100644 --- a/Orange/widgets/unsupervised/tests/test_owlouvain.py +++ b/Orange/widgets/unsupervised/tests/test_owlouvain.py @@ -161,3 +161,20 @@ def test_invalidate(self): self.widget.commit() self.get_output(self.widget.Outputs.annotated_data) self.assertFalse(self.widget.Information.modified.is_shown()) + + def test_deterministic_clustering(self): + # Compute clustering on iris + self.send_signal(self.widget.Inputs.data, self.iris) + self.commit_and_wait() + result1 = self.get_output(self.widget.Outputs.annotated_data) + + # Reset widget state + self.send_signal(self.widget.Inputs.data, None) + + # Compute clustering on iris again + self.send_signal(self.widget.Inputs.data, self.iris) + self.commit_and_wait() + result2 = self.get_output(self.widget.Outputs.annotated_data) + + # Ensure that clustering was the same in both instances + np.testing.assert_equal(result1.metas, result2.metas)