Skip to content

Commit

Permalink
OWLouvain: Ensure deterministic clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlin-policar committed Dec 21, 2018
1 parent e5de908 commit a27031b
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 13 deletions.
47 changes: 37 additions & 10 deletions Orange/clustering/louvain.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

import numpy as np
import networkx as nx
# NOTE: The ``community`` package might be renamed in the near future, see
# GH issue https://github.com/taynaud/python-louvain/issues/23
from community import best_partition
from sklearn.neighbors import NearestNeighbors

Expand Down Expand Up @@ -53,18 +55,27 @@ def table_to_knn_graph(data, k_neighbors, metric, progress_callback=None):
if progress_callback:
progress_callback(idx / num_nodes)

for neighbour in nearest_neighbors[node]:
graph.add_edge(node, neighbour, weight=jaccard(
nearest_neighbors[node], nearest_neighbors[neighbour]))
for neighbor in nearest_neighbors[node]:
graph.add_edge(
node,
neighbor,
weight=jaccard(nearest_neighbors[node], nearest_neighbors[neighbor]),
)

return graph


class Louvain:
preprocessors = [Orange.preprocess.Continuize(),
Orange.preprocess.SklImpute()]

def __init__(self, k_neighbors=30, metric='l2', resolution=1., preprocessors=None):
preprocessors = [Orange.preprocess.Continuize(), Orange.preprocess.SklImpute()]

def __init__(
self,
k_neighbors=30,
metric="l2",
resolution=1.0,
random_state=None,
preprocessors=None,
):
"""Louvain clustering for community detection in graphs.
Louvain clustering is a community detection algorithm for detecting
Expand All @@ -77,12 +88,21 @@ def __init__(self, k_neighbors=30, metric='l2', resolution=1., preprocessors=Non
k_neighbors : Optional[int]
The number of nearest neighbors to use for the KNN graph if
tabular data is passed.
metric : Optional[str]
The metric to use to compute the nearest neighbors.
resolution : Optional[float]
The resolution is a parameter of the Louvain method that affects
the size of the recovered clusters.
random_state: Union[int, RandomState]
The random state parameter follows the convention used in scikit-learn.
If the value is an int, random_state is the seed used by the random
number generator. If the value is a RandomState instance, then it will
be used as the random number generator. If the value is None, the random
number generator is the RandomState instance used by `np.random`.
"""
if preprocessors is None:
preprocessors = type(self).preprocessors
Expand All @@ -91,6 +111,7 @@ def __init__(self, k_neighbors=30, metric='l2', resolution=1., preprocessors=Non
self.k_neighbors = k_neighbors
self.metric = metric
self.resolution = resolution
self.random_state = random_state

self.labels = None

Expand All @@ -106,14 +127,20 @@ def preprocess(self, data):
def fit(self, X, y=None):
# If we are given a table, we have to convert it to a graph first
if isinstance(X, Table):
graph = table_to_knn_graph(X.X, metric=self.metric, k_neighbors=self.k_neighbors)
graph = table_to_knn_graph(
X.X, metric=self.metric, k_neighbors=self.k_neighbors
)
# Same goes for a matrix
elif isinstance(X, np.ndarray):
graph = table_to_knn_graph(X, metric=self.metric, k_neighbors=self.k_neighbors)
graph = table_to_knn_graph(
X, metric=self.metric, k_neighbors=self.k_neighbors
)
elif isinstance(X, nx.Graph):
graph = X

partition = best_partition(graph, resolution=self.resolution)
partition = best_partition(
graph, resolution=self.resolution, random_state=self.random_state
)
partition = np.fromiter(list(zip(*sorted(partition.items())))[1], dtype=int)

self.labels = partition
Expand Down
6 changes: 3 additions & 3 deletions Orange/widgets/unsupervised/owlouvainclustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def commit(self):

# Preprocess the dataset
if self.preprocessed_data is None:
louvain = Louvain()
louvain = Louvain(random_state=0)
self.preprocessed_data = louvain.preprocess(self.data)

state = TaskState(self)
Expand Down Expand Up @@ -590,7 +590,7 @@ def pcallback(val):
if state.is_interuption_requested():
return res

louvain = Louvain(resolution=resolution)
louvain = Louvain(resolution=resolution, random_state=0)
res.partition = louvain.fit_predict(graph)
state.set_partial_results(("partition", res.partition))
return res
Expand All @@ -603,7 +603,7 @@ def run_on_graph(graph, resolution, state):
"""
state = state # type: TaskState
res = Results(resolution=resolution)
louvain = Louvain(resolution=resolution)
louvain = Louvain(resolution=resolution, random_state=0)
state.set_status("Detecting communities...")
if state.is_interuption_requested():
return res
Expand Down
17 changes: 17 additions & 0 deletions Orange/widgets/unsupervised/tests/test_owlouvain.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,20 @@ def test_invalidate(self):
self.widget.commit()
self.get_output(self.widget.Outputs.annotated_data)
self.assertFalse(self.widget.Information.modified.is_shown())

def test_deterministic_clustering(self):
# Compute clustering on iris
self.send_signal(self.widget.Inputs.data, self.iris)
self.commit_and_wait()
result1 = self.get_output(self.widget.Outputs.annotated_data)

# Reset widget state
self.send_signal(self.widget.Inputs.data, None)

# Compute clustering on iris again
self.send_signal(self.widget.Inputs.data, self.iris)
self.commit_and_wait()
result2 = self.get_output(self.widget.Outputs.annotated_data)

# Ensure that clustering was the same in both instances
np.testing.assert_equal(result1.metas, result2.metas)

0 comments on commit a27031b

Please sign in to comment.