Skip to content

Commit

Permalink
OwLouvain: Enable normalization for sparse data
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlin-policar committed Feb 4, 2019
1 parent 495dfa8 commit 0c8c15b
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 12 deletions.
54 changes: 42 additions & 12 deletions Orange/widgets/unsupervised/owlouvainclustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from Orange.clustering.louvain import table_to_knn_graph, Louvain
from Orange.data import Table, DiscreteVariable
from Orange.data.util import get_unique_names
from Orange import preprocess
from Orange.projection import PCA
from Orange.widgets import widget, gui, report
from Orange.widgets.settings import DomainContextHandler, ContextSetting, \
Expand Down Expand Up @@ -66,6 +67,7 @@ class Outputs:

apply_pca = ContextSetting(True)
pca_components = ContextSetting(_DEFAULT_PCA_COMPONENTS)
normalize = ContextSetting(True)
metric_idx = ContextSetting(0)
k_neighbors = ContextSetting(_DEFAULT_K_NEIGHBORS)
resolution = ContextSetting(1.)
Expand Down Expand Up @@ -101,13 +103,17 @@ def __init__(self):
info_box = gui.vBox(self.controlArea, "Info")
self.info_label = gui.widgetLabel(info_box, "No data on input.") # type: QLabel

pca_box = gui.vBox(self.controlArea, "PCA Preprocessing")
preprocessing_box = gui.vBox(self.controlArea, "Preprocessing")
self.normalize_cbx = gui.checkBox(
preprocessing_box, self, "normalize", label="Normalize data",
callback=self._invalidate_preprocessed_data,
) # type: QCheckBox
self.apply_pca_cbx = gui.checkBox(
pca_box, self, "apply_pca", label="Apply PCA preprocessing",
preprocessing_box, self, "apply_pca", label="Apply PCA preprocessing",
callback=self._invalidate_graph,
) # type: QCheckBox
self.pca_components_slider = gui.hSlider(
pca_box, self, "pca_components", label="Components: ", minValue=2,
preprocessing_box, self, "pca_components", label="PCA Components: ", minValue=2,
maxValue=_MAX_PCA_COMPONENTS,
callback=self._invalidate_pca_projection, tracking=False
) # type: QSlider
Expand Down Expand Up @@ -139,6 +145,14 @@ def __init__(self):
callback=lambda: self._on_auto_commit_changed(),
) # type: QWidget

def _invalidate_preprocessed_data(self):
self.preprocessed_data = None
self._invalidate_pca_projection()
# If we don't apply PCA, this still invalidates the graph, otherwise
# this change won't be propagated further
if not self.apply_pca:
self._invalidate_graph()

def _invalidate_pca_projection(self):
self.pca_projection = None
if not self.apply_pca:
Expand Down Expand Up @@ -215,8 +229,11 @@ def commit(self):

# Preprocess the dataset
if self.preprocessed_data is None:
louvain = Louvain(random_state=0)
self.preprocessed_data = louvain.preprocess(self.data)
if self.normalize:
normalizer = preprocess.Normalize(center=False)
self.preprocessed_data = normalizer(self.data)
else:
self.preprocessed_data = self.data

state = TaskState(self)

Expand All @@ -243,8 +260,8 @@ def commit(self):
if graph is None:
task = partial(
run_on_data, data, pca_components=pca_components,
k_neighbors=k_neighbors, metric=metric,
resolution=self.resolution, state=state
normalize=self.normalize, k_neighbors=k_neighbors,
metric=metric, resolution=self.resolution, state=state,
)
else:
task = partial(
Expand Down Expand Up @@ -381,6 +398,7 @@ def _send_data(self):

@Inputs.data
def set_data(self, data):
# pylint: disable=too-many-branches
self.closeContext()
self.Error.clear()

Expand Down Expand Up @@ -439,6 +457,7 @@ def send_report(self):
pca += report.plural(", {number} component{s}", self.pca_components)

self.report_items((
("Normalize data", report.bool_str(self.normalize)),
("PCA preprocessing", pca),
("Metric", METRICS[self.metric_idx][0]),
("k neighbors", self.k_neighbors),
Expand Down Expand Up @@ -520,15 +539,16 @@ class InteruptRequested(BaseException):
class Results(namespace):
pca_projection = None # type: Optional[Table]
pca_components = None # type: Optional[int]
normalize = None # type: Optional[bool]
k_neighbors = None # type: Optional[int]
metric = None # type: Optional[str]
graph = None # type: Optional[nx.Graph]
resolution = None # type: Optional[float]
partition = None # type: Optional[np.ndarray]


def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
# type: (Table, Optional[int], int, str, float, TaskState) -> Results
def run_on_data(data, normalize, pca_components, k_neighbors, metric, resolution, state):
# type: (Table, Optional[int], int, str, float, bool, TaskState) -> Results
"""
Run the louvain clustering on `data`.
Expand All @@ -539,6 +559,8 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
----------
data : Table
Data table
normalize : bool
If `True`, the data is first normalized before computing PCA.
pca_components : Optional[int]
If not `None` then the data is first projected onto first
`pca_components` principal components.
Expand All @@ -556,16 +578,18 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
"""
state = state # type: TaskState
res = Results(
pca_components=pca_components, k_neighbors=k_neighbors, metric=metric,
resolution=resolution,
normalize=normalize, pca_components=pca_components,
k_neighbors=k_neighbors, metric=metric, resolution=resolution,
)
step = 0
if state.is_interuption_requested():
return res

if pca_components is not None:
steps = 3
state.set_status("Computing PCA...")
pca = PCA(n_components=pca_components, random_state=0)

data = res.pca_projection = pca(data)(data)
assert isinstance(data, Table)
state.set_partial_results(("pca_projection", res.pca_projection))
Expand All @@ -579,6 +603,13 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
state.set_progress_value(100. * step / steps)
state.set_status("Building graph...")

# Apply Louvain preprocessing before converting the table into a graph
louvain = Louvain(resolution=resolution, random_state=0)
data = louvain.preprocess(data)

if state.is_interuption_requested():
return res

def pcallback(val):
state.set_progress_value((100. * step + 100 * val) / steps)
if state.is_interuption_requested():
Expand All @@ -600,7 +631,6 @@ def pcallback(val):
if state.is_interuption_requested():
return res

louvain = Louvain(resolution=resolution, random_state=0)
res.partition = louvain.fit_predict(graph)
state.set_partial_results(("partition", res.partition))
return res
Expand Down
56 changes: 56 additions & 0 deletions Orange/widgets/unsupervised/tests/test_owlouvain.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import numpy as np

from Orange.data import Table, Domain, ContinuousVariable
from Orange.preprocess import Normalize
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import table_dense_sparse
from Orange.widgets.unsupervised.owlouvainclustering import OWLouvainClustering
from sklearn.utils import check_random_state

# Deterministic tests
np.random.seed(42)
Expand Down Expand Up @@ -178,3 +181,56 @@ def test_deterministic_clustering(self):

# Ensure that clustering was the same in both instances
np.testing.assert_equal(result1.metas, result2.metas)

@table_dense_sparse
def test_normalize_data(self, prepare_table):
"""Check that normalization is called at the proper times."""
data = prepare_table(self.iris)

# Enable checkbox
self.widget.controls.normalize.setChecked(True)
self.assertTrue(self.widget.controls.normalize.isChecked())
with patch("Orange.preprocess.Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
self.assertTrue(self.widget.controls.normalize.isEnabled())
normalize.assert_called_once()

# Disable checkbox
self.widget.controls.normalize.setChecked(False)
self.assertFalse(self.widget.controls.normalize.isChecked())
with patch("Orange.preprocess.Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
self.assertTrue(self.widget.controls.normalize.isEnabled())
normalize.assert_not_called()

def test_dense_and_sparse_return_same_result(self):
"""Check that Louvain clustering returns identical results for both
dense and sparse data."""
random_state = check_random_state(42)

# Randomly set some values to zero
dense_data = self.iris
mask = random_state.beta(1, 2, size=self.iris.X.shape) > 0.5
dense_data.X[mask] = 0
sparse_data = dense_data.to_sparse()

def _compute_clustering(data):
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
result = self.get_output(self.widget.Outputs.annotated_data)
self.send_signal(self.widget.Inputs.data, None)
return result

# Disable normalization
self.widget.controls.normalize.setChecked(False)
dense_result = _compute_clustering(dense_data)
sparse_result = _compute_clustering(sparse_data)
np.testing.assert_equal(dense_result.metas, sparse_result.metas)

# Enable normalization
self.widget.controls.normalize.setChecked(True)
dense_result = _compute_clustering(dense_data)
sparse_result = _compute_clustering(sparse_data)
np.testing.assert_equal(dense_result.metas, sparse_result.metas)

0 comments on commit 0c8c15b

Please sign in to comment.