From 0c8c15bb33db7699cf3f87fd6516de87be90493a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavlin=20Poli=C4=8Dar?= Date: Mon, 4 Feb 2019 17:21:10 +0100 Subject: [PATCH] OwLouvain: Enable normalization for sparse data --- .../unsupervised/owlouvainclustering.py | 54 ++++++++++++++---- .../unsupervised/tests/test_owlouvain.py | 56 +++++++++++++++++++ 2 files changed, 98 insertions(+), 12 deletions(-) diff --git a/Orange/widgets/unsupervised/owlouvainclustering.py b/Orange/widgets/unsupervised/owlouvainclustering.py index 9cbff27dc07..3ddc4c68fa4 100644 --- a/Orange/widgets/unsupervised/owlouvainclustering.py +++ b/Orange/widgets/unsupervised/owlouvainclustering.py @@ -17,6 +17,7 @@ from Orange.clustering.louvain import table_to_knn_graph, Louvain from Orange.data import Table, DiscreteVariable from Orange.data.util import get_unique_names +from Orange import preprocess from Orange.projection import PCA from Orange.widgets import widget, gui, report from Orange.widgets.settings import DomainContextHandler, ContextSetting, \ @@ -66,6 +67,7 @@ class Outputs: apply_pca = ContextSetting(True) pca_components = ContextSetting(_DEFAULT_PCA_COMPONENTS) + normalize = ContextSetting(True) metric_idx = ContextSetting(0) k_neighbors = ContextSetting(_DEFAULT_K_NEIGHBORS) resolution = ContextSetting(1.) @@ -101,13 +103,17 @@ def __init__(self): info_box = gui.vBox(self.controlArea, "Info") self.info_label = gui.widgetLabel(info_box, "No data on input.") # type: QLabel - pca_box = gui.vBox(self.controlArea, "PCA Preprocessing") + preprocessing_box = gui.vBox(self.controlArea, "Preprocessing") + self.normalize_cbx = gui.checkBox( + preprocessing_box, self, "normalize", label="Normalize data", + callback=self._invalidate_preprocessed_data, + ) # type: QCheckBox self.apply_pca_cbx = gui.checkBox( - pca_box, self, "apply_pca", label="Apply PCA preprocessing", + preprocessing_box, self, "apply_pca", label="Apply PCA preprocessing", callback=self._invalidate_graph, ) # type: QCheckBox self.pca_components_slider = gui.hSlider( - pca_box, self, "pca_components", label="Components: ", minValue=2, + preprocessing_box, self, "pca_components", label="PCA Components: ", minValue=2, maxValue=_MAX_PCA_COMPONENTS, callback=self._invalidate_pca_projection, tracking=False ) # type: QSlider @@ -139,6 +145,14 @@ def __init__(self): callback=lambda: self._on_auto_commit_changed(), ) # type: QWidget + def _invalidate_preprocessed_data(self): + self.preprocessed_data = None + self._invalidate_pca_projection() + # If we don't apply PCA, this still invalidates the graph, otherwise + # this change won't be propagated further + if not self.apply_pca: + self._invalidate_graph() + def _invalidate_pca_projection(self): self.pca_projection = None if not self.apply_pca: @@ -215,8 +229,11 @@ def commit(self): # Preprocess the dataset if self.preprocessed_data is None: - louvain = Louvain(random_state=0) - self.preprocessed_data = louvain.preprocess(self.data) + if self.normalize: + normalizer = preprocess.Normalize(center=False) + self.preprocessed_data = normalizer(self.data) + else: + self.preprocessed_data = self.data state = TaskState(self) @@ -243,8 +260,8 @@ def commit(self): if graph is None: task = partial( run_on_data, data, pca_components=pca_components, - k_neighbors=k_neighbors, metric=metric, - resolution=self.resolution, state=state + normalize=self.normalize, k_neighbors=k_neighbors, + metric=metric, resolution=self.resolution, state=state, ) else: task = partial( @@ -381,6 +398,7 @@ def _send_data(self): @Inputs.data def set_data(self, data): + # pylint: disable=too-many-branches self.closeContext() self.Error.clear() @@ -439,6 +457,7 @@ def send_report(self): pca += report.plural(", {number} component{s}", self.pca_components) self.report_items(( + ("Normalize data", report.bool_str(self.normalize)), ("PCA preprocessing", pca), ("Metric", METRICS[self.metric_idx][0]), ("k neighbors", self.k_neighbors), @@ -520,6 +539,7 @@ class InteruptRequested(BaseException): class Results(namespace): pca_projection = None # type: Optional[Table] pca_components = None # type: Optional[int] + normalize = None # type: Optional[bool] k_neighbors = None # type: Optional[int] metric = None # type: Optional[str] graph = None # type: Optional[nx.Graph] @@ -527,8 +547,8 @@ class Results(namespace): partition = None # type: Optional[np.ndarray] -def run_on_data(data, pca_components, k_neighbors, metric, resolution, state): - # type: (Table, Optional[int], int, str, float, TaskState) -> Results +def run_on_data(data, normalize, pca_components, k_neighbors, metric, resolution, state): + # type: (Table, Optional[int], int, str, float, bool, TaskState) -> Results """ Run the louvain clustering on `data`. @@ -539,6 +559,8 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state): ---------- data : Table Data table + normalize : bool + If `True`, the data is first normalized before computing PCA. pca_components : Optional[int] If not `None` then the data is first projected onto first `pca_components` principal components. @@ -556,16 +578,18 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state): """ state = state # type: TaskState res = Results( - pca_components=pca_components, k_neighbors=k_neighbors, metric=metric, - resolution=resolution, + normalize=normalize, pca_components=pca_components, + k_neighbors=k_neighbors, metric=metric, resolution=resolution, ) step = 0 if state.is_interuption_requested(): return res + if pca_components is not None: steps = 3 state.set_status("Computing PCA...") pca = PCA(n_components=pca_components, random_state=0) + data = res.pca_projection = pca(data)(data) assert isinstance(data, Table) state.set_partial_results(("pca_projection", res.pca_projection)) @@ -579,6 +603,13 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state): state.set_progress_value(100. * step / steps) state.set_status("Building graph...") + # Apply Louvain preprocessing before converting the table into a graph + louvain = Louvain(resolution=resolution, random_state=0) + data = louvain.preprocess(data) + + if state.is_interuption_requested(): + return res + def pcallback(val): state.set_progress_value((100. * step + 100 * val) / steps) if state.is_interuption_requested(): @@ -600,7 +631,6 @@ def pcallback(val): if state.is_interuption_requested(): return res - louvain = Louvain(resolution=resolution, random_state=0) res.partition = louvain.fit_predict(graph) state.set_partial_results(("partition", res.partition)) return res diff --git a/Orange/widgets/unsupervised/tests/test_owlouvain.py b/Orange/widgets/unsupervised/tests/test_owlouvain.py index b686ca37e1e..ee3b7bc1c67 100644 --- a/Orange/widgets/unsupervised/tests/test_owlouvain.py +++ b/Orange/widgets/unsupervised/tests/test_owlouvain.py @@ -3,8 +3,11 @@ import numpy as np from Orange.data import Table, Domain, ContinuousVariable +from Orange.preprocess import Normalize from Orange.widgets.tests.base import WidgetTest +from Orange.widgets.tests.utils import table_dense_sparse from Orange.widgets.unsupervised.owlouvainclustering import OWLouvainClustering +from sklearn.utils import check_random_state # Deterministic tests np.random.seed(42) @@ -178,3 +181,56 @@ def test_deterministic_clustering(self): # Ensure that clustering was the same in both instances np.testing.assert_equal(result1.metas, result2.metas) + + @table_dense_sparse + def test_normalize_data(self, prepare_table): + """Check that normalization is called at the proper times.""" + data = prepare_table(self.iris) + + # Enable checkbox + self.widget.controls.normalize.setChecked(True) + self.assertTrue(self.widget.controls.normalize.isChecked()) + with patch("Orange.preprocess.Normalize", wraps=Normalize) as normalize: + self.send_signal(self.widget.Inputs.data, data) + self.wait_until_stop_blocking() + self.assertTrue(self.widget.controls.normalize.isEnabled()) + normalize.assert_called_once() + + # Disable checkbox + self.widget.controls.normalize.setChecked(False) + self.assertFalse(self.widget.controls.normalize.isChecked()) + with patch("Orange.preprocess.Normalize", wraps=Normalize) as normalize: + self.send_signal(self.widget.Inputs.data, data) + self.wait_until_stop_blocking() + self.assertTrue(self.widget.controls.normalize.isEnabled()) + normalize.assert_not_called() + + def test_dense_and_sparse_return_same_result(self): + """Check that Louvain clustering returns identical results for both + dense and sparse data.""" + random_state = check_random_state(42) + + # Randomly set some values to zero + dense_data = self.iris + mask = random_state.beta(1, 2, size=self.iris.X.shape) > 0.5 + dense_data.X[mask] = 0 + sparse_data = dense_data.to_sparse() + + def _compute_clustering(data): + self.send_signal(self.widget.Inputs.data, data) + self.wait_until_stop_blocking() + result = self.get_output(self.widget.Outputs.annotated_data) + self.send_signal(self.widget.Inputs.data, None) + return result + + # Disable normalization + self.widget.controls.normalize.setChecked(False) + dense_result = _compute_clustering(dense_data) + sparse_result = _compute_clustering(sparse_data) + np.testing.assert_equal(dense_result.metas, sparse_result.metas) + + # Enable normalization + self.widget.controls.normalize.setChecked(True) + dense_result = _compute_clustering(dense_data) + sparse_result = _compute_clustering(sparse_data) + np.testing.assert_equal(dense_result.metas, sparse_result.metas)