Skip to content

Commit

Permalink
OwLouvain: Add data normalization option to PCA preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlin-policar committed Feb 2, 2019
1 parent 40762e2 commit a7a3b19
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 7 deletions.
42 changes: 35 additions & 7 deletions Orange/widgets/unsupervised/owlouvainclustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from Orange.clustering.louvain import table_to_knn_graph, Louvain
from Orange.data import Table, DiscreteVariable
from Orange.data.util import get_unique_names
from Orange import preprocess
from Orange.projection import PCA
from Orange.widgets import widget, gui, report
from Orange.widgets.settings import DomainContextHandler, ContextSetting, \
Expand Down Expand Up @@ -66,6 +67,7 @@ class Outputs:

apply_pca = ContextSetting(True)
pca_components = ContextSetting(_DEFAULT_PCA_COMPONENTS)
pca_normalize = ContextSetting(True)
metric_idx = ContextSetting(0)
k_neighbors = ContextSetting(_DEFAULT_K_NEIGHBORS)
resolution = ContextSetting(1.)
Expand Down Expand Up @@ -106,11 +108,18 @@ def __init__(self):
pca_box, self, "apply_pca", label="Apply PCA preprocessing",
callback=self._invalidate_graph,
) # type: QCheckBox
pca_params_box = gui.vBox(pca_box)
# Make the difference between "Apply PCA" and PCA specific parameters clear
pca_params_box.layout().setContentsMargins(10, 0, 0, 0)
self.pca_components_slider = gui.hSlider(
pca_box, self, "pca_components", label="Components: ", minValue=2,
pca_params_box, self, "pca_components", label="Components: ", minValue=2,
maxValue=_MAX_PCA_COMPONENTS,
callback=self._invalidate_pca_projection, tracking=False
) # type: QSlider
self.pca_normalize_cbx = gui.checkBox(
pca_params_box, self, "pca_normalize", label="Normalize data",
callback=self._invalidate_pca_projection,
) # type: QCheckBox

graph_box = gui.vBox(self.controlArea, "Graph parameters")
self.metric_combo = gui.comboBox(
Expand Down Expand Up @@ -243,8 +252,8 @@ def commit(self):
if graph is None:
task = partial(
run_on_data, data, pca_components=pca_components,
k_neighbors=k_neighbors, metric=metric,
resolution=self.resolution, state=state
pca_normalize=self.pca_normalize, k_neighbors=k_neighbors,
metric=metric, resolution=self.resolution, state=state,
)
else:
task = partial(
Expand Down Expand Up @@ -414,6 +423,17 @@ def set_data(self, data):

self.info_label.setText("Clustering not yet run.")

self.pca_normalize_cbx.setDisabled(data.is_sparse())
# PCA doesn't support normalization on sparse data, as this would
# require centering and normalizing the matrix
if data.is_sparse():
self.pca_normalize_cbx.setToolTip(
"Data normalization is not supported on sparse matrices."
)
self.pca_normalize = False
else:
self.pca_normalize_cbx.setToolTip("")

self.commit()

def clear(self):
Expand Down Expand Up @@ -520,15 +540,18 @@ class InteruptRequested(BaseException):
class Results(namespace):
pca_projection = None # type: Optional[Table]
pca_components = None # type: Optional[int]
pca_normalize = None # type: Optional[bool]
k_neighbors = None # type: Optional[int]
metric = None # type: Optional[str]
graph = None # type: Optional[nx.Graph]
resolution = None # type: Optional[float]
partition = None # type: Optional[np.ndarray]


def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
# type: (Table, Optional[int], int, str, float, TaskState) -> Results
def run_on_data(
data, pca_components, pca_normalize, k_neighbors, metric, resolution, state
):
# type: (Table, Optional[int], int, str, float, bool, TaskState) -> Results
"""
Run the louvain clustering on `data`.
Expand All @@ -542,6 +565,8 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
pca_components : Optional[int]
If not `None` then the data is first projected onto first
`pca_components` principal components.
pca_normalize : bool
If `True`, the data is first normalized before computing PCA.
k_neighbors : int
Passed to `table_to_knn_graph`
metric : str
Expand All @@ -556,8 +581,8 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
"""
state = state # type: TaskState
res = Results(
pca_components=pca_components, k_neighbors=k_neighbors, metric=metric,
resolution=resolution,
pca_components=pca_components, pca_normalize=pca_normalize,
k_neighbors=k_neighbors, metric=metric, resolution=resolution,
)
step = 0
if state.is_interuption_requested():
Expand All @@ -566,6 +591,9 @@ def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
steps = 3
state.set_status("Computing PCA...")
pca = PCA(n_components=pca_components, random_state=0)
if pca_normalize:
pca.preprocessors += (preprocess.Normalize(),)

data = res.pca_projection = pca(data)(data)
assert isinstance(data, Table)
state.set_partial_results(("pca_projection", res.pca_projection))
Expand Down
30 changes: 30 additions & 0 deletions Orange/widgets/unsupervised/tests/test_owlouvain.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np

from Orange.data import Table, Domain, ContinuousVariable
from Orange.preprocess import Normalize
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.unsupervised.owlouvainclustering import OWLouvainClustering

Expand Down Expand Up @@ -178,3 +179,32 @@ def test_deterministic_clustering(self):

# Ensure that clustering was the same in both instances
np.testing.assert_equal(result1.metas, result2.metas)

def test_normalize_data(self):
# Normalization should be checked by default
self.assertTrue(self.widget.controls.pca_normalize.isChecked())
with patch("Orange.preprocess.Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, self.iris)
self.wait_until_stop_blocking()
self.assertTrue(self.widget.controls.pca_normalize.isEnabled())
normalize.assert_called_once()

# Disable checkbox
self.widget.controls.pca_normalize.setChecked(False)
self.assertFalse(self.widget.controls.pca_normalize.isChecked())
with patch("Orange.preprocess.Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, self.iris)
self.wait_until_stop_blocking()
self.assertTrue(self.widget.controls.pca_normalize.isEnabled())
normalize.assert_not_called()

# Normalization shouldn't work on sparse data
self.widget.controls.pca_normalize.setChecked(True)
self.assertTrue(self.widget.controls.pca_normalize.isChecked())

sparse_data = self.iris.to_sparse()
with patch("Orange.preprocess.Normalize", wraps=Normalize) as normalize:
self.send_signal(self.widget.Inputs.data, sparse_data)
self.wait_until_stop_blocking()
self.assertFalse(self.widget.controls.pca_normalize.isEnabled())
normalize.assert_not_called()

0 comments on commit a7a3b19

Please sign in to comment.