Skip to content

Commit

Permalink
Merge pull request #3573 from pavlin-policar/louvain-pca-normalize
Browse files Browse the repository at this point in the history
[ENH] OwLouvain: Add normalize data checkbox to PCA preprocessing
  • Loading branch information
lanzagar authored Feb 15, 2019
2 parents a5b8de6 + fe28eaf commit 59228cf
Show file tree
Hide file tree
Showing 8 changed files with 209 additions and 44 deletions.
19 changes: 16 additions & 3 deletions Orange/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,24 @@ class Normalizer(Reprable):
def __init__(self,
zero_based=True,
norm_type=Normalize.NormalizeBySD,
transform_class=False):
transform_class=False,
center=True):
self.zero_based = zero_based
self.norm_type = norm_type
self.transform_class = transform_class
self.center = center

def __call__(self, data):

dists = distribution.get_distributions(data)
new_attrs = [self.normalize(dists[i], var) for
(i, var) in enumerate(data.domain.attributes)]

new_class_vars = data.domain.class_vars
if self.transform_class:
attr_len = len(data.domain.attributes)
new_class_vars = [self.normalize(dists[i + attr_len], var) for
(i, var) in enumerate(data.domain.class_vars)]

domain = Domain(new_attrs, new_class_vars, data.domain.metas)
return data.transform(domain)

Expand All @@ -41,7 +44,17 @@ def normalize_by_sd(self, dist, var):
avg, sd = (dist.mean(), dist.standard_deviation()) if dist.size else (0, 1)
if sd == 0:
sd = 1
return ContinuousVariable(var.name, compute_value=Norm(var, avg, 1 / sd), sparse=var.sparse)

if self.center:
compute_val = Norm(var, avg, 1 / sd)
else:
compute_val = Norm(var, 0, 1 / sd)

return ContinuousVariable(
var.name,
compute_value=compute_val,
sparse=var.sparse,
)

def normalize_by_span(self, dist, var):
dma, dmi = dist.max(), dist.min()
Expand Down
15 changes: 13 additions & 2 deletions Orange/preprocess/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ class Normalize(Preprocess):
Parameters
----------
zero_based : bool (default=True)
Only used when `norm_type=NormalizeBySpan`.
Determines the value used as the “low” value of the variable.
It determines the interval for normalized continuous variables
(either [-1, 1] or [0, 1]).
Expand All @@ -286,6 +288,11 @@ class Normalize(Preprocess):
transform_class : bool (default=False)
If True the class is normalized as well.
center : bool (default=True)
Only used when `norm_type=NormalizeBySD`.
Whether or not to center the data so it has mean zero.
Examples
--------
>>> from Orange.data import Table
Expand All @@ -301,10 +308,12 @@ class Normalize(Preprocess):
def __init__(self,
zero_based=True,
norm_type=NormalizeBySD,
transform_class=False):
transform_class=False,
center=True):
self.zero_based = zero_based
self.norm_type = norm_type
self.transform_class = transform_class
self.center = center

def __call__(self, data):
"""
Expand Down Expand Up @@ -334,7 +343,9 @@ def __call__(self, data):
normalizer = normalize.Normalizer(
zero_based=self.zero_based,
norm_type=self.norm_type,
transform_class=self.transform_class)
transform_class=self.transform_class,
center=self.center,
)
return normalizer(data)


Expand Down
17 changes: 17 additions & 0 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,23 @@
from sklearn.utils.sparsefuncs import mean_variance_axis


def sparse_array_equal(x1, x2):
"""Check if two sparse arrays are equal."""
if not sp.issparse(x1):
raise TypeError("`x1` must be sparse.")
if not sp.issparse(x2):
raise TypeError("`x2` must be sparse.")

return x1.shape == x2.shape and (x1 != x2).nnz == 0


def array_equal(x1, x2):
"""Equivalent of np.array_equal that properly handles sparse matrices."""
if sp.issparse(x1) and sp.issparse(x2):
return sparse_array_equal(x1, x2)
return np.array_equal(x1, x2)


def _count_nans_per_row_sparse(X, weights, dtype=None):
""" Count the number of nans (undefined) values per row. """
if weights is not None:
Expand Down
21 changes: 20 additions & 1 deletion Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from Orange.statistics.util import bincount, countnans, contingency, digitize, \
mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \
unique, var, nanstd, nanvar, nanmode
unique, var, nanstd, nanvar, nanmode, array_equal
from sklearn.utils import check_random_state


Expand Down Expand Up @@ -590,6 +590,25 @@ def test_nanunique_ignores_nans_in_counts(self, array):
np.testing.assert_equal(nanunique(x, return_counts=True)[1], expected)


class TestArrayEqual(unittest.TestCase):
@dense_sparse
def test_same_matrices(self, array):
x = array([0, 1, 0, 0, 2])
self.assertTrue(array_equal(x, x))

@dense_sparse
def test_with_different_shapes(self, array):
x = array(np.eye(4))
y = array(np.eye(5))
self.assertFalse(array_equal(x, y))

@dense_sparse
def test_with_different_values(self, array):
x = array([0, 1, 0, 0, 2])
y = array([0, 3, 0, 0, 2])
self.assertFalse(array_equal(x, y))


class TestNanModeAppVeyor(unittest.TestCase):
def test_appveyour_still_not_onscipy_1_2_0(self):
import scipy
Expand Down
18 changes: 3 additions & 15 deletions Orange/widgets/data/tests/test_owfeaturestatistics.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import datetime
import warnings
from collections import namedtuple
from functools import wraps, partial
from functools import partial
from itertools import chain
from typing import Callable, List
from typing import List

import numpy as np
from AnyQt.QtCore import QItemSelection, QItemSelectionRange, \
Expand All @@ -12,7 +12,7 @@
from Orange.data import Table, Domain, StringVariable, ContinuousVariable, \
DiscreteVariable, TimeVariable
from Orange.widgets.tests.base import WidgetTest, datasets
from Orange.widgets.tests.utils import simulate
from Orange.widgets.tests.utils import simulate, table_dense_sparse
from Orange.widgets.data.owfeaturestatistics import \
OWFeatureStatistics

Expand Down Expand Up @@ -175,18 +175,6 @@ def make_table(attributes, target=None, metas=None):
)


def table_dense_sparse(test_case):
# type: (Callable) -> Callable
"""Run a single test case on both dense and sparse Orange tables."""

@wraps(test_case)
def _wrapper(self):
test_case(self, lambda table: table.to_dense())
test_case(self, lambda table: table.to_sparse())

return _wrapper


class TestVariousDataSets(WidgetTest):
def setUp(self):
self.widget = self.create_widget(
Expand Down
23 changes: 23 additions & 0 deletions Orange/widgets/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import sys
from functools import wraps

import warnings
import contextlib

Expand Down Expand Up @@ -317,3 +319,24 @@ def mouseMove(widget, pos=QPoint(), delay=-1): # pragma: no-cover
QTest.qWait(delay)

QApplication.sendEvent(widget, me)


def table_dense_sparse(test_case):
# type: (Callable) -> Callable
"""Run a single test case on both dense and sparse Orange tables.
Examples
--------
>>> @table_dense_sparse
... def test_something(self, prepare_table):
... data: Table # The table you want to test on
... data = prepare_table(data) # This converts the table to dense/sparse
"""

@wraps(test_case)
def _wrapper(self):
test_case(self, lambda table: table.to_dense())
test_case(self, lambda table: table.to_sparse())

return _wrapper
Loading

0 comments on commit 59228cf

Please sign in to comment.