Skip to content

Commit

Permalink
Merge pull request #5427 from markotoplak/fix-outlier-instance-id
Browse files Browse the repository at this point in the history
[FIX] Outlier detection: keep instance ids, make thread safe
  • Loading branch information
markotoplak authored May 12, 2021
2 parents 4376a88 + 587bf9d commit 3773093
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 40 deletions.
74 changes: 37 additions & 37 deletions Orange/classification/outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,43 +9,51 @@
from sklearn.svm import OneClassSVM

from Orange.base import SklLearner, SklModel
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable, \
Variable
from Orange.data.util import get_unique_names
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
from Orange.data.util import get_unique_names, SharedComputeValue
from Orange.preprocess import AdaptiveNormalize
from Orange.util import wrap_callback, dummy_callback
from Orange.util import dummy_callback

__all__ = ["LocalOutlierFactorLearner", "IsolationForestLearner",
"EllipticEnvelopeLearner", "OneClassSVMLearner"]


class _CachedTransform:
# to be used with SharedComputeValue
def __init__(self, model):
self.model = model

def __call__(self, data):
return self.model.data_to_model_domain(data)


class _OutlierModel(SklModel):
def __init__(self, skl_model):
super().__init__(skl_model)
self._cached_data = None
self.outlier_var = None
self.cached_transform = _CachedTransform(self)

def predict(self, X: np.ndarray) -> np.ndarray:
pred = self.skl_model.predict(X)
pred[pred == -1] = 0
return pred[:, None]

def new_domain(self, data: Table) -> Domain:
assert self.outlier_var is not None
return Domain(data.domain.attributes, data.domain.class_vars,
data.domain.metas + (self.outlier_var,))

def __call__(self, data: Table, progress_callback: Callable = None) \
-> Table:
assert isinstance(data, Table)
assert self.outlier_var is not None

domain = Domain(data.domain.attributes, data.domain.class_vars,
data.domain.metas + (self.outlier_var,))
domain = self.new_domain(data)
if progress_callback is None:
progress_callback = dummy_callback
progress_callback(0, "Preprocessing...")
self._cached_data = self.data_to_model_domain(
data, wrap_callback(progress_callback, end=0.1))
progress_callback(0.1, "Predicting...")
metas = np.hstack((data.metas, self.predict(self._cached_data.X)))
progress_callback(0, "Predicting...")
new_table = data.transform(domain)
progress_callback(1)
return Table.from_numpy(domain, data.X, data.Y, metas)
return new_table


class _OutlierLearner(SklLearner):
Expand All @@ -64,27 +72,17 @@ def _fit_model(self, data: Table) -> _OutlierModel:
compute_value=transformer
)

transformer.variable = variable
model.outlier_var = variable
return model


class _Transformer:
class _Transformer(SharedComputeValue):
def __init__(self, model: _OutlierModel):
super().__init__(model.cached_transform)
self._model = model
self._variable = None

@property
def variable(self) -> Variable:
return self._variable

@variable.setter
def variable(self, var: Variable):
self._variable = var

def __call__(self, data: Table) -> np.ndarray:
assert isinstance(self._variable, Variable)
return self._model(data).get_column_view(self._variable)[0]
def compute(self, data: Table, shared_data: Table) -> np.ndarray:
return self._model.predict(shared_data.X)[:, 0]


class OneClassSVMLearner(_OutlierLearner):
Expand Down Expand Up @@ -142,13 +140,16 @@ def mahalanobis(self, observations: np.ndarray) -> np.ndarray:
"""
return self.skl_model.mahalanobis(observations)[:, None]

def __call__(self, data: Table, progress_callback: Callable = None) \
-> Table:
pred = super().__call__(data, progress_callback)
domain = Domain(pred.domain.attributes, pred.domain.class_vars,
pred.domain.metas + (self.mahal_var,))
metas = np.hstack((pred.metas, self.mahalanobis(self._cached_data.X)))
return Table.from_numpy(domain, pred.X, pred.Y, metas)
def new_domain(self, data: Table) -> Domain:
assert self.mahal_var is not None
domain = super().new_domain(data)
return Domain(domain.attributes, domain.class_vars,
domain.metas + (self.mahal_var,))


class _TransformerMahalanobis(_Transformer):
def compute(self, data: Table, shared_data: Table) -> np.ndarray:
return self._model.mahalanobis(shared_data.X)[:, 0]


class EllipticEnvelopeLearner(_OutlierLearner):
Expand All @@ -166,13 +167,12 @@ def _fit_model(self, data: Table) -> EllipticEnvelopeClassifier:
domain = data.domain
model = super()._fit_model(data.transform(Domain(domain.attributes)))

transformer = _Transformer(model)
transformer = _TransformerMahalanobis(model)
names = [v.name for v in domain.variables + domain.metas]
variable = ContinuousVariable(
get_unique_names(names, "Mahalanobis"),
compute_value=transformer
)

transformer.variable = variable
model.mahal_var = variable
return model
12 changes: 9 additions & 3 deletions Orange/classification/tests/test_outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pickle
import tempfile
import unittest
from unittest.mock import Mock
from unittest.mock import Mock, patch

import numpy as np

Expand Down Expand Up @@ -36,6 +36,7 @@ def assert_table_equal(self, table1, table2):
np.testing.assert_array_equal(table1.metas, table2.metas)

def assert_table_appended_outlier(self, table1, table2, offset=1):
np.testing.assert_array_equal(table1.ids, table2.ids)
np.testing.assert_array_equal(table1.X, table2.X)
np.testing.assert_array_equal(table1.Y, table2.Y)
np.testing.assert_array_equal(table1.metas, table2.metas[:, :-offset])
Expand All @@ -47,7 +48,6 @@ def assert_table_appended_outlier(self, table1, table2, offset=1):
self.assertEqual(table2.domain.metas[-offset].name, "Outlier")
self.assertIsNotNone(table2.domain.metas[-offset].compute_value)


class TestOneClassSVMLearner(_TestDetector):
def test_OneClassSVM(self):
np.random.seed(42)
Expand Down Expand Up @@ -128,12 +128,19 @@ def test_EllipticEnvelope(self):
def test_mahalanobis(self):
n = len(self.X_all)
pred = self.model(self.X_all)

y_pred = pred[:, self.model.outlier_var].metas
y_mahal = pred[:, self.model.mahal_var].metas
y_mahal, y_pred = zip(*sorted(zip(y_mahal, y_pred), reverse=True))
self.assertTrue(all(i == 0 for i in y_pred[:int(self.cont * n)]))
self.assertTrue(all(i == 1 for i in y_pred[int(self.cont * n):]))

def test_single_data_to_model_domain(self):
with patch.object(self.model, "data_to_model_domain",
wraps=self.model.data_to_model_domain) as call:
self.model(self.X_all)
self.assertEqual(call.call_count, 1)

def test_EllipticEnvelope_ignores_y(self):
domain = Domain((ContinuousVariable("x1"), ContinuousVariable("x2")),
(ContinuousVariable("y1"), ContinuousVariable("y2")))
Expand Down Expand Up @@ -231,7 +238,6 @@ def test_transformer(self):
detect = self.detector(self.iris)
pred = detect(self.iris)
var = pred.domain.metas[0]
self.assertIs(var, var.compute_value.variable)
np.testing.assert_array_equal(pred[:, "Outlier"].metas.ravel(),
var.compute_value(self.iris))

Expand Down

0 comments on commit 3773093

Please sign in to comment.