Skip to content

Commit

Permalink
Outlier detection: keep instance ids, make thread safe
Browse files Browse the repository at this point in the history
Outlier detection did not keep instance ids, so subsets did not work.
Also, it was not thread safe: multiple calls to _OutlierModel.__call__
could result in undefined behaviour, because some caching was done at
object level.
  • Loading branch information
markotoplak committed May 6, 2021
1 parent d4e259f commit 6c50278
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 39 deletions.
73 changes: 37 additions & 36 deletions Orange/classification/outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,43 +9,52 @@
from sklearn.svm import OneClassSVM

from Orange.base import SklLearner, SklModel
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable, \
Variable
from Orange.data.util import get_unique_names
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
from Orange.data.util import get_unique_names, SharedComputeValue
from Orange.preprocess import AdaptiveNormalize
from Orange.util import wrap_callback, dummy_callback
from Orange.util import dummy_callback

__all__ = ["LocalOutlierFactorLearner", "IsolationForestLearner",
"EllipticEnvelopeLearner", "OneClassSVMLearner"]


class _CachedTransform:
# to be used with SharedComputeValue
def __init__(self, model):
self.model = model

def __call__(self, data):
return self.model.data_to_model_domain(data)


class _OutlierModel(SklModel):
def __init__(self, skl_model):
super().__init__(skl_model)
self._cached_data = None
self.outlier_var = None
self.cached_transform = _CachedTransform(self)

def predict(self, X: np.ndarray) -> np.ndarray:
pred = self.skl_model.predict(X)
pred[pred == -1] = 0
return pred[:, None]

def new_domain(self, data: Table) -> Domain:
assert self.outlier_var is not None
return Domain(data.domain.attributes, data.domain.class_vars,
data.domain.metas + (self.outlier_var,))

def __call__(self, data: Table, progress_callback: Callable = None) \
-> Table:
assert isinstance(data, Table)
assert self.outlier_var is not None

domain = Domain(data.domain.attributes, data.domain.class_vars,
data.domain.metas + (self.outlier_var,))
domain = self.new_domain(data)
if progress_callback is None:
progress_callback = dummy_callback
progress_callback(0, "Preprocessing...")
self._cached_data = self.data_to_model_domain(
data, wrap_callback(progress_callback, end=0.1))
progress_callback(0.1, "Predicting...")
metas = np.hstack((data.metas, self.predict(self._cached_data.X)))
progress_callback(0, "Predicting...")
new_table = data.transform(domain)
progress_callback(1)
return Table.from_numpy(domain, data.X, data.Y, metas)
return new_table


class _OutlierLearner(SklLearner):
Expand All @@ -64,27 +73,17 @@ def _fit_model(self, data: Table) -> _OutlierModel:
compute_value=transformer
)

transformer.variable = variable
model.outlier_var = variable
return model


class _Transformer:
class _Transformer(SharedComputeValue):
def __init__(self, model: _OutlierModel):
super().__init__(model.cached_transform)
self._model = model
self._variable = None

@property
def variable(self) -> Variable:
return self._variable

@variable.setter
def variable(self, var: Variable):
self._variable = var

def __call__(self, data: Table) -> np.ndarray:
assert isinstance(self._variable, Variable)
return self._model(data).get_column_view(self._variable)[0]
def compute(self, data: Table, shared_data: Table) -> np.ndarray:
return self._model.predict(shared_data.X)[:, 0]


class OneClassSVMLearner(_OutlierLearner):
Expand Down Expand Up @@ -142,13 +141,16 @@ def mahalanobis(self, observations: np.ndarray) -> np.ndarray:
"""
return self.skl_model.mahalanobis(observations)[:, None]

def __call__(self, data: Table, progress_callback: Callable = None) \
-> Table:
pred = super().__call__(data, progress_callback)
domain = Domain(pred.domain.attributes, pred.domain.class_vars,
pred.domain.metas + (self.mahal_var,))
metas = np.hstack((pred.metas, self.mahalanobis(self._cached_data.X)))
return Table.from_numpy(domain, pred.X, pred.Y, metas)
def new_domain(self, data: Table) -> Domain:
assert self.mahal_var is not None
domain = super().new_domain(data)
return Domain(domain.attributes, domain.class_vars,
domain.metas + (self.mahal_var,))


class _TransformerMahalanobis(_Transformer):
def compute(self, data: Table, cached_data: Table) -> np.ndarray:
return self._model.mahalanobis(cached_data.X)[:, 0]


class EllipticEnvelopeLearner(_OutlierLearner):
Expand All @@ -166,13 +168,12 @@ def _fit_model(self, data: Table) -> EllipticEnvelopeClassifier:
domain = data.domain
model = super()._fit_model(data.transform(Domain(domain.attributes)))

transformer = _Transformer(model)
transformer = _TransformerMahalanobis(model)
names = [v.name for v in domain.variables + domain.metas]
variable = ContinuousVariable(
get_unique_names(names, "Mahalanobis"),
compute_value=transformer
)

transformer.variable = variable
model.mahal_var = variable
return model
12 changes: 9 additions & 3 deletions Orange/classification/tests/test_outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pickle
import tempfile
import unittest
from unittest.mock import Mock
from unittest.mock import Mock, patch

import numpy as np

Expand Down Expand Up @@ -36,6 +36,7 @@ def assert_table_equal(self, table1, table2):
np.testing.assert_array_equal(table1.metas, table2.metas)

def assert_table_appended_outlier(self, table1, table2, offset=1):
np.testing.assert_array_equal(table1.ids, table2.ids)
np.testing.assert_array_equal(table1.X, table2.X)
np.testing.assert_array_equal(table1.Y, table2.Y)
np.testing.assert_array_equal(table1.metas, table2.metas[:, :-offset])
Expand All @@ -47,7 +48,6 @@ def assert_table_appended_outlier(self, table1, table2, offset=1):
self.assertEqual(table2.domain.metas[-offset].name, "Outlier")
self.assertIsNotNone(table2.domain.metas[-offset].compute_value)


class TestOneClassSVMLearner(_TestDetector):
def test_OneClassSVM(self):
np.random.seed(42)
Expand Down Expand Up @@ -128,12 +128,19 @@ def test_EllipticEnvelope(self):
def test_mahalanobis(self):
n = len(self.X_all)
pred = self.model(self.X_all)

y_pred = pred[:, self.model.outlier_var].metas
y_mahal = pred[:, self.model.mahal_var].metas
y_mahal, y_pred = zip(*sorted(zip(y_mahal, y_pred), reverse=True))
self.assertTrue(all(i == 0 for i in y_pred[:int(self.cont * n)]))
self.assertTrue(all(i == 1 for i in y_pred[int(self.cont * n):]))

def test_single_data_to_model_domain(self):
with patch.object(self.model, "data_to_model_domain",
wraps=self.model.data_to_model_domain) as call:
self.model(self.X_all)
self.assertEqual(call.call_count, 1)

def test_EllipticEnvelope_ignores_y(self):
domain = Domain((ContinuousVariable("x1"), ContinuousVariable("x2")),
(ContinuousVariable("y1"), ContinuousVariable("y2")))
Expand Down Expand Up @@ -231,7 +238,6 @@ def test_transformer(self):
detect = self.detector(self.iris)
pred = detect(self.iris)
var = pred.domain.metas[0]
self.assertIs(var, var.compute_value.variable)
np.testing.assert_array_equal(pred[:, "Outlier"].metas.ravel(),
var.compute_value(self.iris))

Expand Down

0 comments on commit 6c50278

Please sign in to comment.