diff --git a/Orange/projection/__init__.py b/Orange/projection/__init__.py index dd577130b90..8207a80bfc0 100644 --- a/Orange/projection/__init__.py +++ b/Orange/projection/__init__.py @@ -4,3 +4,4 @@ from .manifold import * from .freeviz import * from .radviz import radviz +from .lda import LDA diff --git a/Orange/projection/lda.py b/Orange/projection/lda.py new file mode 100644 index 00000000000..7de255a62c0 --- /dev/null +++ b/Orange/projection/lda.py @@ -0,0 +1,65 @@ +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + +import Orange.data +from Orange.classification.logistic_regression import _FeatureScorerMixin +from Orange.data.util import SharedComputeValue +from Orange.projection import SklProjector, Projection + +__all__ = ["LDA"] + + +class LDA(SklProjector, _FeatureScorerMixin): + name = "LDA" + supports_sparse = False + + def __init__(self, n_components=2, solver='eigen', preprocessors=None): + super().__init__(preprocessors=preprocessors) + self.n_components = n_components + self.solver = solver + + def fit(self, X, Y=None): + if self.n_components is not None: + self.n_components = min(min(X.shape), self.n_components) + proj = LinearDiscriminantAnalysis(solver='eigen', n_components=2) + proj = proj.fit(X, Y) + return LDAModel(proj, self.domain) + + +class _LDATransformDomain: + """Computation common for all LDA variables.""" + def __init__(self, lda): + self.lda = lda + + def __call__(self, data): + if data.domain != self.lda.pre_domain: + data = data.transform(self.lda.pre_domain) + return self.lda.transform(data.X) + + +class LDAModel(Projection): + name = "LDAModel" + + def __init__(self, proj, domain): + lda_transform = _LDATransformDomain(self) + self.components_ = proj.scalings_.T + + def lda_variable(i): + return Orange.data.ContinuousVariable( + 'LD%d' % (i + 1), compute_value=LDAProjector(self, i, lda_transform)) + + super().__init__(proj=proj) + self.orig_domain = domain + self.n_components = self.components_.shape[0] + self.domain = Orange.data.Domain( + [lda_variable(i) for i in range(proj.n_components)], + domain.class_vars, domain.metas) + + +class LDAProjector(SharedComputeValue): + """Transform into a given LDA component.""" + def __init__(self, projection, feature, lda_transform): + super().__init__(lda_transform) + self.feature = feature + + def compute(self, data, lda_space): + return lda_space[:, self.feature] diff --git a/Orange/tests/test_lda.py b/Orange/tests/test_lda.py new file mode 100644 index 00000000000..fc75e7a0394 --- /dev/null +++ b/Orange/tests/test_lda.py @@ -0,0 +1,49 @@ +# Test methods with long descriptive names can omit docstrings +# pylint: disable=missing-docstring + +import unittest + +import numpy as np + +from Orange.preprocess import Continuize, Randomize +from Orange.projection import LDA +from Orange.data import Table + + +class TestLDA(unittest.TestCase): + def test_lda(self): + iris = Table('iris') + n_components = 2 + lda = LDA(n_components=n_components) + model = lda(iris) + transformed = model(iris) + self.assertEqual(transformed.X.shape, (len(iris), n_components)) + self.assertEqual(transformed.Y.shape, (len(iris),)) + + def test_transform_changed_domain(self): + """ + 1. Open data, apply some preprocessor, splits the data into two parts, + use LDA on the first part, and then transform the second part. + + 2. Open data, split into two parts, apply the same preprocessor and + LDA only on the first part, and then transform the second part. + + The transformed second part in (1) and (2) has to be the same. + """ + data = Table("iris") + data = Randomize()(data) + preprocessor = Continuize() + lda = LDA() + + # normalize all + ndata = preprocessor(data) + + model = lda(ndata[:75]) + result_1 = model(ndata[75:]) + + # normalize only the "training" part + ndata = preprocessor(data[:75]) + model = lda(ndata) + result_2 = model(data[75:]) + + np.testing.assert_almost_equal(result_1.X, result_2.X) diff --git a/doc/data-mining-library/source/reference/projection.rst b/doc/data-mining-library/source/reference/projection.rst index db3641c50be..36c08a99532 100644 --- a/doc/data-mining-library/source/reference/projection.rst +++ b/doc/data-mining-library/source/reference/projection.rst @@ -83,3 +83,51 @@ Example .. autoclass:: Orange.projection.freeviz.FreeViz + + + + +LDA +--- + +Linear discriminant analysis is another way of finding a linear transformation of +data that reduces the number of dimensions required to represent it. It is often +used for dimensionality reduction prior to classification, but can also be used as a +classification technique itself ([1]_). + + +Example +======= + + >>> from Orange.projection import LDA + >>> from Orange.data import Table + >>> iris = Table('iris') + >>> lda = LDA() + >>> model = LDA(iris) + >>> model.components_ # LDA components + array([[ 0.20490976, 0.38714331, -0.54648218, -0.71378517], + [ 0.00898234, 0.58899857, -0.25428655, 0.76703217], + [-0.71507172, 0.43568045, 0.45568731, -0.30200008], + [ 0.06449913, -0.35780501, -0.42514529, 0.828895 ]]) + >>> transformed_data = model(iris) # transformed data + >>> transformed_data + [[1.492, 1.905 | Iris-setosa], + [1.258, 1.608 | Iris-setosa], + [1.349, 1.750 | Iris-setosa], + [1.180, 1.639 | Iris-setosa], + [1.510, 1.963 | Iris-setosa], + ... + ] + + + +.. autoclass:: Orange.projection.lda.LDA + + + +References +---------- + +.. [1] Witten, I.H., Frank, E., Hall, M.A. and Pal, C.J., 2016. + Data Mining: Practical machine learning tools and techniques. Morgan Kaufmann. +