diff --git a/Orange/clustering/__init__.py b/Orange/clustering/__init__.py index 03b53fa7f05..818a33c076d 100644 --- a/Orange/clustering/__init__.py +++ b/Orange/clustering/__init__.py @@ -4,3 +4,4 @@ from .dbscan import * from .hierarchical import * from .kmeans import * +from .louvain import * diff --git a/Orange/clustering/clustering.py b/Orange/clustering/clustering.py new file mode 100644 index 00000000000..0dfaca0608f --- /dev/null +++ b/Orange/clustering/clustering.py @@ -0,0 +1,100 @@ +import numpy as np +import scipy.sparse + +from Orange.data import Table, Instance +from Orange.data.table import DomainTransformationError +from Orange.misc.wrapper_meta import WrapperMeta +from Orange.preprocess import Continuize, SklImpute + + +class ClusteringModel: + + def __init__(self, projector): + self.projector = projector + self.domain = None + self.original_domain = None + self.labels = projector.labels_ + + def __call__(self, data): + def fix_dim(x): + return x[0] if one_d else x + + one_d = False + if isinstance(data, np.ndarray): + one_d = data.ndim == 1 + prediction = self.predict(np.atleast_2d(data)) + elif isinstance(data, scipy.sparse.csr.csr_matrix) or \ + isinstance(data, scipy.sparse.csc.csc_matrix): + prediction = self.predict(data) + elif isinstance(data, (Table, Instance)): + if isinstance(data, Instance): + data = Table(data.domain, [data]) + one_d = True + if data.domain != self.domain: + if self.original_domain.attributes != data.domain.attributes \ + and data.X.size \ + and not np.isnan(data.X).all(): + data = data.transform(self.original_domain) + if np.isnan(data.X).all(): + raise DomainTransformationError( + "domain transformation produced no defined values") + data = data.transform(self.domain) + prediction = self.predict(data.X) + elif isinstance(data, (list, tuple)): + if not isinstance(data[0], (list, tuple)): + data = [data] + one_d = True + data = Table.from_list(self.original_domain, data) + data = data.transform(self.domain) + prediction = self.predict(data.X) + else: + raise TypeError("Unrecognized argument (instance of '{}')" + .format(type(data).__name__)) + + return fix_dim(prediction) + + def predict(self, X): + raise NotImplementedError( + "This clustering algorithm does not support predicting.") + + +class Clustering(metaclass=WrapperMeta): + """ + ${skldoc} + Additional Orange parameters + + preprocessors : list, optional (default = [Continuize(), SklImpute()]) + An ordered list of preprocessors applied to data before + training or testing. + """ + __wraps__ = None + __returns__ = ClusteringModel + preprocessors = [Continuize(), SklImpute()] + + def __init__(self, preprocessors, parameters): + self.preprocessors = tuple(preprocessors or self.preprocessors) + self.params = {k: v for k, v in parameters.items() + if k not in ["self", "preprocessors", "__class__"]} + + def __call__(self, data): + return self.get_model(data).labels + + def get_model(self, data): + orig_domain = data.domain + data = self.preprocess(data) + model = self.fit_storage(data) + model.domain = data.domain + model.original_domain = orig_domain + return model + + def fit_storage(self, data): + # only data Table + return self.fit(data.X) + + def fit(self, X: np.ndarray, y: np.ndarray = None): + return self.__returns__(self.__wraps__(**self.params).fit(X)) + + def preprocess(self, data): + for pp in self.preprocessors: + data = pp(data) + return data diff --git a/Orange/clustering/dbscan.py b/Orange/clustering/dbscan.py index 7481f1c34ec..80e7cdd9948 100644 --- a/Orange/clustering/dbscan.py +++ b/Orange/clustering/dbscan.py @@ -1,52 +1,22 @@ -import sklearn.cluster as skl_cluster -from numpy import ndarray, unique +import sklearn.cluster -from Orange.data import Table, DiscreteVariable, Domain, Instance -from Orange.projection import SklProjector, Projection +from Orange.clustering.clustering import Clustering +from Orange.data import Table __all__ = ["DBSCAN"] -class DBSCAN(SklProjector): - __wraps__ = skl_cluster.DBSCAN + +class DBSCAN(Clustering): + + __wraps__ = sklearn.cluster.DBSCAN def __init__(self, eps=0.5, min_samples=5, metric='euclidean', - algorithm='auto', leaf_size=30, p=None, - preprocessors=None): - super().__init__(preprocessors=preprocessors) - self.params = vars() - - def fit(self, X, Y=None): - proj = skl_cluster.DBSCAN(**self.params) - self.X = X - if isinstance(X, Table): - proj = proj.fit(X.X,) - else: - proj = proj.fit(X, ) - return DBSCANModel(proj) - - -class DBSCANModel(Projection): - def __init__(self, proj): - super().__init__(proj=proj) - - def __call__(self, data): - if isinstance(data, ndarray): - return self.proj.fit_predict(data).reshape((len(data), 1)) - - if isinstance(data, Table): - if data.domain is not self.pre_domain: - data = data.transform(self.pre_domain) - y = self.proj.fit_predict(data.X) - vals, indices = unique(y, return_inverse=True) - c = DiscreteVariable(name='Core sample index', - values=[str(v) for v in vals]) - domain = Domain([c]) - return Table(domain, indices.reshape(len(y), 1)) - - elif isinstance(data, Instance): - if data.domain is not self.pre_domain: - data = Instance(self.pre_domain, data) - # Instances-by-Instance classification is not defined; - raise Exception("Core sample assignment is not supported " - "for single instances.") + algorithm='auto', leaf_size=30, p=None, preprocessors=None): + super().__init__(preprocessors, vars()) + + +if __name__ == "__main__": + d = Table("iris") + km = DBSCAN(preprocessors=None) + clusters = km(d) diff --git a/Orange/clustering/kmeans.py b/Orange/clustering/kmeans.py index 364a4261405..97ba7d0e8a1 100644 --- a/Orange/clustering/kmeans.py +++ b/Orange/clustering/kmeans.py @@ -1,72 +1,45 @@ -import numpy as np -import sklearn.cluster as skl_cluster -from sklearn.metrics import silhouette_samples, silhouette_score +import warnings -from Orange.data import Table, DiscreteVariable, Domain, Instance -from Orange.projection import SklProjector, Projection -from Orange.distance import Euclidean +import sklearn.cluster + +from Orange.clustering.clustering import Clustering, ClusteringModel +from Orange.data import Table __all__ = ["KMeans"] -SILHOUETTE_MAX_SAMPLES = 5000 -class KMeans(SklProjector): - __wraps__ = skl_cluster.KMeans +class KMeansModel(ClusteringModel): - def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300, - tol=0.0001, random_state=None, preprocessors=None, - compute_silhouette_score=False): - super().__init__(preprocessors=preprocessors) - self.params = vars() - self._compute_silhouette = compute_silhouette_score + def __init__(self, projector): + super().__init__(projector) + self.centroids = projector.cluster_centers_ + self.k = projector.get_params()["n_clusters"] - def fit(self, X, Y=None): - proj = skl_cluster.KMeans(**self.params) - proj = proj.fit(X, Y) - proj.silhouette = np.nan - try: - if self._compute_silhouette and 2 <= proj.n_clusters < X.shape[0]: - if X.shape[0] <= SILHOUETTE_MAX_SAMPLES: - proj.silhouette_samples = \ - silhouette_samples(X, proj.labels_) - proj.silhouette = np.mean(proj.silhouette_samples) - else: - proj.silhouette_samples = None - proj.silhouette = \ - silhouette_score(X, proj.labels_, sample_size=SILHOUETTE_MAX_SAMPLES) - except MemoryError: # Pairwise dist in silhouette fails for large data - pass - proj.inertia = proj.inertia_ / X.shape[0] - cluster_dist = Euclidean(proj.cluster_centers_) - proj.inter_cluster = np.mean(cluster_dist[np.triu_indices_from(cluster_dist, 1)]) - return KMeansModel(proj, self.preprocessors) + def predict(self, X): + return self.projector.predict(X) -class KMeansModel(Projection): - def __init__(self, proj, preprocessors=None): - super().__init__(proj=proj) - self.k = self.proj.get_params()["n_clusters"] - self.centroids = self.proj.cluster_centers_ +class KMeans(Clustering): - def __call__(self, data): - if isinstance(data, Table): - if data.domain is not self.pre_domain: - data = data.transform(self.pre_domain) - c = DiscreteVariable(name='Cluster id', - values=[str(i) for i in range(self.k)]) - domain = Domain([c]) - return Table( - domain, - self.proj.predict(data.X).astype(int).reshape((len(data), 1))) - elif isinstance(data, Instance): - if data.domain is not self.pre_domain: - data = Instance(self.pre_domain, data) - c = DiscreteVariable(name='Cluster id', - values=[str(i) for i in range(self.k)]) - domain = Domain([c]) - return Table( - domain, - np.atleast_2d(self.proj.predict(data._x.reshape(1, -1))).astype(int)) - else: - return self.proj.predict(data).reshape((data.shape[0], 1)) + __wraps__ = sklearn.cluster.KMeans + __returns__ = KMeansModel + + def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300, + tol=0.0001, random_state=None, preprocessors=None, + compute_silhouette_score=None): + if compute_silhouette_score is not None: + warnings.warn( + "compute_silhouette_score is deprecated. Please use " + "sklearn.metrics.silhouette_score to compute silhouettes.", + DeprecationWarning) + super().__init__( + preprocessors, {k: v for k, v in vars().items() + if k != "compute_silhouette_score"}) + + +if __name__ == "__main__": + d = Table("iris") + km = KMeans(preprocessors=None, n_clusters=3) + clusters = km(d) + model = km.fit_storage(d) diff --git a/Orange/clustering/louvain.py b/Orange/clustering/louvain.py index a4072341b25..ed2ff7ddc3e 100644 --- a/Orange/clustering/louvain.py +++ b/Orange/clustering/louvain.py @@ -3,7 +3,6 @@ Original C++ implementation available at https://sites.google.com/site/findcommunities/ - """ import numpy as np @@ -11,25 +10,29 @@ # NOTE: The ``community`` package might be renamed in the near future, see # GH issue https://github.com/taynaud/python-louvain/issues/23 from community import best_partition +from sklearn.base import BaseEstimator from sklearn.neighbors import NearestNeighbors -import Orange +from Orange.clustering.clustering import Clustering from Orange.data import Table +__all__ = ["Louvain", "matrix_to_knn_graph"] + + def jaccard(x, y): # type: (set, set) -> float """Compute the Jaccard similarity between two sets.""" return len(x & y) / len(x | y) -def table_to_knn_graph(data, k_neighbors, metric, progress_callback=None): - """Convert tabular data to a graph using a nearest neighbors approach with +def matrix_to_knn_graph(data, k_neighbors, metric, progress_callback=None): + """Convert data matrix to a graph using a nearest neighbors approach with the Jaccard similarity as the edge weights. Parameters ---------- - data : Table + data : np.ndarray k_neighbors : int metric : str A distance metric supported by sklearn. @@ -59,99 +62,82 @@ def table_to_knn_graph(data, k_neighbors, metric, progress_callback=None): graph.add_edge( node, neighbor, - weight=jaccard(nearest_neighbors[node], nearest_neighbors[neighbor]), + weight=jaccard( + nearest_neighbors[node], nearest_neighbors[neighbor]), ) return graph -class Louvain: - preprocessors = [Orange.preprocess.Continuize(), Orange.preprocess.SklImpute()] - - def __init__( - self, - k_neighbors=30, - metric="l2", - resolution=1.0, - random_state=None, - preprocessors=None, - ): - """Louvain clustering for community detection in graphs. - - Louvain clustering is a community detection algorithm for detecting - clusters of "communities" in graphs. As such, tabular data must first - be converted into graph form. This is typically done by computing the - KNN graph on the input data. - - Parameters - ---------- - k_neighbors : Optional[int] - The number of nearest neighbors to use for the KNN graph if - tabular data is passed. - - metric : Optional[str] - The metric to use to compute the nearest neighbors. - - resolution : Optional[float] - The resolution is a parameter of the Louvain method that affects - the size of the recovered clusters. - - random_state: Union[int, RandomState] - The random state parameter follows the convention used in scikit-learn. - If the value is an int, random_state is the seed used by the random - number generator. If the value is a RandomState instance, then it will - be used as the random number generator. If the value is None, the random - number generator is the RandomState instance used by `np.random`. - - """ - if preprocessors is None: - preprocessors = type(self).preprocessors - self.preprocessors = tuple(preprocessors) +class LouvainMethod(BaseEstimator): + def __init__(self, k_neighbors=30, metric="l2", resolution=1.0, + random_state=None): self.k_neighbors = k_neighbors self.metric = metric self.resolution = resolution self.random_state = random_state + self.labels_ = None - self.labels = None + def fit(self, X: np.ndarray, y: np.ndarray = None): + # If we are given a table, we have to convert it to a graph first + graph = matrix_to_knn_graph( + X, metric=self.metric, k_neighbors=self.k_neighbors) + return self.fit_graph(graph) - def __call__(self, data): - data = self.preprocess(data) - return self.fit_predict(data.X, data.Y) + def fit_graph(self, graph): + partition = best_partition( + graph, resolution=self.resolution, random_state=self.random_state) + self.labels_ = np.fromiter( + list(zip(*sorted(partition.items())))[1], dtype=int) + return self - def preprocess(self, data): - for pp in self.preprocessors: - data = pp(data) - return data - def fit(self, X, y=None): - # If we are given a table, we have to convert it to a graph first - if isinstance(X, Table): - graph = table_to_knn_graph( - X.X, metric=self.metric, k_neighbors=self.k_neighbors - ) - # Same goes for a matrix - elif isinstance(X, np.ndarray): - graph = table_to_knn_graph( - X, metric=self.metric, k_neighbors=self.k_neighbors - ) - elif isinstance(X, nx.Graph): - graph = X +class Louvain(Clustering): + """Louvain clustering for community detection in graphs. - partition = best_partition( - graph, resolution=self.resolution, random_state=self.random_state - ) - partition = np.fromiter(list(zip(*sorted(partition.items())))[1], dtype=int) + Louvain clustering is a community detection algorithm for detecting + clusters of "communities" in graphs. As such, tabular data must first + be converted into graph form. This is typically done by computing the + KNN graph on the input data. + + Attributes + ---------- + k_neighbors : Optional[int] + The number of nearest neighbors to use for the KNN graph if + tabular data is passed. + + metric : Optional[str] + The metric to use to compute the nearest neighbors. + + resolution : Optional[float] + The resolution is a parameter of the Louvain method that affects + the size of the recovered clusters. + + random_state: Union[int, RandomState] + The random state parameter follows the convention used in scikit-learn. + If the value is an int, random_state is the seed used by the random + number generator. If the value is a RandomState instance, then it will + be used as the random number generator. If the value is None, the random + number generator is the RandomState instance used by `np.random`. + """ + + __wraps__ = LouvainMethod - self.labels = partition + def __init__(self, k_neighbors=30, metric="l2", resolution=1.0, + random_state=None, preprocessors=None): + super().__init__(preprocessors, vars()) - def fit_predict(self, X, y=None): - self.fit(X, y) - return self.labels + def get_model(self, data): + if isinstance(data, nx.Graph): + return self.__returns__( + self.__wraps__(**self.params).fit_graph(data)) + else: + return super().get_model(data) if __name__ == "__main__": # clustering run on iris data - orange table - data = Table("iris") - louvain = Louvain(2) - louvain.fit(data) + d = Table("iris") + louvain = Louvain(5) + clusters = louvain(d) diff --git a/Orange/evaluation/clustering.py b/Orange/evaluation/clustering.py index 53bb7858cde..77682dd310e 100644 --- a/Orange/evaluation/clustering.py +++ b/Orange/evaluation/clustering.py @@ -94,11 +94,10 @@ def __call__(self, data, learners, preprocessor=None, *, callback=None): for k in range(self.k): for i, learner in enumerate(learners): - model = learner(data) + model = learner.get_model(data) if self.store_models: res.models[k, i] = model - labels = model(data) - res.predicted[i, k, :] = labels.X.flatten() + res.predicted[i, k, :] = model.labels return res diff --git a/Orange/tests/test_clustering_dbscan.py b/Orange/tests/test_clustering_dbscan.py index 5065fe2482c..3286f5a714d 100644 --- a/Orange/tests/test_clustering_dbscan.py +++ b/Orange/tests/test_clustering_dbscan.py @@ -3,45 +3,94 @@ import unittest -import Orange +import numpy as np +from scipy.sparse import csc_matrix, csr_matrix + +from Orange.clustering.clustering import ClusteringModel +from Orange.data import Table from Orange.clustering.dbscan import DBSCAN class TestDBSCAN(unittest.TestCase): + def setUp(self): + self.iris = Table('iris') + self.dbscan = DBSCAN() - @classmethod - def setUpClass(cls): - cls.iris = Orange.data.Table('iris') + def test_dbscan(self): + c = self.dbscan(self.iris) + # First 20 iris belong to one cluster + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + self.assertEqual(1, len(set(c[:20].ravel()))) def test_dbscan_parameters(self): dbscan = DBSCAN(eps=0.1, min_samples=7, metric='euclidean', algorithm='auto', leaf_size=12, p=None) c = dbscan(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) def test_predict_table(self): - dbscan = DBSCAN() - c = dbscan(self.iris) - table = self.iris[:20] - p = c(table) + pred = self.dbscan(self.iris) + self.assertEqual(np.ndarray, type(pred)) + self.assertEqual(len(self.iris), len(pred)) def test_predict_numpy(self): - dbscan = DBSCAN() - c = dbscan(self.iris) - X = self.iris.X[::20] - p = c(X) + model = self.dbscan.fit(self.iris.X) + self.assertEqual(ClusteringModel, type(model)) + self.assertEqual(np.ndarray, type(model.labels)) + self.assertEqual(len(self.iris), len(model.labels)) - def test_values(self): - dbscan = DBSCAN(eps=1) # it clusters data in two classes - c = dbscan(self.iris) - table = self.iris - p = c(table) + def test_predict_sparse_csc(self): + self.iris.X = csc_matrix(self.iris.X[::20]) + c = self.dbscan(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + + def test_predict_spares_csr(self): + self.iris.X = csr_matrix(self.iris.X[::20]) + c = self.dbscan(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + + def test_model(self): + c = self.dbscan.get_model(self.iris) + self.assertEqual(ClusteringModel, type(c)) + self.assertEqual(len(self.iris), len(c.labels)) + + self.assertRaises(NotImplementedError, c, self.iris) + + def test_model_np(self): + """ + Test with numpy array as an input in model. + """ + c = self.dbscan.get_model(self.iris) + self.assertRaises(NotImplementedError, c, self.iris.X) - self.assertEqual(2, len(p.domain[0].values)) - self.assertSetEqual({"0", "1"}, set(p.domain[0].values)) + def test_model_sparse(self): + """ + Test with sparse array as an input in model. + """ + c = self.dbscan.get_model(self.iris) + self.assertRaises(NotImplementedError, c, csr_matrix(self.iris.X)) - table.X[0] = [100, 100, 100, 100] # we add a big outlier + def test_model_instance(self): + """ + Test with instance as an input in model. + """ + c = self.dbscan.get_model(self.iris) + self.assertRaises(NotImplementedError, c, self.iris[0]) - p = c(table) + def test_model_list(self): + """ + Test with list as an input in model. + """ + c = self.dbscan.get_model(self.iris) + self.assertRaises(NotImplementedError, c, self.iris.X.tolist()) - self.assertEqual(3, len(p.domain[0].values)) - self.assertSetEqual({"-1", "0", "1"}, set(p.domain[0].values)) + def test_model_bad_datatype(self): + """ + Check model with data-type that is not supported. + """ + c = self.dbscan.get_model(self.iris) + self.assertRaises(TypeError, c, 10) diff --git a/Orange/tests/test_clustering_kmeans.py b/Orange/tests/test_clustering_kmeans.py index ae2dc82eb1e..7ff40d94992 100644 --- a/Orange/tests/test_clustering_kmeans.py +++ b/Orange/tests/test_clustering_kmeans.py @@ -2,64 +2,149 @@ # pylint: disable=missing-docstring import unittest +import warnings import numpy as np -from scipy.sparse import csc_matrix +from scipy.sparse import csc_matrix, csr_matrix import Orange -from Orange.clustering.kmeans import KMeans +from Orange.clustering.kmeans import KMeans, KMeansModel +from Orange.data import Table, Domain, ContinuousVariable +from Orange.data.table import DomainTransformationError class TestKMeans(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.iris = Orange.data.Table('iris') + def setUp(self): + self.kmeans = KMeans(n_clusters=2) + self.iris = Orange.data.Table('iris') def test_kmeans(self): - kmeans = KMeans(n_clusters=2) - c = kmeans(self.iris) - X = self.iris.X[:20] - p = c(X) + c = self.kmeans(self.iris) # First 20 iris belong to one cluster - assert len(set(p.ravel())) == 1 + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + self.assertEqual(1, len(set(c[:20].ravel()))) def test_kmeans_parameters(self): - kmeans = KMeans(n_clusters=10, - max_iter=10, - random_state=42, - tol=0.001, - init='random', - compute_silhouette_score=True) - c = kmeans(self.iris) - - def test_predict_single_instance(self): - kmeans = KMeans() + kmeans = KMeans(n_clusters=10, max_iter=10, random_state=42, tol=0.001, + init='random') c = kmeans(self.iris) - inst = self.iris[0] - p = c(inst) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) def test_predict_table(self): - kmeans = KMeans() - c = kmeans(self.iris) - table = self.iris[:20] - p = c(table) + c = self.kmeans(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) def test_predict_numpy(self): - kmeans = KMeans() - c = kmeans(self.iris) - X = self.iris.X[::20] - p = c(X) + c = self.kmeans.fit(self.iris.X) + self.assertEqual(KMeansModel, type(c)) + self.assertEqual(np.ndarray, type(c.labels)) + self.assertEqual(len(self.iris), len(c.labels)) - def test_predict_sparse(self): - kmeans = KMeans() - c = kmeans(self.iris) - X = csc_matrix(self.iris.X[::20]) - p = c(X) - - def test_silhouette_sparse(self): - """Test if silhouette gets calculated for sparse data""" - kmeans = KMeans(compute_silhouette_score=True) - sparse_iris = self.iris.copy() - sparse_iris.X = csc_matrix(sparse_iris.X) - c = kmeans(sparse_iris) - self.assertFalse(np.isnan(c.silhouette)) + def test_predict_sparse_csc(self): + self.iris.X = csc_matrix(self.iris.X[::20]) + c = self.kmeans(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + + def test_predict_spares_csr(self): + self.iris.X = csr_matrix(self.iris.X[::20]) + c = self.kmeans(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + + def test_model(self): + c = self.kmeans.get_model(self.iris) + self.assertEqual(KMeansModel, type(c)) + self.assertEqual(len(self.iris), len(c.labels)) + + c1 = c(self.iris) + # prediction of the model must be same since data are same + np.testing.assert_array_almost_equal(c.labels, c1) + + def test_model_np(self): + """ + Test with numpy array as an input in model. + """ + c = self.kmeans.get_model(self.iris) + c1 = c(self.iris.X) + # prediction of the model must be same since data are same + np.testing.assert_array_almost_equal(c.labels, c1) + + def test_model_sparse_csc(self): + """ + Test with sparse array as an input in model. + """ + c = self.kmeans.get_model(self.iris) + c1 = c(csc_matrix(self.iris.X)) + # prediction of the model must be same since data are same + np.testing.assert_array_almost_equal(c.labels, c1) + + def test_model_sparse_csr(self): + """ + Test with sparse array as an input in model. + """ + c = self.kmeans.get_model(self.iris) + c1 = c(csr_matrix(self.iris.X)) + # prediction of the model must be same since data are same + np.testing.assert_array_almost_equal(c.labels, c1) + + def test_model_instance(self): + """ + Test with instance as an input in model. + """ + c = self.kmeans.get_model(self.iris) + c1 = c(self.iris[0]) + # prediction of the model must be same since data are same + self.assertEqual(c1, c.labels[0]) + + def test_model_list(self): + """ + Test with list as an input in model. + """ + c = self.kmeans.get_model(self.iris) + c1 = c(self.iris.X.tolist()) + # prediction of the model must be same since data are same + np.testing.assert_array_almost_equal(c.labels, c1) + + # example with a list of only one data item + c1 = c(self.iris.X.tolist()[0]) + # prediction of the model must be same since data are same + np.testing.assert_array_almost_equal(c.labels[0], c1) + + def test_model_bad_datatype(self): + """ + Check model with data-type that is not supported. + """ + c = self.kmeans.get_model(self.iris) + self.assertRaises(TypeError, c, 10) + + def test_model_data_table_domain(self): + """ + Check model with data-type that is not supported. + """ + # ok domain + data = Table(Domain( + list(self.iris.domain.attributes) + [ContinuousVariable("a")]), + np.concatenate((self.iris.X, np.ones((len(self.iris), 1))), axis=1)) + c = self.kmeans.get_model(self.iris) + res = c(data) + np.testing.assert_array_almost_equal(c.labels, res) + + # totally different domain - should fail + self.assertRaises(DomainTransformationError, c, Table("housing")) + + def test_deprecated_silhouette(self): + with warnings.catch_warnings(record=True) as w: + KMeans(compute_silhouette_score=True) + + assert len(w) == 1 + assert issubclass(w[-1].category, DeprecationWarning) + + with warnings.catch_warnings(record=True) as w: + KMeans(compute_silhouette_score=False) + + assert len(w) == 1 + assert issubclass(w[-1].category, DeprecationWarning) diff --git a/Orange/tests/test_clustering_louvain.py b/Orange/tests/test_clustering_louvain.py new file mode 100644 index 00000000000..a65ba4a8edf --- /dev/null +++ b/Orange/tests/test_clustering_louvain.py @@ -0,0 +1,120 @@ +# Test methods with long descriptive names can omit docstrings +# pylint: disable=missing-docstring + +import unittest + +import numpy as np +import networkx +from scipy.sparse import csc_matrix, csr_matrix + +from Orange.clustering.clustering import ClusteringModel +from Orange.clustering.louvain import matrix_to_knn_graph +from Orange.data import Table +from Orange.clustering.louvain import Louvain + + +class TestLouvain(unittest.TestCase): + def setUp(self): + self.iris = Table('iris') + self.louvain = Louvain() + + def test_louvain(self): + c = self.louvain(self.iris) + # First 20 iris belong to one cluster + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + self.assertEqual(1, len(set(c[:20].ravel()))) + + def test_louvain_parameters(self): + louvain = Louvain( + k_neighbors=3, resolution=1.2, random_state=42, metric="l2") + c = louvain(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + + def test_predict_table(self): + c = self.louvain(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + + def test_predict_numpy(self): + c = self.louvain.fit(self.iris.X) + self.assertEqual(ClusteringModel, type(c)) + self.assertEqual(np.ndarray, type(c.labels)) + self.assertEqual(len(self.iris), len(c.labels)) + + def test_predict_sparse_csc(self): + self.iris.X = csc_matrix(self.iris.X[::5]) + c = self.louvain(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + + def test_predict_spares_csr(self): + self.iris.X = csr_matrix(self.iris.X[::5]) + c = self.louvain(self.iris) + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + + def test_model(self): + c = self.louvain.get_model(self.iris) + self.assertEqual(ClusteringModel, type(c)) + self.assertEqual(len(self.iris), len(c.labels)) + + self.assertRaises(NotImplementedError, c, self.iris) + + def test_model_np(self): + """ + Test with numpy array as an input in model. + """ + c = self.louvain.get_model(self.iris) + self.assertRaises(NotImplementedError, c, self.iris.X) + + def test_model_sparse(self): + """ + Test with sparse array as an input in model. + """ + c = self.louvain.get_model(self.iris) + self.assertRaises(NotImplementedError, c, csr_matrix(self.iris.X)) + + def test_model_instance(self): + """ + Test with instance as an input in model. + """ + c = self.louvain.get_model(self.iris) + self.assertRaises(NotImplementedError, c, self.iris[0]) + + def test_model_list(self): + """ + Test with list as an input in model. + """ + c = self.louvain.get_model(self.iris) + self.assertRaises(NotImplementedError, c, self.iris.X.tolist()) + + def test_graph(self): + """ + Louvain accepts graphs too. + :return: + """ + graph = matrix_to_knn_graph(self.iris.X, 30, "l2") + self.assertIsNotNone(graph) + self.assertEqual(networkx.Graph, type(graph), 1) + + # basic clustering - get clusters + c = self.louvain(graph) + # First 20 iris belong to one cluster + self.assertEqual(np.ndarray, type(c)) + self.assertEqual(len(self.iris), len(c)) + self.assertEqual(1, len(set(c[:20].ravel()))) + + # clustering - get model + c = self.louvain.get_model(graph) + # First 20 iris belong to one cluster + self.assertEqual(ClusteringModel, type(c)) + self.assertEqual(len(self.iris), len(c.labels)) + + def test_model_bad_datatype(self): + """ + Check model with data-type that is not supported. + """ + c = self.louvain.get_model(self.iris) + self.assertRaises(TypeError, c, 10) diff --git a/Orange/tests/test_louvain.py b/Orange/tests/test_louvain.py deleted file mode 100644 index 2a012889482..00000000000 --- a/Orange/tests/test_louvain.py +++ /dev/null @@ -1,26 +0,0 @@ -# Test methods with long descriptive names can omit docstrings -# pylint: disable=missing-docstring - -import unittest -import numpy as np - -from Orange.data import Table -from Orange.clustering.louvain import Louvain - - -class TestSVMLearner(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.data = Table('iris') - cls.louvain = Louvain() - - def test_orange_table(self): - self.assertIsNone(self.louvain.fit(self.data)) - clusters = self.louvain.fit_predict(self.data) - self.assertIn(type(clusters), [list, np.ndarray]) - - def test_np_array(self): - data_np = self.data.X - self.assertIsNone(self.louvain.fit(data_np)) - clusters = self.louvain.fit_predict(data_np) - self.assertIn(type(clusters), [list, np.ndarray]) diff --git a/Orange/widgets/unsupervised/owkmeans.py b/Orange/widgets/unsupervised/owkmeans.py index 185b184eb48..0a44de35280 100644 --- a/Orange/widgets/unsupervised/owkmeans.py +++ b/Orange/widgets/unsupervised/owkmeans.py @@ -6,9 +6,10 @@ pyqtSlot as Slot from AnyQt.QtGui import QIntValidator from AnyQt.QtWidgets import QGridLayout, QTableView +from sklearn.metrics import silhouette_samples, silhouette_score from Orange.clustering import KMeans -from Orange.clustering.kmeans import KMeansModel, SILHOUETTE_MAX_SAMPLES +from Orange.clustering.kmeans import KMeansModel from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable from Orange.data.util import get_unique_names, array_equal from Orange.preprocess.impute import ReplaceUnknowns @@ -23,6 +24,7 @@ RANDOM_STATE = 0 +SILHOUETTE_MAX_SAMPLES = 5000 class ClusterTableModel(QAbstractTableModel): @@ -268,15 +270,15 @@ def has_attributes(self): return len(self.data.domain.attributes) @staticmethod - def _compute_clustering(data, k, init, n_init, max_iter, silhouette, random_state): + def _compute_clustering(data, k, init, n_init, max_iter, random_state): # type: (Table, int, str, int, int, bool) -> KMeansModel if k > len(data): raise NotEnoughData() return KMeans( n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, - compute_silhouette_score=silhouette, random_state=random_state, - )(data) + random_state=random_state + ).get_model(data) @Slot(int, int) def __progress_changed(self, n, d): @@ -336,7 +338,6 @@ def __launch_tasks(self, ks): init=self.INIT_METHODS[self.smart_init][1], n_init=self.n_init, max_iter=self.max_iterations, - silhouette=True, random_state=RANDOM_STATE, ) for k in ks] watcher = FutureSetWatcher(futures) @@ -432,10 +433,9 @@ def invalidate(self): self.commit() def update_results(self): - scores = [ - mk if isinstance(mk, str) else mk.silhouette for mk in ( - self.clusterings[k] for k in range(self.k_from, self.k_to + 1)) - ] + scores = [mk if isinstance(mk, str) else silhouette_score( + self.data.X, mk.labels) for mk in ( + self.clusterings[k] for k in range(self.k_from, self.k_to + 1))] best_row = max( range(len(scores)), default=0, key=lambda x: 0 if isinstance(scores[x], str) else scores[x] @@ -454,6 +454,16 @@ def selected_row(self): def select_row(self): self.send_data() + def preproces(self, data): + for preprocessor in KMeans.preprocessors: # use same preprocessors than + data = preprocessor(data) + return data + + def samples_scores(self, clust_ids): + d = self.preproces(self.data) + return np.arctan( + silhouette_samples(d.X, clust_ids)) / np.pi + 0.5 + def send_data(self): if self.optimize_k: row = self.selected_row() @@ -472,16 +482,15 @@ def send_data(self): get_unique_names(domain, "Cluster"), values=["C%d" % (x + 1) for x in range(km.k)] ) - clust_ids = km(self.data) - clust_col = clust_ids.X.ravel() + clust_ids = km.labels silhouette_var = ContinuousVariable( get_unique_names(domain, "Silhouette")) - if km.silhouette_samples is not None: + if len(self.data) <= SILHOUETTE_MAX_SAMPLES: self.Warning.no_silhouettes.clear() - scores = np.arctan(km.silhouette_samples) / np.pi + 0.5 + scores = self.samples_scores(clust_ids) clust_scores = [] for i in range(km.k): - in_clust = clust_col == i + in_clust = clust_ids == i if in_clust.any(): clust_scores.append(np.mean(scores[in_clust])) else: @@ -494,7 +503,7 @@ def send_data(self): new_domain = add_columns(domain, metas=[cluster_var, silhouette_var]) new_table = self.data.transform(new_domain) - new_table.get_column_view(cluster_var)[0][:] = clust_col + new_table.get_column_view(cluster_var)[0][:] = clust_ids new_table.get_column_view(silhouette_var)[0][:] = scores centroid_attributes = [ @@ -502,7 +511,7 @@ def send_data(self): if isinstance(attr.compute_value, ReplaceUnknowns) and attr.compute_value.variable in domain.attributes else attr - for attr in km.pre_domain.attributes] + for attr in km.domain.attributes] centroid_domain = add_columns( Domain(centroid_attributes, [], domain.metas), metas=[cluster_var, silhouette_var]) diff --git a/Orange/widgets/unsupervised/owlouvainclustering.py b/Orange/widgets/unsupervised/owlouvainclustering.py index a4d27ddc4b1..56432544b5a 100644 --- a/Orange/widgets/unsupervised/owlouvainclustering.py +++ b/Orange/widgets/unsupervised/owlouvainclustering.py @@ -14,7 +14,7 @@ ) from AnyQt.QtWidgets import QSlider, QCheckBox, QWidget, QLabel -from Orange.clustering.louvain import table_to_knn_graph, Louvain +from Orange.clustering.louvain import matrix_to_knn_graph, Louvain from Orange.data import Table, DiscreteVariable from Orange.data.util import get_unique_names, array_equal from Orange import preprocess @@ -623,10 +623,9 @@ def pcallback(val): raise InteruptRequested() try: - res.graph = graph = table_to_knn_graph( - data, k_neighbors=k_neighbors, metric=metric, - progress_callback=pcallback - ) + res.graph = graph = matrix_to_knn_graph( + data.X, k_neighbors=k_neighbors, metric=metric, + progress_callback=pcallback) except InteruptRequested: return res @@ -638,7 +637,7 @@ def pcallback(val): if state.is_interuption_requested(): return res - res.partition = louvain.fit_predict(graph) + res.partition = louvain(graph) state.set_partial_results(("partition", res.partition)) return res @@ -654,7 +653,7 @@ def run_on_graph(graph, resolution, state): state.set_status("Detecting communities...") if state.is_interuption_requested(): return res - partition = louvain.fit_predict(graph) + partition = louvain(graph) res.partition = partition state.set_partial_results(("partition", res.partition)) return res diff --git a/Orange/widgets/unsupervised/tests/test_owkmeans.py b/Orange/widgets/unsupervised/tests/test_owkmeans.py index 988daf5c9a8..f81fed3fc04 100644 --- a/Orange/widgets/unsupervised/tests/test_owkmeans.py +++ b/Orange/widgets/unsupervised/tests/test_owkmeans.py @@ -5,6 +5,7 @@ import numpy as np from AnyQt.QtCore import Qt from AnyQt.QtWidgets import QRadioButton +from sklearn.metrics import silhouette_score import Orange.clustering from Orange.data import Table, Domain @@ -197,26 +198,22 @@ def test_data_on_output(self): # removing data should have cleared the output self.assertEqual(self.widget.data, None) - @patch("Orange.clustering.kmeans.KMeansModel.__call__") - def test_centroids_on_output(self, km_call): - ret = km_call.return_value = Mock() - ret.X = np.array([0] * 50 + [1] * 100) - ret.silhouette_samples = np.arange(150) / 150 - + def test_centroids_on_output(self): widget = self.widget widget.optimize_k = False widget.k = 4 self.send_signal(widget.Inputs.data, self.iris) self.commit_and_wait() + widget.clusterings[widget.k].labels = np.array([0] * 50 + [1] * 100).flatten() - widget.clusterings[4].silhouette_samples = np.arange(150) / 150 + widget.samples_scores = lambda x: np.arctan( + np.arange(150) / 150) / np.pi + 0.5 widget.send_data() out = self.get_output(widget.Outputs.centroids) - np.testing.assert_almost_equal( - out.metas, - [[0, np.mean(np.arctan(np.arange(50) / 150)) / np.pi + 0.5], - [1, np.mean(np.arctan(np.arange(50, 150) / 150)) / np.pi + 0.5], - [2, 0], [3, 0]]) + np.testing.assert_array_almost_equal( + np.array([[0, np.mean(np.arctan(np.arange(50) / 150)) / np.pi + 0.5], + [1, np.mean(np.arctan(np.arange(50, 150) / 150)) / np.pi + 0.5], + [2, 0], [3, 0]]), out.metas.astype(float)) self.assertEqual(out.name, "iris centroids") def test_centroids_domain_on_output(self): @@ -262,12 +259,14 @@ def test_optimization_fails(self): self.KMeansFail.fail_on = {3, 5, 7} model = widget.table_view.model() - with patch.object(model, "set_scores", wraps=model.set_scores) as set_scores: + with patch.object( + model, "set_scores", wraps=model.set_scores) as set_scores: self.send_signal(self.widget.Inputs.data, self.iris, wait=5000) scores, start_k = set_scores.call_args[0] self.assertEqual( scores, - [km if isinstance(km, str) else km.silhouette + [km if isinstance(km, str) else silhouette_score( + self.iris.X, km(self.iris)) for km in (widget.clusterings[k] for k in range(3, 9))] ) self.assertEqual(start_k, 3) @@ -312,15 +311,14 @@ def test_run_fails(self): self.assertIsNotNone(self.get_output(self.widget.Outputs.annotated_data)) def test_select_best_row(self): - class Cluster: - def __init__(self, n): - self.silhouette = n - widget = self.widget widget.k_from, widget.k_to = 2, 6 - widget.clusterings = {k: Cluster(5 - (k - 4) ** 2) for k in range(2, 7)} + widget.optimize_k = True + self.send_signal(self.widget.Inputs.data, Table("housing"), wait=5000) + self.commit_and_wait() widget.update_results() - self.assertEqual(widget.selected_row(), 2) + # for housing dataset best selection is 3 clusters, so row no. 1 + self.assertEqual(widget.selected_row(), 1) widget.clusterings = {k: "error" for k in range(2, 7)} widget.update_results() @@ -394,7 +392,9 @@ def test_silhouette_column(self): # Avoid randomness in the test random = np.random.RandomState(0) # pylint: disable=no-member table = Table(random.rand(110, 2)) - with patch("Orange.clustering.kmeans.SILHOUETTE_MAX_SAMPLES", 100): + with patch( + "Orange.widgets.unsupervised.owkmeans.SILHOUETTE_MAX_SAMPLES", + 100): self.send_signal(self.widget.Inputs.data, table) outtable = self.get_output(widget.Outputs.annotated_data) outtable = outtable.get_column_view("Silhouette")[0] diff --git a/Orange/widgets/visualize/owheatmap.py b/Orange/widgets/visualize/owheatmap.py index 6c36e25b803..40f1a5dbef3 100644 --- a/Orange/widgets/visualize/owheatmap.py +++ b/Orange/widgets/visualize/owheatmap.py @@ -114,7 +114,7 @@ def barycenter(a, axis=0): def kmeans_compress(X, k=50): km = kmeans.KMeans(n_clusters=k, n_init=5, random_state=42) - return km(X) + return km.get_model(X) def candidate_split_labels(data): @@ -866,8 +866,8 @@ def construct_heatmaps(self, data, split_label=None): self.input_data.domain.metas)) nclust = min(self.merge_kmeans_k, len(effective_data) - 1) self.kmeans_model = kmeans_compress(effective_data, k=nclust) - effective_data.domain = self.kmeans_model.pre_domain - merge_indices = [np.flatnonzero(self.kmeans_model.labels_ == ind) + effective_data.domain = self.kmeans_model.domain + merge_indices = [np.flatnonzero(self.kmeans_model.labels == ind) for ind in range(nclust)] not_empty_indices = [i for i, x in enumerate(merge_indices) if len(x) > 0]