Skip to content

Commit

Permalink
Merge pull request #3917 from PrimozGodec/add-dbscan
Browse files Browse the repository at this point in the history
[ENH] DBSCAN widget
  • Loading branch information
ajdapretnar authored Aug 2, 2019
2 parents ea80339 + 8d1e4c3 commit eb2c9a9
Show file tree
Hide file tree
Showing 10 changed files with 920 additions and 82 deletions.
53 changes: 53 additions & 0 deletions Orange/widgets/unsupervised/icons/DBSCAN.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
263 changes: 263 additions & 0 deletions Orange/widgets/unsupervised/owdbscan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
import sys

import numpy as np
from scipy import spatial
from AnyQt.QtWidgets import QApplication
from AnyQt.QtCore import Qt
from AnyQt.QtGui import QColor
from sklearn.metrics import pairwise_distances

from Orange.preprocess import Normalize, Continuize, SklImpute
from Orange.widgets import widget, gui
from Orange.widgets.utils.slidergraph import SliderGraph
from Orange.widgets.settings import Setting
from Orange.data import Table, Domain, DiscreteVariable
from Orange.clustering import DBSCAN
from Orange.widgets.utils.annotated_data import ANNOTATED_DATA_SIGNAL_NAME
from Orange.widgets.utils.signals import Input, Output
from Orange.widgets.widget import Msg


DEFAULT_CUT_POINT = 0.1
PREPROCESSORS = [Continuize(), Normalize(), SklImpute()]
EPS_BOTTOM_LIMIT = 0.01


def get_kth_distances(data, metric, k=5):
"""
The function computes the epsilon parameter for DBSCAN through method
proposed in the paper.
Parameters
----------
data : Orange.data.Table
Visualisation coordinates - embeddings
metric : callable or str
The metric to compute the distance.
k : int
Number kth observed neighbour
Returns
-------
np.ndarray
Epsilon parameter for DBSCAN
"""
x = data.X
if x.shape[0] > 1000: # subsample
x = x[np.random.randint(x.shape[0], size=1000), :]

dist = pairwise_distances(x, metric=metric)
k = min(k+1, len(data) - 1) # k+1 since first one is item itself
kth_point = np.argpartition(dist, k, axis=1)[:, k]
kth_dist = np.sort(dist[np.arange(0, len(kth_point)), kth_point])[::-1]

return kth_dist


class OWDBSCAN(widget.OWWidget):
name = "DBSCAN"
description = "Density-based spatial clustering."
icon = "icons/DBSCAN.svg"
priority = 2150

class Inputs:
data = Input("Data", Table)

class Outputs:
annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table)

class Error(widget.OWWidget.Error):
not_enough_instances = Msg("Not enough unique data instances. "
"At least two are required.")

METRICS = [
("Euclidean", "euclidean"),
("Manhattan", "cityblock"),
("Cosine", "cosine")
]

min_samples = Setting(4)
eps = Setting(0.5)
metric_idx = Setting(0)
auto_commit = Setting(True)
k_distances = None
cut_point = None

def __init__(self):
super().__init__()

self.data = None
self.data_normalized = None
self.db = None
self.model = None

box = gui.widgetBox(self.controlArea, "Parameters")
gui.spin(box, self, "min_samples", 1, 100, 1,
callback=self._min_samples_changed,
label="Core point neighbors")
gui.doubleSpin(box, self, "eps", EPS_BOTTOM_LIMIT, 1000, 0.01,
callback=self._eps_changed,
label="Neighborhood distance")

box = gui.widgetBox(self.controlArea, self.tr("Distance Metric"))
gui.comboBox(box, self, "metric_idx",
items=list(zip(*self.METRICS))[0],
callback=self._metirc_changed)

gui.auto_commit(self.controlArea, self, "auto_commit", "Apply",
orientation=Qt.Horizontal)
gui.rubber(self.controlArea)

self.controlArea.layout().addStretch()

self.plot = SliderGraph(
x_axis_label="Data items sorted by score",
y_axis_label="Distance to the k-th nearest neighbour",
background="w", callback=self._on_cut_changed
)

self.mainArea.layout().addWidget(self.plot)

def check_data_size(self, data):
if data is None:
return False
if len(data) < 2:
self.Error.not_enough_instances()
return False
return True

def commit(self):
self.cluster()

def cluster(self):
if not self.check_data_size(self.data):
return
self.model = DBSCAN(
eps=self.eps,
min_samples=self.min_samples,
metric=self.METRICS[self.metric_idx][1]
).get_model(self.data_normalized)
self.send_data()

def _compute_and_plot(self, cut_point=None):
self._compute_kdistances()
if cut_point is None:
self._compute_cut_point()
self._plot_graph()

def _plot_graph(self):
nonzero = np.sum(self.k_distances > EPS_BOTTOM_LIMIT)
self.plot.update(np.arange(len(self.k_distances)),
[self.k_distances],
colors=[QColor('red')],
cutpoint_x=self.cut_point,
selection_limit=(0, nonzero - 1))

def _compute_kdistances(self):
self.k_distances = get_kth_distances(
self.data_normalized, metric=self.METRICS[self.metric_idx][1],
k=self.min_samples
)

def _compute_cut_point(self):
self.cut_point = int(DEFAULT_CUT_POINT * len(self.k_distances))
self.eps = self.k_distances[self.cut_point]

if self.eps < EPS_BOTTOM_LIMIT:
self.eps = np.min(
self.k_distances[self.k_distances >= EPS_BOTTOM_LIMIT])
self.cut_point = self._find_nearest_dist(self.eps)

@Inputs.data
def set_data(self, data):
self.Error.clear()
if not self.check_data_size(data):
data = None
self.data = self.data_normalized = data
if self.data is None:
self.Outputs.annotated_data.send(None)
self.plot.clear_plot()
return

if self.data is None:
return

# preprocess data
for pp in PREPROCESSORS:
self.data_normalized = pp(self.data_normalized)

self._compute_and_plot()
self.unconditional_commit()

def send_data(self):
model = self.model

clusters = [c if c >= 0 else np.nan for c in model.labels]
k = len(set(clusters) - {np.nan})
clusters = np.array(clusters).reshape(len(self.data), 1)
core_samples = set(model.projector.core_sample_indices_)
in_core = np.array([1 if (i in core_samples) else 0
for i in range(len(self.data))])
in_core = in_core.reshape(len(self.data), 1)

clust_var = DiscreteVariable(
"Cluster", values=["C%d" % (x + 1) for x in range(k)])
in_core_var = DiscreteVariable("DBSCAN Core", values=["0", "1"])

domain = self.data.domain
attributes, classes = domain.attributes, domain.class_vars
meta_attrs = domain.metas
x, y, metas = self.data.X, self.data.Y, self.data.metas

meta_attrs += (clust_var, )
metas = np.hstack((metas, clusters))
meta_attrs += (in_core_var, )
metas = np.hstack((metas, in_core))

domain = Domain(attributes, classes, meta_attrs)
new_table = Table(domain, x, y, metas, self.data.W)

self.Outputs.annotated_data.send(new_table)

def _invalidate(self):
self.commit()

def _find_nearest_dist(self, value):
array = np.asarray(self.k_distances)
idx = (np.abs(array - value)).argmin()
return idx

def _eps_changed(self):
# find the closest value to eps
if self.data is None:
return
self.cut_point = self._find_nearest_dist(self.eps)
self.plot.set_cut_point(self.cut_point)
self._invalidate()

def _metirc_changed(self):
if self.data is not None:
self._compute_and_plot()
self._invalidate()

def _on_cut_changed(self, value):
# cut changed by means of a cut line over the scree plot.
self.cut_point = value
self.eps = self.k_distances[value]

self.commit()

def _min_samples_changed(self):
if self.data is None:
return
self._compute_and_plot(cut_point=self.cut_point)
self._invalidate()


if __name__ == "__main__":
a = QApplication(sys.argv)
ow = OWDBSCAN()
d = Table("iris.tab")
ow.set_data(d)
ow.show()
a.exec()
ow.saveSettings()
Loading

0 comments on commit eb2c9a9

Please sign in to comment.