picnicml · Wei-1 · Sep 12, 2019 · Sep 12, 2019 · Sep 15, 2019 · Sep 18, 2019
diff --git a/src/main/scala/io/picnicml/doddlemodel/cluster/DBSCAN.scala b/src/main/scala/io/picnicml/doddlemodel/cluster/DBSCAN.scala
@@ -0,0 +1,77 @@
+package io.picnicml.doddlemodel.cluster
+
+import breeze.linalg.functions.euclideanDistance
+import cats.syntax.option._
+import io.picnicml.doddlemodel.data.Features
+import io.picnicml.doddlemodel.typeclasses.Clusterer
+
+/** An immutable DBSCAN model.
+  *
+  * @param eps: the maximum distance between points in a group
+  * @param minSamples: the minimum number of point in a core group
+  *
+  * Examples:
+  * val model = DBSCAN()
+  * val model = DBSCAN(eps = 1.5)
+  * val model = DBSCAN(minSamples = 3)
+  * val model = DBSCAN(eps = 2.0, minSamples = 3)
+  */
+case class DBSCAN private(eps: Double, minSamples: Int, private val label: Option[Array[Int]])
+
+object DBSCAN {
+
+  def apply(eps: Double = 1.0, minSamples: Int = 1): DBSCAN = {
+    require(eps > 0.0, "Maximum distance needs to be larger than 0")
+    require(minSamples > 0, "Minimum number of samples needs to be larger than 0")
+    DBSCAN(eps, minSamples, none)
+  }
+
+  implicit lazy val ev: Clusterer[DBSCAN] = new Clusterer[DBSCAN] {
+
+    override def isFitted(model: DBSCAN): Boolean = model.label.isDefined
+
+    override protected def labelSafe(model: DBSCAN): Array[Int] = model.label.get
+
+    override protected def copy(model: DBSCAN): DBSCAN =
+      model.copy()
+
+    override protected def copy(model: DBSCAN, label: Array[Int]): DBSCAN =
+      model.copy(label = label.some)
+
+    override protected def fitSafe(model: DBSCAN, x: Features): DBSCAN = {
+      val label = Array.fill[Int](x.rows)(Int.MaxValue)
+      var groupId = -1
+      for (pointId <- 0 until x.rows if label(pointId) == Int.MaxValue) {
+        var groupQueue = findNeighbors(pointId, x, model.eps)
+        if (groupQueue.size + 1 < model.minSamples) {
+          label(pointId) = -1
+        } else {
+          groupId += 1
+          label(pointId) = groupId
+          while (groupQueue.size > 0) {
+            val tmpGroupQueue = groupQueue
+            groupQueue = Set[Int]()
+            tmpGroupQueue.foreach { i =>
+              if (label(i) == -1) label(i) = groupId
+              else if (label(i) == Int.MaxValue) {
+                label(i) = groupId
+                val neighbors = findNeighbors(i, x, model.eps)
+                if (neighbors.size + 1 < model.minSamples)
+                  groupQueue ++= neighbors
+              }
+            }
+          }
+        }
+      }
+      copy(model, label)
+    }
+
+    override protected def fitPredictSafe(model: DBSCAN, x: Features): Array[Int] =
+      labelSafe(fitSafe(model, x))
+
+    private def findNeighbors(pointId: Int, x: Features, eps: Double): Set[Int] =
+      (0 until x.rows).filter { i =>
+        i != pointId && euclideanDistance(x(i, ::).t, x(pointId, ::).t) <= eps
+      }.toSet
+  }
+}
diff --git a/src/main/scala/io/picnicml/doddlemodel/typeclasses/Clusterer.scala b/src/main/scala/io/picnicml/doddlemodel/typeclasses/Clusterer.scala
@@ -0,0 +1,35 @@
+package io.picnicml.doddlemodel.typeclasses
+
+import io.picnicml.doddlemodel.data.Features
+
+trait Clusterer[A] extends Estimator[A] {
+
+  def fit(model: A, x: Features): A = {
+    require(!isFitted(model), "Called fit on a model that is already fitted")
+    fitSafe(copy(model), x)
+  }
+
+  def fitPredict(model: A, x: Features): Array[Int] = {
+    require(!isFitted(model), "Called fit on a model that is already fitted")
+    fitPredictSafe(copy(model), x)
+  }
+
+  def label(model: A): Array[Int] = {
+    require(isFitted(model), "Request label on a model that is not fitted yet")
+    labelSafe(model)
+  }
+
+  /** A function that creates an identical clusterer. */
+  protected def copy(model: A): A
+  protected def copy(model: A, label: Array[Int]): A
+
+  /** A function that is guaranteed to be called on a fitted model. */
+  protected def labelSafe(model: A): Array[Int]
+
+  /**
+    * A function that is guaranteed to receive an appropriate target variable when called. Additionally,
+    * the object is guaranteed not to be fitted.
+    */
+  protected def fitSafe(model: A, x: Features): A
+  protected def fitPredictSafe(model: A, x: Features): Array[Int]
+}
diff --git a/src/test/scala/io/picnicml/doddlemodel/cluster/DBSCANTest.scala b/src/test/scala/io/picnicml/doddlemodel/cluster/DBSCANTest.scala
@@ -0,0 +1,50 @@
+package io.picnicml.doddlemodel.cluster
+
+import breeze.linalg.DenseMatrix
+import io.picnicml.doddlemodel.TestingUtils
+import io.picnicml.doddlemodel.cluster.DBSCAN.ev
+import org.scalactic.{Equality, TolerantNumerics}
+import org.scalatest.{FlatSpec, Matchers}
+
+class DBSCANTest extends FlatSpec with Matchers with TestingUtils {
+
+  implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)
+
+  val x = DenseMatrix((1.0, 1.0), (0.0, 2.0), (2.0, 0.0),
+    (8.0, 1.0), (7.0, 2.0), (9.0, 0.0))
+
+  "DBSCAN" should "calculate the label of the data points" in {
+    val model = DBSCAN(eps = 3.0, minSamples = 1)
+    ev.fitPredict(model, x) shouldEqual Array(0, 0, 0, 1, 1, 1)
+    ev.label(ev.fit(model, x)) shouldEqual Array(0, 0, 0, 1, 1, 1)
+  }
+
+  it should "cluster one data point to one group when eps is too small" in {
+    val model = DBSCAN()
+    ev.fitPredict(model, x) shouldEqual Array(0, 1, 2, 3, 4, 5)
+  }
+
+  it should "cluster all data points to one group when eps is too large" in {
+    val model = DBSCAN(eps = 10.0)
+    ev.fitPredict(model, x) shouldEqual Array(0, 0, 0, 0, 0, 0)
+  }
+
+  it should "cluster all points to outliers when min samples is too large" in {
+    val model = DBSCAN(minSamples = 7)
+    ev.fitPredict(model, x) shouldEqual Array(-1, -1, -1, -1, -1, -1)
+  }
+
+  it should "cluster all data points to one group when eps is equal to the distance among points" in {
+    val smallX = DenseMatrix((0.0, 0.0), (3.0, 0.0))
+    val model = DBSCAN(eps = 3.0)
+    ev.fitPredict(model, smallX) shouldEqual Array(0, 0)
+  }
+
+  it should "prevent the usage of negative eps" in {
+    an [IllegalArgumentException] shouldBe thrownBy(DBSCAN(eps = -0.5))
+  }
+
+  it should "prevent the usage of negative min samples" in {
+    an [IllegalArgumentException] shouldBe thrownBy(DBSCAN(minSamples = -1))
+  }
+}