picnicml · Wei-1 · Sep 12, 2019 · Sep 12, 2019 · Sep 15, 2019 · Sep 18, 2019
diff --git a/src/main/scala/io/picnicml/doddlemodel/cluster/DBSCAN.scala b/src/main/scala/io/picnicml/doddlemodel/cluster/DBSCAN.scala
@@ -0,0 +1,89 @@
+package io.picnicml.doddlemodel.cluster
+
+import breeze.linalg.functions.euclideanDistance
+import cats.syntax.option._
+import io.picnicml.doddlemodel.data.Features
+import io.picnicml.doddlemodel.typeclasses.Clusterer
+
+/** An immutable DBSCAN model.
+  *
+  * @param eps: the maximum distance between points in a group
+  * @param minSamples: the minimum number of point in a core group
+  *
+  * Examples:
+  * val model = DBSCAN()
+  * val model = DBSCAN(eps = 1.5)
+  * val model = DBSCAN(minSamples = 3)
+  * val model = DBSCAN(eps = 2.0, minSamples = 3)
+  */
+case class DBSCAN private(eps: Double, minSamples: Int, private val labels: Option[Array[Int]])
+
+object DBSCAN {
+
+  val NOISE: Int = -1
+  val UNASSIGNED: Int = Int.MaxValue
+
+  def apply(eps: Double = 1.0, minSamples: Int = 1): DBSCAN = {
+    require(eps > 0.0, "Maximum distance needs to be larger than 0")
+    require(minSamples > 0, "Minimum number of samples needs to be larger than 0")
+    DBSCAN(eps, minSamples, none)
+  }
+
+  implicit lazy val ev: Clusterer[DBSCAN] = new Clusterer[DBSCAN] {
+
+    override def isFitted(model: DBSCAN): Boolean = model.labels.isDefined
+
+    override protected def labelSafe(model: DBSCAN): Array[Int] = model.labels.get
+
+    override protected def copy(model: DBSCAN): DBSCAN =
+      model.copy()
+
+    override protected def fitSafe(model: DBSCAN, x: Features): DBSCAN = {
+      val xSize = x.rows
+      val distanceMap = (0 until xSize - 1).flatMap { i1 =>
+        (i1 + 1 until xSize).map { i2 =>
+          (i1, i2) -> euclideanDistance(x(i1, ::).t, x(i2, ::).t)
+        }
+      }.toMap
+      val labels = Array.fill[Int](xSize)(UNASSIGNED)
+      var groupId = 0
+      for (pointId <- 0 until xSize if labels(pointId) == UNASSIGNED) {
+        var groupQueue = findNeighbors(pointId, distanceMap, xSize, model.eps)
+        if (groupQueue.size + 1 < model.minSamples) {
+          labels(pointId) = NOISE
+        } else {
+          labels(pointId) = groupId
+          while (groupQueue.nonEmpty) {
+            val tmpGroupQueue = groupQueue
+            groupQueue = Set[Int]()
+            tmpGroupQueue.foreach { i =>
+              if (labels(i) == NOISE) labels(i) = groupId
+              else if (labels(i) == UNASSIGNED) {
+                labels(i) = groupId
+                val neighbors = findNeighbors(i, distanceMap, xSize, model.eps)
+                if (neighbors.size + 1 >= model.minSamples)
+                  groupQueue ++= neighbors
+              }
+            }
+          }
+          groupId += 1
+        }
+      }
+      model.copy(labels = labels.some)
+    }
+
+    private def findNeighbors(
+      pointId: Int,
+      distanceMap: Map[(Int, Int), Double],
+      xSize: Int,
+      eps: Double
+    ): Set[Int] = {
+      def findDistance(i1: Int, i2: Int): Double = distanceMap(
+        if (i1 < i2) (i1, i2) else (i2, i1)
+      )
+      (0 until xSize).filter { i =>
+        i != pointId && findDistance(i, pointId) <= eps
+      }.toSet
+    }
+  }
+}
diff --git a/src/main/scala/io/picnicml/doddlemodel/typeclasses/Clusterer.scala b/src/main/scala/io/picnicml/doddlemodel/typeclasses/Clusterer.scala
@@ -0,0 +1,33 @@
+package io.picnicml.doddlemodel.typeclasses
+
+import io.picnicml.doddlemodel.data.Features
+
+trait Clusterer[A] extends Estimator[A] {
+
+  def fit(model: A, x: Features): A = {
+    require(!isFitted(model), "Called fit on a model that is already fitted")
+    fitSafe(copy(model), x)
+  }
+
+  def fitPredict(model: A, x: Features): Array[Int] = {
+    require(!isFitted(model), "Called fit on a model that is already fitted")
+    labelSafe(fitSafe(copy(model), x))
+  }
+
+  def labels(model: A): Array[Int] = {
+    require(isFitted(model), "Request labels on a model that is not fitted yet")
+    labelSafe(model)
+  }
+
+  /** A function that creates an identical clusterer. */
+  protected def copy(model: A): A
+
+  /** A function that is guaranteed to be called on a fitted model. */
+  protected def labelSafe(model: A): Array[Int]
+
+  /**
+    * A function that is guaranteed to receive an appropriate target variable when called. Additionally,
+    * the object is guaranteed not to be fitted.
+    */
+  protected def fitSafe(model: A, x: Features): A
+}
diff --git a/src/test/scala/io/picnicml/doddlemodel/cluster/DBSCANTest.scala b/src/test/scala/io/picnicml/doddlemodel/cluster/DBSCANTest.scala
@@ -0,0 +1,71 @@
+package io.picnicml.doddlemodel.cluster
+
+import breeze.linalg.DenseMatrix
+import io.picnicml.doddlemodel.TestingUtils
+import io.picnicml.doddlemodel.cluster.DBSCAN.ev
+import org.scalactic.{Equality, TolerantNumerics}
+import org.scalatest.{FlatSpec, Matchers}
+
+class DBSCANTest extends FlatSpec with Matchers with TestingUtils {
+
+  implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)
+
+  private val x = DenseMatrix(
+    List(1.0, 1.0),
+    List(0.0, 2.0),
+    List(2.0, 0.0),
+    List(8.0, 1.0),
+    List(7.0, 2.0),
+    List(9.0, 0.0)
+  )
+
+  "DBSCAN" should "calculate the label of the data points" in {
+    val model = DBSCAN(eps = 3.0, minSamples = 1)
+    ev.fitPredict(model, x) shouldEqual Array(0, 0, 0, 1, 1, 1)
+    ev.labels(ev.fit(model, x)) shouldEqual Array(0, 0, 0, 1, 1, 1)
+  }
+
+  it should "cluster one data point to one group when eps is too small" in {
+    val model = DBSCAN()
+    ev.fitPredict(model, x) shouldEqual Array(0, 1, 2, 3, 4, 5)
+  }
+
+  it should "cluster all data points to one group when eps is too large" in {
+    val model = DBSCAN(eps = 10.0)
+    ev.fitPredict(model, x) shouldEqual Array(0, 0, 0, 0, 0, 0)
+  }
+
+  it should "cluster all points to outliers when min samples is too large" in {
+    val model = DBSCAN(minSamples = 7)
+    ev.fitPredict(model, x) shouldEqual Array(-1, -1, -1, -1, -1, -1)
+  }
+
+  it should "cluster all data points to one group when eps is equal to the distance among points" in {
+    val smallX = DenseMatrix(
+      List(0.0, 0.0),
+      List(3.0, 0.0)
+    )
+    val model = DBSCAN(eps = 3.0)
+    ev.fitPredict(model, smallX) shouldEqual Array(0, 0)
+  }
+
+  it should "cluster all data points to one group in an 1D array of points that match min sample size" in {
+    val d1X = DenseMatrix(
+      List(0.0, 12.0),
+      List(0.0, 9.0),
+      List(0.0, 6.0),
+      List(0.0, 3.0),
+      List(0.0, 0.0)
+    )
+    val model = DBSCAN(eps = 3.0, minSamples = 3)
+    ev.fitPredict(model, d1X) shouldEqual Array(0, 0, 0, 0, 0)
+  }
+
+  it should "prevent the usage of negative eps" in {
+    an [IllegalArgumentException] shouldBe thrownBy(DBSCAN(eps = -0.5))
+  }
+
+  it should "prevent the usage of negative min samples" in {
+    an [IllegalArgumentException] shouldBe thrownBy(DBSCAN(minSamples = -1))
+  }
+}