Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clustering - DBSCAN #86

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
77 changes: 77 additions & 0 deletions src/main/scala/io/picnicml/doddlemodel/cluster/DBSCAN.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package io.picnicml.doddlemodel.cluster

import breeze.linalg.functions.euclideanDistance
import cats.syntax.option._
import io.picnicml.doddlemodel.data.Features
import io.picnicml.doddlemodel.typeclasses.Clusterer

/** An immutable DBSCAN model.
*
* @param eps: the maximum distance between points in a group
* @param minSamples: the minimum number of point in a core group
*
* Examples:
* val model = DBSCAN()
* val model = DBSCAN(eps = 1.5)
* val model = DBSCAN(minSamples = 3)
* val model = DBSCAN(eps = 2.0, minSamples = 3)
*/
case class DBSCAN private(eps: Double, minSamples: Int, private val label: Option[Array[Int]])
inejc marked this conversation as resolved.
Show resolved Hide resolved

object DBSCAN {

def apply(eps: Double = 1.0, minSamples: Int = 1): DBSCAN = {
require(eps > 0.0, "Maximum distance needs to be larger than 0")
require(minSamples > 0, "Minimum number of samples needs to be larger than 0")
DBSCAN(eps, minSamples, none)
}

implicit lazy val ev: Clusterer[DBSCAN] = new Clusterer[DBSCAN] {

override def isFitted(model: DBSCAN): Boolean = model.label.isDefined

override protected def labelSafe(model: DBSCAN): Array[Int] = model.label.get
inejc marked this conversation as resolved.
Show resolved Hide resolved

override protected def copy(model: DBSCAN): DBSCAN =
model.copy()

override protected def copy(model: DBSCAN, label: Array[Int]): DBSCAN =
model.copy(label = label.some)

override protected def fitSafe(model: DBSCAN, x: Features): DBSCAN = {
val label = Array.fill[Int](x.rows)(Int.MaxValue)
var groupId = -1
for (pointId <- 0 until x.rows if label(pointId) == Int.MaxValue) {
var groupQueue = findNeighbors(pointId, x, model.eps)
if (groupQueue.size + 1 < model.minSamples) {
label(pointId) = -1
matejklemen marked this conversation as resolved.
Show resolved Hide resolved
} else {
groupId += 1
label(pointId) = groupId
while (groupQueue.size > 0) {
matejklemen marked this conversation as resolved.
Show resolved Hide resolved
val tmpGroupQueue = groupQueue
groupQueue = Set[Int]()
tmpGroupQueue.foreach { i =>
if (label(i) == -1) label(i) = groupId
else if (label(i) == Int.MaxValue) {
label(i) = groupId
val neighbors = findNeighbors(i, x, model.eps)
if (neighbors.size + 1 < model.minSamples)
groupQueue ++= neighbors
}
}
}
}
}
copy(model, label)
inejc marked this conversation as resolved.
Show resolved Hide resolved
}

override protected def fitPredictSafe(model: DBSCAN, x: Features): Array[Int] =
labelSafe(fitSafe(model, x))

private def findNeighbors(pointId: Int, x: Features, eps: Double): Set[Int] =
(0 until x.rows).filter { i =>
i != pointId && euclideanDistance(x(i, ::).t, x(pointId, ::).t) <= eps
}.toSet
}
}
35 changes: 35 additions & 0 deletions src/main/scala/io/picnicml/doddlemodel/typeclasses/Clusterer.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package io.picnicml.doddlemodel.typeclasses

import io.picnicml.doddlemodel.data.Features

trait Clusterer[A] extends Estimator[A] {

def fit(model: A, x: Features): A = {
require(!isFitted(model), "Called fit on a model that is already fitted")
fitSafe(copy(model), x)
}

def fitPredict(model: A, x: Features): Array[Int] = {
inejc marked this conversation as resolved.
Show resolved Hide resolved
inejc marked this conversation as resolved.
Show resolved Hide resolved
require(!isFitted(model), "Called fit on a model that is already fitted")
fitPredictSafe(copy(model), x)
}

def label(model: A): Array[Int] = {
inejc marked this conversation as resolved.
Show resolved Hide resolved
require(isFitted(model), "Request label on a model that is not fitted yet")
labelSafe(model)
}

/** A function that creates an identical clusterer. */
protected def copy(model: A): A
protected def copy(model: A, label: Array[Int]): A
inejc marked this conversation as resolved.
Show resolved Hide resolved

/** A function that is guaranteed to be called on a fitted model. */
protected def labelSafe(model: A): Array[Int]
inejc marked this conversation as resolved.
Show resolved Hide resolved

/**
* A function that is guaranteed to receive an appropriate target variable when called. Additionally,
* the object is guaranteed not to be fitted.
*/
protected def fitSafe(model: A, x: Features): A
protected def fitPredictSafe(model: A, x: Features): Array[Int]
}
50 changes: 50 additions & 0 deletions src/test/scala/io/picnicml/doddlemodel/cluster/DBSCANTest.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package io.picnicml.doddlemodel.cluster

import breeze.linalg.DenseMatrix
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.cluster.DBSCAN.ev
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.{FlatSpec, Matchers}

class DBSCANTest extends FlatSpec with Matchers with TestingUtils {

implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)

val x = DenseMatrix((1.0, 1.0), (0.0, 2.0), (2.0, 0.0),
inejc marked this conversation as resolved.
Show resolved Hide resolved
(8.0, 1.0), (7.0, 2.0), (9.0, 0.0))

"DBSCAN" should "calculate the label of the data points" in {
val model = DBSCAN(eps = 3.0, minSamples = 1)
ev.fitPredict(model, x) shouldEqual Array(0, 0, 0, 1, 1, 1)
ev.label(ev.fit(model, x)) shouldEqual Array(0, 0, 0, 1, 1, 1)
}

it should "cluster one data point to one group when eps is too small" in {
val model = DBSCAN()
ev.fitPredict(model, x) shouldEqual Array(0, 1, 2, 3, 4, 5)
}

it should "cluster all data points to one group when eps is too large" in {
val model = DBSCAN(eps = 10.0)
ev.fitPredict(model, x) shouldEqual Array(0, 0, 0, 0, 0, 0)
}

it should "cluster all points to outliers when min samples is too large" in {
val model = DBSCAN(minSamples = 7)
ev.fitPredict(model, x) shouldEqual Array(-1, -1, -1, -1, -1, -1)
}

it should "cluster all data points to one group when eps is equal to the distance among points" in {
val smallX = DenseMatrix((0.0, 0.0), (3.0, 0.0))
inejc marked this conversation as resolved.
Show resolved Hide resolved
val model = DBSCAN(eps = 3.0)
ev.fitPredict(model, smallX) shouldEqual Array(0, 0)
}

it should "prevent the usage of negative eps" in {
an [IllegalArgumentException] shouldBe thrownBy(DBSCAN(eps = -0.5))
}

it should "prevent the usage of negative min samples" in {
an [IllegalArgumentException] shouldBe thrownBy(DBSCAN(minSamples = -1))
}
}