Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clustering - DBSCAN #86

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
89 changes: 89 additions & 0 deletions src/main/scala/io/picnicml/doddlemodel/cluster/DBSCAN.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package io.picnicml.doddlemodel.cluster

import breeze.linalg.functions.euclideanDistance
import cats.syntax.option._
import io.picnicml.doddlemodel.data.Features
import io.picnicml.doddlemodel.typeclasses.Clusterer

/** An immutable DBSCAN model.
*
* @param eps: the maximum distance between points in a group
* @param minSamples: the minimum number of point in a core group
*
* Examples:
* val model = DBSCAN()
* val model = DBSCAN(eps = 1.5)
* val model = DBSCAN(minSamples = 3)
* val model = DBSCAN(eps = 2.0, minSamples = 3)
*/
case class DBSCAN private(eps: Double, minSamples: Int, private val labels: Option[Array[Int]])

object DBSCAN {

val NOISE: Int = -1
val UNASSIGNED: Int = Int.MaxValue

def apply(eps: Double = 1.0, minSamples: Int = 1): DBSCAN = {
require(eps > 0.0, "Maximum distance needs to be larger than 0")
require(minSamples > 0, "Minimum number of samples needs to be larger than 0")
DBSCAN(eps, minSamples, none)
}

implicit lazy val ev: Clusterer[DBSCAN] = new Clusterer[DBSCAN] {

override def isFitted(model: DBSCAN): Boolean = model.labels.isDefined

override protected def labelSafe(model: DBSCAN): Array[Int] = model.labels.get

override protected def copy(model: DBSCAN): DBSCAN =
model.copy()

override protected def fitSafe(model: DBSCAN, x: Features): DBSCAN = {
val xSize = x.rows
val distanceMap = (0 until xSize - 1).flatMap { i1 =>
(i1 + 1 until xSize).map { i2 =>
(i1, i2) -> euclideanDistance(x(i1, ::).t, x(i2, ::).t)
}
}.toMap
val labels = Array.fill[Int](xSize)(UNASSIGNED)
var groupId = 0
for (pointId <- 0 until xSize if labels(pointId) == UNASSIGNED) {
var groupQueue = findNeighbors(pointId, distanceMap, xSize, model.eps)
if (groupQueue.size + 1 < model.minSamples) {
labels(pointId) = NOISE
} else {
labels(pointId) = groupId
while (groupQueue.nonEmpty) {
matejklemen marked this conversation as resolved.
Show resolved Hide resolved
val tmpGroupQueue = groupQueue
groupQueue = Set[Int]()
tmpGroupQueue.foreach { i =>
if (labels(i) == NOISE) labels(i) = groupId
else if (labels(i) == UNASSIGNED) {
labels(i) = groupId
val neighbors = findNeighbors(i, distanceMap, xSize, model.eps)
if (neighbors.size + 1 >= model.minSamples)
groupQueue ++= neighbors
}
}
}
groupId += 1
}
}
model.copy(labels = labels.some)
}

private def findNeighbors(
pointId: Int,
distanceMap: Map[(Int, Int), Double],
xSize: Int,
eps: Double
): Set[Int] = {
def findDistance(i1: Int, i2: Int): Double = distanceMap(
if (i1 < i2) (i1, i2) else (i2, i1)
)
(0 until xSize).filter { i =>
i != pointId && findDistance(i, pointId) <= eps
}.toSet
}
}
}
33 changes: 33 additions & 0 deletions src/main/scala/io/picnicml/doddlemodel/typeclasses/Clusterer.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package io.picnicml.doddlemodel.typeclasses

import io.picnicml.doddlemodel.data.Features

trait Clusterer[A] extends Estimator[A] {

def fit(model: A, x: Features): A = {
require(!isFitted(model), "Called fit on a model that is already fitted")
fitSafe(copy(model), x)
}

def fitPredict(model: A, x: Features): Array[Int] = {
inejc marked this conversation as resolved.
Show resolved Hide resolved
inejc marked this conversation as resolved.
Show resolved Hide resolved
require(!isFitted(model), "Called fit on a model that is already fitted")
labelSafe(fitSafe(copy(model), x))
}

def labels(model: A): Array[Int] = {
require(isFitted(model), "Request labels on a model that is not fitted yet")
labelSafe(model)
}

/** A function that creates an identical clusterer. */
protected def copy(model: A): A

/** A function that is guaranteed to be called on a fitted model. */
protected def labelSafe(model: A): Array[Int]
inejc marked this conversation as resolved.
Show resolved Hide resolved

/**
* A function that is guaranteed to receive an appropriate target variable when called. Additionally,
* the object is guaranteed not to be fitted.
*/
protected def fitSafe(model: A, x: Features): A
}
71 changes: 71 additions & 0 deletions src/test/scala/io/picnicml/doddlemodel/cluster/DBSCANTest.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package io.picnicml.doddlemodel.cluster

import breeze.linalg.DenseMatrix
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.cluster.DBSCAN.ev
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.{FlatSpec, Matchers}

class DBSCANTest extends FlatSpec with Matchers with TestingUtils {

implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)

private val x = DenseMatrix(
List(1.0, 1.0),
List(0.0, 2.0),
List(2.0, 0.0),
List(8.0, 1.0),
List(7.0, 2.0),
List(9.0, 0.0)
)

"DBSCAN" should "calculate the label of the data points" in {
val model = DBSCAN(eps = 3.0, minSamples = 1)
ev.fitPredict(model, x) shouldEqual Array(0, 0, 0, 1, 1, 1)
ev.labels(ev.fit(model, x)) shouldEqual Array(0, 0, 0, 1, 1, 1)
}

it should "cluster one data point to one group when eps is too small" in {
val model = DBSCAN()
ev.fitPredict(model, x) shouldEqual Array(0, 1, 2, 3, 4, 5)
}

it should "cluster all data points to one group when eps is too large" in {
val model = DBSCAN(eps = 10.0)
ev.fitPredict(model, x) shouldEqual Array(0, 0, 0, 0, 0, 0)
}

it should "cluster all points to outliers when min samples is too large" in {
val model = DBSCAN(minSamples = 7)
ev.fitPredict(model, x) shouldEqual Array(-1, -1, -1, -1, -1, -1)
}

it should "cluster all data points to one group when eps is equal to the distance among points" in {
val smallX = DenseMatrix(
List(0.0, 0.0),
List(3.0, 0.0)
)
val model = DBSCAN(eps = 3.0)
ev.fitPredict(model, smallX) shouldEqual Array(0, 0)
}

it should "cluster all data points to one group in an 1D array of points that match min sample size" in {
val d1X = DenseMatrix(
List(0.0, 12.0),
List(0.0, 9.0),
List(0.0, 6.0),
List(0.0, 3.0),
List(0.0, 0.0)
)
val model = DBSCAN(eps = 3.0, minSamples = 3)
ev.fitPredict(model, d1X) shouldEqual Array(0, 0, 0, 0, 0)
}

it should "prevent the usage of negative eps" in {
an [IllegalArgumentException] shouldBe thrownBy(DBSCAN(eps = -0.5))
}

it should "prevent the usage of negative min samples" in {
an [IllegalArgumentException] shouldBe thrownBy(DBSCAN(minSamples = -1))
}
}