Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for generic distance calculation delegate #9

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion UMAP/DistanceCalculation.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
namespace UMAP
{
public delegate float DistanceCalculation(float[] x, float[] y);
public delegate float DistanceCalculation<T>(T x, T y) where T : IUmapDataPoint;
}
13 changes: 13 additions & 0 deletions UMAP/IUmapDataPoint.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace UMAP
{
/// <summary>
/// Represents a single data point to be processed by <see cref="Umap{T}"/>.
/// </summary>
public interface IUmapDataPoint
{
/// <summary>
/// The data being operated on.
/// </summary>
float[] Data { get; }
}
}
7 changes: 4 additions & 3 deletions UMAP/NNDescent.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
using System;
using System.Collections.Generic;
using static UMAP.Heaps;

namespace UMAP
{
internal static class NNDescent
internal static class NNDescent<T> where T : IUmapDataPoint
{
public delegate (int[][] indices, float[][] weights) NNDescentFn(
float[][] data,
T[] data,
int[][] leafArray,
int nNeighbors,
int nIters = 10,
Expand All @@ -20,7 +21,7 @@ public delegate (int[][] indices, float[][] weights) NNDescentFn(
/// <summary>
/// Create a version of nearest neighbor descent.
/// </summary>
public static NNDescentFn MakeNNDescent(DistanceCalculation distanceFn, IProvideRandomValues random)
public static NNDescentFn MakeNNDescent(DistanceCalculation<T> distanceFn, IProvideRandomValues random)
{
return (data, leafArray, nNeighbors, nIters, maxCandidates, delta, rho, rpTreeInit, startingIteration) =>
{
Expand Down
31 changes: 31 additions & 0 deletions UMAP/RawVectorArrayUmapDataPoint.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
using System;
using System.Collections.Generic;
using System.Text;

namespace UMAP
{
/// <inheritdoc/>
public class RawVectorArrayUmapDataPoint : IUmapDataPoint
{

/// <inheritdoc/>
public RawVectorArrayUmapDataPoint(float[] data)
{
Data = data;
}


/// <inheritdoc/>
public float[] Data { get; }

/// <summary>
/// Define an implicit conversion operator from <see cref="float[]"/>.
/// </summary>
public static implicit operator RawVectorArrayUmapDataPoint(float[] data) => new RawVectorArrayUmapDataPoint(data);

/// <summary>
/// Implicit conversation back to <see cref="float[]"/>.
/// </summary>
public static implicit operator float[](RawVectorArrayUmapDataPoint x) => x.Data;
}
}
3 changes: 2 additions & 1 deletion UMAP/SIMD.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
using System;
using System.Linq;
using System.Numerics;
using System.Runtime.CompilerServices;

namespace UMAP
{
internal static class SIMD
internal static class SIMD<T>
{
private static readonly int _vs1 = Vector<float>.Count;
private static readonly int _vs2 = 2 * Vector<float>.Count;
Expand Down
6 changes: 3 additions & 3 deletions UMAP/SIMDInt.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

namespace UMAP
{
internal static class SIMDint
internal static class SIMDint<T>
{
private static readonly int _vs1 = Vector<int>.Count;
private static readonly int _vs2 = 2 * Vector<int>.Count;
Expand Down Expand Up @@ -70,8 +70,8 @@ public static void Uniform(ref float[] data, float a, IProvideRandomValues rando
float a2 = 2 * a;
float an = -a;
random.NextFloats(data);
SIMD.Multiply(ref data, a2);
SIMD.Add(ref data, an);
SIMD<T>.Multiply(ref data, a2);
SIMD<T>.Add(ref data, an);
}
}
}
17 changes: 9 additions & 8 deletions UMAP/Tree.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@

namespace UMAP
{
internal static class Tree
internal static class Tree<T> where T : IUmapDataPoint
{
/// <summary>
/// Construct a random projection tree based on ``data`` with leaves of size at most ``leafSize``
/// </summary>
public static RandomProjectionTreeNode MakeTree(float[][] data, int leafSize, int n, IProvideRandomValues random)
public static RandomProjectionTreeNode MakeTree(T[] data, int leafSize, int n, IProvideRandomValues random)
{
var indices = Enumerable.Range(0, data.Length).ToArray();
return MakeEuclideanTree(data, indices, leafSize, n, random);
}

private static RandomProjectionTreeNode MakeEuclideanTree(float[][] data, int[] indices, int leafSize, int q, IProvideRandomValues random)
private static RandomProjectionTreeNode MakeEuclideanTree(T[] data, int[] indices, int leafSize, int q, IProvideRandomValues random)
{
if (indices.Length > leafSize)
{
Expand Down Expand Up @@ -50,9 +50,10 @@ public static FlatTree FlattenTree(RandomProjectionTreeNode tree, int leafSize)
/// the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses euclidean distance to determine the hyperplane and which side each data
/// sample falls on.
/// </summary>
private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(float[][] data, int[] indices, IProvideRandomValues random)
private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(T[] data, int[] indices, IProvideRandomValues random)
{
var dim = data[0].Length;
var vectorData = data.Select(x => x.Data).ToArray();
var dim = vectorData[0].Length;

// Select two random points, set the hyperplane between them
var leftIndex = random.Next(0, indices.Length);
Expand All @@ -67,8 +68,8 @@ private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector,
var hyperplaneVector = new float[dim];
for (var i = 0; i < hyperplaneVector.Length; i++)
{
hyperplaneVector[i] = data[left][i] - data[right][i];
hyperplaneOffset -= (hyperplaneVector[i] * (data[left][i] + data[right][i])) / 2;
hyperplaneVector[i] = vectorData[left][i] - vectorData[right][i];
hyperplaneOffset -= (hyperplaneVector[i] * (vectorData[left][i] + vectorData[right][i])) / 2;
}

// For each point compute the margin (project into normal vector)
Expand All @@ -81,7 +82,7 @@ private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector,
var margin = hyperplaneOffset;
for (var d = 0; d < dim; d++)
{
margin += hyperplaneVector[d] * data[indices[i]][d];
margin += hyperplaneVector[d] * vectorData[indices[i]][d];
}

if (margin == 0)
Expand Down
108 changes: 70 additions & 38 deletions UMAP/Umap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading.Tasks;
using static UMAP.NNDescent;
using static UMAP.Tree;

namespace UMAP
{
public sealed class Umap
public class Umap<T> where T : IUmapDataPoint
{
private const float SMOOTH_K_TOLERANCE = 1e-5f;
private const float MIN_K_DIST_SCALE = 1e-3f;
Expand All @@ -21,7 +19,7 @@ public sealed class Umap
private readonly float _setOpMixRatio = 1;
private readonly float _spread = 1;

private readonly DistanceCalculation _distanceFn;
private readonly DistanceCalculation<T> _distanceFn;
private readonly IProvideRandomValues _random;
private readonly int _nNeighbors;
private readonly int? _customNumberOfEpochs;
Expand All @@ -33,9 +31,9 @@ public sealed class Umap

// Internal graph connectivity representation
private SparseMatrix _graph = null;
private float[][] _x = null;
private T[] _x = null;
private bool _isInitialized = false;
private FlatTree[] _rpForest = new FlatTree[0];
private Tree<T>.FlatTree[] _rpForest = new Tree<T>.FlatTree[0];

// Projected embedding
private float[] _embedding;
Expand All @@ -47,7 +45,7 @@ public sealed class Umap
public delegate void ProgressReporter(float progress);

public Umap(
DistanceCalculation distance = null,
DistanceCalculation<T> distance = null,
IProvideRandomValues random = null,
int dimensions = 2,
int numberOfNeighbors = 15,
Expand All @@ -71,7 +69,7 @@ public Umap(
/// Initializes fit by computing KNN and a fuzzy simplicial set, as well as initializing the projected embeddings. Sets the optimization state ahead of optimization steps.
/// Returns the number of epochs to be used for the SGD optimization.
/// </summary>
public int InitializeFit(float[][] x)
public int InitializeFit(T[] x)
{
// We don't need to reinitialize if we've already initialized for this data
if ((_x == x) && _isInitialized)
Expand Down Expand Up @@ -151,9 +149,9 @@ private int GetNEpochs()
/// <summary>
/// Compute the ``nNeighbors`` nearest points for each data point in ``X`` - this may be exact, but more likely is approximated via nearest neighbor descent.
/// </summary>
internal (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(float[][] x, ProgressReporter progressReporter)
internal (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(T[] x, ProgressReporter progressReporter)
{
var metricNNDescent = MakeNNDescent(_distanceFn, _random);
var metricNNDescent = NNDescent<T>.MakeNNDescent(_distanceFn, _random);
progressReporter(0.05f);
var nTrees = 5 + Round(Math.Sqrt(x.Length) / 20);
var nIters = Math.Max(5, (int)Math.Floor(Math.Round(Math.Log(x.Length, 2))));
Expand All @@ -164,12 +162,13 @@ private int GetNEpochs()
.Select(i =>
{
forestProgressReporter((float)i / nTrees);
return FlattenTree(MakeTree(x, leafSize, i, _random), leafSize);
return Tree<T>.FlattenTree(Tree<T>.MakeTree(x, leafSize, i, _random), leafSize);
})
.ToArray();
var leafArray = MakeLeafArray(_rpForest);
var leafArray = Tree<T>.MakeLeafArray(_rpForest);
progressReporter(0.45f);
var nnDescendProgressReporter = ScaleProgressReporter(progressReporter, 0.5f, 1);

return metricNNDescent(x, leafArray, _nNeighbors, nIters, startingIteration: (i, max) => nnDescendProgressReporter((float)i / max));

// Handle python3 rounding down from 0.5 discrpancy
Expand All @@ -181,7 +180,7 @@ private int GetNEpochs()
/// to the data. This is done by locally approximating geodesic distance at each point, creating a fuzzy simplicial set for each such point, and then combining all the local fuzzy
/// simplicial sets into a global one via a fuzzy union.
/// </summary>
private SparseMatrix FuzzySimplicialSet(float[][] x, int nNeighbors, float setOpMixRatio, ProgressReporter progressReporter)
private SparseMatrix FuzzySimplicialSet(T[] x, int nNeighbors, float setOpMixRatio, ProgressReporter progressReporter)
{
var knnIndices = _knnIndices ?? new int[0][];
var knnDistances = _knnDistances ?? new float[0][];
Expand Down Expand Up @@ -363,7 +362,7 @@ private static (int[] rows, int[] cols, float[] vals) ComputeMembershipStrengths
// We're not computing the spectral initialization in this implementation until we determine a better eigenvalue/eigenvector computation approach

_embedding = new float[graph.Dims.rows * _optimizationState.Dim];
SIMDint.Uniform(ref _embedding, 10, _random);
SIMDint<T>.Uniform(ref _embedding, 10, _random);

// Get graph data in ordered way...
var weights = new List<float>();
Expand All @@ -382,15 +381,15 @@ private static (int[] rows, int[] cols, float[] vals) ComputeMembershipStrengths
return (head.ToArray(), tail.ToArray(), MakeEpochsPerSample(weights.ToArray(), nEpochs));
}

private void ShuffleTogether<T, T2, T3>(List<T> list, List<T2> other, List<T3> weights)
private void ShuffleTogether<T1, T2, T3>(List<T1> list, List<T2> other, List<T3> weights)
{
int n = list.Count;
if (other.Count != n) { throw new Exception(); }
while (n > 1)
{
n--;
int k = _random.Next(0, n + 1);
T value = list[k];
T1 value = list[k];
list[k] = list[n];
list[n] = value;

Expand Down Expand Up @@ -630,43 +629,76 @@ private static ProgressReporter ScaleProgressReporter(ProgressReporter progressR

public static class DistanceFunctions
{
public static float Cosine(float[] lhs, float[] rhs)
public static float Cosine(T lhs, T rhs)
{
return 1 - (SIMD.DotProduct(ref lhs, ref rhs) / (SIMD.Magnitude(ref lhs) * SIMD.Magnitude(ref rhs)));
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return 1 - (SIMD<T>.DotProduct(ref lhsVal, ref rhsVal) / (SIMD<T>.Magnitude(ref lhsVal) * SIMD<T>.Magnitude(ref rhsVal)));
}

public static float CosineForNormalizedVectors(float[] lhs, float[] rhs)
public static float CosineForNormalizedVectors(T lhs, T rhs)
{
return 1 - SIMD.DotProduct(ref lhs, ref rhs);
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return 1 - SIMD<T>.DotProduct(ref lhsVal, ref rhsVal);
}

public static float Euclidean(float[] lhs, float[] rhs)
public static float Euclidean(T lhs, T rhs)
{
return (float)Math.Sqrt(SIMD.Euclidean(ref lhs, ref rhs)); // TODO: Replace with netcore3 MathF class when the framework is available
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return (float)Math.Sqrt(SIMD<T>.Euclidean(ref lhsVal, ref rhsVal)); // TODO: Replace with netcore3 MathF class when the framework is available
}
}

private sealed class OptimizationState
{
public int CurrentEpoch = 0;
public int[] Head = new int[0];
public int[] Tail = new int[0];
public float[] EpochsPerSample = new float[0];
public float[] EpochOfNextSample = new float[0];
public float[] EpochOfNextNegativeSample= new float[0];
public float[] EpochsPerNegativeSample = new float[0];
public bool MoveOther = true;
public float InitialAlpha = 1;
public float Alpha = 1;
public float Gamma = 1;
public float A = 1.5769434603113077f;
public float B = 0.8950608779109733f;
public int Dim = 2;
public int NEpochs = 500;
public int NVertices = 0;
public int CurrentEpoch = 0;
public int[] Head = new int[0];
public int[] Tail = new int[0];
public float[] EpochsPerSample = new float[0];
public float[] EpochOfNextSample = new float[0];
public float[] EpochOfNextNegativeSample = new float[0];
public float[] EpochsPerNegativeSample = new float[0];
public bool MoveOther = true;
public float InitialAlpha = 1;
public float Alpha = 1;
public float Gamma = 1;
public float A = 1.5769434603113077f;
public float B = 0.8950608779109733f;
public int Dim = 2;
public int NEpochs = 500;
public int NVertices = 0;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public float GetDistanceFactor(float distSquared) => 1f / ((0.001f + distSquared) * (float)(A * Math.Pow(distSquared, B) + 1));
}
}

/// <inheritdoc cref="Umap{T}"/>
public class Umap : Umap<RawVectorArrayUmapDataPoint>
{
/// <inheritdoc cref="Umap{T}"/>
public Umap(
DistanceCalculation<RawVectorArrayUmapDataPoint> distance = null,
IProvideRandomValues random = null,
int dimensions = 2,
int numberOfNeighbors = 15,
int? customNumberOfEpochs = null,
ProgressReporter progressReporter = null)
: base(distance, random, dimensions, numberOfNeighbors, customNumberOfEpochs, progressReporter)
{

}

/// <inheritdoc cref="Umap{T}.NearestNeighbors(T[], Umap{T}.ProgressReporter)"/>
public (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(float[][] x, ProgressReporter progressReporter)
{
return base.NearestNeighbors(x.Select(c => new RawVectorArrayUmapDataPoint(c)).ToArray(), progressReporter);
}

/// <inheritdoc cref="Umap{T}.InitializeFit(T[])"/>
public int InitializeFit(float[][] a) => base.InitializeFit(a.Select(x => new RawVectorArrayUmapDataPoint(x)).ToArray());

}
}