Skip to content

Commit

Permalink
Merge pull request #1 from Arlodotexe/master
Browse files Browse the repository at this point in the history
Implemented IUmapDataPoint, cleanup IUmapDistanceParameter.
  • Loading branch information
amaid authored Sep 27, 2023
2 parents b6a1ecc + 99767f1 commit a3eb8e3
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 70 deletions.
2 changes: 1 addition & 1 deletion UMAP/DistanceCalculation.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
namespace UMAP
{
public delegate float DistanceCalculation<T>(IUmapDistanceParameter<T>[] x, IUmapDistanceParameter<T>[] y);
public delegate float DistanceCalculation<T>(T x, T y) where T : IUmapDataPoint;
}
13 changes: 13 additions & 0 deletions UMAP/IUmapDataPoint.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace UMAP
{
/// <summary>
/// Represents a single data point to be processed by <see cref="Umap{T}"/>.
/// </summary>
public interface IUmapDataPoint
{
/// <summary>
/// The data being operated on.
/// </summary>
float[] Data { get; }
}
}
11 changes: 0 additions & 11 deletions UMAP/IUmapDistance.cs

This file was deleted.

4 changes: 2 additions & 2 deletions UMAP/NNDescent.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

namespace UMAP
{
internal static class NNDescent<T>
internal static class NNDescent<T> where T : IUmapDataPoint
{
public delegate (int[][] indices, float[][] weights) NNDescentFn(
IUmapDistanceParameter<T>[][] data,
T[] data,
int[][] leafArray,
int nNeighbors,
int nIters = 10,
Expand Down
23 changes: 10 additions & 13 deletions UMAP/SIMD.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ internal static class SIMD<T>
private static readonly int _vs4 = 4 * Vector<float>.Count;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float Magnitude(ref IUmapDistanceParameter<T>[] vec) => (float)Math.Sqrt(DotProduct(ref vec, ref vec));
public static float Magnitude(ref float[] vec) => (float)Math.Sqrt(DotProduct(ref vec, ref vec));

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float Euclidean(ref float[] lhs, ref float[] rhs)
Expand Down Expand Up @@ -179,20 +179,17 @@ public static void Multiply(ref float[] lhs, float f)
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float DotProduct(ref IUmapDistanceParameter<T>[] lhs, ref IUmapDistanceParameter<T>[] rhs)
public static float DotProduct(ref float[] lhs, ref float[] rhs)
{

var lhsArray = lhs.Select(x => x.EmbeddingVectorValue).ToArray();
var rhsArray = rhs.Select(x=>x.EmbeddingVectorValue).ToArray();
var result = 0f;
var count = lhs.Length;
var offset = 0;
while (count >= _vs4)
{
result += Vector.Dot(new Vector<float>(lhsArray, offset), new Vector<float>(rhsArray, offset));
result += Vector.Dot(new Vector<float>(lhsArray, offset + _vs1), new Vector<float>(rhsArray, offset + _vs1));
result += Vector.Dot(new Vector<float>(lhsArray, offset + _vs2), new Vector<float>(rhsArray, offset + _vs2));
result += Vector.Dot(new Vector<float>(lhsArray, offset + _vs3), new Vector<float>(rhsArray, offset + _vs3));
result += Vector.Dot(new Vector<float>(lhs, offset), new Vector<float>(rhs, offset));
result += Vector.Dot(new Vector<float>(lhs, offset + _vs1), new Vector<float>(rhs, offset + _vs1));
result += Vector.Dot(new Vector<float>(lhs, offset + _vs2), new Vector<float>(rhs, offset + _vs2));
result += Vector.Dot(new Vector<float>(lhs, offset + _vs3), new Vector<float>(rhs, offset + _vs3));
if (count == _vs4)
{
return result;
Expand All @@ -203,8 +200,8 @@ public static float DotProduct(ref IUmapDistanceParameter<T>[] lhs, ref IUmapDis
}
if (count >= _vs2)
{
result += Vector.Dot(new Vector<float>(lhsArray, offset), new Vector<float>(rhsArray, offset));
result += Vector.Dot(new Vector<float>(lhsArray, offset + _vs1), new Vector<float>(rhsArray, offset + _vs1));
result += Vector.Dot(new Vector<float>(lhs, offset), new Vector<float>(rhs, offset));
result += Vector.Dot(new Vector<float>(lhs, offset + _vs1), new Vector<float>(rhs, offset + _vs1));
if (count == _vs2)
{
return result;
Expand All @@ -215,7 +212,7 @@ public static float DotProduct(ref IUmapDistanceParameter<T>[] lhs, ref IUmapDis
}
if (count >= _vs1)
{
result += Vector.Dot(new Vector<float>(lhsArray, offset), new Vector<float>(rhsArray, offset));
result += Vector.Dot(new Vector<float>(lhs, offset), new Vector<float>(rhs, offset));
if (count == _vs1)
{
return result;
Expand All @@ -228,7 +225,7 @@ public static float DotProduct(ref IUmapDistanceParameter<T>[] lhs, ref IUmapDis
{
while (count > 0)
{
result += lhsArray[offset] * rhsArray[offset];
result += lhs[offset] * rhs[offset];
offset++; count--;
}
}
Expand Down
19 changes: 9 additions & 10 deletions UMAP/Tree.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@

namespace UMAP
{
internal static class Tree<T>
internal static class Tree<T> where T : IUmapDataPoint
{
/// <summary>
/// Construct a random projection tree based on ``data`` with leaves of size at most ``leafSize``
/// </summary>
public static RandomProjectionTreeNode MakeTree(IUmapDistanceParameter<T>[][] data, int leafSize, int n, IProvideRandomValues random)
public static RandomProjectionTreeNode MakeTree(T[] data, int leafSize, int n, IProvideRandomValues random)
{
var indices = Enumerable.Range(0, data.Length).ToArray();
return MakeEuclideanTree(data, indices, leafSize, n, random);
}

private static RandomProjectionTreeNode MakeEuclideanTree(IUmapDistanceParameter<T>[][] data, int[] indices, int leafSize, int q, IProvideRandomValues random)
private static RandomProjectionTreeNode MakeEuclideanTree(T[] data, int[] indices, int leafSize, int q, IProvideRandomValues random)
{
if (indices.Length > leafSize)
{
Expand Down Expand Up @@ -50,9 +50,10 @@ public static FlatTree FlattenTree(RandomProjectionTreeNode tree, int leafSize)
/// the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses euclidean distance to determine the hyperplane and which side each data
/// sample falls on.
/// </summary>
private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(IUmapDistanceParameter<T>[][] data, int[] indices, IProvideRandomValues random)
private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(T[] data, int[] indices, IProvideRandomValues random)
{
var dim = data[0].Length;
var vectorData = data.Select(x => x.Data).ToArray();
var dim = vectorData[0].Length;

// Select two random points, set the hyperplane between them
var leftIndex = random.Next(0, indices.Length);
Expand All @@ -67,10 +68,8 @@ private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector,
var hyperplaneVector = new float[dim];
for (var i = 0; i < hyperplaneVector.Length; i++)
{
var leftVectorValue = data[left][i].EmbeddingVectorValue;
var rightVectorValue = data[right][i].EmbeddingVectorValue;
hyperplaneVector[i] = leftVectorValue - rightVectorValue;
hyperplaneOffset -= (hyperplaneVector[i] * (leftVectorValue + rightVectorValue)) / 2;
hyperplaneVector[i] = vectorData[left][i] - vectorData[right][i];
hyperplaneOffset -= (hyperplaneVector[i] * (vectorData[left][i] + vectorData[right][i])) / 2;
}

// For each point compute the margin (project into normal vector)
Expand All @@ -83,7 +82,7 @@ private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector,
var margin = hyperplaneOffset;
for (var d = 0; d < dim; d++)
{
margin += hyperplaneVector[d] * data[indices[i]][d].EmbeddingVectorValue;
margin += hyperplaneVector[d] * vectorData[indices[i]][d];
}

if (margin == 0)
Expand Down
70 changes: 37 additions & 33 deletions UMAP/Umap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace UMAP
{
public sealed class Umap<T>
public sealed class Umap<T> where T : IUmapDataPoint
{
private const float SMOOTH_K_TOLERANCE = 1e-5f;
private const float MIN_K_DIST_SCALE = 1e-3f;
Expand All @@ -31,7 +31,7 @@ public sealed class Umap<T>

// Internal graph connectivity representation
private SparseMatrix _graph = null;
private IUmapDistanceParameter<T>[][] _x = null;
private T[] _x = null;
private bool _isInitialized = false;
private Tree<T>.FlatTree[] _rpForest = new Tree<T>.FlatTree[0];

Expand All @@ -57,7 +57,7 @@ public Umap(
throw new ArgumentOutOfRangeException(nameof(customNumberOfEpochs), "if non-null then must be a positive value");
}

_distanceFn = distance ?? DistanceFunctions<T>.Cosine;
_distanceFn = distance ?? DistanceFunctions.Cosine;
_random = random ?? DefaultRandomGenerator.Instance;
_nNeighbors = numberOfNeighbors;
_optimizationState = new OptimizationState { Dim = dimensions };
Expand All @@ -69,7 +69,7 @@ public Umap(
/// Initializes fit by computing KNN and a fuzzy simplicial set, as well as initializing the projected embeddings. Sets the optimization state ahead of optimization steps.
/// Returns the number of epochs to be used for the SGD optimization.
/// </summary>
public int InitializeFit(IUmapDistanceParameter<T>[][] x)
public int InitializeFit(T[] x)
{
// We don't need to reinitialize if we've already initialized for this data
if ((_x == x) && _isInitialized)
Expand Down Expand Up @@ -149,7 +149,7 @@ private int GetNEpochs()
/// <summary>
/// Compute the ``nNeighbors`` nearest points for each data point in ``X`` - this may be exact, but more likely is approximated via nearest neighbor descent.
/// </summary>
internal (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(IUmapDistanceParameter<T>[][] x, ProgressReporter progressReporter)
internal (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(T[] x, ProgressReporter progressReporter)
{
var metricNNDescent = NNDescent<T>.MakeNNDescent(_distanceFn, _random);
progressReporter(0.05f);
Expand All @@ -169,8 +169,6 @@ private int GetNEpochs()
progressReporter(0.45f);
var nnDescendProgressReporter = ScaleProgressReporter(progressReporter, 0.5f, 1);

var organizedDataList = new List<(float left, float right)>();

return metricNNDescent(x, leafArray, _nNeighbors, nIters, startingIteration: (i, max) => nnDescendProgressReporter((float)i / max));

// Handle python3 rounding down from 0.5 discrpancy
Expand All @@ -182,7 +180,7 @@ private int GetNEpochs()
/// to the data. This is done by locally approximating geodesic distance at each point, creating a fuzzy simplicial set for each such point, and then combining all the local fuzzy
/// simplicial sets into a global one via a fuzzy union.
/// </summary>
private SparseMatrix FuzzySimplicialSet(IUmapDistanceParameter<T>[][] x, int nNeighbors, float setOpMixRatio, ProgressReporter progressReporter)
private SparseMatrix FuzzySimplicialSet(T[] x, int nNeighbors, float setOpMixRatio, ProgressReporter progressReporter)
{
var knnIndices = _knnIndices ?? new int[0][];
var knnDistances = _knnDistances ?? new float[0][];
Expand Down Expand Up @@ -383,15 +381,15 @@ private static (int[] rows, int[] cols, float[] vals) ComputeMembershipStrengths
return (head.ToArray(), tail.ToArray(), MakeEpochsPerSample(weights.ToArray(), nEpochs));
}

private void ShuffleTogether<T, T2, T3>(List<T> list, List<T2> other, List<T3> weights)
private void ShuffleTogether<T1, T2, T3>(List<T1> list, List<T2> other, List<T3> weights)
{
int n = list.Count;
if (other.Count != n) { throw new Exception(); }
while (n > 1)
{
n--;
int k = _random.Next(0, n + 1);
T value = list[k];
T1 value = list[k];
list[k] = list[n];
list[n] = value;

Expand Down Expand Up @@ -629,42 +627,48 @@ private static ProgressReporter ScaleProgressReporter(ProgressReporter progressR
return progress => progressReporter((range * progress) + start);
}

public static class DistanceFunctions<T>
public static class DistanceFunctions
{
public static float Cosine(IUmapDistanceParameter<T>[] lhs, IUmapDistanceParameter<T>[] rhs)
public static float Cosine(T lhs, T rhs)
{
return 1 - (SIMD<T>.DotProduct(ref lhs, ref rhs) / (SIMD<T>.Magnitude(ref lhs) * SIMD<T>.Magnitude(ref rhs)));
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return 1 - (SIMD<T>.DotProduct(ref lhsVal, ref rhsVal) / (SIMD<T>.Magnitude(ref lhsVal) * SIMD<T>.Magnitude(ref rhsVal)));
}

public static float CosineForNormalizedVectors(IUmapDistanceParameter<T>[] lhs, IUmapDistanceParameter<T>[] rhs)
public static float CosineForNormalizedVectors(T lhs, T rhs)
{
return 1 - SIMD<T>.DotProduct(ref lhs, ref rhs);
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return 1 - SIMD<T>.DotProduct(ref lhsVal, ref rhsVal);
}

public static float Euclidean(float[] lhs, float[] rhs)
public static float Euclidean(T lhs, T rhs)
{
return (float)Math.Sqrt(SIMD<T>.Euclidean(ref lhs, ref rhs)); // TODO: Replace with netcore3 MathF class when the framework is available
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return (float)Math.Sqrt(SIMD<T>.Euclidean(ref lhsVal, ref rhsVal)); // TODO: Replace with netcore3 MathF class when the framework is available
}
}

private sealed class OptimizationState
{
public int CurrentEpoch = 0;
public int[] Head = new int[0];
public int[] Tail = new int[0];
public float[] EpochsPerSample = new float[0];
public float[] EpochOfNextSample = new float[0];
public float[] EpochOfNextNegativeSample= new float[0];
public float[] EpochsPerNegativeSample = new float[0];
public bool MoveOther = true;
public float InitialAlpha = 1;
public float Alpha = 1;
public float Gamma = 1;
public float A = 1.5769434603113077f;
public float B = 0.8950608779109733f;
public int Dim = 2;
public int NEpochs = 500;
public int NVertices = 0;
public int CurrentEpoch = 0;
public int[] Head = new int[0];
public int[] Tail = new int[0];
public float[] EpochsPerSample = new float[0];
public float[] EpochOfNextSample = new float[0];
public float[] EpochOfNextNegativeSample = new float[0];
public float[] EpochsPerNegativeSample = new float[0];
public bool MoveOther = true;
public float InitialAlpha = 1;
public float Alpha = 1;
public float Gamma = 1;
public float A = 1.5769434603113077f;
public float B = 0.8950608779109733f;
public int Dim = 2;
public int NEpochs = 500;
public int NVertices = 0;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public float GetDistanceFactor(float distSquared) => 1f / ((0.001f + distSquared) * (float)(A * Math.Pow(distSquared, B) + 1));
Expand Down

0 comments on commit a3eb8e3

Please sign in to comment.