Skip to content

Commit

Permalink
add indexes
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrii Yefymenko committed Jan 16, 2024
1 parent bc09a56 commit fbb1305
Show file tree
Hide file tree
Showing 11 changed files with 806 additions and 248 deletions.
440 changes: 406 additions & 34 deletions src/Alpaca/Clustering/AffinityPropagation.cs

Large diffs are not rendered by default.

99 changes: 99 additions & 0 deletions src/Alpaca/Clustering/HierarchicalClustering.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace UnicornAnalytics.Clustering
{
public class HierarchicalClustering
{
public double[][] Centroids { get; private set; }

private List<List<int>> clusters;

public void Cluster(double[][] data)
{
int count = data.Length;

clusters = new List<List<int>>();
for (int i = 0; i < count; i++)
{
clusters.Add(new List<int> { i });
}

while (clusters.Count > 1)
{
double minDistance = double.MaxValue;
int[] minPair = new int[2];

for (int i = 0; i < clusters.Count; i++)
{
for (int j = i + 1; j < clusters.Count; j++)
{
double distance = FindDistance(data, clusters, i, j);
if (distance < minDistance)
{
minDistance = distance;
minPair[0] = i;
minPair[1] = j;
}
}
}

clusters[minPair[0]].AddRange(clusters[minPair[1]]);
clusters.RemoveAt(minPair[1]);
}

Centroids = clusters.Select(cluster => CalculateCentroid(data, cluster)).ToArray();
}

private double FindDistance(double[][] data, List<List<int>> clusters, int i, int j)
{
double maxDistance = double.NegativeInfinity;

foreach (int point1 in clusters[i])
{
foreach (int point2 in clusters[j])
{
double distance = EuclideanDistance(data[point1], data[point2]);
maxDistance = Math.Max(maxDistance, distance);
}
}

return maxDistance;
}

private double[] CalculateCentroid(double[][] data, List<int> cluster)
{
int dimensions = data[0].Length;
double[] centroid = new double[dimensions];
foreach (var index in cluster)
{
for (int j = 0; j < dimensions; j++)
{
centroid[j] += data[index][j];
}
}

for (int i = 0; i < dimensions; i++)
{
centroid[i] /= cluster.Count;
}

return centroid;
}

private double EuclideanDistance(double[] vector1, double[] vector2)
{
double sum = 0.0;
for (int i = 0; i < vector1.Length; i++)
sum += Math.Pow(vector1[i] - vector2[i], 2);
return Math.Sqrt(sum);
}

public int[][] GetClusters()
{
return clusters.Select(cluster => cluster.ToArray()).ToArray();
}
}
}
118 changes: 67 additions & 51 deletions src/Alpaca/Clustering/MeanShift.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,87 +6,103 @@ namespace UnicornAnalytics.Clustering
{
public class MeanShift
{
private double _bandwidth;
private readonly double _bandwidth;

public double[][] Centroids { get; private set; }

public int[] ClusterAssignments { get; private set; }
public double[][] Centers { get; private set; }
public int[] Labels { get; private set; }

public MeanShift(double bandwidth)
{
this._bandwidth = bandwidth;
_bandwidth = bandwidth;
}

public void Fit(double[][] data)
{
List<double[]> newCentroids = new List<double[]>();
bool[] visited = new bool[data.Length];
ClusterAssignments = new int[data.Length];
List<double[]> centers = new List<double[]>();
foreach (double[] point in data)
{
centers.Add(ShiftPoint(point, data, _bandwidth));
}

for (int i = 0; i < data.Length; i++)
for (int i = 0; i < centers.Count; i++)
{
if (!visited[i])
for (int j = i + 1; j < centers.Count; j++)
{
double[] centroid = data[i];
while (true)
if (EuclideanDistance(centers[i], centers[j]) < _bandwidth)
{
List<double[]> inBandwidth = new List<double[]>();

for (int j = 0; j < data.Length; j++)
{
if (EuclideanDistance(data[j], centroid) <= _bandwidth)
{
inBandwidth.Add(data[j]);
visited[j] = true;
}
}

double[] newCentroid = inBandwidth.Aggregate((x, y) =>
x.Zip(y, (a, b) => a + b).ToArray());

for (int k = 0; k < newCentroid.Length; k++)
{
newCentroid[k] /= inBandwidth.Count;
}

if (EuclideanDistance(newCentroid, centroid) < 1e-5)
{
break;
}

centroid = newCentroid;
centers[j] = centers[i];
}
}
}

newCentroids.Add(centroid);
Centers = centers.Distinct(new CenterComparer()).ToArray();

int clusterId = newCentroids.Count - 1;
for (int l = 0; l < data.Length; l++)
Labels = new int[data.Length];
for (int i = 0; i < data.Length; i++)
{
double minDist = double.PositiveInfinity;
for (int j = 0; j < Centers.Length; j++)
{
double dist = EuclideanDistance(data[i], Centers[j]);
if (dist < minDist)
{
if (EuclideanDistance(data[l], centroid) <= _bandwidth)
{
ClusterAssignments[l] = clusterId;
}
minDist = dist;
Labels[i] = j;
}
}
}
}

private double[] ShiftPoint(double[] point, IEnumerable<double[]> points, double bandwidth)
{
double[] shiftedPoint = new double[point.Length];
double scale = 0;
foreach (double[] p in points)
{
double distance = EuclideanDistance(point, p);
double weight = GaussianKernel(distance, bandwidth);
for (int i = 0; i < shiftedPoint.Length; i++)
{
shiftedPoint[i] += weight * p[i];
}
scale += weight;
}

for (int i = 0; i < shiftedPoint.Length; i++)
{
shiftedPoint[i] /= scale;
}

Centroids = newCentroids.ToArray();
return shiftedPoint;
}

private double EuclideanDistance(double[] a, double[] b)
{
return Math.Sqrt(a.Zip(b, (x, y) => (x - y) * (x - y)).Sum());
double sum = 0;
for (int i = 0; i < a.Length; i++)
{
double diff = a[i] - b[i];
sum += diff * diff;
}
return Math.Sqrt(sum);
}

public double[][] GetCentroids()
private double GaussianKernel(double distance, double bandwidth)
{
return Centroids;
return (1 / (bandwidth * Math.Sqrt(2 * Math.PI))) * Math.Exp(-0.5 * Math.Pow(distance / bandwidth, 2));
}

public int[] GetClusterAssignments()
private class CenterComparer : IEqualityComparer<double[]>
{
return ClusterAssignments;
public bool Equals(double[] x, double[] y)
{
return x.SequenceEqual(y);
}

public int GetHashCode(double[] obj)
{
return obj.Sum().GetHashCode();
}
}
}

}
25 changes: 13 additions & 12 deletions src/Alpaca/Indexes/External/AdjustedRandIndex.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Linq;

namespace UnicornAnalytics.Indexes.External
{
Expand All @@ -11,22 +12,22 @@ public double Calculate(int[] clusterIndices1, int[] clusterIndices2)
throw new ArgumentException("Size of both cluster arrays should be equal");
}

int N = clusterIndices1.Length;
double[,] contingencyMatrix = new double[N, N];
int maxIndex = Math.Max(clusterIndices1.Max(), clusterIndices2.Max()) + 1;
double[,] contingencyMatrix = new double[maxIndex, maxIndex];

// Create the contingency table
for (int i = 0; i < N; i++)
for (int i = 0; i < clusterIndices1.Length; i++)
{
contingencyMatrix[clusterIndices1[i], clusterIndices2[i]]++;
}

int[] rowSums = new int[N];
int[] colSums = new int[N];
int[] rowSums = new int[maxIndex];
int[] colSums = new int[maxIndex];

// Calculate the row and column sums
for (int i = 0; i < N; i++)
for (int i = 0; i < maxIndex; i++)
{
for (int j = 0; j < N; j++)
for (int j = 0; j < maxIndex; j++)
{
rowSums[i] += (int)contingencyMatrix[i, j];
colSums[j] += (int)contingencyMatrix[i, j];
Expand All @@ -37,23 +38,23 @@ public double Calculate(int[] clusterIndices1, int[] clusterIndices2)
double sumCombCol = 0.0;
double sumComb = 0.0;

for (int i = 0; i < N; i++)
for (int i = 0; i < maxIndex; i++)
{
sumComb += Combination2((int)contingencyMatrix[i, i]);
sumCombRow += Combination2(rowSums[i]);
sumCombCol += Combination2(colSums[i]);
}

double index = sumComb - sumCombRow * sumCombCol / Combination2(N);
double maxIndex = 0.5 * (sumCombRow + sumCombCol) - sumCombRow * sumCombCol / Combination2(N);
double ARI = index / maxIndex;
double index = sumComb - sumCombRow * sumCombCol / Combination2(clusterIndices1.Length);
double maxIndexScore = 0.5 * (sumCombRow + sumCombCol) - (sumCombRow * sumCombCol) / Combination2(clusterIndices1.Length);
double ARI = index / maxIndexScore;

return ARI;
}

private double Combination2(int n)
{
// combinations of n items, taken 2 at a time, i.e. "n choose 2"
// combinations of 2
if (n < 2)
return 0;

Expand Down
71 changes: 0 additions & 71 deletions src/Alpaca/Indexes/External/HubertIndex.cs

This file was deleted.

Loading

0 comments on commit fbb1305

Please sign in to comment.