Skip to content

Commit

Permalink
Merge pull request #12 from EfimenkoAndrew/feature/fix-calinski-harab…
Browse files Browse the repository at this point in the history
…asz-index

Feature/fix calinski harabasz index
  • Loading branch information
EfimenkoAndrew authored Jan 17, 2024
2 parents db88282 + bc9e904 commit b00698c
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 136 deletions.
83 changes: 47 additions & 36 deletions src/Alpaca/Indexes/Internal/CalinskiHarabaszIndex.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,61 +2,72 @@
using System.Linq;

namespace UnicornAnalytics.Indexes.Internal;

public class CalinskiHarabaszIndex
{
public double Calculate(double[][] data, int[] clusterTags, double[][] centroids)
public double Calculate(double[][] data, int[] clusterMarkers)
{
int numberOfClusters = centroids.Length;
int numberOfDataPoints = data.Length;
double betweenClusterVariance = ComputeBetweenClusterVariance(data, centroids, clusterTags);
double withinClusterVariance = ComputeWithinClusterVariance(data, centroids, clusterTags);
int n = data.Length; // total number of data points
int k = clusterMarkers.Distinct().Count(); // number of clusters

if (numberOfClusters == 1 || withinClusterVariance == 0)
{
return 0;
}
return (betweenClusterVariance / withinClusterVariance) * (numberOfDataPoints - numberOfClusters) / (numberOfClusters - 1);
}
double[] overallCentroid = CalculateCentroid(data); // overall data centroid

private double ComputeWithinClusterVariance(double[][] data, double[][] centroids, int[] clusterTags)
{
double variance = 0.0;
for (int i = 0; i < data.Length; i++)
double SSB = 0, SSW = 0;

for (int i = 0; i < k; i++)
{
double[] dataPoint = data[i];
double[] centroid = centroids[clusterTags[i]];
variance += EuclideanDistanceSquared(dataPoint, centroid);
double[][] clusterData = data.Where((v, j) => clusterMarkers[j] == i).ToArray();
double[] clusterCentroid = CalculateCentroid(clusterData); // cluster centroid

SSB += clusterData.Length * CalculateSquareDistance(clusterCentroid, overallCentroid);

foreach (double[] point in clusterData)
{
SSW += CalculateSquareDistance(point, clusterCentroid);
}
}
return variance;

double betweenGroupDispersion = SSB / (k - 1);
double withinGroupDispersion = SSW / (n - k);
return betweenGroupDispersion / withinGroupDispersion;
}

private double ComputeBetweenClusterVariance(double[][] data, double[][] centroids, int[] clusterTags)
public double[] CalculateCentroid(double[][] data)
{
double[] overallCentroid = CalculateOverallCentroid(centroids);
double variance = 0.0;
int[] clusterSizes = new int[centroids.Length];
int dimensions = data[0].Length;

for (int i = 0; i < clusterTags.Length; i++)
clusterSizes[clusterTags[i]]++;
double[] centroid = new double[dimensions];

for (int i = 0; i < centroids.Length; i++)
foreach (double[] point in data)
{
variance += clusterSizes[i] * EuclideanDistanceSquared(centroids[i], overallCentroid);
for (int i = 0; i < dimensions; i++)
{
centroid[i] += point[i];
}
}

return variance;
}
for (int i = 0; i < dimensions; i++)
{
centroid[i] /= data.Length;
}

private double EuclideanDistanceSquared(double[] pointA, double[] pointB)
{
return pointA.Zip(pointB, (a, b) => (a - b) * (a - b)).Sum();
return centroid;
}

private double[] CalculateOverallCentroid(double[][] centroids)
public double CalculateSquareDistance(double[] a, double[] b)
{
return Enumerable.Range(0, centroids[0].Length)
.Select(i => centroids.Average(point => point[i]))
.ToArray();
if (a.Length != b.Length)
throw new ArgumentException("Points must have the same dimensionality");

double sumSquares = 0;

for (int i = 0; i < a.Length; i++)
{
double difference = a[i] - b[i];
sumSquares += difference * difference;
}

return sumSquares;
}
}

53 changes: 29 additions & 24 deletions src/Alpaca/Indexes/Internal/DaviesBouldinIndex.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,44 +5,49 @@ namespace UnicornAnalytics.Indexes.Internal;

public class DaviesBouldinIndex
{
public double Calculate(double[][] clustersCentroids, double[][] allData, int[] allDataClusterIndices)
private double GetEuclideanDistance(double[] vectorA, double[] vectorB)
{
int k = clustersCentroids.Length;
double[] S = new double[k];
return Math.Sqrt(vectorA.Zip(vectorB, (a, b) => (a - b) * (a - b)).Sum());
}

public double Calculate(double[][] data, int[] clusterMarkers)
{
int numClusters = clusterMarkers.Max() + 1;
double[][] centroids = new double[numClusters][];
double[] avgIntraClusterDistances = new double[numClusters];

for (int i = 0; i < k; i++)
for (int i = 0; i < numClusters; i++)
{
S[i] = allData.Where((t, j) => allDataClusterIndices[j] == i).Average(t => EuclideanDistance(t, clustersCentroids[i]));
var clusterPoints = data.Where((t, j) => clusterMarkers[j] == i).ToArray();
centroids[i] = clusterPoints.Aggregate(new double[clusterPoints[0].Length], (a, b) => a.Zip(b, (x, y) => x + y).ToArray());
for (int j = 0; j < centroids[i].Length; j++)
{
centroids[i][j] /= clusterPoints.Length;
}
avgIntraClusterDistances[i] = clusterPoints.Average(p => GetEuclideanDistance(p, centroids[i]));
}

double DBIndex = 0.0;
for (int i = 0; i < k; i++)
double dbIndex = 0;

for (int i = 0; i < numClusters; i++)
{
double maxRatio = 0.0;
for (int j = 0; j < k; j++)
double maxRatio = double.MinValue;

for (int j = 0; j < numClusters; j++)
{
if (j != i)
if (i != j)
{
double distance = EuclideanDistance(clustersCentroids[i], clustersCentroids[j]);
double ratio = (S[i] + S[j]) / distance;
double ratio = (avgIntraClusterDistances[i] + avgIntraClusterDistances[j]) / GetEuclideanDistance(centroids[i], centroids[j]);
if (ratio > maxRatio)
{
maxRatio = ratio;
}
}
}
DBIndex += maxRatio;
}
DBIndex /= k;

return DBIndex;
}
dbIndex += maxRatio;
}

private double EuclideanDistance(double[] x, double[] y)
{
double distance = 0;
for (int i = 0; i < x.Length; i++)
distance += Math.Pow(x[i] - y[i], 2);
return Math.Sqrt(distance);
return dbIndex / numClusters;
}
}
}
66 changes: 0 additions & 66 deletions src/Alpaca/Indexes/Internal/HubertIndex.cs

This file was deleted.

15 changes: 5 additions & 10 deletions tests/UpdatedEvaluations/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
using UnicornAnalytics.Indexes.Internal;
using UpdatedEvaluations;

var name = "4";
var name = "3";
var path = $@"C:\Work\personal\Diploma\datasets\data+y\{name}.txt";
using var streamReader = new StreamReader(path);
using var csv = new CsvReader(streamReader, new CsvConfiguration(CultureInfo.InvariantCulture) { HasHeaderRecord = false});
Expand All @@ -17,9 +17,9 @@
var sim = SimilarityMatrix.SparseSimilarityMatrix(points);

KMeans kMeans = new KMeans();
kMeans.Fit(parsed, 3);
kMeans.Fit(parsed, 2);

FuzzyCMeans fuzzyCMeans = new FuzzyCMeans(3, 10.0);
FuzzyCMeans fuzzyCMeans = new FuzzyCMeans(2, 10.0);
fuzzyCMeans.Fit(parsed, 100);

var meanShift = new MeanShift(75);
Expand All @@ -38,7 +38,6 @@
DaviesBouldinIndex db = new DaviesBouldinIndex();
CIndexCalculatorIndex cIndex = new CIndexCalculatorIndex();
SilhouetteIndex sIndex = new SilhouetteIndex();
HubertIndex hubertIndex = new HubertIndex();

RandIndex randIndex = new RandIndex();
Dictionary<string, double> indexValidations = new();
Expand All @@ -62,24 +61,20 @@
writer_r.Close();



void ValidateIndexes(double[][] allData, int[] clusters, double[][] centroids)
{
indexValidations.Clear();
var chValuation = ch.Calculate(allData, clusters, centroids);
var chValuation = ch.Calculate(allData, clusters);
indexValidations.Add("CalinskiHarabaszIndex", chValuation);

var dbValuation = db.Calculate(centroids, allData, clusters);
var dbValuation = db.Calculate(allData, clusters);
indexValidations.Add("DaviesBouldinIndex", dbValuation);

var cValuation = cIndex.Calculate(allData, clusters);
indexValidations.Add("CIndexCalculatorIndex", cValuation);

var sValuation = sIndex.Calculate(allData, clusters);
indexValidations.Add("SilhouetteIndex", sValuation);

var hValuation = hubertIndex.Calculate(allData, clusters);
indexValidations.Add("HubertIndex", hValuation);
}

void WriteResultsToFile(string path, double[][] data, int[] clusters)
Expand Down

0 comments on commit b00698c

Please sign in to comment.