vcacheoptimizer: Improve meshopt_optimizeVertexCache performance by ~15%

The core of this algorithm has a few branches in the inner loops that are all difficult to predict. The triangle scoring was already optimized to cmov by clang/gcc, but the cache copy wasn't - and the result consumed a surprisingly large fraction of the total compute; making that code branchless yielded ~7% speedup by itself. The other ~7% speedup is achieved by a live_triangles check during scoring update; there's no need to update anything if we aren't going to use this vertex again. This is likely to be less effective on other meshes, but it's rather effective on Buddha mesh. Note: live_triangles is equivalent to adjacency.counts at the point of cache score update, but checking adjacency.counts generates better code on clang.
zeux · Jun 23, 2023 · 61d2f12 · 61d2f12
1 parent a35f63d
commit 61d2f12
Showing 1 changed file with 10 additions and 11 deletions.
diff --git a/src/vcacheoptimizer.cpp b/src/vcacheoptimizer.cpp
@@ -221,9 +221,9 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
 	}
 
-	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+	unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
 	unsigned int* cache = cache_holder;
-	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
 	size_t cache_count = 0;
 
 	unsigned int current_triangle = 0;
@@ -260,10 +260,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = cache[i];
 
-			if (index != a && index != b && index != c)
-			{
-				cache_new[cache_write++] = index;
-			}
+			cache_new[cache_write] = index;
+			cache_write += (index != a && index != b && index != c);
 		}
 
 		unsigned int* cache_temp = cache;
@@ -305,6 +303,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = cache[i];
 
+			// no need to update scores if we are never going to use this vertex
+			if (adjacency.counts[index] == 0)
+				continue;
+
 			int cache_position = i >= cache_size ? -1 : int(i);
 
 			// update vertex score
@@ -325,11 +327,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 				float tri_score = triangle_scores[tri] + score_diff;
 				assert(tri_score > 0);
 
-				if (best_score < tri_score)
-				{
-					best_triangle = tri;
-					best_score = tri_score;
-				}
+				best_triangle = best_score < tri_score ? tri : best_triangle;
+				best_score = best_score < tri_score ? tri_score : best_score;
 
 				triangle_scores[tri] = tri_score;
 			}