From 6ff2b3fac92102d144d3820c70487ecc9cce8da5 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Thu, 24 Aug 2023 11:42:22 -0700
Subject: [PATCH 1/7] quantization: Introduce a new file, quantization.cpp

meshopt_quantizeFloat and meshopt_quantizeHalf are moved to the new
file. These are larger (esp. float->half), not as commonly used as
unorm/snorm, don't benefit as much from inlining as it's more rare that
they are used with constant arguments in perf-sensitive context, and
also not used anywhere inside meshoptimizer itself, which means we still
get to keep the independence between different translation units (we'll
need to be careful with this in the future...).

A new file makes for a convenient place to add more quantization
utilities in the future as well.

These are used in gltfpack, which doesn't see a noticeable perf
degradation on geometry-heavy meshes that use quantizeFloat.
---
 CMakeLists.txt       |  1 +
 src/meshoptimizer.h  | 48 ++------------------------------------------
 src/quantization.cpp | 46 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 46 deletions(-)
 create mode 100644 src/quantization.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 01909c140..a21445796 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ set(SOURCES
     src/indexgenerator.cpp
     src/overdrawanalyzer.cpp
     src/overdrawoptimizer.cpp
+    src/quantization.cpp
     src/simplifier.cpp
     src/spatialorder.cpp
     src/stripifier.cpp
diff --git a/src/meshoptimizer.h b/src/meshoptimizer.h
index aeeab9de6..b8a8164a4 100644
--- a/src/meshoptimizer.h
+++ b/src/meshoptimizer.h
@@ -582,14 +582,14 @@ inline int meshopt_quantizeSnorm(float v, int N);
  * Representable magnitude range: [6e-5; 65504]
  * Maximum relative reconstruction error: 5e-4
  */
-inline unsigned short meshopt_quantizeHalf(float v);
+MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
 
 /**
  * Quantize a float into a floating point value with a limited number of significant mantissa bits
  * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
  * Assumes N is in a valid mantissa precision range, which is 1..23
  */
-inline float meshopt_quantizeFloat(float v, int N);
+MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
 #endif
 
 /**
@@ -684,50 +684,6 @@ inline int meshopt_quantizeSnorm(float v, int N)
 
 	return int(v * scale + round);
 }
-
-inline unsigned short meshopt_quantizeHalf(float v)
-{
-	union { float f; unsigned int ui; } u = {v};
-	unsigned int ui = u.ui;
-
-	int s = (ui >> 16) & 0x8000;
-	int em = ui & 0x7fffffff;
-
-	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
-	int h = (em - (112 << 23) + (1 << 12)) >> 13;
-
-	/* underflow: flush to zero; 113 encodes exponent -14 */
-	h = (em < (113 << 23)) ? 0 : h;
-
-	/* overflow: infinity; 143 encodes exponent 16 */
-	h = (em >= (143 << 23)) ? 0x7c00 : h;
-
-	/* NaN; note that we convert all types of NaN to qNaN */
-	h = (em > (255 << 23)) ? 0x7e00 : h;
-
-	return (unsigned short)(s | h);
-}
-
-inline float meshopt_quantizeFloat(float v, int N)
-{
-	union { float f; unsigned int ui; } u = {v};
-	unsigned int ui = u.ui;
-
-	const int mask = (1 << (23 - N)) - 1;
-	const int round = (1 << (23 - N)) >> 1;
-
-	int e = ui & 0x7f800000;
-	unsigned int rui = (ui + round) & ~mask;
-
-	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
-	ui = e == 0x7f800000 ? ui : rui;
-
-	/* flush denormals to zero */
-	ui = e == 0 ? 0 : ui;
-
-	u.ui = ui;
-	return u.f;
-}
 #endif
 
 /* Internal implementation helpers */
diff --git a/src/quantization.cpp b/src/quantization.cpp
new file mode 100644
index 000000000..59592dc80
--- /dev/null
+++ b/src/quantization.cpp
@@ -0,0 +1,46 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+unsigned short meshopt_quantizeHalf(float v)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	int s = (ui >> 16) & 0x8000;
+	int em = ui & 0x7fffffff;
+
+	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+	/* underflow: flush to zero; 113 encodes exponent -14 */
+	h = (em < (113 << 23)) ? 0 : h;
+
+	/* overflow: infinity; 143 encodes exponent 16 */
+	h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+	/* NaN; note that we convert all types of NaN to qNaN */
+	h = (em > (255 << 23)) ? 0x7e00 : h;
+
+	return (unsigned short)(s | h);
+}
+
+float meshopt_quantizeFloat(float v, int N)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	const int mask = (1 << (23 - N)) - 1;
+	const int round = (1 << (23 - N)) >> 1;
+
+	int e = ui & 0x7f800000;
+	unsigned int rui = (ui + round) & ~mask;
+
+	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
+	ui = e == 0x7f800000 ? ui : rui;
+
+	/* flush denormals to zero */
+	ui = e == 0 ? 0 : ui;
+
+	u.ui = ui;
+	return u.f;
+}

From 3d1a098e6bdcf1c79386d2d92ccea1e4fa9a236f Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Thu, 24 Aug 2023 11:58:29 -0700
Subject: [PATCH 2/7] demo: Add quantization tests

We now test both quantizeFloat and quantizeHalf fairly thoroughly via
unit tests.
---
 demo/tests.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/demo/tests.cpp b/demo/tests.cpp
index 3f933609d..ca31112d8 100644
--- a/demo/tests.cpp
+++ b/demo/tests.cpp
@@ -1222,6 +1222,52 @@ static void tessellation()
 	assert(memcmp(tessib, expected, sizeof(expected)) == 0);
 }
 
+static void quantizeFloat()
+{
+	assert(meshopt_quantizeFloat(1.2345f, 23) == 1.2345f);
+
+	assert(meshopt_quantizeFloat(1.2345f, 16) == 1.2344971f);
+	assert(meshopt_quantizeFloat(1.2345f, 8) == 1.2343750f);
+	assert(meshopt_quantizeFloat(1.2345f, 4) == 1.25f);
+	assert(meshopt_quantizeFloat(1.2345f, 1) == 1.0);
+
+	assert(meshopt_quantizeFloat(1.0f, 0) == 1.0f);
+}
+
+static void quantizeHalf()
+{
+	// normal
+	assert(meshopt_quantizeHalf(1.2345f) == 0x3cf0);
+
+	// overflow
+	assert(meshopt_quantizeHalf(65535.f) == 0x7c00);
+	assert(meshopt_quantizeHalf(-65535.f) == 0xfc00);
+
+	// large
+	assert(meshopt_quantizeHalf(65000.f) == 0x7bef);
+	assert(meshopt_quantizeHalf(-65000.f) == 0xfbef);
+
+	// small
+	assert(meshopt_quantizeHalf(0.125f) == 0x3000);
+	assert(meshopt_quantizeHalf(-0.125f) == 0xb000);
+
+	// very small
+	assert(meshopt_quantizeHalf(1e-4f) == 0x068e);
+	assert(meshopt_quantizeHalf(-1e-4f) == 0x868e);
+
+	// underflow
+	assert(meshopt_quantizeHalf(1e-5f) == 0x0000);
+	assert(meshopt_quantizeHalf(-1e-5f) == 0x8000);
+
+	// exponent underflow
+	assert(meshopt_quantizeHalf(1e-20f) == 0x0000);
+	assert(meshopt_quantizeHalf(-1e-20f) == 0x8000);
+
+	// exponent overflow
+	assert(meshopt_quantizeHalf(1e20f) == 0x7c00);
+	assert(meshopt_quantizeHalf(-1e20f) == 0xfc00);
+}
+
 void runTests()
 {
 	decodeIndexV0();
@@ -1284,4 +1330,7 @@ void runTests()
 
 	adjacency();
 	tessellation();
+
+	quantizeFloat();
+	quantizeHalf();
 }

From 275fb30063488eda319b3ee94d19784fb242ddd7 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Thu, 24 Aug 2023 12:00:29 -0700
Subject: [PATCH 3/7] quantization: Add assertions for quantizeFloat N argument

Also convert C comments to C++ - this is no longer part of
meshoptimizer.h so it doesn't need to be C89 compatible.
---
 src/quantization.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/quantization.cpp b/src/quantization.cpp
index 59592dc80..7399781ed 100644
--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -1,6 +1,8 @@
 // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
 #include "meshoptimizer.h"
 
+#include <assert.h>
+
 unsigned short meshopt_quantizeHalf(float v)
 {
 	union { float f; unsigned int ui; } u = {v};
@@ -9,16 +11,16 @@ unsigned short meshopt_quantizeHalf(float v)
 	int s = (ui >> 16) & 0x8000;
 	int em = ui & 0x7fffffff;
 
-	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
+	// bias exponent and round to nearest; 112 is relative exponent bias (127-15)
 	int h = (em - (112 << 23) + (1 << 12)) >> 13;
 
-	/* underflow: flush to zero; 113 encodes exponent -14 */
+	// underflow: flush to zero; 113 encodes exponent -14
 	h = (em < (113 << 23)) ? 0 : h;
 
-	/* overflow: infinity; 143 encodes exponent 16 */
+	// overflow: infinity; 143 encodes exponent 16
 	h = (em >= (143 << 23)) ? 0x7c00 : h;
 
-	/* NaN; note that we convert all types of NaN to qNaN */
+	// NaN; note that we convert all types of NaN to qNaN
 	h = (em > (255 << 23)) ? 0x7e00 : h;
 
 	return (unsigned short)(s | h);
@@ -26,6 +28,8 @@ unsigned short meshopt_quantizeHalf(float v)
 
 float meshopt_quantizeFloat(float v, int N)
 {
+	assert(N >= 0 && N <= 23);
+
 	union { float f; unsigned int ui; } u = {v};
 	unsigned int ui = u.ui;
 
@@ -35,10 +39,10 @@ float meshopt_quantizeFloat(float v, int N)
 	int e = ui & 0x7f800000;
 	unsigned int rui = (ui + round) & ~mask;
 
-	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
+	// round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0
 	ui = e == 0x7f800000 ? ui : rui;
 
-	/* flush denormals to zero */
+	// flush denormals to zero
 	ui = e == 0 ? 0 : ui;
 
 	u.ui = ui;

From 3bbaee09a09bc8e6b02c8620e0806fa4f9616c03 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Thu, 24 Aug 2023 12:04:11 -0700
Subject: [PATCH 4/7] demo: Add inf/nan quantization tests

---
 demo/tests.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/demo/tests.cpp b/demo/tests.cpp
index ca31112d8..5ec4a2e98 100644
--- a/demo/tests.cpp
+++ b/demo/tests.cpp
@@ -1231,7 +1231,13 @@ static void quantizeFloat()
 	assert(meshopt_quantizeFloat(1.2345f, 4) == 1.25f);
 	assert(meshopt_quantizeFloat(1.2345f, 1) == 1.0);
 
-	assert(meshopt_quantizeFloat(1.0f, 0) == 1.0f);
+	assert(meshopt_quantizeFloat(1.f, 0) == 1.0f);
+
+	assert(meshopt_quantizeFloat(1.f / 0.f, 0) == 1.f / 0.f);
+	assert(meshopt_quantizeFloat(-1.f / 0.f, 0) == -1.f / 0.f);
+
+	float nanf = meshopt_quantizeFloat(0.f / 0.f, 8);
+	assert(nanf != nanf);
 }
 
 static void quantizeHalf()
@@ -1266,6 +1272,14 @@ static void quantizeHalf()
 	// exponent overflow
 	assert(meshopt_quantizeHalf(1e20f) == 0x7c00);
 	assert(meshopt_quantizeHalf(-1e20f) == 0xfc00);
+
+	// inf
+	assert(meshopt_quantizeHalf(1.f / 0.f) == 0x7c00);
+	assert(meshopt_quantizeHalf(-1.f / 0.f) == 0xfc00);
+
+	// nan
+	unsigned short nanh = meshopt_quantizeHalf(0.f / 0.f);
+	assert(nanh == 0x7e00 || nanh == 0xfe00);
 }
 
 void runTests()

From 282153ea48c069bb4fa68e2cd15a0dbe2e62c0eb Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Thu, 24 Aug 2023 13:25:23 -0700
Subject: [PATCH 5/7] quantization: Implement meshopt_dequantizeHalf

This function reverses the transformation by meshopt_quantizeHalf; the
reverse conversion is easier since exponent and mantissa just need to be
expanded into the larger range. We still need to handle denormals and
specials; unlike quantization, here we choose to preserve NaN payload
because that allows us to fold inf and nan cases into a single
conditional add.
---
 demo/tests.cpp       | 32 ++++++++++++++++++++++++++++++++
 src/meshoptimizer.h  |  8 +++++++-
 src/quantization.cpp | 20 ++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/demo/tests.cpp b/demo/tests.cpp
index 5ec4a2e98..5cc459e7e 100644
--- a/demo/tests.cpp
+++ b/demo/tests.cpp
@@ -1282,6 +1282,37 @@ static void quantizeHalf()
 	assert(nanh == 0x7e00 || nanh == 0xfe00);
 }
 
+static void dequantizeHalf()
+{
+	// normal
+	assert(meshopt_dequantizeHalf(0x3cf0) == 1.234375f);
+
+	// large
+	assert(meshopt_dequantizeHalf(0x7bef) == 64992.f);
+	assert(meshopt_dequantizeHalf(0xfbef) == -64992.f);
+
+	// small
+	assert(meshopt_dequantizeHalf(0x3000) == 0.125f);
+	assert(meshopt_dequantizeHalf(0xb000) == -0.125f);
+
+	// very small
+	assert(meshopt_dequantizeHalf(0x068e) == 1.00016594e-4f);
+	assert(meshopt_dequantizeHalf(0x868e) == -1.00016594e-4f);
+
+	// denormal
+	assert(meshopt_dequantizeHalf(0x00ff) == 0.f);
+	assert(meshopt_dequantizeHalf(0x80ff) == 0.f); // actually this is -0.f
+	assert(1.f / meshopt_dequantizeHalf(0x80ff) == -1.f / 0.f);
+
+	// inf
+	assert(meshopt_dequantizeHalf(0x7c00) == 1.f / 0.f);
+	assert(meshopt_dequantizeHalf(0xfc00) == -1.f / 0.f);
+
+	// nan
+	float nanf = meshopt_dequantizeHalf(0x7e00);
+	assert(nanf != nanf);
+}
+
 void runTests()
 {
 	decodeIndexV0();
@@ -1347,4 +1378,5 @@ void runTests()
 
 	quantizeFloat();
 	quantizeHalf();
+	dequantizeHalf();
 }
diff --git a/src/meshoptimizer.h b/src/meshoptimizer.h
index b8a8164a4..53a23686d 100644
--- a/src/meshoptimizer.h
+++ b/src/meshoptimizer.h
@@ -577,7 +577,7 @@ inline int meshopt_quantizeUnorm(float v, int N);
 inline int meshopt_quantizeSnorm(float v, int N);
 
 /**
- * Quantize a float into half-precision floating point value
+ * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
  * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
  * Representable magnitude range: [6e-5; 65504]
  * Maximum relative reconstruction error: 5e-4
@@ -590,6 +590,12 @@ MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
  * Assumes N is in a valid mantissa precision range, which is 1..23
  */
 MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
+
+/**
+ * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
+ * Preserves Inf/NaN, flushes denormals to zero
+ */
+MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
 #endif
 
 /**
diff --git a/src/quantization.cpp b/src/quantization.cpp
index 7399781ed..09a314d60 100644
--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -48,3 +48,23 @@ float meshopt_quantizeFloat(float v, int N)
 	u.ui = ui;
 	return u.f;
 }
+
+float meshopt_dequantizeHalf(unsigned short h)
+{
+	unsigned int s = unsigned(h & 0x8000) << 16;
+	int em = h & 0x7fff;
+
+	// bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
+	int r = (em + (112 << 10)) << 13;
+
+	// denormal: flush to zero
+	r = (em < (1 << 10)) ? 0 : r;
+
+	// infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
+	// 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
+	r += (em >= (31 << 10)) ? (112 << 23) : 0;
+
+	union { float f; unsigned int ui; } u;
+	u.ui = s | r;
+	return u.f;
+}

From b4486e16e06f4358f9d5ef0969532211a80266f7 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Thu, 24 Aug 2023 14:15:55 -0700
Subject: [PATCH 6/7] Document dequantization a little better

meshopt_quantizeFloat does not require dequantization;
meshopt_quantizeHalf now has a dequantize function; Snorm/Unorm variants
have a trivial (divide-by-scale) dequantizer.
---
 README.md           | 2 ++
 src/meshoptimizer.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d96ffa836..7fe171322 100644
--- a/README.md
+++ b/README.md
@@ -134,6 +134,8 @@ unsigned short py = meshopt_quantizeHalf(v.y);
 unsigned short pz = meshopt_quantizeHalf(v.z);
 ```
 
+Since quantized vertex attributes often need to remain in their compact representations for efficient transfer and storage, they are usually dequantized during vertex processing by configuring the GPU vertex input correctly to expect normalized integers or half precision floats, which often needs no or minimal changes to the shader code. When CPU dequantization is required instead, `meshopt_dequantizeHalf` can be used to convert half precision values back to single precision; for normalized integer formats, the dequantization just requires dividing by 2^N-1 for unorm and 2^(N-1)-1 for snorm variants, for example manually reversing `meshopt_quantizeUnorm(v, 10)` can be done by dividing by 1023.
+
 ## Vertex/index buffer compression
 
 In case storage size or transmission bandwidth is of importance, you might want to additionally compress vertex and index data. While several mesh compression libraries, like Google Draco, are available, they typically are designed to maximize the compression ratio at the cost of disturbing the vertex/index order (which makes the meshes inefficient to render on GPU) or decompression performance. They also frequently don't support custom game-ready quantized vertex formats and thus require to re-quantize the data after loading it, introducing extra quantization errors and making decoding slower.
diff --git a/src/meshoptimizer.h b/src/meshoptimizer.h
index 53a23686d..b763148a2 100644
--- a/src/meshoptimizer.h
+++ b/src/meshoptimizer.h
@@ -585,7 +585,7 @@ inline int meshopt_quantizeSnorm(float v, int N);
 MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
 
 /**
- * Quantize a float into a floating point value with a limited number of significant mantissa bits
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
  * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
  * Assumes N is in a valid mantissa precision range, which is 1..23
  */

From d49f1a54e956d21d877b535a1b7baf833fff8820 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Thu, 24 Aug 2023 14:21:43 -0700
Subject: [PATCH 7/7] demo: Fix MSVC warnings about divide-by-zero

---
 demo/tests.cpp | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/demo/tests.cpp b/demo/tests.cpp
index 5cc459e7e..e7eb02392 100644
--- a/demo/tests.cpp
+++ b/demo/tests.cpp
@@ -1224,6 +1224,8 @@ static void tessellation()
 
 static void quantizeFloat()
 {
+	volatile float zero = 0.f; // avoids div-by-zero warnings
+
 	assert(meshopt_quantizeFloat(1.2345f, 23) == 1.2345f);
 
 	assert(meshopt_quantizeFloat(1.2345f, 16) == 1.2344971f);
@@ -1233,15 +1235,17 @@ static void quantizeFloat()
 
 	assert(meshopt_quantizeFloat(1.f, 0) == 1.0f);
 
-	assert(meshopt_quantizeFloat(1.f / 0.f, 0) == 1.f / 0.f);
-	assert(meshopt_quantizeFloat(-1.f / 0.f, 0) == -1.f / 0.f);
+	assert(meshopt_quantizeFloat(1.f / zero, 0) == 1.f / zero);
+	assert(meshopt_quantizeFloat(-1.f / zero, 0) == -1.f / zero);
 
-	float nanf = meshopt_quantizeFloat(0.f / 0.f, 8);
+	float nanf = meshopt_quantizeFloat(zero / zero, 8);
 	assert(nanf != nanf);
 }
 
 static void quantizeHalf()
 {
+	volatile float zero = 0.f; // avoids div-by-zero warnings
+
 	// normal
 	assert(meshopt_quantizeHalf(1.2345f) == 0x3cf0);
 
@@ -1274,16 +1278,18 @@ static void quantizeHalf()
 	assert(meshopt_quantizeHalf(-1e20f) == 0xfc00);
 
 	// inf
-	assert(meshopt_quantizeHalf(1.f / 0.f) == 0x7c00);
-	assert(meshopt_quantizeHalf(-1.f / 0.f) == 0xfc00);
+	assert(meshopt_quantizeHalf(1.f / zero) == 0x7c00);
+	assert(meshopt_quantizeHalf(-1.f / zero) == 0xfc00);
 
 	// nan
-	unsigned short nanh = meshopt_quantizeHalf(0.f / 0.f);
+	unsigned short nanh = meshopt_quantizeHalf(zero / zero);
 	assert(nanh == 0x7e00 || nanh == 0xfe00);
 }
 
 static void dequantizeHalf()
 {
+	volatile float zero = 0.f; // avoids div-by-zero warnings
+
 	// normal
 	assert(meshopt_dequantizeHalf(0x3cf0) == 1.234375f);
 
@@ -1302,11 +1308,11 @@ static void dequantizeHalf()
 	// denormal
 	assert(meshopt_dequantizeHalf(0x00ff) == 0.f);
 	assert(meshopt_dequantizeHalf(0x80ff) == 0.f); // actually this is -0.f
-	assert(1.f / meshopt_dequantizeHalf(0x80ff) == -1.f / 0.f);
+	assert(1.f / meshopt_dequantizeHalf(0x80ff) == -1.f / zero);
 
 	// inf
-	assert(meshopt_dequantizeHalf(0x7c00) == 1.f / 0.f);
-	assert(meshopt_dequantizeHalf(0xfc00) == -1.f / 0.f);
+	assert(meshopt_dequantizeHalf(0x7c00) == 1.f / zero);
+	assert(meshopt_dequantizeHalf(0xfc00) == -1.f / zero);
 
 	// nan
 	float nanf = meshopt_dequantizeHalf(0x7e00);