Update GGUF quantization types (#729)

Bring `GGMLQuantizationType` up to date; adds `I8`, `I16`, `I32`, `I64`, `F64`, `IQ1_M` and `BF16`. Added in: * ggerganov/llama.cpp#6045 * ggerganov/llama.cpp#6062 * ggerganov/llama.cpp#6302 * ggerganov/llama.cpp#6412
huggingface · Jun 3, 2024 · d00b389 · d00b389
1 parent 8df8f6a
commit d00b389
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 0 deletions.
diff --git a/packages/gguf/src/quant-descriptions.ts b/packages/gguf/src/quant-descriptions.ts
@@ -96,4 +96,32 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
 		txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix.",
 		src_url: "https://github.com/ggerganov/llama.cpp/pull/5590",
 	},
+	[GGMLQuantizationType.I8]: {
+		txt: "8-bit fixed-width integer number.",
+		src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
+	},
+	[GGMLQuantizationType.I16]: {
+		txt: "16-bit fixed-width integer number.",
+		src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
+	},
+	[GGMLQuantizationType.I32]: {
+		txt: "32-bit fixed-width integer number.",
+		src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
+	},
+	[GGMLQuantizationType.I64]: {
+		txt: "64-bit fixed-width integer number.",
+		src_url: "https://github.com/ggerganov/llama.cpp/pull/6062",
+	},
+	[GGMLQuantizationType.F64]: {
+		txt: "64-bit standard IEEE 754 double-precision floating-point number.",
+		src_url: "https://en.wikipedia.org/wiki/Double-precision_floating-point_format",
+	},
+	[GGMLQuantizationType.IQ1_M]: {
+		txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.75 bits-per-weight.",
+		src_url: "https://github.com/ggerganov/llama.cpp/pull/6302",
+	},
+	[GGMLQuantizationType.BF16]: {
+		txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.",
+		src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
+	},
 };
diff --git a/packages/gguf/src/types.ts b/packages/gguf/src/types.ts
@@ -29,6 +29,13 @@ export enum GGMLQuantizationType {
 	IQ3_S = 21,
 	IQ2_S = 22,
 	IQ4_XS = 23,
+	I8 = 24,
+	I16 = 25,
+	I32 = 26,
+	I64 = 27,
+	F64 = 28,
+	IQ1_M = 29,
+	BF16 = 30,
 }
 
 export enum GGUFValueType {