Skip to content

Commit

Permalink
Update GGUF quantization types (#729)
Browse files Browse the repository at this point in the history
Bring `GGMLQuantizationType` up to date; adds `I8`, `I16`, `I32`, `I64`,
`F64`, `IQ1_M` and `BF16`.

Added in:
* ggerganov/llama.cpp#6045
* ggerganov/llama.cpp#6062
* ggerganov/llama.cpp#6302
* ggerganov/llama.cpp#6412
  • Loading branch information
CISC authored Jun 3, 2024
1 parent 8df8f6a commit d00b389
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 0 deletions.
28 changes: 28 additions & 0 deletions packages/gguf/src/quant-descriptions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,32 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/5590",
},
[GGMLQuantizationType.I8]: {
txt: "8-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
},
[GGMLQuantizationType.I16]: {
txt: "16-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
},
[GGMLQuantizationType.I32]: {
txt: "32-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
},
[GGMLQuantizationType.I64]: {
txt: "64-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6062",
},
[GGMLQuantizationType.F64]: {
txt: "64-bit standard IEEE 754 double-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Double-precision_floating-point_format",
},
[GGMLQuantizationType.IQ1_M]: {
txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.75 bits-per-weight.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6302",
},
[GGMLQuantizationType.BF16]: {
txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
},
};
7 changes: 7 additions & 0 deletions packages/gguf/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ export enum GGMLQuantizationType {
IQ3_S = 21,
IQ2_S = 22,
IQ4_XS = 23,
I8 = 24,
I16 = 25,
I32 = 26,
I64 = 27,
F64 = 28,
IQ1_M = 29,
BF16 = 30,
}

export enum GGUFValueType {
Expand Down

0 comments on commit d00b389

Please sign in to comment.