From 874c3411c2ff918f744b139756c52ab580b17352 Mon Sep 17 00:00:00 2001 From: Christian Azinn Date: Sat, 27 Apr 2024 00:08:55 -0400 Subject: [PATCH 01/66] support splits in convert.py --- convert.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/convert.py b/convert.py index 1c700cf6a3d65..ec9aa9a566ee9 100755 --- a/convert.py +++ b/convert.py @@ -44,9 +44,16 @@ DEFAULT_CONCURRENCY = 8 +DEFAULT_SPLIT_TENSORS = 128 + ADDED_TOKENS_FILE = 'added_tokens.json' FAST_TOKENIZER_FILE = 'tokenizer.json' +LLM_KV_SPLIT_NO = "split.no" +LLM_KV_SPLIT_COUNT = "split.count" +LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" +SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" + # # data types # @@ -1235,6 +1242,49 @@ def write_all( of.close() + @staticmethod + def write_split( + fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, + total_tensors: int, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, tensors_per_shard: int = DEFAULT_SPLIT_TENSORS, small_first_shard: bool = True, + ) -> None: + check_vocab_size(params, vocab, pad_vocab=pad_vocab) + + model_list = list(model.items()) + total_shards = math.ceil(total_tensors / tensors_per_shard) + small_first_shard + shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] + + for i, shard in enumerate(shard_files): + of = OutputFile(shard, endianess=endianess) + + if i == 0: + of.add_meta_arch(params) + if isinstance(vocab, Vocab): + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) + else: # NoVocab + of.gguf.add_tokenizer_model(vocab.tokenizer_model) + + of.gguf.add_uint16(LLM_KV_SPLIT_NO, i) + of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards) + of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors) + + # have the option to write a first shard with only the metadata + if small_first_shard and i == 0: + of.write_meta() + of.close() + continue + + stop = min((i + 1 - small_first_shard) * tensors_per_shard, total_tensors) + shard_models = model_list[(i - small_first_shard) * tensors_per_shard:stop] + for name, lazy_tensor in shard_models: + of.add_tensor_info(name, lazy_tensor) + + of.write_meta() + of.write_tensor_info() + of.write_tensor_data(ftype, dict(shard_models), concurrency) + of.close() + def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type @@ -1473,6 +1523,9 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") + parser.add_argument("--split", action="store_true", help="split the converted model into multiple files") + parser.add_argument("--split-max-tensors", type=int, help=f"maximum number of tensors per file when splitting (default: {DEFAULT_SPLIT_TENSORS})", default=DEFAULT_SPLIT_TENSORS) + parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default is to only include metadata)") args = parser.parse_args(args_in) if args.no_vocab and args.vocab_only: @@ -1544,11 +1597,28 @@ def main(args_in: list[str] | None = None) -> None: outfile = args.outfile or default_outfile(model_plus.paths, ftype) params.ftype = ftype - print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") + if args.split: + total_tensors = len(model) + if total_tensors < args.split_max_tensors: + + print("Model has fewer tensors than the split threshold, not splitting") + print(f"Writing {outfile}, format {ftype}") + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) + else: + print(f"Writing {outfile} as shards, format {ftype}") + OutputFile.write_split(outfile, ftype, params, model, vocab, special_vocab, total_tensors, + concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, + tensors_per_shard=args.split_max_tensors, small_first_shard=not args.large_first_shard) + print(f"Wrote {outfile}") + + else: + print(f"Writing {outfile}, format {ftype}") + + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) + print(f"Wrote {outfile}") if __name__ == '__main__': From 72cbd4e0142f11e3bc25c6050f961a5d89c6ff25 Mon Sep 17 00:00:00 2001 From: Christian Azinn Date: Sat, 27 Apr 2024 21:14:41 -0400 Subject: [PATCH 02/66] Support split by size and dry run to write estimated shards/filesizes --- convert.py | 213 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 146 insertions(+), 67 deletions(-) diff --git a/convert.py b/convert.py index ec9aa9a566ee9..693f31a8f1b74 100755 --- a/convert.py +++ b/convert.py @@ -44,8 +44,6 @@ DEFAULT_CONCURRENCY = 8 -DEFAULT_SPLIT_TENSORS = 128 - ADDED_TOKENS_FILE = 'added_tokens.json' FAST_TOKENIZER_FILE = 'tokenizer.json' @@ -54,6 +52,10 @@ LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" +SPLIT_STYLE_NONE = 0 +SPLIT_STYLE_BY_TENSORS = 1 +SPLIT_STYLE_BY_SIZE = 2 + # # data types # @@ -1215,49 +1217,46 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: @staticmethod def write_all( fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, + tensors_per_shard: int, tensors_max_size: int, dry_run: bool = False, concurrency: int = DEFAULT_CONCURRENCY, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, small_first_shard: bool = True, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, endianess=endianess) - - # meta data - of.add_meta_arch(params) - if isinstance(vocab, Vocab): - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) - else: # NoVocab - of.gguf.add_tokenizer_model(vocab.tokenizer_model) + total_tensors = len(model) + total_size = sum(get_tensor_size(lazy_tensor) for lazy_tensor in model.values()) - # tensor info - for name, lazy_tensor in model.items(): - of.add_tensor_info(name, lazy_tensor) + if tensors_per_shard: + split_style = SPLIT_STYLE_BY_TENSORS + elif tensors_max_size: + split_style = SPLIT_STYLE_BY_SIZE + else: + split_style = SPLIT_STYLE_NONE - of.write_meta() - of.write_tensor_info() + if tensors_per_shard and total_tensors < tensors_per_shard: + print("Model has fewer tensors than the split threshold, not splitting") + split_style = SPLIT_STYLE_NONE - # tensor data - of.write_tensor_data(ftype, model, concurrency) + if tensors_max_size and total_size < tensors_max_size: + print("Model has smaller size than the split threshold, not splitting") + split_style = SPLIT_STYLE_NONE - of.close() + split_strategy = create_split_strategy(split_style, fname_out, model, tensors_per_shard, tensors_max_size, small_first_shard) + total_shards = len(split_strategy) - @staticmethod - def write_split( - fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - total_tensors: int, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, tensors_per_shard: int = DEFAULT_SPLIT_TENSORS, small_first_shard: bool = True, - ) -> None: - check_vocab_size(params, vocab, pad_vocab=pad_vocab) + print("Writing the following files:") + for shard_path, shard_tensors in split_strategy: + size = format_n_bytes_to_str(sum(get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" + print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") - model_list = list(model.items()) - total_shards = math.ceil(total_tensors / tensors_per_shard) + small_first_shard - shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] + if dry_run: + print("Dry run, not writing files") + return - for i, shard in enumerate(shard_files): - of = OutputFile(shard, endianess=endianess) + for i, (shard_path, shard_tensors) in enumerate(split_strategy): + of = OutputFile(shard_path, endianess=endianess) if i == 0: + # meta data of.add_meta_arch(params) if isinstance(vocab, Vocab): of.add_meta_vocab(vocab) @@ -1265,27 +1264,111 @@ def write_split( else: # NoVocab of.gguf.add_tokenizer_model(vocab.tokenizer_model) - of.gguf.add_uint16(LLM_KV_SPLIT_NO, i) - of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards) - of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors) - # have the option to write a first shard with only the metadata - if small_first_shard and i == 0: - of.write_meta() - of.close() - continue + if split_style != SPLIT_STYLE_NONE: + + of.gguf.add_uint16(LLM_KV_SPLIT_NO, i) + of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards) + of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors) - stop = min((i + 1 - small_first_shard) * tensors_per_shard, total_tensors) - shard_models = model_list[(i - small_first_shard) * tensors_per_shard:stop] - for name, lazy_tensor in shard_models: + if small_first_shard and i == 0: + of.write_meta() + of.close() + continue + + print(f"Writing shard {i + 1}/{total_shards} with {len(shard_tensors)} tensors") + + # tensor info + for name, lazy_tensor in shard_tensors: of.add_tensor_info(name, lazy_tensor) of.write_meta() of.write_tensor_info() - of.write_tensor_data(ftype, dict(shard_models), concurrency) + of.write_tensor_data(ftype, dict(shard_tensors), concurrency) + of.close() +def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1024 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1024 * 1024 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1024 * 1024 * 1024 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + + if n <= 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + +def format_n_bytes_to_str(num: int) -> str: + num = float(num) + for unit in ("", "K", "M", "G"): + if abs(num) < 1024.0: + return f"{num:3.1f}{unit}" + num /= 1024.0 + return f"{num:.1f}T - over 1TB, --split recommended" + + +def get_tensor_size(tensor: LazyTensor) -> int: + return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) + + +SplitStrategy: TypeAlias = 'list[tuple[Path, list[tuple[str, LazyTensor]]]]' + + +def create_split_strategy(split_style: int, fname_out: Path, model: LazyModel, tensors_per_shard: int, tensors_max_size: int, small_first_shard: bool) -> SplitStrategy: + if split_style == SPLIT_STYLE_NONE: + return [(fname_out, list(model.items()))] + + elif split_style == SPLIT_STYLE_BY_TENSORS: + total_shards = math.ceil(len(model) / tensors_per_shard) + small_first_shard + shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] + splits = [] + + if small_first_shard: + splits.append((shard_files[0], None)) + + for i, shard in enumerate(shard_files[small_first_shard:]): + start = i * tensors_per_shard + stop = min((i + 1) * tensors_per_shard, len(model)) + splits.append((shard, list(model.items())[start:stop])) + + return splits + + elif split_style == SPLIT_STYLE_BY_SIZE: + shards = [] + + # we have to determine the shards first to determine how many shards there will be in total - two passes + for i, shard in enumerate(list(model.items())): + if i == 0: + shards.append([shard]) + continue + if get_tensor_size(shard[1]) + sum(get_tensor_size(t[1]) for t in shards[-1]) > tensors_max_size: + shards.append([shard]) + else: + shards[-1].append(shard) + + total_shards = len(shards) + small_first_shard + shard_offset = 1 + splits = [] + + if small_first_shard: + splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards)), None)) + shard_offset += 1 + + for i, shard in enumerate(shards): + splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)), shard)) + + return splits + + def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type @@ -1524,8 +1607,10 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--split", action="store_true", help="split the converted model into multiple files") - parser.add_argument("--split-max-tensors", type=int, help=f"maximum number of tensors per file when splitting (default: {DEFAULT_SPLIT_TENSORS})", default=DEFAULT_SPLIT_TENSORS) - parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default is to only include metadata)") + parser.add_argument("--split-max-tensors", type=int, help=f"max tensors in each split") + parser.add_argument("--split-max-size", type=str, help=f"max size per split") + parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files") + parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)") args = parser.parse_args(args_in) if args.no_vocab and args.vocab_only: @@ -1536,6 +1621,15 @@ def main(args_in: list[str] | None = None) -> None: do_dump_model(model_plus) return + if args.split and not (args.split_max_tensors or args.split_max_size): + raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") + + if args.split_max_tensors and args.split_max_size: + raise ValueError("Can't specify both --split-max-tensors and --split-max-size") + + if args.split_max_size: + args.split_max_size = split_str_to_n_bytes(args.split_max_size) + if not args.vocab_only: model_plus = load_some_model(args.model) else: @@ -1598,26 +1692,11 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype - if args.split: - total_tensors = len(model) - if total_tensors < args.split_max_tensors: - - print("Model has fewer tensors than the split threshold, not splitting") - print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) - else: - print(f"Writing {outfile} as shards, format {ftype}") - OutputFile.write_split(outfile, ftype, params, model, vocab, special_vocab, total_tensors, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, - tensors_per_shard=args.split_max_tensors, small_first_shard=not args.large_first_shard) - print(f"Wrote {outfile}") - - else: - print(f"Writing {outfile}, format {ftype}") - - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) + print(f"Writing {outfile}, format {ftype}") + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, args.split_max_tensors, + args.split_max_size, dry_run=args.dry_run, concurrency=args.concurrency, + endianess=endianess, pad_vocab=args.pad_vocab, small_first_shard=not args.large_first_shard) + if not args.dry_run: print(f"Wrote {outfile}") From 702a7446700d4c2fc7bf9e71c24ccc9ac6fbbc24 Mon Sep 17 00:00:00 2001 From: Christian Azinn Date: Sun, 28 Apr 2024 18:22:32 -0400 Subject: [PATCH 03/66] Move split functionality to new GGUFManager class --- convert.py | 208 +++----------- gguf-py/gguf/__init__.py | 1 + gguf-py/gguf/gguf_manager.py | 523 +++++++++++++++++++++++++++++++++++ 3 files changed, 556 insertions(+), 176 deletions(-) create mode 100644 gguf-py/gguf/gguf_manager.py diff --git a/convert.py b/convert.py index 693f31a8f1b74..31c96e5ade55e 100755 --- a/convert.py +++ b/convert.py @@ -24,13 +24,15 @@ from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable +import importlib +gguf = importlib.import_module("gguf-py.gguf") import numpy as np from sentencepiece import SentencePieceProcessor if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf +# import gguf if TYPE_CHECKING: from typing_extensions import Self, TypeAlias @@ -47,15 +49,6 @@ ADDED_TOKENS_FILE = 'added_tokens.json' FAST_TOKENIZER_FILE = 'tokenizer.json' -LLM_KV_SPLIT_NO = "split.no" -LLM_KV_SPLIT_COUNT = "split.count" -LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" -SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" - -SPLIT_STYLE_NONE = 0 -SPLIT_STYLE_BY_TENSORS = 1 -SPLIT_STYLE_BY_SIZE = 2 - # # data types # @@ -1066,8 +1059,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): - self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) + def __init__(self, fname_out: Path, args: argparse.Namespace, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): + self.gguf = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], args, endianess=endianess) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -1146,21 +1139,15 @@ def add_meta_vocab(self, vocab: Vocab) -> None: def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None: svocab.add_to_gguf(self.gguf) - def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: - n_elements = int(np.prod(tensor.shape)) - raw_dtype = getattr(tensor.data_type, 'ggml_type', None) - data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype - data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) - def write_meta(self) -> None: - self.gguf.write_header_to_file() - self.gguf.write_kv_data_to_file() + self.gguf.write_to_file(meta_only=True) - def write_tensor_info(self) -> None: - self.gguf.write_ti_data_to_file() + def write_tensors(self, ftype: GGMLFileType, concurrency: int) -> None: + self.gguf.write_to_file(ftype=ftype, concurrency=concurrency, write_tensor_data=OutputFile.write_tensor_data) - def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None: + # really awkward with how this is managed with gguf_manager.py: maybe refactor at some point? + @staticmethod + def write_tensor_data(ftype: GGMLFileType, model: LazyModel, concurrency: int, writer: gguf.GGUFWriter) -> None: ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency) if ftype == GGMLFileType.MostlyQ8_0: ndarrays = bounded_parallel_map( @@ -1178,7 +1165,7 @@ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: print( f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" ) - self.gguf.write_tensor_data(ndarray) + writer.write_tensor_data(ndarray) def close(self) -> None: self.gguf.close() @@ -1217,156 +1204,26 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: @staticmethod def write_all( fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - tensors_per_shard: int, tensors_max_size: int, dry_run: bool = False, concurrency: int = DEFAULT_CONCURRENCY, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, small_first_shard: bool = True, + args: argparse.Namespace, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE ) -> None: - check_vocab_size(params, vocab, pad_vocab=pad_vocab) - - total_tensors = len(model) - total_size = sum(get_tensor_size(lazy_tensor) for lazy_tensor in model.values()) - - if tensors_per_shard: - split_style = SPLIT_STYLE_BY_TENSORS - elif tensors_max_size: - split_style = SPLIT_STYLE_BY_SIZE - else: - split_style = SPLIT_STYLE_NONE - - if tensors_per_shard and total_tensors < tensors_per_shard: - print("Model has fewer tensors than the split threshold, not splitting") - split_style = SPLIT_STYLE_NONE - - if tensors_max_size and total_size < tensors_max_size: - print("Model has smaller size than the split threshold, not splitting") - split_style = SPLIT_STYLE_NONE - - split_strategy = create_split_strategy(split_style, fname_out, model, tensors_per_shard, tensors_max_size, small_first_shard) - total_shards = len(split_strategy) - - print("Writing the following files:") - for shard_path, shard_tensors in split_strategy: - size = format_n_bytes_to_str(sum(get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" - print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") - - if dry_run: - print("Dry run, not writing files") - return - - for i, (shard_path, shard_tensors) in enumerate(split_strategy): - of = OutputFile(shard_path, endianess=endianess) - - if i == 0: - # meta data - of.add_meta_arch(params) - if isinstance(vocab, Vocab): - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) - else: # NoVocab - of.gguf.add_tokenizer_model(vocab.tokenizer_model) - - # have the option to write a first shard with only the metadata - if split_style != SPLIT_STYLE_NONE: - - of.gguf.add_uint16(LLM_KV_SPLIT_NO, i) - of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards) - of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors) - - if small_first_shard and i == 0: - of.write_meta() - of.close() - continue - - print(f"Writing shard {i + 1}/{total_shards} with {len(shard_tensors)} tensors") - - # tensor info - for name, lazy_tensor in shard_tensors: - of.add_tensor_info(name, lazy_tensor) - - of.write_meta() - of.write_tensor_info() - of.write_tensor_data(ftype, dict(shard_tensors), concurrency) - - of.close() - - -def split_str_to_n_bytes(split_str: str) -> int: - if split_str.endswith("K"): - n = int(split_str[:-1]) * 1024 - elif split_str.endswith("M"): - n = int(split_str[:-1]) * 1024 * 1024 - elif split_str.endswith("G"): - n = int(split_str[:-1]) * 1024 * 1024 * 1024 - elif split_str.isnumeric(): - n = int(split_str) - else: - raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") - - if n <= 0: - raise ValueError(f"Invalid split size: {split_str}, must be positive") - - return n - - -def format_n_bytes_to_str(num: int) -> str: - num = float(num) - for unit in ("", "K", "M", "G"): - if abs(num) < 1024.0: - return f"{num:3.1f}{unit}" - num /= 1024.0 - return f"{num:.1f}T - over 1TB, --split recommended" - - -def get_tensor_size(tensor: LazyTensor) -> int: - return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) + check_vocab_size(params, vocab, pad_vocab=args.pad_vocab) + of = OutputFile(fname_out, args, endianess=endianess) + # meta data + of.add_meta_arch(params) + if isinstance(vocab, Vocab): + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) + else: # NoVocab + of.gguf.add_tokenizer_model(vocab.tokenizer_model) -SplitStrategy: TypeAlias = 'list[tuple[Path, list[tuple[str, LazyTensor]]]]' - - -def create_split_strategy(split_style: int, fname_out: Path, model: LazyModel, tensors_per_shard: int, tensors_max_size: int, small_first_shard: bool) -> SplitStrategy: - if split_style == SPLIT_STYLE_NONE: - return [(fname_out, list(model.items()))] - - elif split_style == SPLIT_STYLE_BY_TENSORS: - total_shards = math.ceil(len(model) / tensors_per_shard) + small_first_shard - shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] - splits = [] - - if small_first_shard: - splits.append((shard_files[0], None)) - - for i, shard in enumerate(shard_files[small_first_shard:]): - start = i * tensors_per_shard - stop = min((i + 1) * tensors_per_shard, len(model)) - splits.append((shard, list(model.items())[start:stop])) - - return splits - - elif split_style == SPLIT_STYLE_BY_SIZE: - shards = [] - - # we have to determine the shards first to determine how many shards there will be in total - two passes - for i, shard in enumerate(list(model.items())): - if i == 0: - shards.append([shard]) - continue - if get_tensor_size(shard[1]) + sum(get_tensor_size(t[1]) for t in shards[-1]) > tensors_max_size: - shards.append([shard]) - else: - shards[-1].append(shard) - - total_shards = len(shards) + small_first_shard - shard_offset = 1 - splits = [] - - if small_first_shard: - splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards)), None)) - shard_offset += 1 + # tensor info + for name, lazy_tensor in model.items(): + of.gguf.add_tensor_info(name, lazy_tensor) - for i, shard in enumerate(shards): - splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)), shard)) + of.write_tensors(ftype, concurrency) - return splits + of.close() def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: @@ -1607,8 +1464,8 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--split", action="store_true", help="split the converted model into multiple files") - parser.add_argument("--split-max-tensors", type=int, help=f"max tensors in each split") - parser.add_argument("--split-max-size", type=str, help=f"max size per split") + parser.add_argument("--split-max-tensors", type=int, help="max tensors in each split") + parser.add_argument("--split-max-size", type=str, help="max size per split N(M|G)+") parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files") parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)") @@ -1628,7 +1485,7 @@ def main(args_in: list[str] | None = None) -> None: raise ValueError("Can't specify both --split-max-tensors and --split-max-size") if args.split_max_size: - args.split_max_size = split_str_to_n_bytes(args.split_max_size) + args.split_max_size = gguf.SplitStrategy.split_str_to_n_bytes(args.split_max_size) if not args.vocab_only: model_plus = load_some_model(args.model) @@ -1693,9 +1550,8 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, args.split_max_tensors, - args.split_max_size, dry_run=args.dry_run, concurrency=args.concurrency, - endianess=endianess, pad_vocab=args.pad_vocab, small_first_shard=not args.large_first_shard) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, args, + concurrency=args.concurrency, endianess=endianess) if not args.dry_run: print(f"Wrote {outfile}") diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py index 110ab342ccd71..bd904fa2ab1ba 100644 --- a/gguf-py/gguf/__init__.py +++ b/gguf-py/gguf/__init__.py @@ -1,5 +1,6 @@ from .constants import * from .gguf_reader import * from .gguf_writer import * +from .gguf_manager import * from .tensor_mapping import * from .vocab import * diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py new file mode 100644 index 0000000000000..b1e680810a642 --- /dev/null +++ b/gguf-py/gguf/gguf_manager.py @@ -0,0 +1,523 @@ +from __future__ import annotations + +import os +import shutil +import struct +import tempfile +from enum import IntEnum +from typing import TYPE_CHECKING, Any, Sequence, Mapping +from string import ascii_letters, digits +from argparse import Namespace +from math import ceil + +import numpy as np + +if TYPE_CHECKING: + from typing_extensions import TypeAlias + +from .constants import ( + GGMLQuantizationType, + GGUFEndian, + GGUFValueType, + Keys, + RopeScalingType, + PoolingType, + TokenType, +) +from .gguf_writer import GGUFWriter + + +SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" + +LLM_KV_SPLIT_NO = "split.no" +LLM_KV_SPLIT_COUNT = "split.count" +LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" + +SplitTensorsPerFile: TypeAlias = list[tuple[os.PathLike[str], list[tuple[str, Any]], GGUFWriter]] # [(outfile name, [(tensor name, tensor data)] for each tensor in file, filewriter)] +KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)} +TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any]] # (tensor name, tensor data), aka LazyModel + + +class SplitStyle(IntEnum): + NONE = 0 + TENSORS = 1 + SIZE = 2 + + +class SplitStrategy: + data: SplitTensorsPerFile + + def __init__(self, split_style: SplitStyle, fname_out: os.PathLike[str], model: list[TensorTempData], + args: Namespace, arch: str, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE, small_first_shard: bool = True + ): + self.data = [] + + if split_style == SplitStyle.NONE: + self.append((fname_out, model, GGUFWriter(fname_out, arch, use_temp_file=use_temp_file, endianess=endianess))) + + elif split_style == SplitStyle.TENSORS: + total_shards = ceil(len(model) / args.split_max_tensors) + small_first_shard + shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] + + if small_first_shard: + self.append((shard_files[0], None, GGUFWriter(shard_files[0], arch, use_temp_file=use_temp_file, endianess=endianess))) + + for i, shard in enumerate(shard_files[small_first_shard:]): + start = i * args.split_max_tensors + stop = min((i + 1) * args.split_max_tensors, len(model)) + self.append((shard, model[start:stop], GGUFWriter(shard, arch, use_temp_file=use_temp_file, endianess=endianess))) + + elif split_style == SplitStyle.SIZE: + shards = [] + + # we have to determine the shards first to determine how many shards there will be in total - two passes + for i, shard in enumerate(model): + if i == 0: + shards.append([shard]) + continue + if SplitStrategy.get_tensor_size(shard[1]) + sum(SplitStrategy.get_tensor_size(t[1]) for t in shards[-1]) > args.split_max_size: + shards.append([shard]) + else: + shards[-1].append(shard) + + total_shards = len(shards) + small_first_shard + shard_offset = 1 + + if small_first_shard: + outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards)) + self.append((outname, None, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) + shard_offset += 1 + + for i, shard in enumerate(shards): + outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)) + self.append((outname, shard, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) + + def __getitem__(self, index): + return self.data[index] + + def __setitem__(self, index, value): + self.data[index] = value + + def __len__(self): + return len(self.data) + + def append(self, value: TensorTempData): + self.data.append(value) + + def remove(self, item: TensorTempData): + self.data.remove(item) + + @staticmethod + def get_tensor_size(tensor) -> int: + # we don't have the LazyTensor class here from convert.py but we can try + try: + return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) + except AttributeError: # numpy ndarray[Any, Any] + return tensor.nbytes + except: # this should never happen + raise ValueError(f"Invalid tensor type: {type(tensor)}") + + @staticmethod + def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1024 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1024 * 1024 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1024 * 1024 * 1024 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + + if n <= 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + @staticmethod + def format_n_bytes_to_str(num: int) -> str: + num = float(num) + for unit in ("", "K", "M", "G"): + if abs(num) < 1024.0: + return f"{num:3.1f}{unit}" + num /= 1024.0 + return f"{num:.1f}T - over 1TB, --split recommended" + + +# ideally this has most of the same signatures as GGUFWriter so it's nearly a drop-in replacement +class GGUFManager: + kv_data: KVTempData + tensors: list[TensorTempData] + split_style: SplitStyle + split_strategy: SplitStrategy + + def __init__(self, path: os.PathLike[str] | str, arch: str, args: Namespace, use_temp_file: bool = True, + endianess: GGUFEndian = GGUFEndian.LITTLE) -> None: + self.arch = arch + self.path = path + self.endianess = endianess + self.offset_tensor = 0 + self.kv_data = {} + self.tensors = [] + self.args = args + self.split_style = SplitStyle.NONE if not args.split \ + else SplitStyle.TENSORS if args.split_max_tensors \ + else SplitStyle.SIZE + self.split_strategy = None + self.total_shards = None + self.total_tensors = None + self.use_temp_file = use_temp_file + + self.add_architecture() + + # have to consolidate because we need to know kv data count and tensor count before we can write the header + # and we need to write tensor info before we can write metadata + # these all kinda show up around the same places anyway so it's not a huge deal? + def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: int = 8, write_tensor_data: function = None) -> None: + + # here is the first place you can assume you have all tensors written and you can establish the size of the file - so logic goes here + self.total_tensors = len(self.tensors) + total_size = sum(SplitStrategy.get_tensor_size(tensor[1]) for tensor in self.tensors) + + if self.args.split_max_tensors and self.total_tensors < self.args.split_max_tensors: + print("Model has fewer tensors than the split threshold, not splitting") + self.split_style = SplitStyle.NONE + + if self.args.split_max_size and total_size < self.args.split_max_size: + print("Model has smaller size than the split threshold, not splitting") + self.split_style = SplitStyle.NONE + + self.split_strategy = SplitStrategy(self.split_style, self.path, self.tensors, self.args, not self.args.large_first_shard) + self.total_shards = len(self.split_strategy) + + # only the first shard needs all the KV data + for key, (value, etype) in self.kv_data.items(): + self.split_strategy[0][2].add_key(key) + self.split_strategy[0][2].add_val(value, etype) + + if self.split_style != SplitStyle.NONE: + for i, (_, _, writer) in enumerate(self.split_strategy): + writer.add_uint16(LLM_KV_SPLIT_NO, i) + writer.add_uint16(LLM_KV_SPLIT_COUNT, self.total_shards) + writer.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) + + # metadata/vocab only can write and return here + if meta_only: + for i, (_, _, writer) in enumerate(self.split_strategy): + writer.write_header_to_file() + writer.write_kv_data_to_file() + return + + # tensor writing code starts here + + print("\nWriting the following files:") + for (shard_path, shard_tensors, _) in self.split_strategy: + size = SplitStrategy.format_n_bytes_to_str(sum(SplitStrategy.get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" + print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") + + if self.args.dry_run: + print("\nDry run, not writing files") + return + + # run add_tensor_info, write data, then write_tensor_data - taken from convert.py + running_total = self.total_tensors + for i, (_, tensors, writer) in enumerate(self.split_strategy): + + if tensors: + for name, tensor in tensors: + n_elements = int(np.prod(tensor.shape)) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype + data_nbytes = tensor.data_type.elements_to_bytes(n_elements) + writer.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file() + + if tensors: + print(f"\nWriting to shard {i + 1}/{self.total_shards} with {len(tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") + running_total -= len(tensors) + + # convert.py's write_tensor_data is dependent on so many objects in convert.py itself that it's easier to pass the function as a parameter and call it here + write_tensor_data(ftype, dict(tensors), concurrency, writer) + + def add_uint8(self, key: str, val: int) -> None: + self.kv_data[key] = (val, GGUFValueType.UINT8) + + def add_int8(self, key: str, val: int) -> None: + self.kv_data[key] = (val, GGUFValueType.INT8) + + def add_uint16(self, key: str, val: int) -> None: + self.kv_data[key] = (val, GGUFValueType.UINT16) + + def add_int16(self, key: str, val: int) -> None: + self.kv_data[key] = (val, GGUFValueType.INT16) + + def add_uint32(self, key: str, val: int) -> None: + self.kv_data[key] = (val, GGUFValueType.UINT32) + + def add_int32(self, key: str, val: int) -> None: + self.kv_data[key] = (val, GGUFValueType.INT32) + + def add_float32(self, key: str, val: float) -> None: + self.kv_data[key] = (val, GGUFValueType.FLOAT32) + + def add_uint64(self, key: str, val: int) -> None: + self.kv_data[key] = (val, GGUFValueType.UINT64) + + def add_int64(self, key: str, val: int) -> None: + self.kv_data[key] = (val, GGUFValueType.INT64) + + def add_float64(self, key: str, val: float) -> None: + self.kv_data[key] = (val, GGUFValueType.FLOAT64) + + def add_bool(self, key: str, val: bool) -> None: + self.kv_data[key] = (val, GGUFValueType.BOOL) + + def add_string(self, key: str, val: str) -> None: + if not val: + return + self.kv_data[key] = (val, GGUFValueType.STRING) + + def add_array(self, key: str, val: Sequence[Any]) -> None: + if not isinstance(val, Sequence): + raise ValueError(f'Expected a sequence for {key}, got {type(val)}') + self.kv_data[key] = (val, GGUFValueType.ARRAY) + + # this method is exclusive to convert.py - we don't have LazyTensor so Any type is used + def add_tensor_info(self, name: str, tensor: Any) -> None: + self.tensors.append((name, tensor)) + + # these methods are everywhere but convert.py (and convert-lora-to-ggml.py since that doesn't use the class) + def add_tensor( + self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, + raw_dtype: GGMLQuantizationType | None = None, + ) -> None: + # TODO WRITE + pass + + def write_tensors_to_file(self) -> None: + # TODO WRITE + pass + + def close(self) -> None: + for _, _, writer in self.split_strategy: + writer.close() + + def add_architecture(self) -> None: + self.add_string(Keys.General.ARCHITECTURE, self.arch) + + def add_author(self, author: str) -> None: + self.add_string(Keys.General.AUTHOR, author) + + def add_version(self, version: str) -> None: + self.add_string(Keys.General.VERSION, version) + + def add_tensor_data_layout(self, layout: str) -> None: + self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) + + def add_url(self, url: str) -> None: + self.add_string(Keys.General.URL, url) + + def add_description(self, description: str) -> None: + self.add_string(Keys.General.DESCRIPTION, description) + + def add_licence(self, licence: str) -> None: + self.add_string(Keys.General.LICENSE, licence) + + def add_source_url(self, url: str) -> None: + self.add_string(Keys.General.SOURCE_URL, url) + + def add_source_hf_repo(self, repo: str) -> None: + self.add_string(Keys.General.SOURCE_HF_REPO, repo) + + def add_file_type(self, ftype: int) -> None: + self.add_uint32(Keys.General.FILE_TYPE, ftype) + + def add_name(self, name: str) -> None: + self.add_string(Keys.General.NAME, name) + + def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None: + self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version) + + def add_custom_alignment(self, alignment: int) -> None: + self.data_alignment = alignment + self.add_uint32(Keys.General.ALIGNMENT, alignment) + + def add_vocab_size(self, size: int) -> None: + self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size) + + def add_context_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) + + def add_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_block_count(self, length: int) -> None: + self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) + + def add_feed_forward_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) + + def add_parallel_residual(self, use: bool) -> None: + self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) + + def add_head_count(self, count: int) -> None: + self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) + + def add_head_count_kv(self, count: int) -> None: + self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) + + def add_key_length(self, length: int) -> None: + self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length) + + def add_value_length(self, length: int) -> None: + self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) + + def add_max_alibi_bias(self, bias: float) -> None: + self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) + + def add_clamp_kqv(self, value: float) -> None: + self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + + def add_logit_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) + + def add_expert_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) + + def add_expert_used_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count) + + def add_layer_norm_eps(self, value: float) -> None: + self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) + + def add_layer_norm_rms_eps(self, value: float) -> None: + self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) + + def add_causal_attention(self, value: bool) -> None: + self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) + + def add_pooling_type(self, value: PoolingType) -> None: + self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) + + def add_rope_dimension_count(self, count: int) -> None: + self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) + + def add_rope_freq_base(self, value: float) -> None: + self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) + + def add_rope_scaling_type(self, value: RopeScalingType) -> None: + self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value) + + def add_rope_scaling_factor(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) + + def add_rope_scaling_orig_ctx_len(self, value: int) -> None: + self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value) + + def add_rope_scaling_finetuned(self, value: bool) -> None: + self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) + + def add_ssm_conv_kernel(self, value: int) -> None: + self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value) + + def add_ssm_inner_size(self, value: int) -> None: + self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value) + + def add_ssm_state_size(self, value: int) -> None: + self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value) + + def add_ssm_time_step_rank(self, value: int) -> None: + self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value) + + def add_tokenizer_model(self, model: str) -> None: + self.add_string(Keys.Tokenizer.MODEL, model) + + def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: + self.add_array(Keys.Tokenizer.LIST, tokens) + + def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: + self.add_array(Keys.Tokenizer.MERGES, merges) + + def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None: + self.add_array(Keys.Tokenizer.TOKEN_TYPE, types) + + def add_token_type_count(self, value: int) -> None: + self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value) + + def add_token_scores(self, scores: Sequence[float]) -> None: + self.add_array(Keys.Tokenizer.SCORES, scores) + + def add_bos_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.BOS_ID, id) + + def add_eos_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOS_ID, id) + + def add_unk_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.UNK_ID, id) + + def add_sep_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.SEP_ID, id) + + def add_pad_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.PAD_ID, id) + + def add_cls_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.CLS_ID, id) + + def add_mask_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.MASK_ID, id) + + def add_add_bos_token(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_BOS, value) + + def add_add_eos_token(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_EOS, value) + + def add_add_space_prefix(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) + + def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: + if isinstance(value, list): + template_default = None + template_names = set() + + for choice in value: + name = choice.get('name', '') + template = choice.get('template') + + # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it + name = ''.join((c if c in ascii_letters + digits else '_' for c in name)) + + if name and template is not None: + if name == 'default': + template_default = template + else: + template_names.add(name) + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template) + + if template_names: + self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names)) + + if template_default is None: + return + + value = template_default + + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + + def add_prefix_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) + + def add_suffix_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id) + + def add_middle_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id) + + def add_eot_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOT_ID, id) \ No newline at end of file From c33bdf397de671cfc863439a8def7994d9516df3 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 28 Apr 2024 21:52:33 -0400 Subject: [PATCH 04/66] fix improper function signature --- gguf-py/gguf/gguf_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index b1e680810a642..5218f3a809834 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -188,7 +188,7 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in print("Model has smaller size than the split threshold, not splitting") self.split_style = SplitStyle.NONE - self.split_strategy = SplitStrategy(self.split_style, self.path, self.tensors, self.args, not self.args.large_first_shard) + self.split_strategy = SplitStrategy(self.split_style, self.path, self.tensors, self.args, self.arch) self.total_shards = len(self.split_strategy) # only the first shard needs all the KV data From b7c612088fd769c7e2fb476ee9c855245b343dbe Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 5 May 2024 15:43:24 -0400 Subject: [PATCH 05/66] tentative push of convert-hf-to-gguf support --- convert-hf-to-gguf.py | 39 ++++++++++++++++-------- convert.py | 2 +- gguf-py/gguf/gguf_manager.py | 57 +++++++++++++++++++++++++++++------- 3 files changed, 74 insertions(+), 24 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5763b6664e832..1d6fee89f0505 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -21,7 +21,9 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf +import importlib +gguf = importlib.import_module("gguf-py.gguf") +# import gguf from convert import LlamaHfVocab, permute @@ -43,18 +45,18 @@ class SentencePieceTokenTypes(IntEnum): class Model(ABC): _model_classes: dict[str, type[Model]] = {} - def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool): + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, args: argparse.Namespace): self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out - self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.use_temp_file = use_temp_file + self.is_big_endian = args.bigendian + self.endianess = gguf.GGUFEndian.BIG if args.bigendian else gguf.GGUFEndian.LITTLE + self.use_temp_file = args.use_temp_file self.is_safetensors = self._is_model_safetensors() self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) + self.gguf_writer = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], args, endianess=self.endianess, use_temp_file=self.use_temp_file) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) @property @@ -174,14 +176,11 @@ def write_tensors(self): def write(self): self.write_tensors() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file() + self.gguf_writer.write_to_file() self.gguf_writer.close() def write_vocab(self): - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_to_file(meta_only=True) self.gguf_writer.close() @staticmethod @@ -1711,7 +1710,7 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) - +# TODO what the hell is this? @Model.register("QWenLMHeadModel") class QwenModel(Model): model_arch = gguf.MODEL_ARCH.QWEN @@ -2843,6 +2842,11 @@ def parse_args() -> argparse.Namespace: help="directory containing model file", ) parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)") + parser.add_argument("--split", action="store_true", help="split the converted model into multiple files") + parser.add_argument("--split-max-tensors", type=int, help="max tensors in each split") + parser.add_argument("--split-max-size", type=str, help="max size per split N(M|G)") + parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files") + parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)") return parser.parse_args() @@ -2869,6 +2873,15 @@ def main() -> None: print(f'Error: {args.model} is not a directory', file=sys.stderr) sys.exit(1) + if args.split and not (args.split_max_tensors or args.split_max_size): + raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") + + if args.split_max_tensors and args.split_max_size: + raise ValueError("Can't specify both --split-max-tensors and --split-max-size") + + if args.split_max_size: + args.split_max_size = gguf.SplitStrategy.split_str_to_n_bytes(args.split_max_size) + ftype_map = { "f32": gguf.GGMLQuantizationType.F32, "f16": gguf.GGMLQuantizationType.F16, @@ -2886,7 +2899,7 @@ def main() -> None: with torch.inference_mode(): model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args) print("Set model parameters") model_instance.set_gguf_parameters() diff --git a/convert.py b/convert.py index 31c96e5ade55e..c6d9e2b2eba5a 100755 --- a/convert.py +++ b/convert.py @@ -1465,7 +1465,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--split", action="store_true", help="split the converted model into multiple files") parser.add_argument("--split-max-tensors", type=int, help="max tensors in each split") - parser.add_argument("--split-max-size", type=str, help="max size per split N(M|G)+") + parser.add_argument("--split-max-size", type=str, help="max size per split N(M|G)") parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files") parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)") diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 5218f3a809834..5963aa036b215 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -4,6 +4,7 @@ import shutil import struct import tempfile +import time from enum import IntEnum from typing import TYPE_CHECKING, Any, Sequence, Mapping from string import ascii_letters, digits @@ -174,7 +175,9 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, args: Namespace, use # have to consolidate because we need to know kv data count and tensor count before we can write the header # and we need to write tensor info before we can write metadata # these all kinda show up around the same places anyway so it's not a huge deal? - def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: int = 8, write_tensor_data: function = None) -> None: + def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: int = 8, + write_tensor_data: function = None + ) -> None: # here is the first place you can assume you have all tensors written and you can establish the size of the file - so logic goes here self.total_tensors = len(self.tensors) @@ -218,19 +221,37 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in if self.args.dry_run: print("\nDry run, not writing files") + # instantiating GGUFWriters creates files + for name, _, _ in self.split_strategy: + os.remove(name) return # run add_tensor_info, write data, then write_tensor_data - taken from convert.py running_total = self.total_tensors + start = time.time() for i, (_, tensors, writer) in enumerate(self.split_strategy): if tensors: - for name, tensor in tensors: + for j, (name, tensor) in enumerate(tensors): n_elements = int(np.prod(tensor.shape)) - raw_dtype = getattr(tensor.data_type, 'ggml_type', None) - data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype - data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - writer.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) + # logic from convert.py + if getattr(tensor, 'data_type', None): + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype + data_nbytes = tensor.data_type.elements_to_bytes(n_elements) + writer.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) + # logic from convert-hf-to-gguf.py + else: + # stolen from write_tensor_data because that doesn't get called with this logic + elapsed = time.time() - start + size = ' x '.join(f"{dim:6d}" for dim in tensor.shape) + padi = len(str(self.total_tensors)) + dtype = str(tensor.dtype) + print( + f"[{j + 1:{padi}d}/{len(tensors)}] Writing tensor {name:38s} | size {size:16} | type {dtype:8} | T+{int(elapsed):4}" + ) + writer.add_tensor(name, tensor) + writer.write_header_to_file() writer.write_kv_data_to_file() @@ -240,8 +261,9 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in print(f"\nWriting to shard {i + 1}/{self.total_shards} with {len(tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") running_total -= len(tensors) - # convert.py's write_tensor_data is dependent on so many objects in convert.py itself that it's easier to pass the function as a parameter and call it here - write_tensor_data(ftype, dict(tensors), concurrency, writer) + if write_tensor_data: + # convert.py's write_tensor_data is dependent on so many objects in convert.py itself that it's easier to pass the function as a parameter and call it here + write_tensor_data(ftype, dict(tensors), concurrency, writer) def add_uint8(self, key: str, val: int) -> None: self.kv_data[key] = (val, GGUFValueType.UINT8) @@ -295,8 +317,23 @@ def add_tensor( self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None, ) -> None: - # TODO WRITE - pass + if self.endianess == GGUFEndian.BIG: + tensor.byteswap(inplace=True) + + # TODO reimplement temp file + #if self.use_temp_file and self.temp_file is None: + # fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024) + # fp.seek(0) + # self.temp_file = fp + + self.add_tensor_info(name, tensor) + + #if self.temp_file is None: + # self.tensors.append(tensor) + # return + + #tensor.tofile(self.temp_file) + #self.write_padding(self.temp_file, tensor.nbytes) def write_tensors_to_file(self) -> None: # TODO WRITE From 87a98a5b6d42bc6b566d51a5d717ed65866afc44 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 9 May 2024 21:22:55 -0400 Subject: [PATCH 06/66] resolve merge + SplitArguments for easier parsing --- convert-hf-to-gguf.py | 18 ++++---- convert.py | 21 +++------ gguf-py/gguf/gguf_manager.py | 83 +++++++++++++++++++++++++----------- 3 files changed, 73 insertions(+), 49 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 8e8ecfe3d2b47..4ba681473c71b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -58,7 +58,7 @@ class Model: part_names: list[str] is_safetensors: bool hparams: dict[str, Any] - gguf_writer: gguf.GGUFWriter + gguf_writer: gguf.GGUFManager block_count: int tensor_map: gguf.TensorNameMap tensor_names: set[str] | None @@ -66,7 +66,8 @@ class Model: # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool): + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, + split_arguments: gguf.SplitArguments): if self.__class__ == Model: raise TypeError(f"{self.__class__.__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -83,7 +84,8 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: self.part_names = Model.get_model_part_names(self.dir_model, ".bin") self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) + self.gguf_writer = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, + endianess=self.endianess, use_temp_file=self.use_temp_file) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None @@ -275,9 +277,7 @@ def write_tensors(self): def write(self): self.write_tensors() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.write_to_file() self.gguf_writer.close() def write_vocab(self): @@ -2501,8 +2501,7 @@ def main() -> None: if args.split_max_tensors and args.split_max_size: raise ValueError("Can't specify both --split-max-tensors and --split-max-size") - if args.split_max_size: - args.split_max_size = gguf.SplitStrategy.split_str_to_n_bytes(args.split_max_size) + split_arguments = gguf.SplitArguments(args) if args.split else gguf.SplitArguments() ftype_map = { "f32": gguf.GGMLQuantizationType.F32, @@ -2521,7 +2520,8 @@ def main() -> None: with torch.inference_mode(): model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, + args.no_lazy, split_arguments) logger.info("Set model parameters") model_instance.set_gguf_parameters() diff --git a/convert.py b/convert.py index cc133576fff34..9ee0f1ce757ad 100755 --- a/convert.py +++ b/convert.py @@ -1065,8 +1065,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) class OutputFile: - def __init__(self, fname_out: Path, args: argparse.Namespace, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): - self.gguf = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], args, endianess=endianess) + def __init__(self, fname_out: Path, split_arguments: gguf.SplitArguments, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): + self.gguf = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], split_arguments, endianess=endianess) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -1183,7 +1183,7 @@ def write_vocab_only( ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, endianess=endianess) + of = OutputFile(fname_out, gguf.SplitArguments(), endianess=endianess) # meta data of.add_meta_arch(params) @@ -1210,10 +1210,10 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: @staticmethod def write_all( fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - args: argparse.Namespace, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE + split_arguments: gguf.SplitArguments, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE ) -> None: check_vocab_size(params, vocab, pad_vocab=args.pad_vocab) - of = OutputFile(fname_out, args, endianess=endianess) + of = OutputFile(fname_out, split_arguments, endianess=endianess) # meta data of.add_meta_arch(params) @@ -1500,8 +1500,7 @@ def main(args_in: list[str] | None = None) -> None: if args.split_max_tensors and args.split_max_size: raise ValueError("Can't specify both --split-max-tensors and --split-max-size") - if args.split_max_size: - args.split_max_size = gguf.SplitStrategy.split_str_to_n_bytes(args.split_max_size) + split_arguments = gguf.SplitArguments(args) if args.split else gguf.SplitArguments() if not args.vocab_only: model_plus = load_some_model(args.model) @@ -1578,15 +1577,9 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype - print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, args, - concurrency=args.concurrency, endianess=endianess) - if not args.dry_run: - print(f"Wrote {outfile}") - logger.info(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, split_arguments, concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) if not args.dry_run: logger.info(f"Wrote {outfile}") diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 5963aa036b215..f36b0173eafae 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -45,30 +45,57 @@ class SplitStyle(IntEnum): SIZE = 2 +class SplitArguments: + split: bool + dry_run: bool + small_first_shard: bool + split_max_tensors: int + split_max_size: int + split_style: SplitStyle + + def __init__(self) -> None: + self.split = False + self.dry_run = False + self.small_first_shard = False + self.split_max_tensors = 0 + self.split_max_size = 0 + self.split_style = SplitStyle.NONE + + def __init__(self, args: Namespace) -> None: + self.split = args.split + self.split_max_tensors = args.split_max_tensors + self.split_max_size = SplitStrategy.split_str_to_n_bytes(args.split_max_size) if args.split_max_size else None + self.dry_run = args.dry_run + self.small_first_shard = not args.large_first_shard + self.split_style = SplitStyle.NONE if not self.split \ + else SplitStyle.TENSORS if self.split_max_tensors \ + else SplitStyle.SIZE + + class SplitStrategy: data: SplitTensorsPerFile - def __init__(self, split_style: SplitStyle, fname_out: os.PathLike[str], model: list[TensorTempData], - args: Namespace, arch: str, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE, small_first_shard: bool = True + def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arch: str, + split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE, ): self.data = [] - if split_style == SplitStyle.NONE: + if split_arguments.split_style == SplitStyle.NONE: self.append((fname_out, model, GGUFWriter(fname_out, arch, use_temp_file=use_temp_file, endianess=endianess))) - elif split_style == SplitStyle.TENSORS: - total_shards = ceil(len(model) / args.split_max_tensors) + small_first_shard + elif split_arguments.split_style == SplitStyle.TENSORS: + total_shards = ceil(len(model) / split_arguments.split_max_tensors) + split_arguments.small_first_shard shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] - if small_first_shard: + if split_arguments.small_first_shard: self.append((shard_files[0], None, GGUFWriter(shard_files[0], arch, use_temp_file=use_temp_file, endianess=endianess))) - for i, shard in enumerate(shard_files[small_first_shard:]): - start = i * args.split_max_tensors - stop = min((i + 1) * args.split_max_tensors, len(model)) + for i, shard in enumerate(shard_files[split_arguments.small_first_shard:]): + start = i * split_arguments.split_max_tensors + stop = min((i + 1) * split_arguments.split_max_tensors, len(model)) self.append((shard, model[start:stop], GGUFWriter(shard, arch, use_temp_file=use_temp_file, endianess=endianess))) - elif split_style == SplitStyle.SIZE: + elif split_arguments.split_style == SplitStyle.SIZE: shards = [] # we have to determine the shards first to determine how many shards there will be in total - two passes @@ -76,15 +103,15 @@ def __init__(self, split_style: SplitStyle, fname_out: os.PathLike[str], model: if i == 0: shards.append([shard]) continue - if SplitStrategy.get_tensor_size(shard[1]) + sum(SplitStrategy.get_tensor_size(t[1]) for t in shards[-1]) > args.split_max_size: + if SplitStrategy.get_tensor_size(shard[1]) + sum(SplitStrategy.get_tensor_size(t[1]) for t in shards[-1]) > split_arguments.split_max_size: shards.append([shard]) else: shards[-1].append(shard) - total_shards = len(shards) + small_first_shard + total_shards = len(shards) + split_arguments.small_first_shard shard_offset = 1 - if small_first_shard: + if split_arguments.small_first_shard: outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards)) self.append((outname, None, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) shard_offset += 1 @@ -150,25 +177,23 @@ def format_n_bytes_to_str(num: int) -> str: class GGUFManager: kv_data: KVTempData tensors: list[TensorTempData] - split_style: SplitStyle + split_arguments: SplitArguments split_strategy: SplitStrategy - def __init__(self, path: os.PathLike[str] | str, arch: str, args: Namespace, use_temp_file: bool = True, - endianess: GGUFEndian = GGUFEndian.LITTLE) -> None: + def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, + use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE + ) -> None: self.arch = arch self.path = path self.endianess = endianess self.offset_tensor = 0 self.kv_data = {} self.tensors = [] - self.args = args - self.split_style = SplitStyle.NONE if not args.split \ - else SplitStyle.TENSORS if args.split_max_tensors \ - else SplitStyle.SIZE self.split_strategy = None self.total_shards = None self.total_tensors = None self.use_temp_file = use_temp_file + self.split_arguments = split_arguments self.add_architecture() @@ -183,15 +208,16 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in self.total_tensors = len(self.tensors) total_size = sum(SplitStrategy.get_tensor_size(tensor[1]) for tensor in self.tensors) - if self.args.split_max_tensors and self.total_tensors < self.args.split_max_tensors: + if self.split_arguments.split_max_tensors and self.total_tensors < self.split_arguments.split_max_tensors: print("Model has fewer tensors than the split threshold, not splitting") self.split_style = SplitStyle.NONE - if self.args.split_max_size and total_size < self.args.split_max_size: + if self.split_arguments.split_max_size and total_size < self.split_arguments.split_max_size: print("Model has smaller size than the split threshold, not splitting") self.split_style = SplitStyle.NONE - self.split_strategy = SplitStrategy(self.split_style, self.path, self.tensors, self.args, self.arch) + self.split_strategy = SplitStrategy(self.path, self.tensors, self.arch, self.split_arguments, + use_temp_file=self.use_temp_file, endianess=self.endianess) self.total_shards = len(self.split_strategy) # only the first shard needs all the KV data @@ -199,7 +225,7 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in self.split_strategy[0][2].add_key(key) self.split_strategy[0][2].add_val(value, etype) - if self.split_style != SplitStyle.NONE: + if self.split_arguments.split_style != SplitStyle.NONE: for i, (_, _, writer) in enumerate(self.split_strategy): writer.add_uint16(LLM_KV_SPLIT_NO, i) writer.add_uint16(LLM_KV_SPLIT_COUNT, self.total_shards) @@ -219,7 +245,7 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in size = SplitStrategy.format_n_bytes_to_str(sum(SplitStrategy.get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") - if self.args.dry_run: + if self.split_arguments.dry_run: print("\nDry run, not writing files") # instantiating GGUFWriters creates files for name, _, _ in self.split_strategy: @@ -232,6 +258,7 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in for i, (_, tensors, writer) in enumerate(self.split_strategy): if tensors: + print(f"\nWriting to shard {i + 1}/{self.total_shards} with {len(tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") for j, (name, tensor) in enumerate(tensors): n_elements = int(np.prod(tensor.shape)) # logic from convert.py @@ -251,6 +278,7 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in f"[{j + 1:{padi}d}/{len(tensors)}] Writing tensor {name:38s} | size {size:16} | type {dtype:8} | T+{int(elapsed):4}" ) writer.add_tensor(name, tensor) + print(f"Writing to shard {i + 1}/{self.total_shards} with {len(tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") writer.write_header_to_file() @@ -258,7 +286,7 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in writer.write_tensors_to_file() if tensors: - print(f"\nWriting to shard {i + 1}/{self.total_shards} with {len(tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") + # TODO this shows up AFTER writing which we don't really want - move it running_total -= len(tensors) if write_tensor_data: @@ -473,6 +501,9 @@ def add_ssm_time_step_rank(self, value: int) -> None: def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) + def add_tokenizer_pre(self, pre: str) -> None: + self.add_string(Keys.Tokenizer.PRE, pre) + def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: self.add_array(Keys.Tokenizer.LIST, tokens) From 3ff27efa89a42afebf51cbfbc0964f81b479babd Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 23 May 2024 18:50:21 -0400 Subject: [PATCH 07/66] Fix eager tensor memory leak and remove convert.py changes Removed a memory leak caused by unexpected reference retention to eager tensors. Also removed GGUFManager functionality in convert.py in favor of specializing for convert-hf-to-gguf.py. --- convert-hf-to-gguf.py | 2 +- convert.py | 70 +++++++++--------- gguf-py/gguf/gguf_manager.py | 136 ++++++++++++++--------------------- gguf-py/gguf/gguf_writer.py | 1 + 4 files changed, 88 insertions(+), 121 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 7c8d7a8ac75ea..24da4ebdd941e 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2570,7 +2570,7 @@ def main() -> None: if args.split_max_tensors and args.split_max_size: raise ValueError("Can't specify both --split-max-tensors and --split-max-size") - split_arguments = gguf.SplitArguments(args) if args.split else gguf.SplitArguments() + split_arguments = gguf.SplitArguments(args=args) if args.split else gguf.SplitArguments() ftype_map = { "f32": gguf.LlamaFileType.ALL_F32, diff --git a/convert.py b/convert.py index 26c0641250b0c..da1247957780c 100644 --- a/convert.py +++ b/convert.py @@ -24,17 +24,14 @@ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable -# TEMPORARY IMPORT - TODO REMOVE -import importlib -gguf = importlib.import_module("gguf-py.gguf") +from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional import numpy as np from sentencepiece import SentencePieceProcessor if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -# import gguf +import gguf if TYPE_CHECKING: from typing_extensions import Self, TypeAlias @@ -1103,8 +1100,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) class OutputFile: - def __init__(self, fname_out: Path, split_arguments: gguf.SplitArguments, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): - self.gguf = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], split_arguments, endianess=endianess) + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_model(self, params: Params, metadata: Metadata) -> None: # Metadata About The Model And Its Provenence @@ -1204,15 +1201,21 @@ def add_meta_vocab(self, vocab: Vocab) -> None: def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None: svocab.add_to_gguf(self.gguf) + def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: + n_elements = int(np.prod(tensor.shape)) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype + data_nbytes = tensor.data_type.elements_to_bytes(n_elements) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) + def write_meta(self) -> None: - self.gguf.write_to_file(meta_only=True) + self.gguf.write_header_to_file() + self.gguf.write_kv_data_to_file() - def write_tensors(self, ftype: GGMLFileType, concurrency: int) -> None: - self.gguf.write_to_file(ftype=ftype, concurrency=concurrency, write_tensor_data=OutputFile.write_tensor_data) + def write_tensor_info(self) -> None: + self.gguf.write_ti_data_to_file() - # really awkward with how this is managed with gguf_manager.py: maybe refactor at some point? - @staticmethod - def write_tensor_data(ftype: GGMLFileType, model: LazyModel, concurrency: int, writer: gguf.GGUFWriter) -> None: + def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None: ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency) if ftype == GGMLFileType.MostlyQ8_0: ndarrays = bounded_parallel_map( @@ -1230,7 +1233,7 @@ def write_tensor_data(ftype: GGMLFileType, model: LazyModel, concurrency: int, w logger.info( f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" ) - writer.write_tensor_data(ndarray) + self.gguf.write_tensor_data(ndarray) def close(self) -> None: self.gguf.close() @@ -1242,7 +1245,7 @@ def write_vocab_only( ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, gguf.SplitArguments(), endianess=endianess) + of = OutputFile(fname_out, endianess=endianess) # meta data of.add_meta_model(params, metadata) @@ -1270,11 +1273,13 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: @staticmethod def write_all( fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - split_arguments: gguf.SplitArguments, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, metadata: Metadata = None, + concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, + metadata: Metadata = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, split_arguments, endianess=endianess) + + of = OutputFile(fname_out, endianess=endianess) # meta data of.add_meta_model(params, metadata) @@ -1287,9 +1292,13 @@ def write_all( # tensor info for name, lazy_tensor in model.items(): - of.gguf.add_tensor_info(name, lazy_tensor) + of.add_tensor_info(name, lazy_tensor) + + of.write_meta() + of.write_tensor_info() - of.write_tensors(ftype, concurrency) + # tensor data + of.write_tensor_data(ftype, model, concurrency) of.close() @@ -1364,7 +1373,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]) del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"] else: - raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.model_classweight") + raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight") tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts) # HF models permut or pack some of the tensors, so we need to undo that @@ -1584,11 +1593,6 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") - parser.add_argument("--split", action="store_true", help="split the converted model into multiple files") - parser.add_argument("--split-max-tensors", type=int, help="max tensors in each split") - parser.add_argument("--split-max-size", type=str, help="max size per split N(M|G)") - parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files") - parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file") parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name") @@ -1622,14 +1626,6 @@ def main(args_in: list[str] | None = None) -> None: do_dump_model(model_plus) return - if args.split and not (args.split_max_tensors or args.split_max_size): - raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") - - if args.split_max_tensors and args.split_max_size: - raise ValueError("Can't specify both --split-max-tensors and --split-max-size") - - split_arguments = gguf.SplitArguments(args) if args.split else gguf.SplitArguments() - if not args.vocab_only: model_plus = load_some_model(args.model) else: @@ -1707,13 +1703,11 @@ def main(args_in: list[str] | None = None) -> None: outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata) params.ftype = ftype - logger.info(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, split_arguments, + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata) - if not args.dry_run: - logger.info(f"Wrote {outfile}") + logger.info(f"Wrote {outfile}") if __name__ == '__main__': diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index f36b0173eafae..4a51b717e23e6 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -10,6 +10,7 @@ from string import ascii_letters, digits from argparse import Namespace from math import ceil +from collections import deque import numpy as np @@ -34,7 +35,7 @@ LLM_KV_SPLIT_COUNT = "split.count" LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" -SplitTensorsPerFile: TypeAlias = list[tuple[os.PathLike[str], list[tuple[str, Any]], GGUFWriter]] # [(outfile name, [(tensor name, tensor data)] for each tensor in file, filewriter)] +SplitTensorsPerFile: TypeAlias = deque[tuple[os.PathLike[str], deque[tuple[str, Any]], GGUFWriter]] # [(outfile name, [(tensor name, tensor data)] for each tensor in file, filewriter)] KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)} TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any]] # (tensor name, tensor data), aka LazyModel @@ -53,23 +54,23 @@ class SplitArguments: split_max_size: int split_style: SplitStyle - def __init__(self) -> None: - self.split = False - self.dry_run = False - self.small_first_shard = False - self.split_max_tensors = 0 - self.split_max_size = 0 - self.split_style = SplitStyle.NONE - - def __init__(self, args: Namespace) -> None: - self.split = args.split - self.split_max_tensors = args.split_max_tensors - self.split_max_size = SplitStrategy.split_str_to_n_bytes(args.split_max_size) if args.split_max_size else None - self.dry_run = args.dry_run - self.small_first_shard = not args.large_first_shard - self.split_style = SplitStyle.NONE if not self.split \ - else SplitStyle.TENSORS if self.split_max_tensors \ - else SplitStyle.SIZE + def __init__(self, args: Namespace = None) -> None: + if args is None: + self.split = False + self.dry_run = False + self.small_first_shard = False + self.split_max_tensors = 0 + self.split_max_size = 0 + self.split_style = SplitStyle.NONE + else: + self.split = args.split + self.split_max_tensors = args.split_max_tensors + self.split_max_size = SplitStrategy.split_str_to_n_bytes(args.split_max_size) if args.split_max_size else None + self.dry_run = args.dry_run + self.small_first_shard = not args.large_first_shard + self.split_style = SplitStyle.NONE if not self.split \ + else SplitStyle.TENSORS if self.split_max_tensors \ + else SplitStyle.SIZE class SplitStrategy: @@ -78,7 +79,7 @@ class SplitStrategy: def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE, ): - self.data = [] + self.data = deque() if split_arguments.split_style == SplitStyle.NONE: self.append((fname_out, model, GGUFWriter(fname_out, arch, use_temp_file=use_temp_file, endianess=endianess))) @@ -96,7 +97,7 @@ def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arc self.append((shard, model[start:stop], GGUFWriter(shard, arch, use_temp_file=use_temp_file, endianess=endianess))) elif split_arguments.split_style == SplitStyle.SIZE: - shards = [] + shards = deque() # we have to determine the shards first to determine how many shards there will be in total - two passes for i, shard in enumerate(model): @@ -118,13 +119,7 @@ def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arc for i, shard in enumerate(shards): outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)) - self.append((outname, shard, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) - - def __getitem__(self, index): - return self.data[index] - - def __setitem__(self, index, value): - self.data[index] = value + self.append((outname, deque(shard), GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) def __len__(self): return len(self.data) @@ -176,7 +171,7 @@ def format_n_bytes_to_str(num: int) -> str: # ideally this has most of the same signatures as GGUFWriter so it's nearly a drop-in replacement class GGUFManager: kv_data: KVTempData - tensors: list[TensorTempData] + tensors: deque[TensorTempData] split_arguments: SplitArguments split_strategy: SplitStrategy @@ -188,7 +183,7 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: Spl self.endianess = endianess self.offset_tensor = 0 self.kv_data = {} - self.tensors = [] + self.tensors = deque() self.split_strategy = None self.total_shards = None self.total_tensors = None @@ -200,9 +195,7 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: Spl # have to consolidate because we need to know kv data count and tensor count before we can write the header # and we need to write tensor info before we can write metadata # these all kinda show up around the same places anyway so it's not a huge deal? - def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: int = 8, - write_tensor_data: function = None - ) -> None: + def write_to_file(self, meta_only: bool = False) -> None: # here is the first place you can assume you have all tensors written and you can establish the size of the file - so logic goes here self.total_tensors = len(self.tensors) @@ -218,22 +211,23 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in self.split_strategy = SplitStrategy(self.path, self.tensors, self.arch, self.split_arguments, use_temp_file=self.use_temp_file, endianess=self.endianess) + del self.tensors self.total_shards = len(self.split_strategy) # only the first shard needs all the KV data for key, (value, etype) in self.kv_data.items(): - self.split_strategy[0][2].add_key(key) - self.split_strategy[0][2].add_val(value, etype) + self.split_strategy.data[0][2].add_key(key) + self.split_strategy.data[0][2].add_val(value, etype) if self.split_arguments.split_style != SplitStyle.NONE: - for i, (_, _, writer) in enumerate(self.split_strategy): + for i, (_, _, writer) in enumerate(self.split_strategy.data): writer.add_uint16(LLM_KV_SPLIT_NO, i) writer.add_uint16(LLM_KV_SPLIT_COUNT, self.total_shards) writer.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) # metadata/vocab only can write and return here if meta_only: - for i, (_, _, writer) in enumerate(self.split_strategy): + for i, (_, _, writer) in enumerate(self.split_strategy.data): writer.write_header_to_file() writer.write_kv_data_to_file() return @@ -241,57 +235,44 @@ def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: in # tensor writing code starts here print("\nWriting the following files:") - for (shard_path, shard_tensors, _) in self.split_strategy: + for (shard_path, shard_tensors, _) in self.split_strategy.data: size = SplitStrategy.format_n_bytes_to_str(sum(SplitStrategy.get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") if self.split_arguments.dry_run: print("\nDry run, not writing files") # instantiating GGUFWriters creates files - for name, _, _ in self.split_strategy: + for name, _, _ in self.split_strategy.data: os.remove(name) return # run add_tensor_info, write data, then write_tensor_data - taken from convert.py running_total = self.total_tensors - start = time.time() - for i, (_, tensors, writer) in enumerate(self.split_strategy): - + ct = 0 + while True: + try: + (_, tensors, writer) = self.split_strategy.data.popleft() + except IndexError: + break + + shard_num_tensors = len(tensors) if tensors else 0 + if tensors: - print(f"\nWriting to shard {i + 1}/{self.total_shards} with {len(tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") - for j, (name, tensor) in enumerate(tensors): - n_elements = int(np.prod(tensor.shape)) - # logic from convert.py - if getattr(tensor, 'data_type', None): - raw_dtype = getattr(tensor.data_type, 'ggml_type', None) - data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype - data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - writer.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) - # logic from convert-hf-to-gguf.py - else: - # stolen from write_tensor_data because that doesn't get called with this logic - elapsed = time.time() - start - size = ' x '.join(f"{dim:6d}" for dim in tensor.shape) - padi = len(str(self.total_tensors)) - dtype = str(tensor.dtype) - print( - f"[{j + 1:{padi}d}/{len(tensors)}] Writing tensor {name:38s} | size {size:16} | type {dtype:8} | T+{int(elapsed):4}" - ) - writer.add_tensor(name, tensor) - print(f"Writing to shard {i + 1}/{self.total_shards} with {len(tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") + while True: + try: + (name, tensor) = tensors.popleft() + except IndexError: + break + writer.add_tensor(name, tensor) + print(f"Writing to shard {ct + 1}/{self.total_shards} with {shard_num_tensors}/{running_total} remaining tensors (of {self.total_tensors} total)") + running_total -= shard_num_tensors writer.write_header_to_file() writer.write_kv_data_to_file() - writer.write_tensors_to_file() - - if tensors: - # TODO this shows up AFTER writing which we don't really want - move it - running_total -= len(tensors) - - if write_tensor_data: - # convert.py's write_tensor_data is dependent on so many objects in convert.py itself that it's easier to pass the function as a parameter and call it here - write_tensor_data(ftype, dict(tensors), concurrency, writer) + writer.write_tensors_to_file(progress=True) + ct = ct + 1 + del tensors def add_uint8(self, key: str, val: int) -> None: self.kv_data[key] = (val, GGUFValueType.UINT8) @@ -336,11 +317,6 @@ def add_array(self, key: str, val: Sequence[Any]) -> None: raise ValueError(f'Expected a sequence for {key}, got {type(val)}') self.kv_data[key] = (val, GGUFValueType.ARRAY) - # this method is exclusive to convert.py - we don't have LazyTensor so Any type is used - def add_tensor_info(self, name: str, tensor: Any) -> None: - self.tensors.append((name, tensor)) - - # these methods are everywhere but convert.py (and convert-lora-to-ggml.py since that doesn't use the class) def add_tensor( self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None, @@ -354,7 +330,7 @@ def add_tensor( # fp.seek(0) # self.temp_file = fp - self.add_tensor_info(name, tensor) + self.tensors.append((name, tensor)) #if self.temp_file is None: # self.tensors.append(tensor) @@ -363,12 +339,8 @@ def add_tensor( #tensor.tofile(self.temp_file) #self.write_padding(self.temp_file, tensor.nbytes) - def write_tensors_to_file(self) -> None: - # TODO WRITE - pass - def close(self) -> None: - for _, _, writer in self.split_strategy: + for _, _, writer in self.split_strategy.data: writer.close() def add_architecture(self) -> None: diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 8b41b54eaa5a6..964bf849c079a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -301,6 +301,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: tensor.tofile(self.fout) bar.update(tensor.nbytes) self.write_padding(self.fout, tensor.nbytes) + del tensor return while True: try: From 6b5c3753c8b827866b98e5a401387bca0a091e93 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 24 May 2024 00:28:48 -0400 Subject: [PATCH 08/66] refactor SplitStrategy to be a deque Instead of having SplitStrategy have a `data` field that is a deque, just have SplitStrategy be a subclass of deque itself. --- gguf-py/gguf/gguf_manager.py | 53 ++++++++++++------------------------ 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 4a51b717e23e6..95b16aee153f4 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -55,31 +55,23 @@ class SplitArguments: split_style: SplitStyle def __init__(self, args: Namespace = None) -> None: - if args is None: - self.split = False - self.dry_run = False - self.small_first_shard = False - self.split_max_tensors = 0 - self.split_max_size = 0 - self.split_style = SplitStyle.NONE - else: - self.split = args.split - self.split_max_tensors = args.split_max_tensors - self.split_max_size = SplitStrategy.split_str_to_n_bytes(args.split_max_size) if args.split_max_size else None - self.dry_run = args.dry_run - self.small_first_shard = not args.large_first_shard - self.split_style = SplitStyle.NONE if not self.split \ - else SplitStyle.TENSORS if self.split_max_tensors \ - else SplitStyle.SIZE + self.split = args.split if args else False + self.split_max_tensors = args.split_max_tensors if args else 0 + self.split_max_size = SplitStrategy.split_str_to_n_bytes(args.split_max_size) if args and args.split_max_size else 0 + self.dry_run = args.dry_run if args else False + self.small_first_shard = not args.large_first_shard if args else False + self.split_style = SplitStyle.NONE if not self.split or not args \ + else SplitStyle.TENSORS if self.split_max_tensors \ + else SplitStyle.SIZE -class SplitStrategy: +class SplitStrategy(deque): data: SplitTensorsPerFile def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE, ): - self.data = deque() + super().__init__() if split_arguments.split_style == SplitStyle.NONE: self.append((fname_out, model, GGUFWriter(fname_out, arch, use_temp_file=use_temp_file, endianess=endianess))) @@ -121,15 +113,6 @@ def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arc outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)) self.append((outname, deque(shard), GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) - def __len__(self): - return len(self.data) - - def append(self, value: TensorTempData): - self.data.append(value) - - def remove(self, item: TensorTempData): - self.data.remove(item) - @staticmethod def get_tensor_size(tensor) -> int: # we don't have the LazyTensor class here from convert.py but we can try @@ -216,18 +199,18 @@ def write_to_file(self, meta_only: bool = False) -> None: # only the first shard needs all the KV data for key, (value, etype) in self.kv_data.items(): - self.split_strategy.data[0][2].add_key(key) - self.split_strategy.data[0][2].add_val(value, etype) + self.split_strategy[0][2].add_key(key) + self.split_strategy[0][2].add_val(value, etype) if self.split_arguments.split_style != SplitStyle.NONE: - for i, (_, _, writer) in enumerate(self.split_strategy.data): + for i, (_, _, writer) in enumerate(self.split_strategy): writer.add_uint16(LLM_KV_SPLIT_NO, i) writer.add_uint16(LLM_KV_SPLIT_COUNT, self.total_shards) writer.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) # metadata/vocab only can write and return here if meta_only: - for i, (_, _, writer) in enumerate(self.split_strategy.data): + for i, (_, _, writer) in enumerate(self.split_strategy): writer.write_header_to_file() writer.write_kv_data_to_file() return @@ -235,14 +218,14 @@ def write_to_file(self, meta_only: bool = False) -> None: # tensor writing code starts here print("\nWriting the following files:") - for (shard_path, shard_tensors, _) in self.split_strategy.data: + for (shard_path, shard_tensors, _) in self.split_strategy: size = SplitStrategy.format_n_bytes_to_str(sum(SplitStrategy.get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") if self.split_arguments.dry_run: print("\nDry run, not writing files") # instantiating GGUFWriters creates files - for name, _, _ in self.split_strategy.data: + for name, _, _ in self.split_strategy: os.remove(name) return @@ -251,7 +234,7 @@ def write_to_file(self, meta_only: bool = False) -> None: ct = 0 while True: try: - (_, tensors, writer) = self.split_strategy.data.popleft() + (_, tensors, writer) = self.split_strategy.popleft() except IndexError: break @@ -340,7 +323,7 @@ def add_tensor( #self.write_padding(self.temp_file, tensor.nbytes) def close(self) -> None: - for _, _, writer in self.split_strategy.data: + for _, _, writer in self.split_strategy: writer.close() def add_architecture(self) -> None: From 09baf2f3b5f8a8d3e779ced639b9c16f6c22dc4e Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 08:58:29 -0400 Subject: [PATCH 09/66] fix Q8 quantization --- gguf-py/gguf/gguf_manager.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 95b16aee153f4..5696f88209706 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -37,7 +37,7 @@ SplitTensorsPerFile: TypeAlias = deque[tuple[os.PathLike[str], deque[tuple[str, Any]], GGUFWriter]] # [(outfile name, [(tensor name, tensor data)] for each tensor in file, filewriter)] KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)} -TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any]] # (tensor name, tensor data), aka LazyModel +TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType] # (tensor name, tensor data, tensor dtype), aka LazyModel class SplitStyle(IntEnum): @@ -157,6 +157,7 @@ class GGUFManager: tensors: deque[TensorTempData] split_arguments: SplitArguments split_strategy: SplitStrategy + dtype: GGMLQuantizationType def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE @@ -243,10 +244,10 @@ def write_to_file(self, meta_only: bool = False) -> None: if tensors: while True: try: - (name, tensor) = tensors.popleft() + (name, tensor, dtype) = tensors.popleft() except IndexError: break - writer.add_tensor(name, tensor) + writer.add_tensor(name, tensor, raw_dtype=dtype) print(f"Writing to shard {ct + 1}/{self.total_shards} with {shard_num_tensors}/{running_total} remaining tensors (of {self.total_tensors} total)") running_total -= shard_num_tensors @@ -313,7 +314,7 @@ def add_tensor( # fp.seek(0) # self.temp_file = fp - self.tensors.append((name, tensor)) + self.tensors.append((name, tensor, raw_dtype)) #if self.temp_file is None: # self.tensors.append(tensor) From 240243e63f380f31772a47b0646048e0a2c0bae9 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 09:01:42 -0400 Subject: [PATCH 10/66] remove unnecessary imports in gguf_manager --- gguf-py/gguf/gguf_manager.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 5696f88209706..a60ce9867f34c 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -1,10 +1,6 @@ from __future__ import annotations import os -import shutil -import struct -import tempfile -import time from enum import IntEnum from typing import TYPE_CHECKING, Any, Sequence, Mapping from string import ascii_letters, digits From a9c7703c12b10085c2a68879fb8a116e82b7512f Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 09:18:19 -0400 Subject: [PATCH 11/66] fix final? merge issue --- convert-hf-to-gguf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b6867fdea5fe6..ff9c74ea9ed8a 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -27,8 +27,6 @@ gguf = importlib.import_module("gguf-py.gguf") # import gguf -from convert import LlamaHfVocab - logger = logging.getLogger("hf-to-gguf") From efead0408c211600412be5598bb23ca7129007fa Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 19:34:01 -0400 Subject: [PATCH 12/66] fix gguf_writer placement and remove comments --- convert-hf-to-gguf.py | 2 +- gguf-py/gguf/gguf_manager.py | 12 +----------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ff9c74ea9ed8a..3d8cdc8119255 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -56,11 +56,11 @@ class Model: part_names: list[str] is_safetensors: bool hparams: dict[str, Any] - gguf_writer: gguf.GGUFManager block_count: int tensor_map: gguf.TensorNameMap tensor_names: set[str] | None fname_out: Path + gguf_writer: gguf.GGUFManager # subclasses should define this! model_arch: gguf.MODEL_ARCH diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index a60ce9867f34c..cafe8abffc4e4 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -305,20 +305,10 @@ def add_tensor( tensor.byteswap(inplace=True) # TODO reimplement temp file - #if self.use_temp_file and self.temp_file is None: - # fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024) - # fp.seek(0) - # self.temp_file = fp + # I'm pretty sure it gets handled per shard? self.tensors.append((name, tensor, raw_dtype)) - #if self.temp_file is None: - # self.tensors.append(tensor) - # return - - #tensor.tofile(self.temp_file) - #self.write_padding(self.temp_file, tensor.nbytes) - def close(self) -> None: for _, _, writer in self.split_strategy: writer.close() From c8ecbc67e2b65623070cab8dde69394b3cf0b384 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 19:34:37 -0400 Subject: [PATCH 13/66] oops, actually fix gguf_writer placement --- convert-hf-to-gguf-update.py | 7 +++++++ convert-hf-to-gguf.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 84b72348dc579..8ea2d82e3f953 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -81,7 +81,14 @@ class TOKENIZER_TYPE(IntEnum): {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, +<<<<<<< Updated upstream {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, +======= + {"name": "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom-7b1", }, + {"name": "gptbigcode", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/gpt_bigcode-santacoder", }, + {"name": "phi2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, + {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B-Chat", }, +>>>>>>> Stashed changes ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 3d8cdc8119255..e415692babd03 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -82,8 +82,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.part_names = Model.get_model_part_names(self.dir_model, ".bin") self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, - endianess=self.endianess, use_temp_file=self.use_temp_file) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None @@ -100,6 +98,8 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) + self.gguf_writer = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, + endianess=self.endianess, use_temp_file=self.use_temp_file) @classmethod def __init_subclass__(cls): From 3e9430df33c1c0f63087365b10aaa2284e1d4b5a Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Wed, 5 Jun 2024 09:29:33 -0400 Subject: [PATCH 14/66] reduce duplicated code from gguf_writer --- gguf-py/gguf/gguf_manager.py | 310 +++-------------------------------- 1 file changed, 24 insertions(+), 286 deletions(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index cafe8abffc4e4..13a2f0eead459 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -16,11 +16,7 @@ from .constants import ( GGMLQuantizationType, GGUFEndian, - GGUFValueType, - Keys, - RopeScalingType, - PoolingType, - TokenType, + GGUFValueType ) from .gguf_writer import GGUFWriter @@ -33,7 +29,7 @@ SplitTensorsPerFile: TypeAlias = deque[tuple[os.PathLike[str], deque[tuple[str, Any]], GGUFWriter]] # [(outfile name, [(tensor name, tensor data)] for each tensor in file, filewriter)] KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)} -TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType] # (tensor name, tensor data, tensor dtype), aka LazyModel +TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType] # (tensor name, tensor data, tensor dtype) class SplitStyle(IntEnum): @@ -43,13 +39,6 @@ class SplitStyle(IntEnum): class SplitArguments: - split: bool - dry_run: bool - small_first_shard: bool - split_max_tensors: int - split_max_size: int - split_style: SplitStyle - def __init__(self, args: Namespace = None) -> None: self.split = args.split if args else False self.split_max_tensors = args.split_max_tensors if args else 0 @@ -107,7 +96,7 @@ def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arc for i, shard in enumerate(shards): outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)) - self.append((outname, deque(shard), GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) + self.append((outname, shard, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) @staticmethod def get_tensor_size(tensor) -> int: @@ -146,35 +135,34 @@ def format_n_bytes_to_str(num: int) -> str: num /= 1024.0 return f"{num:.1f}T - over 1TB, --split recommended" - -# ideally this has most of the same signatures as GGUFWriter so it's nearly a drop-in replacement -class GGUFManager: +# TODO fall back to normal GGUFWriter in convert-hf-to-gguf.py if no --split +class GGUFManager(GGUFWriter): kv_data: KVTempData - tensors: deque[TensorTempData] + tensors: list[TensorTempData] split_arguments: SplitArguments split_strategy: SplitStrategy - dtype: GGMLQuantizationType def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE ) -> None: + # TODO be able to use superclass constructor + # super().__init__(path, arch, use_temp_file=use_temp_file, endianess=endianess) self.arch = arch self.path = path self.endianess = endianess self.offset_tensor = 0 self.kv_data = {} - self.tensors = deque() + self.tensors = [] + # TODO how many of these do you need self.split_strategy = None self.total_shards = None self.total_tensors = None self.use_temp_file = use_temp_file self.split_arguments = split_arguments - + self.recent_key = None self.add_architecture() - # have to consolidate because we need to know kv data count and tensor count before we can write the header - # and we need to write tensor info before we can write metadata - # these all kinda show up around the same places anyway so it's not a huge deal? + # TODO split back into write_header_to_file, write_kv_data_to_file, write_ti_data_to_file def write_to_file(self, meta_only: bool = False) -> None: # here is the first place you can assume you have all tensors written and you can establish the size of the file - so logic goes here @@ -232,11 +220,12 @@ def write_to_file(self, meta_only: bool = False) -> None: while True: try: (_, tensors, writer) = self.split_strategy.popleft() + tensors = deque(tensors) if tensors else None except IndexError: break shard_num_tensors = len(tensors) if tensors else 0 - + if tensors: while True: try: @@ -254,44 +243,16 @@ def write_to_file(self, meta_only: bool = False) -> None: ct = ct + 1 del tensors - def add_uint8(self, key: str, val: int) -> None: - self.kv_data[key] = (val, GGUFValueType.UINT8) - - def add_int8(self, key: str, val: int) -> None: - self.kv_data[key] = (val, GGUFValueType.INT8) - - def add_uint16(self, key: str, val: int) -> None: - self.kv_data[key] = (val, GGUFValueType.UINT16) - - def add_int16(self, key: str, val: int) -> None: - self.kv_data[key] = (val, GGUFValueType.INT16) - - def add_uint32(self, key: str, val: int) -> None: - self.kv_data[key] = (val, GGUFValueType.UINT32) - - def add_int32(self, key: str, val: int) -> None: - self.kv_data[key] = (val, GGUFValueType.INT32) - - def add_float32(self, key: str, val: float) -> None: - self.kv_data[key] = (val, GGUFValueType.FLOAT32) - - def add_uint64(self, key: str, val: int) -> None: - self.kv_data[key] = (val, GGUFValueType.UINT64) - - def add_int64(self, key: str, val: int) -> None: - self.kv_data[key] = (val, GGUFValueType.INT64) - - def add_float64(self, key: str, val: float) -> None: - self.kv_data[key] = (val, GGUFValueType.FLOAT64) - - def add_bool(self, key: str, val: bool) -> None: - self.kv_data[key] = (val, GGUFValueType.BOOL) - - def add_string(self, key: str, val: str) -> None: - if not val: - return - self.kv_data[key] = (val, GGUFValueType.STRING) + # override add_key, add_val to handle kv data separately + def add_key(self, key: str) -> None: + self.recent_key = key + + def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None: + if self.recent_key is None: + raise ValueError("No key set for value") + self.kv_data[self.recent_key] = (val, vtype) + # need to handle arrays separately def add_array(self, key: str, val: Sequence[Any]) -> None: if not isinstance(val, Sequence): raise ValueError(f'Expected a sequence for {key}, got {type(val)}') @@ -303,231 +264,8 @@ def add_tensor( ) -> None: if self.endianess == GGUFEndian.BIG: tensor.byteswap(inplace=True) - - # TODO reimplement temp file - # I'm pretty sure it gets handled per shard? - self.tensors.append((name, tensor, raw_dtype)) def close(self) -> None: for _, _, writer in self.split_strategy: - writer.close() - - def add_architecture(self) -> None: - self.add_string(Keys.General.ARCHITECTURE, self.arch) - - def add_author(self, author: str) -> None: - self.add_string(Keys.General.AUTHOR, author) - - def add_version(self, version: str) -> None: - self.add_string(Keys.General.VERSION, version) - - def add_tensor_data_layout(self, layout: str) -> None: - self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) - - def add_url(self, url: str) -> None: - self.add_string(Keys.General.URL, url) - - def add_description(self, description: str) -> None: - self.add_string(Keys.General.DESCRIPTION, description) - - def add_licence(self, licence: str) -> None: - self.add_string(Keys.General.LICENSE, licence) - - def add_source_url(self, url: str) -> None: - self.add_string(Keys.General.SOURCE_URL, url) - - def add_source_hf_repo(self, repo: str) -> None: - self.add_string(Keys.General.SOURCE_HF_REPO, repo) - - def add_file_type(self, ftype: int) -> None: - self.add_uint32(Keys.General.FILE_TYPE, ftype) - - def add_name(self, name: str) -> None: - self.add_string(Keys.General.NAME, name) - - def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None: - self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version) - - def add_custom_alignment(self, alignment: int) -> None: - self.data_alignment = alignment - self.add_uint32(Keys.General.ALIGNMENT, alignment) - - def add_vocab_size(self, size: int) -> None: - self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size) - - def add_context_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) - - def add_embedding_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) - - def add_block_count(self, length: int) -> None: - self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) - - def add_feed_forward_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) - - def add_parallel_residual(self, use: bool) -> None: - self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) - - def add_head_count(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) - - def add_head_count_kv(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) - - def add_key_length(self, length: int) -> None: - self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length) - - def add_value_length(self, length: int) -> None: - self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) - - def add_max_alibi_bias(self, bias: float) -> None: - self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) - - def add_clamp_kqv(self, value: float) -> None: - self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) - - def add_logit_scale(self, value: float) -> None: - self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) - - def add_expert_count(self, count: int) -> None: - self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) - - def add_expert_used_count(self, count: int) -> None: - self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count) - - def add_layer_norm_eps(self, value: float) -> None: - self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) - - def add_layer_norm_rms_eps(self, value: float) -> None: - self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) - - def add_causal_attention(self, value: bool) -> None: - self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) - - def add_pooling_type(self, value: PoolingType) -> None: - self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) - - def add_rope_dimension_count(self, count: int) -> None: - self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) - - def add_rope_freq_base(self, value: float) -> None: - self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) - - def add_rope_scaling_type(self, value: RopeScalingType) -> None: - self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value) - - def add_rope_scaling_factor(self, value: float) -> None: - self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) - - def add_rope_scaling_orig_ctx_len(self, value: int) -> None: - self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value) - - def add_rope_scaling_finetuned(self, value: bool) -> None: - self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) - - def add_ssm_conv_kernel(self, value: int) -> None: - self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value) - - def add_ssm_inner_size(self, value: int) -> None: - self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value) - - def add_ssm_state_size(self, value: int) -> None: - self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value) - - def add_ssm_time_step_rank(self, value: int) -> None: - self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value) - - def add_tokenizer_model(self, model: str) -> None: - self.add_string(Keys.Tokenizer.MODEL, model) - - def add_tokenizer_pre(self, pre: str) -> None: - self.add_string(Keys.Tokenizer.PRE, pre) - - def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: - self.add_array(Keys.Tokenizer.LIST, tokens) - - def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: - self.add_array(Keys.Tokenizer.MERGES, merges) - - def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None: - self.add_array(Keys.Tokenizer.TOKEN_TYPE, types) - - def add_token_type_count(self, value: int) -> None: - self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value) - - def add_token_scores(self, scores: Sequence[float]) -> None: - self.add_array(Keys.Tokenizer.SCORES, scores) - - def add_bos_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.BOS_ID, id) - - def add_eos_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.EOS_ID, id) - - def add_unk_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.UNK_ID, id) - - def add_sep_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.SEP_ID, id) - - def add_pad_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.PAD_ID, id) - - def add_cls_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.CLS_ID, id) - - def add_mask_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.MASK_ID, id) - - def add_add_bos_token(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_BOS, value) - - def add_add_eos_token(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_EOS, value) - - def add_add_space_prefix(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) - - def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: - if isinstance(value, list): - template_default = None - template_names = set() - - for choice in value: - name = choice.get('name', '') - template = choice.get('template') - - # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it - name = ''.join((c if c in ascii_letters + digits else '_' for c in name)) - - if name and template is not None: - if name == 'default': - template_default = template - else: - template_names.add(name) - self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template) - - if template_names: - self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names)) - - if template_default is None: - return - - value = template_default - - self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) - - def add_prefix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) - - def add_suffix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id) - - def add_middle_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id) - - def add_eot_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.EOT_ID, id) \ No newline at end of file + writer.close() \ No newline at end of file From f6fd3ea4e9a0a68dadbbd3956778672b7735e2d5 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Wed, 5 Jun 2024 12:28:40 -0400 Subject: [PATCH 15/66] further simplify GGUFManager --- convert-hf-to-gguf-update.py | 7 ---- convert-hf-to-gguf.py | 12 ++++-- gguf-py/gguf/gguf_manager.py | 80 +++++++++++++++++++----------------- 3 files changed, 51 insertions(+), 48 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 8ea2d82e3f953..84b72348dc579 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -81,14 +81,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, -<<<<<<< Updated upstream {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, -======= - {"name": "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom-7b1", }, - {"name": "gptbigcode", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/gpt_bigcode-santacoder", }, - {"name": "phi2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, - {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B-Chat", }, ->>>>>>> Stashed changes ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e415692babd03..4b3dfdd707acd 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -60,7 +60,7 @@ class Model: tensor_map: gguf.TensorNameMap tensor_names: set[str] | None fname_out: Path - gguf_writer: gguf.GGUFManager + gguf_writer: gguf.GGUFWriter # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -329,11 +329,16 @@ def write_tensors(self): def write(self): self.write_tensors() - self.gguf_writer.write_to_file() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_ti_data_to_file() self.gguf_writer.close() def write_vocab(self): - self.gguf_writer.write_to_file(meta_only=True) + if self.gguf_writer.split_arguments.split: + raise ValueError('Splitting the vocabulary is not supported') + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @staticmethod @@ -1563,7 +1568,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -# TODO what the hell is this? @Model.register("QWenLMHeadModel") class QwenModel(Model): model_arch = gguf.MODEL_ARCH.QWEN diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 13a2f0eead459..aeec9642cde6d 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -2,8 +2,7 @@ import os from enum import IntEnum -from typing import TYPE_CHECKING, Any, Sequence, Mapping -from string import ascii_letters, digits +from typing import TYPE_CHECKING, Any, Sequence from argparse import Namespace from math import ceil from collections import deque @@ -18,7 +17,7 @@ GGUFEndian, GGUFValueType ) -from .gguf_writer import GGUFWriter +from .gguf_writer import GGUFWriter, WriterState SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" @@ -74,7 +73,7 @@ def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arc self.append((shard, model[start:stop], GGUFWriter(shard, arch, use_temp_file=use_temp_file, endianess=endianess))) elif split_arguments.split_style == SplitStyle.SIZE: - shards = deque() + shards = [] # we have to determine the shards first to determine how many shards there will be in total - two passes for i, shard in enumerate(model): @@ -135,7 +134,6 @@ def format_n_bytes_to_str(num: int) -> str: num /= 1024.0 return f"{num:.1f}T - over 1TB, --split recommended" -# TODO fall back to normal GGUFWriter in convert-hf-to-gguf.py if no --split class GGUFManager(GGUFWriter): kv_data: KVTempData tensors: list[TensorTempData] @@ -145,27 +143,25 @@ class GGUFManager(GGUFWriter): def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE ) -> None: - # TODO be able to use superclass constructor - # super().__init__(path, arch, use_temp_file=use_temp_file, endianess=endianess) + # we intentionally don't call superclass constructor self.arch = arch self.path = path self.endianess = endianess - self.offset_tensor = 0 self.kv_data = {} self.tensors = [] - # TODO how many of these do you need self.split_strategy = None - self.total_shards = None - self.total_tensors = None + self.total_shards = 0 + self.total_tensors = 0 self.use_temp_file = use_temp_file self.split_arguments = split_arguments self.recent_key = None + self.state = WriterState.EMPTY self.add_architecture() - # TODO split back into write_header_to_file, write_kv_data_to_file, write_ti_data_to_file - def write_to_file(self, meta_only: bool = False) -> None: + def write_header_to_file(self) -> None: + if self.state is not WriterState.EMPTY: + raise ValueError(f'Expected GGUFManager state to be EMPTY, got {self.state}') - # here is the first place you can assume you have all tensors written and you can establish the size of the file - so logic goes here self.total_tensors = len(self.tensors) total_size = sum(SplitStrategy.get_tensor_size(tensor[1]) for tensor in self.tensors) @@ -182,42 +178,50 @@ def write_to_file(self, meta_only: bool = False) -> None: del self.tensors self.total_shards = len(self.split_strategy) + print("\nWriting the following files:") + for (shard_path, shard_tensors, _) in self.split_strategy: + size = SplitStrategy.format_n_bytes_to_str(sum(SplitStrategy.get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" + print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") + + if self.split_arguments.dry_run: + print("\nDry run, not writing files") + # instantiating GGUFWriters creates files + for name, _, _ in self.split_strategy: + os.remove(name) + return + + self.state = WriterState.HEADER + + def write_kv_data_to_file(self) -> None: + if self.split_arguments.dry_run: + return + + if self.state is not WriterState.HEADER: + raise ValueError(f'Expected GGUFManager state to be HEADER, got {self.state}') + # only the first shard needs all the KV data for key, (value, etype) in self.kv_data.items(): self.split_strategy[0][2].add_key(key) self.split_strategy[0][2].add_val(value, etype) + # the other shards need shard data if self.split_arguments.split_style != SplitStyle.NONE: for i, (_, _, writer) in enumerate(self.split_strategy): writer.add_uint16(LLM_KV_SPLIT_NO, i) writer.add_uint16(LLM_KV_SPLIT_COUNT, self.total_shards) writer.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) - # metadata/vocab only can write and return here - if meta_only: - for i, (_, _, writer) in enumerate(self.split_strategy): - writer.write_header_to_file() - writer.write_kv_data_to_file() - return - - # tensor writing code starts here - - print("\nWriting the following files:") - for (shard_path, shard_tensors, _) in self.split_strategy: - size = SplitStrategy.format_n_bytes_to_str(sum(SplitStrategy.get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" - print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") + self.state = WriterState.KV_DATA + def write_ti_data_to_file(self) -> None: if self.split_arguments.dry_run: - print("\nDry run, not writing files") - # instantiating GGUFWriters creates files - for name, _, _ in self.split_strategy: - os.remove(name) return - # run add_tensor_info, write data, then write_tensor_data - taken from convert.py + if self.state is not WriterState.KV_DATA: + raise ValueError(f'Expected GGUFManager state to be KV_DATA, got {self.state}') + running_total = self.total_tensors - ct = 0 - while True: + for ct in range(self.total_shards): try: (_, tensors, writer) = self.split_strategy.popleft() tensors = deque(tensors) if tensors else None @@ -234,15 +238,17 @@ def write_to_file(self, meta_only: bool = False) -> None: break writer.add_tensor(name, tensor, raw_dtype=dtype) - print(f"Writing to shard {ct + 1}/{self.total_shards} with {shard_num_tensors}/{running_total} remaining tensors (of {self.total_tensors} total)") - running_total -= shard_num_tensors + print(f"Writing to shard {ct}/{self.total_shards} with {shard_num_tensors}/{running_total} remaining tensors (of {self.total_tensors} total)") + running_total -= shard_num_tensors + # need to write everything down here writer.write_header_to_file() writer.write_kv_data_to_file() writer.write_tensors_to_file(progress=True) - ct = ct + 1 del tensors + self.state = WriterState.TI_DATA + # override add_key, add_val to handle kv data separately def add_key(self, key: str) -> None: self.recent_key = key From bb5ee0209621ee9343554b9ab5907c968ae47e98 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Wed, 5 Jun 2024 12:49:08 -0400 Subject: [PATCH 16/66] simplify even further and standardize with GGUFWriter --- convert-hf-to-gguf.py | 2 +- gguf-py/gguf/gguf_manager.py | 41 ++++++++++-------------------------- 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4b3dfdd707acd..d12373c4136fb 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -331,7 +331,7 @@ def write(self): self.write_tensors() self.gguf_writer.write_header_to_file() self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_ti_data_to_file() + self.gguf_writer.write_tensors_to_file() self.gguf_writer.close() def write_vocab(self): diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index aeec9642cde6d..002d138776849 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -73,33 +73,24 @@ def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arc self.append((shard, model[start:stop], GGUFWriter(shard, arch, use_temp_file=use_temp_file, endianess=endianess))) elif split_arguments.split_style == SplitStyle.SIZE: - shards = [] + shards = [[model[0]]] # we have to determine the shards first to determine how many shards there will be in total - two passes - for i, shard in enumerate(model): - if i == 0: - shards.append([shard]) - continue + for i, shard in enumerate(model[1:]): if SplitStrategy.get_tensor_size(shard[1]) + sum(SplitStrategy.get_tensor_size(t[1]) for t in shards[-1]) > split_arguments.split_max_size: shards.append([shard]) else: shards[-1].append(shard) - total_shards = len(shards) + split_arguments.small_first_shard - shard_offset = 1 - if split_arguments.small_first_shard: - outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards)) - self.append((outname, None, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) - shard_offset += 1 + shards.insert(0, None) for i, shard in enumerate(shards): - outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)) + outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, len(shards))) self.append((outname, shard, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) @staticmethod def get_tensor_size(tensor) -> int: - # we don't have the LazyTensor class here from convert.py but we can try try: return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) except AttributeError: # numpy ndarray[Any, Any] @@ -213,7 +204,7 @@ def write_kv_data_to_file(self) -> None: self.state = WriterState.KV_DATA - def write_ti_data_to_file(self) -> None: + def write_tensors_to_file(self) -> None: if self.split_arguments.dry_run: return @@ -222,25 +213,17 @@ def write_ti_data_to_file(self) -> None: running_total = self.total_tensors for ct in range(self.total_shards): - try: - (_, tensors, writer) = self.split_strategy.popleft() - tensors = deque(tensors) if tensors else None - except IndexError: - break + (_, tensors, writer) = self.split_strategy.popleft() + tensors = deque(tensors) if tensors else None shard_num_tensors = len(tensors) if tensors else 0 - - if tensors: - while True: - try: - (name, tensor, dtype) = tensors.popleft() - except IndexError: - break - writer.add_tensor(name, tensor, raw_dtype=dtype) - print(f"Writing to shard {ct}/{self.total_shards} with {shard_num_tensors}/{running_total} remaining tensors (of {self.total_tensors} total)") running_total -= shard_num_tensors + for _ in range(shard_num_tensors): + (name, tensor, dtype) = tensors.popleft() + writer.add_tensor(name, tensor, raw_dtype=dtype) + # need to write everything down here writer.write_header_to_file() writer.write_kv_data_to_file() @@ -268,8 +251,6 @@ def add_tensor( self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None, ) -> None: - if self.endianess == GGUFEndian.BIG: - tensor.byteswap(inplace=True) self.tensors.append((name, tensor, raw_dtype)) def close(self) -> None: From 5ad397d6104cf21156c8cf9730c81e74e5c413a9 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Wed, 5 Jun 2024 13:49:20 -0400 Subject: [PATCH 17/66] reduce diffs with master --- convert-hf-to-gguf.py | 5 ++--- gguf-py/gguf/gguf_manager.py | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d12373c4136fb..b4399f6803e6d 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -77,10 +77,8 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.lazy = not eager self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors") self.is_safetensors = len(self.part_names) > 0 - if not self.is_safetensors: self.part_names = Model.get_model_part_names(self.dir_model, ".bin") - self.hparams = Model.load_hparams(self.dir_model) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) @@ -331,7 +329,7 @@ def write(self): self.write_tensors() self.gguf_writer.write_header_to_file() self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() def write_vocab(self): @@ -1568,6 +1566,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] + @Model.register("QWenLMHeadModel") class QwenModel(Model): model_arch = gguf.MODEL_ARCH.QWEN diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 002d138776849..2605b816a76d4 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -125,6 +125,7 @@ def format_n_bytes_to_str(num: int) -> str: num /= 1024.0 return f"{num:.1f}T - over 1TB, --split recommended" + class GGUFManager(GGUFWriter): kv_data: KVTempData tensors: list[TensorTempData] @@ -204,7 +205,7 @@ def write_kv_data_to_file(self) -> None: self.state = WriterState.KV_DATA - def write_tensors_to_file(self) -> None: + def write_tensors_to_file(self, progress: bool = False) -> None: if self.split_arguments.dry_run: return @@ -227,7 +228,7 @@ def write_tensors_to_file(self) -> None: # need to write everything down here writer.write_header_to_file() writer.write_kv_data_to_file() - writer.write_tensors_to_file(progress=True) + writer.write_tensors_to_file(progress=progress) del tensors self.state = WriterState.TI_DATA From ce7e6985d2b233f0bbd62969689962635aca0898 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Wed, 5 Jun 2024 18:29:39 -0400 Subject: [PATCH 18/66] form shards while adding tensors, SHA256 sums agree with master --- convert-hf-to-gguf.py | 8 +- gguf-py/gguf/gguf_manager.py | 285 +++++++++++++++++------------------ gguf-py/gguf/gguf_writer.py | 5 +- 3 files changed, 148 insertions(+), 150 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b4399f6803e6d..b6fd4bc4b972d 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -327,6 +327,7 @@ def write_tensors(self): def write(self): self.write_tensors() + self.gguf_writer.init_shards() self.gguf_writer.write_header_to_file() self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) @@ -335,6 +336,7 @@ def write(self): def write_vocab(self): if self.gguf_writer.split_arguments.split: raise ValueError('Splitting the vocabulary is not supported') + self.gguf_writer.init_shards() self.gguf_writer.write_header_to_file() self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @@ -2816,8 +2818,8 @@ def parse_args() -> argparse.Namespace: help="only print out a split plan and exit, without writing any new files" ) parser.add_argument( - "--large-first-shard", action="store_true", - help="include tensors in the first shard when splitting (default: metadata only)" + "--small-first-shard", action="store_true", + help="do not add tensors to the first shard (disabled by default)" ) return parser.parse_args() @@ -2853,7 +2855,7 @@ def main() -> None: if args.split_max_tensors and args.split_max_size: raise ValueError("Can't specify both --split-max-tensors and --split-max-size") - split_arguments = gguf.SplitArguments(args=args) if args.split else gguf.SplitArguments() + split_arguments = gguf.SplitArguments(args) ftype_map = { "f32": gguf.LlamaFileType.ALL_F32, diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 2605b816a76d4..2fcaf3edfc34f 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -4,7 +4,6 @@ from enum import IntEnum from typing import TYPE_CHECKING, Any, Sequence from argparse import Namespace -from math import ceil from collections import deque import numpy as np @@ -21,14 +20,15 @@ SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" +METADATA_ONLY_INDICATOR = -1 LLM_KV_SPLIT_NO = "split.no" LLM_KV_SPLIT_COUNT = "split.count" LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" -SplitTensorsPerFile: TypeAlias = deque[tuple[os.PathLike[str], deque[tuple[str, Any]], GGUFWriter]] # [(outfile name, [(tensor name, tensor data)] for each tensor in file, filewriter)] KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)} TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType] # (tensor name, tensor data, tensor dtype) +Shard: TypeAlias = list[os.PathLike[str], int, int, deque[TensorTempData]] # [shard filename, shard tensor count, shard size, [tensor data]] class SplitStyle(IntEnum): @@ -38,99 +38,23 @@ class SplitStyle(IntEnum): class SplitArguments: - def __init__(self, args: Namespace = None) -> None: - self.split = args.split if args else False - self.split_max_tensors = args.split_max_tensors if args else 0 - self.split_max_size = SplitStrategy.split_str_to_n_bytes(args.split_max_size) if args and args.split_max_size else 0 - self.dry_run = args.dry_run if args else False - self.small_first_shard = not args.large_first_shard if args else False - self.split_style = SplitStyle.NONE if not self.split or not args \ + def __init__(self, args: Namespace) -> None: + self.split = args.split + self.split_max_tensors = args.split_max_tensors if args.split else 0 + self.split_max_size = GGUFManager.split_str_to_n_bytes(args.split_max_size) if args.split and args.split_max_size else 0 + self.split_style = SplitStyle.NONE if not self.split \ else SplitStyle.TENSORS if self.split_max_tensors \ else SplitStyle.SIZE - - -class SplitStrategy(deque): - data: SplitTensorsPerFile - - def __init__(self, fname_out: os.PathLike[str], model: list[TensorTempData], arch: str, - split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE, - ): - super().__init__() - - if split_arguments.split_style == SplitStyle.NONE: - self.append((fname_out, model, GGUFWriter(fname_out, arch, use_temp_file=use_temp_file, endianess=endianess))) - - elif split_arguments.split_style == SplitStyle.TENSORS: - total_shards = ceil(len(model) / split_arguments.split_max_tensors) + split_arguments.small_first_shard - shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] - - if split_arguments.small_first_shard: - self.append((shard_files[0], None, GGUFWriter(shard_files[0], arch, use_temp_file=use_temp_file, endianess=endianess))) - - for i, shard in enumerate(shard_files[split_arguments.small_first_shard:]): - start = i * split_arguments.split_max_tensors - stop = min((i + 1) * split_arguments.split_max_tensors, len(model)) - self.append((shard, model[start:stop], GGUFWriter(shard, arch, use_temp_file=use_temp_file, endianess=endianess))) - - elif split_arguments.split_style == SplitStyle.SIZE: - shards = [[model[0]]] - - # we have to determine the shards first to determine how many shards there will be in total - two passes - for i, shard in enumerate(model[1:]): - if SplitStrategy.get_tensor_size(shard[1]) + sum(SplitStrategy.get_tensor_size(t[1]) for t in shards[-1]) > split_arguments.split_max_size: - shards.append([shard]) - else: - shards[-1].append(shard) - - if split_arguments.small_first_shard: - shards.insert(0, None) - - for i, shard in enumerate(shards): - outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, len(shards))) - self.append((outname, shard, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess))) - - @staticmethod - def get_tensor_size(tensor) -> int: - try: - return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) - except AttributeError: # numpy ndarray[Any, Any] - return tensor.nbytes - except: # this should never happen - raise ValueError(f"Invalid tensor type: {type(tensor)}") - - @staticmethod - def split_str_to_n_bytes(split_str: str) -> int: - if split_str.endswith("K"): - n = int(split_str[:-1]) * 1024 - elif split_str.endswith("M"): - n = int(split_str[:-1]) * 1024 * 1024 - elif split_str.endswith("G"): - n = int(split_str[:-1]) * 1024 * 1024 * 1024 - elif split_str.isnumeric(): - n = int(split_str) - else: - raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") - - if n <= 0: - raise ValueError(f"Invalid split size: {split_str}, must be positive") - - return n - - @staticmethod - def format_n_bytes_to_str(num: int) -> str: - num = float(num) - for unit in ("", "K", "M", "G"): - if abs(num) < 1024.0: - return f"{num:3.1f}{unit}" - num /= 1024.0 - return f"{num:.1f}T - over 1TB, --split recommended" + self.dry_run = args.dry_run + self.small_first_shard = args.small_first_shard class GGUFManager(GGUFWriter): kv_data: KVTempData tensors: list[TensorTempData] split_arguments: SplitArguments - split_strategy: SplitStrategy + shards: list[Shard] + shard_writers: list[GGUFWriter] def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE @@ -140,23 +64,22 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: Spl self.path = path self.endianess = endianess self.kv_data = {} - self.tensors = [] - self.split_strategy = None - self.total_shards = 0 + self.shards = [] + self.shard_writers = [] self.total_tensors = 0 self.use_temp_file = use_temp_file self.split_arguments = split_arguments self.recent_key = None self.state = WriterState.EMPTY - self.add_architecture() - def write_header_to_file(self) -> None: - if self.state is not WriterState.EMPTY: - raise ValueError(f'Expected GGUFManager state to be EMPTY, got {self.state}') + if self.split_arguments.small_first_shard: + self.shards.append(["", 0, METADATA_ONLY_INDICATOR, None]) - self.total_tensors = len(self.tensors) - total_size = sum(SplitStrategy.get_tensor_size(tensor[1]) for tensor in self.tensors) + def init_shards(self) -> None: + self.total_tensors = sum(shard[1] for shard in self.shards) + total_size = sum(shard[2] for shard in self.shards) + # check if we need to split if self.split_arguments.split_max_tensors and self.total_tensors < self.split_arguments.split_max_tensors: print("Model has fewer tensors than the split threshold, not splitting") self.split_style = SplitStyle.NONE @@ -165,71 +88,88 @@ def write_header_to_file(self) -> None: print("Model has smaller size than the split threshold, not splitting") self.split_style = SplitStyle.NONE - self.split_strategy = SplitStrategy(self.path, self.tensors, self.arch, self.split_arguments, - use_temp_file=self.use_temp_file, endianess=self.endianess) - del self.tensors - self.total_shards = len(self.split_strategy) + # no shards are created when writing vocab so make one + if not self.shards: + self.shards.append(["", 0, METADATA_ONLY_INDICATOR, None]) + # format shard names + if len(self.shards) == 1: + self.shards[0][0] = self.path + else: + for i in range(len(self.shards)): + self.shards[i][0] = self.path.with_name(SHARD_NAME_FORMAT.format(self.path.stem, i + 1, len(self.shards))) + + # print shard info print("\nWriting the following files:") - for (shard_path, shard_tensors, _) in self.split_strategy: - size = SplitStrategy.format_n_bytes_to_str(sum(SplitStrategy.get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" - print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") + for (path, tensor_ct, size, _) in self.shards: + print(f" {path}: n_tensors = {tensor_ct}, total_size = {GGUFManager.format_n_bytes_to_str(size)}") + print() if self.split_arguments.dry_run: print("\nDry run, not writing files") - # instantiating GGUFWriters creates files - for name, _, _ in self.split_strategy: - os.remove(name) - return + exit() + + # we don't want to initialize GGUFWriters until now because they create files + for i, (path, _, _, tensors) in enumerate(self.shards): + # dont_add_architecture is used for consistency - examples/gguf_split doesn't add arch to all shards + writer = GGUFWriter(path, self.arch, use_temp_file=self.use_temp_file, + endianess=self.endianess, dont_add_architecture=not (i == 0)) + + # only the first shard needs all the KV data + if i == 0: + for key, (value, etype) in self.kv_data.items(): + writer.add_key(key) + writer.add_val(value, etype) + + # add split metadata unless it's one file - small first shard splits even with SplitStyle.NONE + if self.split_arguments.split_style != SplitStyle.NONE or self.split_arguments.small_first_shard: + writer.add_uint16(LLM_KV_SPLIT_NO, i) + writer.add_uint16(LLM_KV_SPLIT_COUNT, len(self.shards)) + writer.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) + + # add tensors, deque popleft() ensures references to eager tensors are not kept + while True: + try: + (name, tensor, dtype) = tensors.popleft() + writer.add_tensor(name, tensor, raw_dtype=dtype) + except: + break + + self.shard_writers.append(writer) + + def write_header_to_file(self) -> None: + if self.state is not WriterState.EMPTY: + raise ValueError(f'Expected GGUFManager state to be EMPTY, got {self.state}') + + for writer in self.shard_writers: + writer.write_header_to_file() self.state = WriterState.HEADER def write_kv_data_to_file(self) -> None: - if self.split_arguments.dry_run: - return - if self.state is not WriterState.HEADER: raise ValueError(f'Expected GGUFManager state to be HEADER, got {self.state}') - - # only the first shard needs all the KV data - for key, (value, etype) in self.kv_data.items(): - self.split_strategy[0][2].add_key(key) - self.split_strategy[0][2].add_val(value, etype) - - # the other shards need shard data - if self.split_arguments.split_style != SplitStyle.NONE: - for i, (_, _, writer) in enumerate(self.split_strategy): - writer.add_uint16(LLM_KV_SPLIT_NO, i) - writer.add_uint16(LLM_KV_SPLIT_COUNT, self.total_shards) - writer.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) + + for writer in self.shard_writers: + writer.write_kv_data_to_file() self.state = WriterState.KV_DATA def write_tensors_to_file(self, progress: bool = False) -> None: - if self.split_arguments.dry_run: - return - if self.state is not WriterState.KV_DATA: raise ValueError(f'Expected GGUFManager state to be KV_DATA, got {self.state}') running_total = self.total_tensors - for ct in range(self.total_shards): - (_, tensors, writer) = self.split_strategy.popleft() - tensors = deque(tensors) if tensors else None - - shard_num_tensors = len(tensors) if tensors else 0 - print(f"Writing to shard {ct}/{self.total_shards} with {shard_num_tensors}/{running_total} remaining tensors (of {self.total_tensors} total)") - running_total -= shard_num_tensors - - for _ in range(shard_num_tensors): - (name, tensor, dtype) = tensors.popleft() - writer.add_tensor(name, tensor, raw_dtype=dtype) - - # need to write everything down here - writer.write_header_to_file() - writer.write_kv_data_to_file() - writer.write_tensors_to_file(progress=progress) - del tensors + for i in range(len(self.shard_writers)): + writer = self.shard_writers[i] + is_metadata = writer.ti_data_count == 0 + if is_metadata: + print(f"Writing to shard {i + 1}/{len(self.shards)} with metadata only") + else: + print(f"Writing to shard {i + 1}/{len(self.shards)} with {writer.ti_data_count}/{running_total} remaining tensors (of {self.total_tensors} total)") + running_total -= writer.ti_data_count + writer.write_tensors_to_file(progress=(progress and not is_metadata)) + del writer self.state = WriterState.TI_DATA @@ -252,8 +192,63 @@ def add_tensor( self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None, ) -> None: - self.tensors.append((name, tensor, raw_dtype)) + # we build splits as tensors are added so we need logic to figure out when to split + # logic is all in the conditional because it short-circuits, otherwise accessing self.shards[-1] would throw an error + + # create a first shard to start it off + if (len(self.shards) == self.split_arguments.small_first_shard \ + # or split when over tensor limit + or (self.split_arguments.split_style == SplitStyle.TENSORS \ + and self.shards[-1][1] >= self.split_arguments.split_max_tensors) \ + # or split when over size limit + or (self.split_arguments.split_style == SplitStyle.SIZE \ + and self.shards[-1][2] + GGUFManager.get_tensor_size(tensor) > self.split_arguments.split_max_size)): + + # we fill in the name later when we know how many shards there are + self.shards.append(["", 1, GGUFManager.get_tensor_size(tensor), deque([(name, tensor, raw_dtype)])]) + else: + self.shards[-1][1] += 1 + self.shards[-1][2] += GGUFManager.get_tensor_size(tensor) + self.shards[-1][3].append((name, tensor, raw_dtype)) def close(self) -> None: - for _, _, writer in self.split_strategy: - writer.close() \ No newline at end of file + for writer in self.shard_writers: + writer.close() + + @staticmethod + def get_tensor_size(tensor) -> int: + try: + return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) + except AttributeError: # numpy ndarray[Any, Any] + return tensor.nbytes + except: # this should never happen + raise ValueError(f"Invalid tensor type: {type(tensor)}") + + @staticmethod + def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1024 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1024 * 1024 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1024 * 1024 * 1024 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + + if n <= 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + @staticmethod + def format_n_bytes_to_str(num: int) -> str: + if num == METADATA_ONLY_INDICATOR: + return "negligible - metadata only" + num = float(num) + for unit in ("", "K", "M", "G"): + if abs(num) < 1024.0: + return f"{num:3.1f}{unit}" + num /= 1024.0 + return f"{num:.1f}T - over 1TB, --split recommended" \ No newline at end of file diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7ef321b91a1ef..294f4d06dbb70 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -57,7 +57,7 @@ class GGUFWriter: def __init__( self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True, - endianess: GGUFEndian = GGUFEndian.LITTLE, + endianess: GGUFEndian = GGUFEndian.LITTLE, dont_add_architecture: bool = False ): self.fout = open(path, "wb") self.arch = arch @@ -77,7 +77,8 @@ def __init__( )) self.state = WriterState.EMPTY - self.add_architecture() + if not dont_add_architecture: + self.add_architecture() def write_header_to_file(self) -> None: if self.state is not WriterState.EMPTY: From 706bd69023a5d93f678a0d59337a24d2565b4d68 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Thu, 6 Jun 2024 08:27:25 -0400 Subject: [PATCH 19/66] re-add type hint Co-authored-by: compilade --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b6fd4bc4b972d..6436028831e9f 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2857,7 +2857,7 @@ def main() -> None: split_arguments = gguf.SplitArguments(args) - ftype_map = { + ftype_map: dict[str, gguf.LlamaFileType] = { "f32": gguf.LlamaFileType.ALL_F32, "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, From 6a05183b97992d63ccb396249caba54a981f3497 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Thu, 6 Jun 2024 08:28:10 -0400 Subject: [PATCH 20/66] GGUFWriter compatibility fix Co-authored-by: compilade --- gguf-py/gguf/gguf_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 2fcaf3edfc34f..5d6133fe6ea18 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -155,7 +155,7 @@ def write_kv_data_to_file(self) -> None: self.state = WriterState.KV_DATA - def write_tensors_to_file(self, progress: bool = False) -> None: + def write_tensors_to_file(self, *, progress: bool = False) -> None: if self.state is not WriterState.KV_DATA: raise ValueError(f'Expected GGUFManager state to be KV_DATA, got {self.state}') From 3328b0a99141fc5680abfca6bff01ce7c31f868c Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 6 Jun 2024 08:37:35 -0400 Subject: [PATCH 21/66] Shard dataclass and un-negative dont_add_architecture --- gguf-py/gguf/gguf_manager.py | 49 +++++++++++++++++++++--------------- gguf-py/gguf/gguf_writer.py | 4 +-- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 5d6133fe6ea18..e7d2ef096cd49 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Sequence from argparse import Namespace from collections import deque +from dataclasses import dataclass import numpy as np @@ -28,7 +29,14 @@ KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)} TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType] # (tensor name, tensor data, tensor dtype) -Shard: TypeAlias = list[os.PathLike[str], int, int, deque[TensorTempData]] # [shard filename, shard tensor count, shard size, [tensor data]] + + +@dataclass +class Shard: + path: str + tensor_count: int + size: int + tensors: deque[TensorTempData] class SplitStyle(IntEnum): @@ -73,11 +81,11 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: Spl self.state = WriterState.EMPTY if self.split_arguments.small_first_shard: - self.shards.append(["", 0, METADATA_ONLY_INDICATOR, None]) + self.shards.append(Shard("", 0, METADATA_ONLY_INDICATOR, deque())) def init_shards(self) -> None: - self.total_tensors = sum(shard[1] for shard in self.shards) - total_size = sum(shard[2] for shard in self.shards) + self.total_tensors = sum(shard.tensor_count for shard in self.shards) + total_size = sum(shard.size for shard in self.shards) # check if we need to split if self.split_arguments.split_max_tensors and self.total_tensors < self.split_arguments.split_max_tensors: @@ -90,19 +98,20 @@ def init_shards(self) -> None: # no shards are created when writing vocab so make one if not self.shards: - self.shards.append(["", 0, METADATA_ONLY_INDICATOR, None]) + self.shards.append(Shard("", 0, METADATA_ONLY_INDICATOR, deque())) # format shard names if len(self.shards) == 1: - self.shards[0][0] = self.path + self.shards[0].path = self.path else: for i in range(len(self.shards)): - self.shards[i][0] = self.path.with_name(SHARD_NAME_FORMAT.format(self.path.stem, i + 1, len(self.shards))) + # TODO with_name is not explicit - import pathlib + self.shards[i].path = self.path.with_name(SHARD_NAME_FORMAT.format(self.path.stem, i + 1, len(self.shards))) # print shard info print("\nWriting the following files:") - for (path, tensor_ct, size, _) in self.shards: - print(f" {path}: n_tensors = {tensor_ct}, total_size = {GGUFManager.format_n_bytes_to_str(size)}") + for shard in self.shards: + print(f" {shard.path}: n_tensors = {shard.tensor_count}, total_size = {GGUFManager.format_n_bytes_to_str(shard.size)}") print() if self.split_arguments.dry_run: @@ -110,10 +119,10 @@ def init_shards(self) -> None: exit() # we don't want to initialize GGUFWriters until now because they create files - for i, (path, _, _, tensors) in enumerate(self.shards): - # dont_add_architecture is used for consistency - examples/gguf_split doesn't add arch to all shards - writer = GGUFWriter(path, self.arch, use_temp_file=self.use_temp_file, - endianess=self.endianess, dont_add_architecture=not (i == 0)) + for i, shard in enumerate(self.shards): + # add_architecture is used for consistency - examples/gguf_split doesn't add arch to all shards + writer = GGUFWriter(shard.path, self.arch, use_temp_file=self.use_temp_file, + endianess=self.endianess, add_architecture=(i == 0)) # only the first shard needs all the KV data if i == 0: @@ -130,7 +139,7 @@ def init_shards(self) -> None: # add tensors, deque popleft() ensures references to eager tensors are not kept while True: try: - (name, tensor, dtype) = tensors.popleft() + (name, tensor, dtype) = shard.tensors.popleft() writer.add_tensor(name, tensor, raw_dtype=dtype) except: break @@ -199,17 +208,17 @@ def add_tensor( if (len(self.shards) == self.split_arguments.small_first_shard \ # or split when over tensor limit or (self.split_arguments.split_style == SplitStyle.TENSORS \ - and self.shards[-1][1] >= self.split_arguments.split_max_tensors) \ + and self.shards[-1].tensor_count >= self.split_arguments.split_max_tensors) \ # or split when over size limit or (self.split_arguments.split_style == SplitStyle.SIZE \ - and self.shards[-1][2] + GGUFManager.get_tensor_size(tensor) > self.split_arguments.split_max_size)): + and self.shards[-1].size + GGUFManager.get_tensor_size(tensor) > self.split_arguments.split_max_size)): # we fill in the name later when we know how many shards there are - self.shards.append(["", 1, GGUFManager.get_tensor_size(tensor), deque([(name, tensor, raw_dtype)])]) + self.shards.append(Shard("", 1, GGUFManager.get_tensor_size(tensor), deque([(name, tensor, raw_dtype)]))) else: - self.shards[-1][1] += 1 - self.shards[-1][2] += GGUFManager.get_tensor_size(tensor) - self.shards[-1][3].append((name, tensor, raw_dtype)) + self.shards[-1].tensor_count += 1 + self.shards[-1].size += GGUFManager.get_tensor_size(tensor) + self.shards[-1].tensors.append((name, tensor, raw_dtype)) def close(self) -> None: for writer in self.shard_writers: diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 294f4d06dbb70..31ca9eabc9468 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -57,7 +57,7 @@ class GGUFWriter: def __init__( self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True, - endianess: GGUFEndian = GGUFEndian.LITTLE, dont_add_architecture: bool = False + endianess: GGUFEndian = GGUFEndian.LITTLE, add_architecture: bool = True ): self.fout = open(path, "wb") self.arch = arch @@ -77,7 +77,7 @@ def __init__( )) self.state = WriterState.EMPTY - if not dont_add_architecture: + if add_architecture: self.add_architecture() def write_header_to_file(self) -> None: From 1cbab222250999382cc1ab902e1f72bcad8e55b5 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 6 Jun 2024 08:43:26 -0400 Subject: [PATCH 22/66] type consistency in format_n_bytes_to_str --- gguf-py/gguf/gguf_manager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index e7d2ef096cd49..523a5f500a466 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -255,9 +255,9 @@ def split_str_to_n_bytes(split_str: str) -> int: def format_n_bytes_to_str(num: int) -> str: if num == METADATA_ONLY_INDICATOR: return "negligible - metadata only" - num = float(num) + fnum = float(num) for unit in ("", "K", "M", "G"): - if abs(num) < 1024.0: - return f"{num:3.1f}{unit}" - num /= 1024.0 - return f"{num:.1f}T - over 1TB, --split recommended" \ No newline at end of file + if abs(fnum) < 1024.0: + return f"{fnum:3.1f}{unit}" + fnum /= 1024.0 + return f"{fnum:.1f}T - over 1TB, --split recommended" \ No newline at end of file From 2037eabb644873400f6cdf0514f66af2a10076d2 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 6 Jun 2024 08:49:46 -0400 Subject: [PATCH 23/66] move kv keys to constants.py --- gguf-py/gguf/constants.py | 5 +++++ gguf-py/gguf/gguf_manager.py | 11 ++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a3c024c8975f5..a5bab4de6b183 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -72,6 +72,11 @@ class Rope: SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" + class Split: + LLM_KV_SPLIT_NO = "split.no" + LLM_KV_SPLIT_COUNT = "split.count" + LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" + class SSM: CONV_KERNEL = "{arch}.ssm.conv_kernel" INNER_SIZE = "{arch}.ssm.inner_size" diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index 523a5f500a466..f4411e752cb7b 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -18,15 +18,12 @@ GGUFValueType ) from .gguf_writer import GGUFWriter, WriterState +from .constants import Keys SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" METADATA_ONLY_INDICATOR = -1 -LLM_KV_SPLIT_NO = "split.no" -LLM_KV_SPLIT_COUNT = "split.count" -LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" - KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)} TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType] # (tensor name, tensor data, tensor dtype) @@ -132,9 +129,9 @@ def init_shards(self) -> None: # add split metadata unless it's one file - small first shard splits even with SplitStyle.NONE if self.split_arguments.split_style != SplitStyle.NONE or self.split_arguments.small_first_shard: - writer.add_uint16(LLM_KV_SPLIT_NO, i) - writer.add_uint16(LLM_KV_SPLIT_COUNT, len(self.shards)) - writer.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) + writer.add_uint16(Keys.Split.LLM_KV_SPLIT_NO, i) + writer.add_uint16(Keys.Split.LLM_KV_SPLIT_COUNT, len(self.shards)) + writer.add_int32(Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) # add tensors, deque popleft() ensures references to eager tensors are not kept while True: From 83e4a3f5cce4c32feedfb0687743cc06556443b2 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 6 Jun 2024 09:00:59 -0400 Subject: [PATCH 24/66] make pathlib explicit --- gguf-py/gguf/gguf_manager.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index f4411e752cb7b..f74b24117d149 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -6,6 +6,7 @@ from argparse import Namespace from collections import deque from dataclasses import dataclass +from pathlib import Path import numpy as np @@ -30,7 +31,7 @@ @dataclass class Shard: - path: str + path: Path tensor_count: int size: int tensors: deque[TensorTempData] @@ -56,7 +57,6 @@ def __init__(self, args: Namespace) -> None: class GGUFManager(GGUFWriter): kv_data: KVTempData - tensors: list[TensorTempData] split_arguments: SplitArguments shards: list[Shard] shard_writers: list[GGUFWriter] @@ -66,7 +66,7 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: Spl ) -> None: # we intentionally don't call superclass constructor self.arch = arch - self.path = path + self.path = Path(path) self.endianess = endianess self.kv_data = {} self.shards = [] @@ -78,7 +78,7 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: Spl self.state = WriterState.EMPTY if self.split_arguments.small_first_shard: - self.shards.append(Shard("", 0, METADATA_ONLY_INDICATOR, deque())) + self.shards.append(Shard(Path(), 0, METADATA_ONLY_INDICATOR, deque())) def init_shards(self) -> None: self.total_tensors = sum(shard.tensor_count for shard in self.shards) @@ -95,14 +95,13 @@ def init_shards(self) -> None: # no shards are created when writing vocab so make one if not self.shards: - self.shards.append(Shard("", 0, METADATA_ONLY_INDICATOR, deque())) + self.shards.append(Shard(Path(), 0, METADATA_ONLY_INDICATOR, deque())) # format shard names if len(self.shards) == 1: self.shards[0].path = self.path else: for i in range(len(self.shards)): - # TODO with_name is not explicit - import pathlib self.shards[i].path = self.path.with_name(SHARD_NAME_FORMAT.format(self.path.stem, i + 1, len(self.shards))) # print shard info @@ -211,7 +210,7 @@ def add_tensor( and self.shards[-1].size + GGUFManager.get_tensor_size(tensor) > self.split_arguments.split_max_size)): # we fill in the name later when we know how many shards there are - self.shards.append(Shard("", 1, GGUFManager.get_tensor_size(tensor), deque([(name, tensor, raw_dtype)]))) + self.shards.append(Shard(Path(), 1, GGUFManager.get_tensor_size(tensor), deque([(name, tensor, raw_dtype)]))) else: self.shards[-1].tensor_count += 1 self.shards[-1].size += GGUFManager.get_tensor_size(tensor) From 13ffe22ca77678b6285e0b8f2a80563f57bc9496 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 6 Jun 2024 10:24:11 -0400 Subject: [PATCH 25/66] base-1024 bytes to base-1000 --- gguf-py/gguf/gguf_manager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py index f74b24117d149..fcf0ad8aa4e1c 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_manager.py @@ -232,11 +232,11 @@ def get_tensor_size(tensor) -> int: @staticmethod def split_str_to_n_bytes(split_str: str) -> int: if split_str.endswith("K"): - n = int(split_str[:-1]) * 1024 + n = int(split_str[:-1]) * 1000 elif split_str.endswith("M"): - n = int(split_str[:-1]) * 1024 * 1024 + n = int(split_str[:-1]) * 1000 * 1000 elif split_str.endswith("G"): - n = int(split_str[:-1]) * 1024 * 1024 * 1024 + n = int(split_str[:-1]) * 1000 * 1000 * 1000 elif split_str.isnumeric(): n = int(split_str) else: @@ -253,7 +253,7 @@ def format_n_bytes_to_str(num: int) -> str: return "negligible - metadata only" fnum = float(num) for unit in ("", "K", "M", "G"): - if abs(fnum) < 1024.0: + if abs(fnum) < 1000.0: return f"{fnum:3.1f}{unit}" - fnum /= 1024.0 + fnum /= 1000.0 return f"{fnum:.1f}T - over 1TB, --split recommended" \ No newline at end of file From 6d3a256d1de552c76eab459659228fbed678906d Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 7 Jun 2024 09:12:44 -0400 Subject: [PATCH 26/66] rename GGUFManager to GGUFWriterSplit --- convert-hf-to-gguf.py | 2 +- gguf-py/gguf/__init__.py | 2 +- .../{gguf_manager.py => gguf_writer_split.py} | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) rename gguf-py/gguf/{gguf_manager.py => gguf_writer_split.py} (91%) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6436028831e9f..b4ea11a15f760 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -96,7 +96,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, + self.gguf_writer = gguf.GGUFWriterSplit(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, endianess=self.endianess, use_temp_file=self.use_temp_file) @classmethod diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py index 6900e3dd927fe..a2197255a556e 100644 --- a/gguf-py/gguf/__init__.py +++ b/gguf-py/gguf/__init__.py @@ -2,7 +2,7 @@ from .lazy import * from .gguf_reader import * from .gguf_writer import * -from .gguf_manager import * +from .gguf_writer_split import * from .quants import * from .tensor_mapping import * from .vocab import * diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_writer_split.py similarity index 91% rename from gguf-py/gguf/gguf_manager.py rename to gguf-py/gguf/gguf_writer_split.py index fcf0ad8aa4e1c..c115679993ab5 100644 --- a/gguf-py/gguf/gguf_manager.py +++ b/gguf-py/gguf/gguf_writer_split.py @@ -47,7 +47,7 @@ class SplitArguments: def __init__(self, args: Namespace) -> None: self.split = args.split self.split_max_tensors = args.split_max_tensors if args.split else 0 - self.split_max_size = GGUFManager.split_str_to_n_bytes(args.split_max_size) if args.split and args.split_max_size else 0 + self.split_max_size = GGUFWriterSplit.split_str_to_n_bytes(args.split_max_size) if args.split and args.split_max_size else 0 self.split_style = SplitStyle.NONE if not self.split \ else SplitStyle.TENSORS if self.split_max_tensors \ else SplitStyle.SIZE @@ -55,7 +55,7 @@ def __init__(self, args: Namespace) -> None: self.small_first_shard = args.small_first_shard -class GGUFManager(GGUFWriter): +class GGUFWriterSplit(GGUFWriter): kv_data: KVTempData split_arguments: SplitArguments shards: list[Shard] @@ -107,7 +107,7 @@ def init_shards(self) -> None: # print shard info print("\nWriting the following files:") for shard in self.shards: - print(f" {shard.path}: n_tensors = {shard.tensor_count}, total_size = {GGUFManager.format_n_bytes_to_str(shard.size)}") + print(f" {shard.path}: n_tensors = {shard.tensor_count}, total_size = {GGUFWriterSplit.format_n_bytes_to_str(shard.size)}") print() if self.split_arguments.dry_run: @@ -144,7 +144,7 @@ def init_shards(self) -> None: def write_header_to_file(self) -> None: if self.state is not WriterState.EMPTY: - raise ValueError(f'Expected GGUFManager state to be EMPTY, got {self.state}') + raise ValueError(f'Expected GGUFWriterSplit state to be EMPTY, got {self.state}') for writer in self.shard_writers: writer.write_header_to_file() @@ -153,7 +153,7 @@ def write_header_to_file(self) -> None: def write_kv_data_to_file(self) -> None: if self.state is not WriterState.HEADER: - raise ValueError(f'Expected GGUFManager state to be HEADER, got {self.state}') + raise ValueError(f'Expected GGUFWriterSplit state to be HEADER, got {self.state}') for writer in self.shard_writers: writer.write_kv_data_to_file() @@ -162,7 +162,7 @@ def write_kv_data_to_file(self) -> None: def write_tensors_to_file(self, *, progress: bool = False) -> None: if self.state is not WriterState.KV_DATA: - raise ValueError(f'Expected GGUFManager state to be KV_DATA, got {self.state}') + raise ValueError(f'Expected GGUFWriterSplit state to be KV_DATA, got {self.state}') running_total = self.total_tensors for i in range(len(self.shard_writers)): @@ -207,13 +207,13 @@ def add_tensor( and self.shards[-1].tensor_count >= self.split_arguments.split_max_tensors) \ # or split when over size limit or (self.split_arguments.split_style == SplitStyle.SIZE \ - and self.shards[-1].size + GGUFManager.get_tensor_size(tensor) > self.split_arguments.split_max_size)): + and self.shards[-1].size + GGUFWriterSplit.get_tensor_size(tensor) > self.split_arguments.split_max_size)): # we fill in the name later when we know how many shards there are - self.shards.append(Shard(Path(), 1, GGUFManager.get_tensor_size(tensor), deque([(name, tensor, raw_dtype)]))) + self.shards.append(Shard(Path(), 1, GGUFWriterSplit.get_tensor_size(tensor), deque([(name, tensor, raw_dtype)]))) else: self.shards[-1].tensor_count += 1 - self.shards[-1].size += GGUFManager.get_tensor_size(tensor) + self.shards[-1].size += GGUFWriterSplit.get_tensor_size(tensor) self.shards[-1].tensors.append((name, tensor, raw_dtype)) def close(self) -> None: From 1312e287ec8a0502586d278eb4cc31a7f99a070c Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Fri, 7 Jun 2024 17:10:51 -0400 Subject: [PATCH 27/66] Update gguf-py/gguf/constants.py Co-authored-by: compilade --- gguf-py/gguf/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a5bab4de6b183..eb82bd70639db 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -73,8 +73,8 @@ class Rope: SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" class Split: - LLM_KV_SPLIT_NO = "split.no" - LLM_KV_SPLIT_COUNT = "split.count" + LLM_KV_SPLIT_NO = "split.no" + LLM_KV_SPLIT_COUNT = "split.count" LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" class SSM: From 5f29d4a6172948b130f5f9b8fab3cf7ee9778425 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 7 Jun 2024 17:19:01 -0400 Subject: [PATCH 28/66] fix convert-hf-to-gguf.py permissions --- convert-hf-to-gguf.py | 5814 ++++++++++++++++++++--------------------- 1 file changed, 2907 insertions(+), 2907 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b4ea11a15f760..62e02472148f7 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1,2907 +1,2907 @@ -#!/usr/bin/env python3 - -from __future__ import annotations - -import logging -import argparse -import contextlib -import json -import os -import re -import sys -from enum import IntEnum -from pathlib import Path -from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast - -import math -import numpy as np -import torch - -if TYPE_CHECKING: - from torch import Tensor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import importlib -gguf = importlib.import_module("gguf-py.gguf") -# import gguf - -logger = logging.getLogger("hf-to-gguf") - - -###### MODEL DEFINITIONS ###### - -class SentencePieceTokenTypes(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -AnyModel = TypeVar("AnyModel", bound="type[Model]") - - -class Model: - _model_classes: dict[str, type[Model]] = {} - - dir_model: Path - ftype: int - is_big_endian: bool - endianess: gguf.GGUFEndian - use_temp_file: bool - lazy: bool - part_names: list[str] - is_safetensors: bool - hparams: dict[str, Any] - block_count: int - tensor_map: gguf.TensorNameMap - tensor_names: set[str] | None - fname_out: Path - gguf_writer: gguf.GGUFWriter - - # subclasses should define this! - model_arch: gguf.MODEL_ARCH - - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, - split_arguments: gguf.SplitArguments): - if type(self) is Model: - raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") - self.dir_model = dir_model - self.ftype = ftype - self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.use_temp_file = use_temp_file - self.lazy = not eager - self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors") - self.is_safetensors = len(self.part_names) > 0 - if not self.is_safetensors: - self.part_names = Model.get_model_part_names(self.dir_model, ".bin") - self.hparams = Model.load_hparams(self.dir_model) - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - self.tensor_names = None - if self.ftype == gguf.LlamaFileType.GUESSED: - # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. - _, first_tensor = next(self.get_tensors()) - if first_tensor.dtype == torch.float16: - logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") - self.ftype = gguf.LlamaFileType.MOSTLY_F16 - else: - logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") - self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - ftype_up: str = self.ftype.name.partition("_")[2].upper() - ftype_lw: str = ftype_up.lower() - # allow templating the file name with the output ftype, useful with the "auto" ftype - self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFWriterSplit(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, - endianess=self.endianess, use_temp_file=self.use_temp_file) - - @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - key = next((k for k in keys if k in self.hparams), None) - if key is not None: - return self.hparams[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - def set_vocab(self): - self._set_vocab_gpt2() - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - tensor_names_from_parts: set[str] = set() - - if len(self.part_names) > 1: - self.tensor_names = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - index_name += ".index.json" - logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(self.dir_model / index_name, "r", encoding="utf-8") as f: - index: dict[str, Any] = json.load(f) - weight_map = index.get("weight_map") - if weight_map is None or not isinstance(weight_map, dict): - raise ValueError(f"Can't load 'weight_map' from {index_name!r}") - self.tensor_names.update(weight_map.keys()) - else: - self.tensor_names = tensor_names_from_parts - - for part_name in self.part_names: - logger.info(f"gguf: loading model part '{part_name}'") - ctx: ContextManager[Any] - if self.is_safetensors: - from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) - else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) - - with ctx as model_part: - tensor_names_from_parts.update(model_part.keys()) - - for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] - if self.lazy: - data = LazyTorchTensor.from_eager(data) - yield name, data - - # only verify tensor name presence; it doesn't matter if they are not in the right files - if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") - - def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") - name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in name: - assert bid is not None - name = name.format(bid=bid) - return name + suffix - - def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - return False - key_name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in key_name: - if bid is None: - return False - key_name = key_name.format(bid=bid) - else: - if bid is not None: - return False - return name == (key_name + suffix) - - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: - new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) - if new_name is None: - raise ValueError(f"Can not map tensor {name!r}") - return new_name - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.block_count) - - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: - self.gguf_writer.add_context_length(n_ctx) - logger.info(f"gguf: context length = {n_ctx}") - - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - logger.info(f"gguf: experts used count = {n_experts_used}") - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - return [(self.map_tensor_name(name), data_torch)] - - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid, n_dims # unused - - return False - - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid, n_dims # unused - - return False - - def write_tensors(self): - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # use the first number-like part of the tensor name as the block id - bid = None - for part in name.split("."): - if part.isdecimal(): - bid = int(part) - break - - for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): - data: np.ndarray = data # type hint - n_dims = len(data.shape) - data_dtype = data.dtype - data_qtype: gguf.GGMLQuantizationType | None = None - - # when both are True, f32 should win - extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) - extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) - - # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - extra_f32 = any(cond for cond in ( - extra_f32, - n_dims == 1, - new_name.endswith("_norm.weight"), - )) - - # Some tensor types are always in float32 - extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - )) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - extra_f16 = any(cond for cond in ( - extra_f16, - (name.endswith(".weight") and n_dims >= 2), - )) - - if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: - if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data = gguf.quantize_bf16(data) - assert data.dtype == np.int16 - data_qtype = gguf.GGMLQuantizationType.BF16 - - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): - data = gguf.quantize_q8_0(data) - assert data.dtype == np.uint8 - data_qtype = gguf.GGMLQuantizationType.Q8_0 - - else: # default to float16 for quantized tensors - if data_dtype != np.float16: - data = data.astype(np.float16) - data_qtype = gguf.GGMLQuantizationType.F16 - - if data_qtype is None: # by default, convert to float32 - if data_dtype != np.float32: - data = data.astype(np.float32) - data_qtype = gguf.GGMLQuantizationType.F32 - - shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape - - # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" - - # n_dims is implicit in the shape - logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") - - self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - - def write(self): - self.write_tensors() - self.gguf_writer.init_shards() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - - def write_vocab(self): - if self.gguf_writer.split_arguments.split: - raise ValueError('Splitting the vocabulary is not supported') - self.gguf_writer.init_shards() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - - @staticmethod - def get_model_part_names(dir_model: Path, suffix: str) -> list[str]: - part_names: list[str] = [] - for filename in os.listdir(dir_model): - if filename.endswith(suffix): - part_names.append(filename) - - part_names.sort() - - return part_names - - @staticmethod - def load_hparams(dir_model: Path): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @classmethod - def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: - assert names - - def func(modelcls: AnyModel) -> AnyModel: - for name in names: - cls._model_classes[name] = modelcls - return modelcls - return func - - @classmethod - def from_model_architecture(cls, arch: str) -> type[Model]: - try: - return cls._model_classes[arch] - except KeyError: - raise NotImplementedError(f'Architecture {arch!r} not supported!') from None - - # used for GPT-2 BPE and WordPiece vocabs - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - return tokens, toktypes, tokpre - - # NOTE: this function is generated by convert-hf-to-gguf-update.py - # do not modify it manually! - # ref: https://github.com/ggerganov/llama.cpp/pull/6920 - # Marker: Start get_vocab_base_pre - def get_vocab_base_pre(self, tokenizer) -> str: - # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that - # is specific for the BPE pre-tokenizer used by the model - # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can - # use in llama.cpp to implement the same pre-tokenizer - - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' - - chktok = tokenizer.encode(chktxt) - chkhsh = sha256(str(chktok).encode()).hexdigest() - - logger.debug(f"chktok: {chktok}") - logger.debug(f"chkhsh: {chkhsh}") - - res = None - - # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script - # or pull the latest version of the model from Huggingface - # don't edit the hashes manually! - if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" - if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base - res = "deepseek-llm" - if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base - res = "deepseek-coder" - if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - # ref: https://huggingface.co/tiiuae/falcon-7b - res = "falcon" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 - res = "bert-bge" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/mosaicml/mpt-7b - res = "mpt" - if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - # ref: https://huggingface.co/bigcode/starcoder2-3b - res = "starcoder" - if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - # ref: https://huggingface.co/openai-community/gpt2 - res = "gpt-2" - if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b - res = "stablelm2" - if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - # ref: https://huggingface.co/smallcloudai/Refact-1_6-base - res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 - res = "command-r" - if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - # ref: https://huggingface.co/Qwen/Qwen1.5-7B - res = "qwen2" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf - res = "olmo" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base - res = "dbrx" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en - res = "jina-v2-en" - if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es - res = "jina-v2-es" - if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de - res = "jina-v2-de" - if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct - res = "smaug-bpe" - - if res is None: - logger.warning("\n") - logger.warning("**************************************************************************************") - logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") - logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") - logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") - logger.warning("**") - logger.warning(f"** chkhsh: {chkhsh}") - logger.warning("**************************************************************************************") - logger.warning("\n") - raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - - logger.debug(f"tokenizer.ggml.pre: {repr(res)}") - logger.debug(f"chkhsh: {chkhsh}") - - return res - # Marker: End get_vocab_base_pre - - def _set_vocab_gpt2(self) -> None: - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_qwen(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.special_tokens - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_sentencepiece(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if (token_id >= vocab_size): - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_llama_hf(self): - vocab = gguf.LlamaHfVocab(self.dir_model) - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - -@Model.register("GPTNeoXForCausalLM") -class GPTNeoXModel(Model): - model_arch = gguf.MODEL_ARCH.GPTNEOX - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - tensors: list[tuple[str, Tensor]] = [] - - if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors - - -@Model.register("BloomForCausalLM") -class BloomModel(Model): - model_arch = gguf.MODEL_ARCH.BLOOM - - def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(4 * n_embed) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - name = re.sub(r'transformer\.', '', name) - - tensors: list[tuple[str, Tensor]] = [] - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - tensors.append((self.map_tensor_name(name), data_torch)) - - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("MPTForCausalLM") -class MPTModel(Model): - model_arch = gguf.MODEL_ARCH.MPT - - def set_vocab(self): - try: - self._set_vocab_gpt2() - except Exception: - # Fallback for SEA-LION model - self._set_vocab_sentencepiece() - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_pad_token_id(3) - self.gguf_writer.add_eos_token_id(1) - self.gguf_writer.add_unk_token_id(0) - - def set_gguf_parameters(self): - block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-5) - if self.hparams["attn_config"]["clip_qkv"] is not None: - self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) - else: - self.gguf_writer.add_max_alibi_bias(0.0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if "scales" in name: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name.replace("scales", "act.scales") - else: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) - - return [(new_name, data_torch)] - - -@Model.register("OrionForCausalLM") -class OrionModel(Model): - model_arch = gguf.MODEL_ARCH.ORION - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - # note: config provides rms norm but it is actually layer norm - # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 - self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) - - -@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(Model): - model_arch = gguf.MODEL_ARCH.BAICHUAN - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - tensors: list[tuple[str, Tensor]] = [] - - if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": - logger.info(f"Unpacking and permuting layer {bid}") - tensors = [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), - self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), - self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), - self._reverse_hf_part(data_torch, 2)), - ] - else: - tensors = [(self.map_tensor_name(name), data_torch)] - - return tensors - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, - ) -> Tensor: - r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] - - -@Model.register("XverseForCausalLM") -class XverseModel(Model): - model_arch = gguf.MODEL_ARCH.XVERSE - - def set_vocab(self): - assert (self.dir_model / "tokenizer.json").is_file() - dir_model = self.dir_model - hparams = self.hparams - - tokens: list[bytes] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode('utf-8') - # replace "\x00" to string with length > 0 - if token_text == b"\x00": - toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode('utf-8') - elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): - toktype = gguf.TokenType.BYTE # special - elif reverse_vocab[token_id] in added_vocab: - if tokenizer.added_tokens_decoder[token_id].special: - toktype = gguf.TokenType.CONTROL - else: - toktype = gguf.TokenType.USER_DEFINED - else: - toktype = gguf.TokenType.NORMAL - - tokens.append(token_text) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - # HF models permute some of the tensors, so we need to undo that - if name.endswith("q_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) - if name.endswith("k_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) - - return [(self.map_tensor_name(name), data_torch)] - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - -@Model.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(Model): - model_arch = gguf.MODEL_ARCH.FALCON - - def set_gguf_parameters(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - self.gguf_writer.add_name("Falcon") - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 - head_dim = self.hparams["hidden_size"] // n_head - - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("GPTBigCodeForCausalLM") -class StarCoderModel(Model): - model_arch = gguf.MODEL_ARCH.STARCODER - - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("StarCoder") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -@Model.register("GPTRefactForCausalLM") -class RefactModel(Model): - model_arch = gguf.MODEL_ARCH.REFACT - - def set_vocab(self): - super().set_vocab() - - # TODO: how to determine special FIM tokens automatically? - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) - special_vocab._set_special_token("prefix", 1) - special_vocab._set_special_token("suffix", 3) - special_vocab._set_special_token("middle", 2) - special_vocab._set_special_token("fsep", 4) # is this correct? - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("Refact") - # refact uses Alibi. So this is from config.json which might be used by training. - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - - self.gguf_writer.add_feed_forward_length(ff_dim) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - n_head = self.hparams["n_head"] - n_head_kv = 1 - head_dim = self.hparams["n_embd"] // n_head - - tensors: list[tuple[str, Tensor]] = [] - - if bid is not None: - if name == f"transformer.h.{bid}.attn.kv.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) - elif name == f"transformer.h.{bid}.attn.q.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) - elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) - - if len(tensors) == 0: - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors - - -@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(Model): - model_arch = gguf.MODEL_ARCH.STABLELM - - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab - self._set_vocab_qwen() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) - self.gguf_writer.add_file_type(self.ftype) - - _q_norms: list[dict[str, Tensor]] | None = None - _k_norms: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams["num_key_value_heads"] - - if name.find("q_layernorm.norms") != -1: - assert bid is not None - - if self._q_norms is None: - self._q_norms = [{} for _ in range(self.block_count)] - - self._q_norms[bid][name] = data_torch - - if len(self._q_norms[bid]) >= n_head: - return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") - else: - return [] - - if name.find("k_layernorm.norms") != -1: - assert bid is not None - - if self._k_norms is None: - self._k_norms = [{} for _ in range(self.block_count)] - - self._k_norms[bid][name] = data_torch - - if len(self._k_norms[bid]) >= n_kv_head: - return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): - datas: list[Tensor] = [] - # extract the norms in order - for xid in range(n_head): - ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" - datas.append(norms[ename]) - del norms[ename] - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" - new_name = self.map_tensor_name(merged_name) - - return [(new_name, data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._q_norms is not None or self._k_norms is not None: - # flatten two `list[dict[str, Tensor]]` into a single `list[str]` - norms = ( - [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] - ) + ( - [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] - ) - if len(norms) > 0: - raise ValueError(f"Unprocessed norms: {norms}") - - -@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") -class LlamaModel(Model): - model_arch = gguf.MODEL_ARCH.LLAMA - - def set_vocab(self): - try: - self. _set_vocab_sentencepiece() - except FileNotFoundError: - try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() - - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) - if self.hparams.get("vocab_size", 32000) == 32016: - special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] - ) - special_vocab._set_special_token("prefix", 32007) - special_vocab._set_special_token("suffix", 32008) - special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("GrokForCausalLM") -class GrokModel(Model): - model_arch = gguf.MODEL_ARCH.GROK - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_name("Grok") - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find(".moe.") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["linear", "linear_1", "linear_v"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("DbrxForCausalLM") -class DbrxModel(Model): - model_arch = gguf.MODEL_ARCH.DBRX - - def set_gguf_parameters(self): - ffn_config = self.hparams["ffn_config"] - attn_config = self.hparams["attn_config"] - self.gguf_writer.add_name(self.hparams["model_type"]) - self.gguf_writer.add_block_count(self.hparams["n_layers"]) - - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) - - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) - - self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) - - self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) - self.gguf_writer.add_file_type(self.ftype) - - self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) - - self.gguf_writer.add_layer_norm_eps(1e-5) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_expert = self.hparams["ffn_config"]["moe_num_experts"] - n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] - n_embd = self.hparams["d_model"] - - # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose - # original implementation expects (n_expert, n_ff, n_embd) for all experts weights - # But llama.cpp moe graph works differently - # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions - # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - experts = False - - for exp_tensor_name in exp_tensor_names.keys(): - if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: - experts = True - data_torch = data_torch.view(n_expert, n_ff, n_embd) - if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: - data_torch = data_torch.permute(*permute_tensor) - break - - # map tensor names - # In MoE models the ffn tensors are typically most of the model weights, - # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. - # Every other model has the weight names ending in .weight, - # let's assume that is the convention which is not the case for dbrx: - # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) - - return [(new_name, data_torch)] - - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid # unused - - return n_dims > 1 - - -@Model.register("MiniCPMForCausalLM") -class MiniCPMModel(Model): - model_arch = gguf.MODEL_ARCH.MINICPM - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def set_vocab(self): - self._set_vocab_llama_hf() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("QWenLMHeadModel") -class QwenModel(Model): - model_arch = gguf.MODEL_ARCH.QWEN - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - self._set_vocab_qwen() - - def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -@Model.register("Qwen2ForCausalLM") -class Qwen2Model(Model): - model_arch = gguf.MODEL_ARCH.QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - -@Model.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(Model): - model_arch = gguf.MODEL_ARCH.QWEN2MOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (n_experts := self.hparams.get("num_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.hparams["num_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("GPT2LMHeadModel") -class GPT2Model(Model): - model_arch = gguf.MODEL_ARCH.GPT2 - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_ctx"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - - # we don't need these - if name.endswith((".attn.bias", ".attn.masked_bias")): - return tensors - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - tensors.append((new_name, data_torch)) - - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("PhiForCausalLM") -class Phi2Model(Model): - model_arch = gguf.MODEL_ARCH.PHI2 - - def set_gguf_parameters(self): - block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - - rot_pct = self.find_hparam(["partial_rotary_factor"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - - self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) - - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(4 * n_embd) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_add_bos_token(False) - - -@Model.register("Phi3ForCausalLM") -class Phi3MiniModel(Model): - model_arch = gguf.MODEL_ARCH.PHI3 - - def set_vocab(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - raise ValueError(f'Error: Missing {tokenizer_path}') - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - token_id = added_tokens_json[key] - if (token_id >= vocab_size): - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, foken_data in added_tokens_decoder.items(): - token_id = int(token_id) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: - assert tokens[token_id] == token - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - tokenizer_file = self.dir_model / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - added_tokens = tokenizer_json.get("added_tokens", []) - for foken_data in added_tokens: - token_id = int(foken_data["id"]) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: - assert tokens[token_id] == token - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) - rms_eps = self.find_hparam(["rms_norm_eps"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head - - self.gguf_writer.add_name("Phi3") - self.gguf_writer.add_context_length(max_pos_embds) - self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(rms_eps) - self.gguf_writer.add_rope_dimension_count(rope_dims) - self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) - self.gguf_writer.add_file_type(self.ftype) - - # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(['rope_scaling'], True) - if (rope_scaling is None): - return - - scale = max_pos_embds / orig_max_pos_embds - - rope_scaling_type = rope_scaling.get('type', '').lower() - if len(rope_scaling_type) == 0: - raise KeyError('Missing the required key rope_scaling.type') - - if rope_scaling_type == 'su': - attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 - elif rope_scaling_type == 'yarn': - attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 - else: - raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') - - self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) - - -@Model.register("PlamoForCausalLM") -class PlamoModel(Model): - model_arch = gguf.MODEL_ARCH.PLAMO - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name("PLaMo") - self.gguf_writer.add_context_length(4096) # not in config.json - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def shuffle_attn_q_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(8, 5, 128, 5120) - data_torch = torch.permute(data_torch, (1, 0, 2, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def shuffle_attn_output_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(5120, 8, 5, 128) - data_torch = torch.permute(data_torch, (0, 2, 1, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - new_name = self.map_tensor_name(name) - - # shuffle for broadcasting of gqa in ggml_mul_mat - if new_name.endswith("attn_q.weight"): - data_torch = self.shuffle_attn_q_weight(data_torch) - elif new_name.endswith("attn_output.weight"): - data_torch = self.shuffle_attn_output_weight(data_torch) - - return [(new_name, data_torch)] - - -@Model.register("CodeShellForCausalLM") -class CodeShellModel(Model): - model_arch = gguf.MODEL_ARCH.CODESHELL - - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("CodeShell") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(10000.0) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - new_name = self.map_tensor_name(name) - - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None - - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - # copy tok_embd.weight to output.weight - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("InternLM2ForCausalLM") -class InternLM2Model(Model): - model_arch = gguf.MODEL_ARCH.INTERNLM2 - - def set_vocab(self): - # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - sentencepiece_model = model.ModelProto() - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉".encode("utf-8") - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - old_eos = special_vocab.special_token_ids["eos"] - if "chat" in os.path.basename(self.dir_model.absolute()): - # For the chat model, we replace the eos with '<|im_end|>'. - # TODO: this is a hack, should be fixed - # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 - special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ -in chat mode so that the conversation can end normally.") - - special_vocab.add_to_gguf(self.gguf_writer) - - def _try_get_sft_eos(self, tokenizer): - unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]') - im_end_list = tokenizer.Encode('<|im_end|>') - eos_token = None - assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) - if len(unused_145_list) == 1: - eos_token = unused_145_list[0] - if len(im_end_list) == 1: - eos_token = im_end_list[0] - assert eos_token - return eos_token - - def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_heads = self.hparams["num_attention_heads"] - num_kv_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] - q_per_kv = num_heads // num_kv_heads - head_dim = hidden_size // num_heads - num_groups = num_heads // q_per_kv - - qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" - - if re.match(qkv_pattern, name): - bid = re.findall(qkv_pattern, name)[0] - qkv = data_torch - # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim)) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] - # The model weights of q and k equire additional reshape. - # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads) - # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) - k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads) - # v = rearrange(v, " o g n i -> o (g n i)").T - v = v.reshape((v.shape[0], -1)).T - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), - ] - else: - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("BertModel", "CamembertModel") -class BertModel(Model): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.vocab_size = None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_causal_attention(False) - - # get pooling path - pooling_path = None - module_path = self.dir_model / "modules.json" - if module_path.is_file(): - with open(module_path, encoding="utf-8") as f: - modules = json.load(f) - for mod in modules: - if mod["type"] == "sentence_transformers.models.Pooling": - pooling_path = mod["path"] - break - - # get pooling type - if pooling_path is not None: - with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: - pooling = json.load(f) - if pooling["pooling_mode_mean_tokens"]: - pooling_type = gguf.PoolingType.MEAN - elif pooling["pooling_mode_cls_token"]: - pooling_type = gguf.PoolingType.CLS - else: - raise NotImplementedError("Only MEAN and CLS pooling types supported") - self.gguf_writer.add_pooling_type(pooling_type) - - def set_vocab(self): - tokens, toktypes, tokpre = self.get_vocab_base() - self.vocab_size = len(tokens) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" - - # convert to phantom space vocab - def phantom(tok): - if tok.startswith("[") and tok.endswith("]"): - return tok - if tok.startswith("##"): - return tok[2:] - return "\u2581" + tok - tokens = list(map(phantom, tokens)) - - # add vocab to gguf - self.gguf_writer.add_tokenizer_model("bert") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # handle special tokens - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return [] # we don't need these - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("NomicBertModel") -class NomicBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.NOMIC_BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # the HF config claims n_ctx=8192, but it uses RoPE scaling - self.hparams["n_ctx"] = 2048 - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # this doesn't do anything in the HF version - assert self.hparams["causal"] is False - # no bias tensors - assert self.hparams["qkv_proj_bias"] is False - assert self.hparams["mlp_fc1_bias"] is False - assert self.hparams["mlp_fc2_bias"] is False - # norm at end of layer - assert self.hparams["prenorm"] is False - # standard RoPE - assert self.hparams["rotary_emb_fraction"] == 1.0 - assert self.hparams["rotary_emb_interleaved"] is False - assert self.hparams["rotary_emb_scale_base"] is None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - - -@Model.register("GemmaForCausalLM") -class GemmaModel(Model): - model_arch = gguf.MODEL_ARCH.GEMMA - - def set_vocab(self): - self._set_vocab_sentencepiece() - - # TODO: these special tokens should be exported only for the CodeGemma family - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) - special_vocab._set_special_token("prefix", 67) - special_vocab._set_special_token("suffix", 69) - special_vocab._set_special_token("middle", 68) - special_vocab._set_special_token("fsep", 70) - special_vocab._set_special_token("eot", 107) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] - - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("Starcoder2ForCausalLM") -class StarCoder2Model(Model): - model_arch = gguf.MODEL_ARCH.STARCODER2 - - -@Model.register("MambaForCausalLM", "MambaLMHeadModel") -class MambaModel(Model): - model_arch = gguf.MODEL_ARCH.MAMBA - - def set_vocab(self): - vocab_size = self.hparams["vocab_size"] - # Round vocab size to next multiple of 8 - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - # pad using ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - elif (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - else: - # Use the GPT-NeoX tokenizer when no tokenizer files are present - tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") - neox_reader = gguf.GGUFReader(tokenizer_path, "r") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) - self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) - self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) - self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) - self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID) - self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0) - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - - # Fail early for models which don't have a block expansion factor of 2 - assert d_inner == 2 * d_model - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_file_type(self.ftype) - - _tok_embd = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - - new_name = self.map_tensor_name(name) - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - # assuming token_embd.weight is seen before output.weight - if self._tok_embd is not None and new_name == output_name: - if torch.equal(self._tok_embd, data_torch): - logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") - return [] - elif new_name == tok_embd_name: - self._tok_embd = data_torch - - return [(new_name, data_torch)] - - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del n_dims # unused - - return bid is not None and new_name in ( - self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ - gguf.MODEL_TENSOR.SSM_CONV1D, - gguf.MODEL_TENSOR.SSM_X, - gguf.MODEL_TENSOR.SSM_DT, - gguf.MODEL_TENSOR.SSM_A, - gguf.MODEL_TENSOR.SSM_D, - ] - ) - - -@Model.register("CohereForCausalLM") -class CommandR2Model(Model): - model_arch = gguf.MODEL_ARCH.COMMAND_R - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # max_position_embeddings = 8192 in config.json but model was actually - # trained on 128k context length - # aya-23 models don't have model_max_length specified - self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - -@Model.register("OlmoForCausalLM") -@Model.register("OLMoForCausalLM") -class OlmoModel(Model): - model_arch = gguf.MODEL_ARCH.OLMO - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_eps(1e-5) - clip_qkv = self.hparams.get("clip_qkv") - if clip_qkv is not None: - self.gguf_writer.add_clamp_kqv(clip_qkv) - - # Same as super class, but permuting q_proj, k_proj - # Copied from: LlamaModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("JinaBertModel", "JinaBertForMaskedLM") -class JinaBertV2Model(BertModel): - model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.intermediate_size = self.hparams["intermediate_size"] - - def get_tensors(self): - for name, data in super().get_tensors(): - if 'gated_layers' in name: - d1 = data[:self.intermediate_size, :] - name1 = name.replace('gated_layers', 'gated_layers_w') - d2 = data[self.intermediate_size:, :] - name2 = name.replace('gated_layers', 'gated_layers_v') - yield name1, d1 - yield name2, d2 - continue - - yield name, data - - def set_vocab(self, *args, **kwargs): - tokenizer_class = 'BertTokenizer' - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_class = json.load(f)['tokenizer_class'] - - if tokenizer_class == 'BertTokenizer': - super().set_vocab() - elif tokenizer_class == 'RobertaTokenizer': - self._set_vocab_gpt2() - self.gguf_writer.add_token_type_count(2) - else: - raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - -@Model.register("ArcticForCausalLM") -class ArcticModel(Model): - model_arch = gguf.MODEL_ARCH.ARCTIC - - def set_vocab(self): - # The reason for using a custom implementation here is that the - # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from - # tokenizer.model and used them as BOS and EOS instead of adding new tokens. - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - # Read the whole vocabulary from the tokenizer.model file - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - # Use the added_tokens_decoder field from tokeniser_config.json as the source - # of information about added/redefined tokens and modify them accordingly. - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - - if "added_tokens_decoder" in tokenizer_config_json: - added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] - for token_id, token_json in added_tokens_decoder.items(): - token_id = int(token_id) - if (token_id >= vocab_size): - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - token_content = token_json["content"] - token_type = SentencePieceTokenTypes.USER_DEFINED - token_score = -10000.0 - - # Map unk_token to UNKNOWN, other special tokens to CONTROL - # Set the score to 0.0 as in the original tokenizer.model - if ("special" in token_json) and token_json["special"]: - if token_content == tokenizer_config_json["unk_token"]: - token_type = SentencePieceTokenTypes.UNKNOWN - else: - token_type = SentencePieceTokenTypes.CONTROL - token_score = 0.0 - - logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") - tokens[token_id] = token_content.encode("utf-8") - toktypes[token_id] = token_type - scores[token_id] = token_score - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("DeepseekV2ForCausalLM") -class DeepseekV2Model(Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -###### CONVERSION LOGIC ###### - - -# tree of lazy tensors -class LazyTorchTensor(gguf.LazyBase): - _tensor_type = torch.Tensor - # to keep the type-checker happy - dtype: torch.dtype - shape: torch.Size - - # only used when converting a torch.Tensor to a np.ndarray - _dtype_map: dict[torch.dtype, type] = { - torch.float16: np.float16, - torch.float32: np.float32, - } - - def numpy(self) -> gguf.LazyNumpyTensor: - dtype = self._dtype_map[self.dtype] - return gguf.LazyNumpyTensor( - meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - lazy=self._lazy, - args=(self,), - func=(lambda s: s[0].numpy()) - ) - - @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor: - return torch.empty(size=shape, dtype=dtype, device="meta") - - @classmethod - def __torch_function__(cls, func, types, args=(), kwargs=None): - del types # unused - - if kwargs is None: - kwargs = {} - - if func is torch.Tensor.numpy: - return args[0].numpy() - - return LazyTorchTensor._wrap_fn(func)(*args, **kwargs) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--awq-path", type=Path, default=None, - help="Path to scale awq cache file", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", - ) - parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", - ) - parser.add_argument( - "--bigendian", action="store_true", - help="model is executed on big endian machine", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file", - ) - parser.add_argument( - "--use-temp-file", action="store_true", - help="use the tempfile library while processing (helpful when running out of memory, process killed)", - ) - parser.add_argument( - "--no-lazy", action="store_true", - help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", - ) - parser.add_argument( - "--model-name", type=str, default=None, - help="name of the model", - ) - parser.add_argument( - "--verbose", action="store_true", - help="increase output verbosity", - ) - parser.add_argument( - "--split", action="store_true", - help="split the converted model into multiple files" - ) - parser.add_argument( - "--split-max-tensors", type=int, - help="max tensors in each split" - ) - parser.add_argument( - "--split-max-size", type=str, - help="max size per split N(M|G)" - ) - parser.add_argument( - "--dry-run", action="store_true", - help="only print out a split plan and exit, without writing any new files" - ) - parser.add_argument( - "--small-first-shard", action="store_true", - help="do not add tensors to the first shard (disabled by default)" - ) - - return parser.parse_args() - - -def main() -> None: - args = parse_args() - - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - - dir_model = args.model - - if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] - tmp_model_path = args.model / "weighted_model" - dir_model = tmp_model_path - if tmp_model_path.is_dir(): - logger.info(f"{tmp_model_path} exists as a weighted model.") - else: - tmp_model_path.mkdir(parents=True, exist_ok=True) - logger.info("Saving new weighted model ...") - add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - logger.info(f"Saved weighted model at {tmp_model_path}.") - - if not dir_model.is_dir(): - logger.error(f'Error: {args.model} is not a directory') - sys.exit(1) - - if args.split and not (args.split_max_tensors or args.split_max_size): - raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") - - if args.split_max_tensors and args.split_max_size: - raise ValueError("Can't specify both --split-max-tensors and --split-max-size") - - split_arguments = gguf.SplitArguments(args) - - ftype_map: dict[str, gguf.LlamaFileType] = { - "f32": gguf.LlamaFileType.ALL_F32, - "f16": gguf.LlamaFileType.MOSTLY_F16, - "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - "auto": gguf.LlamaFileType.GUESSED, - } - - if args.outfile is not None: - fname_out = args.outfile - else: - # output in the same directory as the model by default - fname_out = dir_model / 'ggml-model-{ftype}.gguf' - - logger.info(f"Loading model: {dir_model.name}") - - hparams = Model.load_hparams(dir_model) - - with torch.inference_mode(): - try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) - except NotImplementedError: - logger.error(f"Model {hparams['architectures'][0]} is not supported") - sys.exit(1) - - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, - args.no_lazy, split_arguments) - - logger.info("Set model parameters") - model_instance.set_gguf_parameters() - - logger.info("Set model tokenizer") - model_instance.set_vocab() - - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - - if args.vocab_only: - logger.info(f"Exporting model vocab to '{model_instance.fname_out}'") - model_instance.write_vocab() - else: - logger.info(f"Exporting model to '{model_instance.fname_out}'") - model_instance.write() - - logger.info(f"Model successfully exported to '{model_instance.fname_out}'") - - -if __name__ == '__main__': - main() +#!/usr/bin/env python3 + +from __future__ import annotations + +import logging +import argparse +import contextlib +import json +import os +import re +import sys +from enum import IntEnum +from pathlib import Path +from hashlib import sha256 +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast + +import math +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import importlib +gguf = importlib.import_module("gguf-py.gguf") +# import gguf + +logger = logging.getLogger("hf-to-gguf") + + +###### MODEL DEFINITIONS ###### + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +AnyModel = TypeVar("AnyModel", bound="type[Model]") + + +class Model: + _model_classes: dict[str, type[Model]] = {} + + dir_model: Path + ftype: int + is_big_endian: bool + endianess: gguf.GGUFEndian + use_temp_file: bool + lazy: bool + part_names: list[str] + is_safetensors: bool + hparams: dict[str, Any] + block_count: int + tensor_map: gguf.TensorNameMap + tensor_names: set[str] | None + fname_out: Path + gguf_writer: gguf.GGUFWriter + + # subclasses should define this! + model_arch: gguf.MODEL_ARCH + + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, + split_arguments: gguf.SplitArguments): + if type(self) is Model: + raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + self.dir_model = dir_model + self.ftype = ftype + self.is_big_endian = is_big_endian + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.use_temp_file = use_temp_file + self.lazy = not eager + self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors") + self.is_safetensors = len(self.part_names) > 0 + if not self.is_safetensors: + self.part_names = Model.get_model_part_names(self.dir_model, ".bin") + self.hparams = Model.load_hparams(self.dir_model) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + self.tensor_names = None + if self.ftype == gguf.LlamaFileType.GUESSED: + # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. + _, first_tensor = next(self.get_tensors()) + if first_tensor.dtype == torch.float16: + logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") + self.ftype = gguf.LlamaFileType.MOSTLY_F16 + else: + logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") + self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + ftype_up: str = self.ftype.name.partition("_")[2].upper() + ftype_lw: str = ftype_up.lower() + # allow templating the file name with the output ftype, useful with the "auto" ftype + self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) + self.gguf_writer = gguf.GGUFWriterSplit(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, + endianess=self.endianess, use_temp_file=self.use_temp_file) + + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def set_vocab(self): + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + tensor_names_from_parts: set[str] = set() + + if len(self.part_names) > 1: + self.tensor_names = set() + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + logger.info(f"gguf: loading model weight map from '{index_name}'") + with open(self.dir_model / index_name, "r", encoding="utf-8") as f: + index: dict[str, Any] = json.load(f) + weight_map = index.get("weight_map") + if weight_map is None or not isinstance(weight_map, dict): + raise ValueError(f"Can't load 'weight_map' from {index_name!r}") + self.tensor_names.update(weight_map.keys()) + else: + self.tensor_names = tensor_names_from_parts + + for part_name in self.part_names: + logger.info(f"gguf: loading model part '{part_name}'") + ctx: ContextManager[Any] + if self.is_safetensors: + from safetensors import safe_open + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) + else: + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + + with ctx as model_part: + tensor_names_from_parts.update(model_part.keys()) + + for name in model_part.keys(): + data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + if self.lazy: + data = LazyTorchTensor.from_eager(data) + yield name, data + + # only verify tensor name presence; it doesn't matter if they are not in the right files + if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") + + def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") + name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in name: + assert bid is not None + name = name.format(bid=bid) + return name + suffix + + def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + return False + key_name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in key_name: + if bid is None: + return False + key_name = key_name.format(bid=bid) + else: + if bid is not None: + return False + return name == (key_name + suffix) + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) + if new_name is None: + raise ValueError(f"Can not map tensor {name!r}") + return new_name + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + return [(self.map_tensor_name(name), data_torch)] + + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused + + return False + + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused + + return False + + def write_tensors(self): + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # use the first number-like part of the tensor name as the block id + bid = None + for part in name.split("."): + if part.isdecimal(): + bid = int(part) + break + + for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): + data: np.ndarray = data # type hint + n_dims = len(data.shape) + data_dtype = data.dtype + data_qtype: gguf.GGMLQuantizationType | None = None + + # when both are True, f32 should win + extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) + extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) + + # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + extra_f32 = any(cond for cond in ( + extra_f32, + n_dims == 1, + new_name.endswith("_norm.weight"), + )) + + # Some tensor types are always in float32 + extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + )) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + extra_f16 = any(cond for cond in ( + extra_f16, + (name.endswith(".weight") and n_dims >= 2), + )) + + if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: + if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data = gguf.quantize_bf16(data) + assert data.dtype == np.int16 + data_qtype = gguf.GGMLQuantizationType.BF16 + + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): + data = gguf.quantize_q8_0(data) + assert data.dtype == np.uint8 + data_qtype = gguf.GGMLQuantizationType.Q8_0 + + else: # default to float16 for quantized tensors + if data_dtype != np.float16: + data = data.astype(np.float16) + data_qtype = gguf.GGMLQuantizationType.F16 + + if data_qtype is None: # by default, convert to float32 + if data_dtype != np.float32: + data = data.astype(np.float32) + data_qtype = gguf.GGMLQuantizationType.F32 + + shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + + # n_dims is implicit in the shape + logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + + def write(self): + self.write_tensors() + self.gguf_writer.init_shards() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + def write_vocab(self): + if self.gguf_writer.split_arguments.split: + raise ValueError('Splitting the vocabulary is not supported') + self.gguf_writer.init_shards() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + @staticmethod + def get_model_part_names(dir_model: Path, suffix: str) -> list[str]: + part_names: list[str] = [] + for filename in os.listdir(dir_model): + if filename.endswith(suffix): + part_names.append(filename) + + part_names.sort() + + return part_names + + @staticmethod + def load_hparams(dir_model: Path): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @classmethod + def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: + assert names + + def func(modelcls: AnyModel) -> AnyModel: + for name in names: + cls._model_classes[name] = modelcls + return modelcls + return func + + @classmethod + def from_model_architecture(cls, arch: str) -> type[Model]: + try: + return cls._model_classes[arch] + except KeyError: + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + + # used for GPT-2 BPE and WordPiece vocabs + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes, tokpre + + # NOTE: this function is generated by convert-hf-to-gguf-update.py + # do not modify it manually! + # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 + res = "bert-bge" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/mosaicml/mpt-7b + res = "mpt" + if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": + # ref: https://huggingface.co/bigcode/starcoder2-3b + res = "starcoder" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b + res = "stablelm2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" + if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": + # ref: https://huggingface.co/Qwen/Qwen1.5-7B + res = "qwen2" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf + res = "olmo" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": + # ref: https://huggingface.co/databricks/dbrx-base + res = "dbrx" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en + res = "jina-v2-en" + if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es + res = "jina-v2-es" + if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de + res = "jina-v2-de" + if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": + # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct + res = "smaug-bpe" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") + + return res + # Marker: End get_vocab_base_pre + + def _set_vocab_gpt2(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_qwen(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_llama_hf(self): + vocab = gguf.LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + +@Model.register("GPTNeoXForCausalLM") +class GPTNeoXModel(Model): + model_arch = gguf.MODEL_ARCH.GPTNEOX + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + tensors: list[tuple[str, Tensor]] = [] + + if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + tensors.append((self.map_tensor_name(name), data_torch)) + + return tensors + + +@Model.register("BloomForCausalLM") +class BloomModel(Model): + model_arch = gguf.MODEL_ARCH.BLOOM + + def set_gguf_parameters(self): + self.gguf_writer.add_name("Bloom") + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + name = re.sub(r'transformer\.', '', name) + + tensors: list[tuple[str, Tensor]] = [] + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + tensors.append((self.map_tensor_name(name), data_torch)) + + if name == "word_embeddings.weight": + assert self.tensor_names is not None + + # TODO: tie them at runtime, don't duplicate in the model file + if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("MPTForCausalLM") +class MPTModel(Model): + model_arch = gguf.MODEL_ARCH.MPT + + def set_vocab(self): + try: + self._set_vocab_gpt2() + except Exception: + # Fallback for SEA-LION model + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_pad_token_id(3) + self.gguf_writer.add_eos_token_id(1) + self.gguf_writer.add_unk_token_id(0) + + def set_gguf_parameters(self): + block_count = self.hparams["n_layers"] + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-5) + if self.hparams["attn_config"]["clip_qkv"] is not None: + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + if self.hparams["attn_config"]["alibi"]: + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + else: + self.gguf_writer.add_max_alibi_bias(0.0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if "scales" in name: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = new_name.replace("scales", "act.scales") + else: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) + + return [(new_name, data_torch)] + + +@Model.register("OrionForCausalLM") +class OrionModel(Model): + model_arch = gguf.MODEL_ARCH.ORION + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) + + +@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(Model): + model_arch = gguf.MODEL_ARCH.BAICHUAN + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + tensors: list[tuple[str, Tensor]] = [] + + if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": + logger.info(f"Unpacking and permuting layer {bid}") + tensors = [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + self._reverse_hf_part(data_torch, 2)), + ] + else: + tensors = [(self.map_tensor_name(name), data_torch)] + + return tensors + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] + + +@Model.register("XverseForCausalLM") +class XverseModel(Model): + model_arch = gguf.MODEL_ARCH.XVERSE + + def set_vocab(self): + assert (self.dir_model / "tokenizer.json").is_file() + dir_model = self.dir_model + hparams = self.hparams + + tokens: list[bytes] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for token_id in range(vocab_size): + token_text = reverse_vocab[token_id].encode('utf-8') + # replace "\x00" to string with length > 0 + if token_text == b"\x00": + toktype = gguf.TokenType.BYTE # special + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + toktype = gguf.TokenType.BYTE # special + elif reverse_vocab[token_id] in added_vocab: + if tokenizer.added_tokens_decoder[token_id].special: + toktype = gguf.TokenType.CONTROL + else: + toktype = gguf.TokenType.USER_DEFINED + else: + toktype = gguf.TokenType.NORMAL + + tokens.append(token_text) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith("q_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) + if name.endswith("k_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) + + return [(self.map_tensor_name(name), data_torch)] + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + +@Model.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(Model): + model_arch = gguf.MODEL_ARCH.FALCON + + def set_gguf_parameters(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_name("Falcon") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 + head_dim = self.hparams["hidden_size"] // n_head + + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("GPTBigCodeForCausalLM") +class StarCoderModel(Model): + model_arch = gguf.MODEL_ARCH.STARCODER + + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("StarCoder") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +@Model.register("GPTRefactForCausalLM") +class RefactModel(Model): + model_arch = gguf.MODEL_ARCH.REFACT + + def set_vocab(self): + super().set_vocab() + + # TODO: how to determine special FIM tokens automatically? + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab._set_special_token("prefix", 1) + special_vocab._set_special_token("suffix", 3) + special_vocab._set_special_token("middle", 2) + special_vocab._set_special_token("fsep", 4) # is this correct? + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("Refact") + # refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + + tensors: list[tuple[str, Tensor]] = [] + + if bid is not None: + if name == f"transformer.h.{bid}.attn.kv.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) + elif name == f"transformer.h.{bid}.attn.q.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) + elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) + + if len(tensors) == 0: + tensors.append((self.map_tensor_name(name), data_torch)) + + return tensors + + +@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(Model): + model_arch = gguf.MODEL_ARCH.STABLELM + + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + self.gguf_writer.add_file_type(self.ftype) + + _q_norms: list[dict[str, Tensor]] | None = None + _k_norms: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + + if name.find("q_layernorm.norms") != -1: + assert bid is not None + + if self._q_norms is None: + self._q_norms = [{} for _ in range(self.block_count)] + + self._q_norms[bid][name] = data_torch + + if len(self._q_norms[bid]) >= n_head: + return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") + else: + return [] + + if name.find("k_layernorm.norms") != -1: + assert bid is not None + + if self._k_norms is None: + self._k_norms = [{} for _ in range(self.block_count)] + + self._k_norms[bid][name] = data_torch + + if len(self._k_norms[bid]) >= n_kv_head: + return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): + datas: list[Tensor] = [] + # extract the norms in order + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" + datas.append(norms[ename]) + del norms[ename] + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + new_name = self.map_tensor_name(merged_name) + + return [(new_name, data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._q_norms is not None or self._k_norms is not None: + # flatten two `list[dict[str, Tensor]]` into a single `list[str]` + norms = ( + [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] + ) + ( + [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] + ) + if len(norms) > 0: + raise ValueError(f"Unprocessed norms: {norms}") + + +@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +class LlamaModel(Model): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + try: + self. _set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() + + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + ) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("GrokForCausalLM") +class GrokModel(Model): + model_arch = gguf.MODEL_ARCH.GROK + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_name("Grok") + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find(".moe.") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["linear", "linear_1", "linear_v"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("DbrxForCausalLM") +class DbrxModel(Model): + model_arch = gguf.MODEL_ARCH.DBRX + + def set_gguf_parameters(self): + ffn_config = self.hparams["ffn_config"] + attn_config = self.hparams["attn_config"] + self.gguf_writer.add_name(self.hparams["model_type"]) + self.gguf_writer.add_block_count(self.hparams["n_layers"]) + + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) + + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) + + self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) + + self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) + self.gguf_writer.add_file_type(self.ftype) + + self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + + self.gguf_writer.add_layer_norm_eps(1e-5) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_expert = self.hparams["ffn_config"]["moe_num_experts"] + n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] + n_embd = self.hparams["d_model"] + + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + experts = False + + for exp_tensor_name in exp_tensor_names.keys(): + if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: + experts = True + data_torch = data_torch.view(n_expert, n_ff, n_embd) + if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: + data_torch = data_torch.permute(*permute_tensor) + break + + # map tensor names + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) + + return [(new_name, data_torch)] + + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid # unused + + return n_dims > 1 + + +@Model.register("MiniCPMForCausalLM") +class MiniCPMModel(Model): + model_arch = gguf.MODEL_ARCH.MINICPM + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + self.gguf_writer.add_name("MiniCPM") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def set_vocab(self): + self._set_vocab_llama_hf() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("QWenLMHeadModel") +class QwenModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN + + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + self._set_vocab_qwen() + + def set_gguf_parameters(self): + self.gguf_writer.add_name("Qwen") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +@Model.register("Qwen2ForCausalLM") +class Qwen2Model(Model): + model_arch = gguf.MODEL_ARCH.QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + +@Model.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("GPT2LMHeadModel") +class GPT2Model(Model): + model_arch = gguf.MODEL_ARCH.GPT2 + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_ctx"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias", ".attn.masked_bias")): + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + tensors.append((new_name, data_torch)) + + # note: GPT2 output is tied to (same as) wte in original model + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("PhiForCausalLM") +class Phi2Model(Model): + model_arch = gguf.MODEL_ARCH.PHI2 + + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + + self.gguf_writer.add_name("Phi2") + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + +@Model.register("Phi3ForCausalLM") +class Phi3MiniModel(Model): + model_arch = gguf.MODEL_ARCH.PHI3 + + def set_vocab(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + raise ValueError(f'Error: Missing {tokenizer_path}') + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + assert tokens[token_id] == token + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + assert tokens[token_id] == token + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) + rms_eps = self.find_hparam(["rms_norm_eps"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rope_dims = n_embd // n_head + + self.gguf_writer.add_name("Phi3") + self.gguf_writer.add_context_length(max_pos_embds) + self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_rms_eps(rms_eps) + self.gguf_writer.add_rope_dimension_count(rope_dims) + self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) + self.gguf_writer.add_file_type(self.ftype) + + # write rope scaling for long context (128k) model + rope_scaling = self.find_hparam(['rope_scaling'], True) + if (rope_scaling is None): + return + + scale = max_pos_embds / orig_max_pos_embds + + rope_scaling_type = rope_scaling.get('type', '').lower() + if len(rope_scaling_type) == 0: + raise KeyError('Missing the required key rope_scaling.type') + + if rope_scaling_type == 'su': + attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 + elif rope_scaling_type == 'yarn': + attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 + else: + raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') + + self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + + +@Model.register("PlamoForCausalLM") +class PlamoModel(Model): + model_arch = gguf.MODEL_ARCH.PLAMO + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name("PLaMo") + self.gguf_writer.add_context_length(4096) # not in config.json + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def shuffle_attn_q_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(8, 5, 128, 5120) + data_torch = torch.permute(data_torch, (1, 0, 2, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def shuffle_attn_output_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(5120, 8, 5, 128) + data_torch = torch.permute(data_torch, (0, 2, 1, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + new_name = self.map_tensor_name(name) + + # shuffle for broadcasting of gqa in ggml_mul_mat + if new_name.endswith("attn_q.weight"): + data_torch = self.shuffle_attn_q_weight(data_torch) + elif new_name.endswith("attn_output.weight"): + data_torch = self.shuffle_attn_output_weight(data_torch) + + return [(new_name, data_torch)] + + +@Model.register("CodeShellForCausalLM") +class CodeShellModel(Model): + model_arch = gguf.MODEL_ARCH.CODESHELL + + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("CodeShell") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + new_name = self.map_tensor_name(name) + + tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + assert self.tensor_names is not None + + if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + # copy tok_embd.weight to output.weight + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("InternLM2ForCausalLM") +class InternLM2Model(Model): + model_arch = gguf.MODEL_ARCH.INTERNLM2 + + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + sentencepiece_model = model.ModelProto() + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉".encode("utf-8") + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if "chat" in os.path.basename(self.dir_model.absolute()): + # For the chat model, we replace the eos with '<|im_end|>'. + # TODO: this is a hack, should be fixed + # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 + special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) + logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally.") + + special_vocab.add_to_gguf(self.gguf_writer) + + def _try_get_sft_eos(self, tokenizer): + unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.Encode('<|im_end|>') + eos_token = None + assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) + if len(unused_145_list) == 1: + eos_token = unused_145_list[0] + if len(im_end_list) == 1: + eos_token = im_end_list[0] + assert eos_token + return eos_token + + def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("InternLM2") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + q_per_kv = num_heads // num_kv_heads + head_dim = hidden_size // num_heads + num_groups = num_heads // q_per_kv + + qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" + + if re.match(qkv_pattern, name): + bid = re.findall(qkv_pattern, name)[0] + qkv = data_torch + # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) + qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim)) + q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + # The model weights of q and k equire additional reshape. + # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads) + # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) + k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads) + # v = rearrange(v, " o g n i -> o (g n i)").T + v = v.reshape((v.shape[0], -1)).T + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), + ] + else: + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("BertModel", "CamembertModel") +class BertModel(Model): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.vocab_size = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_causal_attention(False) + + # get pooling path + pooling_path = None + module_path = self.dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"] == "sentence_transformers.models.Pooling": + pooling_path = mod["path"] + break + + # get pooling type + if pooling_path is not None: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + pooling = json.load(f) + if pooling["pooling_mode_mean_tokens"]: + pooling_type = gguf.PoolingType.MEAN + elif pooling["pooling_mode_cls_token"]: + pooling_type = gguf.PoolingType.CLS + else: + raise NotImplementedError("Only MEAN and CLS pooling types supported") + self.gguf_writer.add_pooling_type(pooling_type) + + def set_vocab(self): + tokens, toktypes, tokpre = self.get_vocab_base() + self.vocab_size = len(tokens) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" + + # convert to phantom space vocab + def phantom(tok): + if tok.startswith("[") and tok.endswith("]"): + return tok + if tok.startswith("##"): + return tok[2:] + return "\u2581" + tok + tokens = list(map(phantom, tokens)) + + # add vocab to gguf + self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # handle special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + return [] # we don't need these + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("NomicBertModel") +class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.NOMIC_BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors + assert self.hparams["qkv_proj_bias"] is False + assert self.hparams["mlp_fc1_bias"] is False + assert self.hparams["mlp_fc2_bias"] is False + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + + +@Model.register("GemmaForCausalLM") +class GemmaModel(Model): + model_arch = gguf.MODEL_ARCH.GEMMA + + def set_vocab(self): + self._set_vocab_sentencepiece() + + # TODO: these special tokens should be exported only for the CodeGemma family + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab._set_special_token("prefix", 67) + special_vocab._set_special_token("suffix", 69) + special_vocab._set_special_token("middle", 68) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("Starcoder2ForCausalLM") +class StarCoder2Model(Model): + model_arch = gguf.MODEL_ARCH.STARCODER2 + + +@Model.register("MambaForCausalLM", "MambaLMHeadModel") +class MambaModel(Model): + model_arch = gguf.MODEL_ARCH.MAMBA + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 8 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + elif (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + neox_reader = gguf.GGUFReader(tokenizer_path, "r") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) + assert field + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + assert field + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) + assert field + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID) + self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + + # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_file_type(self.ftype) + + _tok_embd = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) + + new_name = self.map_tensor_name(name) + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + # assuming token_embd.weight is seen before output.weight + if self._tok_embd is not None and new_name == output_name: + if torch.equal(self._tok_embd, data_torch): + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") + return [] + elif new_name == tok_embd_name: + self._tok_embd = data_torch + + return [(new_name, data_torch)] + + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del n_dims # unused + + return bid is not None and new_name in ( + self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ + gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.SSM_X, + gguf.MODEL_TENSOR.SSM_DT, + gguf.MODEL_TENSOR.SSM_A, + gguf.MODEL_TENSOR.SSM_D, + ] + ) + + +@Model.register("CohereForCausalLM") +class CommandR2Model(Model): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + # aya-23 models don't have model_max_length specified + self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + +@Model.register("OlmoForCausalLM") +@Model.register("OLMoForCausalLM") +class OlmoModel(Model): + model_arch = gguf.MODEL_ARCH.OLMO + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_eps(1e-5) + clip_qkv = self.hparams.get("clip_qkv") + if clip_qkv is not None: + self.gguf_writer.add_clamp_kqv(clip_qkv) + + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("JinaBertModel", "JinaBertForMaskedLM") +class JinaBertV2Model(BertModel): + model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.intermediate_size = self.hparams["intermediate_size"] + + def get_tensors(self): + for name, data in super().get_tensors(): + if 'gated_layers' in name: + d1 = data[:self.intermediate_size, :] + name1 = name.replace('gated_layers', 'gated_layers_w') + d2 = data[self.intermediate_size:, :] + name2 = name.replace('gated_layers', 'gated_layers_v') + yield name1, d1 + yield name2, d2 + continue + + yield name, data + + def set_vocab(self, *args, **kwargs): + tokenizer_class = 'BertTokenizer' + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_class = json.load(f)['tokenizer_class'] + + if tokenizer_class == 'BertTokenizer': + super().set_vocab() + elif tokenizer_class == 'RobertaTokenizer': + self._set_vocab_gpt2() + self.gguf_writer.add_token_type_count(2) + else: + raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + +@Model.register("ArcticForCausalLM") +class ArcticModel(Model): + model_arch = gguf.MODEL_ARCH.ARCTIC + + def set_vocab(self): + # The reason for using a custom implementation here is that the + # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from + # tokenizer.model and used them as BOS and EOS instead of adding new tokens. + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + # Read the whole vocabulary from the tokenizer.model file + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + # Use the added_tokens_decoder field from tokeniser_config.json as the source + # of information about added/redefined tokens and modify them accordingly. + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + + if "added_tokens_decoder" in tokenizer_config_json: + added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] + for token_id, token_json in added_tokens_decoder.items(): + token_id = int(token_id) + if (token_id >= vocab_size): + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + token_content = token_json["content"] + token_type = SentencePieceTokenTypes.USER_DEFINED + token_score = -10000.0 + + # Map unk_token to UNKNOWN, other special tokens to CONTROL + # Set the score to 0.0 as in the original tokenizer.model + if ("special" in token_json) and token_json["special"]: + if token_content == tokenizer_config_json["unk_token"]: + token_type = SentencePieceTokenTypes.UNKNOWN + else: + token_type = SentencePieceTokenTypes.CONTROL + token_score = 0.0 + + logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") + tokens[token_id] = token_content.encode("utf-8") + toktypes[token_id] = token_type + scores[token_id] = token_score + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("DeepseekV2ForCausalLM") +class DeepseekV2Model(Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +###### CONVERSION LOGIC ###### + + +# tree of lazy tensors +class LazyTorchTensor(gguf.LazyBase): + _tensor_type = torch.Tensor + # to keep the type-checker happy + dtype: torch.dtype + shape: torch.Size + + # only used when converting a torch.Tensor to a np.ndarray + _dtype_map: dict[torch.dtype, type] = { + torch.float16: np.float16, + torch.float32: np.float32, + } + + def numpy(self) -> gguf.LazyNumpyTensor: + dtype = self._dtype_map[self.dtype] + return gguf.LazyNumpyTensor( + meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), + lazy=self._lazy, + args=(self,), + func=(lambda s: s[0].numpy()) + ) + + @classmethod + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor: + return torch.empty(size=shape, dtype=dtype, device="meta") + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.numpy: + return args[0].numpy() + + return LazyTorchTensor._wrap_fn(func)(*args, **kwargs) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a huggingface model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--awq-path", type=Path, default=None, + help="Path to scale awq cache file", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file", + ) + parser.add_argument( + "--use-temp-file", action="store_true", + help="use the tempfile library while processing (helpful when running out of memory, process killed)", + ) + parser.add_argument( + "--no-lazy", action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + ) + parser.add_argument( + "--model-name", type=str, default=None, + help="name of the model", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + parser.add_argument( + "--split", action="store_true", + help="split the converted model into multiple files" + ) + parser.add_argument( + "--split-max-tensors", type=int, + help="max tensors in each split" + ) + parser.add_argument( + "--split-max-size", type=str, + help="max size per split N(M|G)" + ) + parser.add_argument( + "--dry-run", action="store_true", + help="only print out a split plan and exit, without writing any new files" + ) + parser.add_argument( + "--small-first-shard", action="store_true", + help="do not add tensors to the first shard (disabled by default)" + ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + dir_model = args.model + + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] + tmp_model_path = args.model / "weighted_model" + dir_model = tmp_model_path + if tmp_model_path.is_dir(): + logger.info(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + logger.info("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + logger.info(f"Saved weighted model at {tmp_model_path}.") + + if not dir_model.is_dir(): + logger.error(f'Error: {args.model} is not a directory') + sys.exit(1) + + if args.split and not (args.split_max_tensors or args.split_max_size): + raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") + + if args.split_max_tensors and args.split_max_size: + raise ValueError("Can't specify both --split-max-tensors and --split-max-size") + + split_arguments = gguf.SplitArguments(args) + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "auto": gguf.LlamaFileType.GUESSED, + } + + if args.outfile is not None: + fname_out = args.outfile + else: + # output in the same directory as the model by default + fname_out = dir_model / 'ggml-model-{ftype}.gguf' + + logger.info(f"Loading model: {dir_model.name}") + + hparams = Model.load_hparams(dir_model) + + with torch.inference_mode(): + try: + model_class = Model.from_model_architecture(hparams["architectures"][0]) + except NotImplementedError: + logger.error(f"Model {hparams['architectures'][0]} is not supported") + sys.exit(1) + + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, + args.no_lazy, split_arguments) + + logger.info("Set model parameters") + model_instance.set_gguf_parameters() + + logger.info("Set model tokenizer") + model_instance.set_vocab() + + model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + if args.vocab_only: + logger.info(f"Exporting model vocab to '{model_instance.fname_out}'") + model_instance.write_vocab() + else: + logger.info(f"Exporting model to '{model_instance.fname_out}'") + model_instance.write() + + logger.info(f"Model successfully exported to '{model_instance.fname_out}'") + + +if __name__ == '__main__': + main() From 0283fc1771c3dee0e4579ed6ae1f661927079989 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 7 Jun 2024 17:24:27 -0400 Subject: [PATCH 29/66] fix line endings --- convert-hf-to-gguf.py | 5814 ++++++++++++++++++++--------------------- 1 file changed, 2907 insertions(+), 2907 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 62e02472148f7..b4ea11a15f760 100644 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1,2907 +1,2907 @@ -#!/usr/bin/env python3 - -from __future__ import annotations - -import logging -import argparse -import contextlib -import json -import os -import re -import sys -from enum import IntEnum -from pathlib import Path -from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast - -import math -import numpy as np -import torch - -if TYPE_CHECKING: - from torch import Tensor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import importlib -gguf = importlib.import_module("gguf-py.gguf") -# import gguf - -logger = logging.getLogger("hf-to-gguf") - - -###### MODEL DEFINITIONS ###### - -class SentencePieceTokenTypes(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -AnyModel = TypeVar("AnyModel", bound="type[Model]") - - -class Model: - _model_classes: dict[str, type[Model]] = {} - - dir_model: Path - ftype: int - is_big_endian: bool - endianess: gguf.GGUFEndian - use_temp_file: bool - lazy: bool - part_names: list[str] - is_safetensors: bool - hparams: dict[str, Any] - block_count: int - tensor_map: gguf.TensorNameMap - tensor_names: set[str] | None - fname_out: Path - gguf_writer: gguf.GGUFWriter - - # subclasses should define this! - model_arch: gguf.MODEL_ARCH - - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, - split_arguments: gguf.SplitArguments): - if type(self) is Model: - raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") - self.dir_model = dir_model - self.ftype = ftype - self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.use_temp_file = use_temp_file - self.lazy = not eager - self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors") - self.is_safetensors = len(self.part_names) > 0 - if not self.is_safetensors: - self.part_names = Model.get_model_part_names(self.dir_model, ".bin") - self.hparams = Model.load_hparams(self.dir_model) - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - self.tensor_names = None - if self.ftype == gguf.LlamaFileType.GUESSED: - # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. - _, first_tensor = next(self.get_tensors()) - if first_tensor.dtype == torch.float16: - logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") - self.ftype = gguf.LlamaFileType.MOSTLY_F16 - else: - logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") - self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - ftype_up: str = self.ftype.name.partition("_")[2].upper() - ftype_lw: str = ftype_up.lower() - # allow templating the file name with the output ftype, useful with the "auto" ftype - self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFWriterSplit(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, - endianess=self.endianess, use_temp_file=self.use_temp_file) - - @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - key = next((k for k in keys if k in self.hparams), None) - if key is not None: - return self.hparams[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - def set_vocab(self): - self._set_vocab_gpt2() - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - tensor_names_from_parts: set[str] = set() - - if len(self.part_names) > 1: - self.tensor_names = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - index_name += ".index.json" - logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(self.dir_model / index_name, "r", encoding="utf-8") as f: - index: dict[str, Any] = json.load(f) - weight_map = index.get("weight_map") - if weight_map is None or not isinstance(weight_map, dict): - raise ValueError(f"Can't load 'weight_map' from {index_name!r}") - self.tensor_names.update(weight_map.keys()) - else: - self.tensor_names = tensor_names_from_parts - - for part_name in self.part_names: - logger.info(f"gguf: loading model part '{part_name}'") - ctx: ContextManager[Any] - if self.is_safetensors: - from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) - else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) - - with ctx as model_part: - tensor_names_from_parts.update(model_part.keys()) - - for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] - if self.lazy: - data = LazyTorchTensor.from_eager(data) - yield name, data - - # only verify tensor name presence; it doesn't matter if they are not in the right files - if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") - - def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") - name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in name: - assert bid is not None - name = name.format(bid=bid) - return name + suffix - - def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - return False - key_name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in key_name: - if bid is None: - return False - key_name = key_name.format(bid=bid) - else: - if bid is not None: - return False - return name == (key_name + suffix) - - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: - new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) - if new_name is None: - raise ValueError(f"Can not map tensor {name!r}") - return new_name - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.block_count) - - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: - self.gguf_writer.add_context_length(n_ctx) - logger.info(f"gguf: context length = {n_ctx}") - - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - logger.info(f"gguf: experts used count = {n_experts_used}") - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - return [(self.map_tensor_name(name), data_torch)] - - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid, n_dims # unused - - return False - - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid, n_dims # unused - - return False - - def write_tensors(self): - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # use the first number-like part of the tensor name as the block id - bid = None - for part in name.split("."): - if part.isdecimal(): - bid = int(part) - break - - for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): - data: np.ndarray = data # type hint - n_dims = len(data.shape) - data_dtype = data.dtype - data_qtype: gguf.GGMLQuantizationType | None = None - - # when both are True, f32 should win - extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) - extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) - - # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - extra_f32 = any(cond for cond in ( - extra_f32, - n_dims == 1, - new_name.endswith("_norm.weight"), - )) - - # Some tensor types are always in float32 - extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - )) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - extra_f16 = any(cond for cond in ( - extra_f16, - (name.endswith(".weight") and n_dims >= 2), - )) - - if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: - if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data = gguf.quantize_bf16(data) - assert data.dtype == np.int16 - data_qtype = gguf.GGMLQuantizationType.BF16 - - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): - data = gguf.quantize_q8_0(data) - assert data.dtype == np.uint8 - data_qtype = gguf.GGMLQuantizationType.Q8_0 - - else: # default to float16 for quantized tensors - if data_dtype != np.float16: - data = data.astype(np.float16) - data_qtype = gguf.GGMLQuantizationType.F16 - - if data_qtype is None: # by default, convert to float32 - if data_dtype != np.float32: - data = data.astype(np.float32) - data_qtype = gguf.GGMLQuantizationType.F32 - - shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape - - # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" - - # n_dims is implicit in the shape - logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") - - self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - - def write(self): - self.write_tensors() - self.gguf_writer.init_shards() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - - def write_vocab(self): - if self.gguf_writer.split_arguments.split: - raise ValueError('Splitting the vocabulary is not supported') - self.gguf_writer.init_shards() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - - @staticmethod - def get_model_part_names(dir_model: Path, suffix: str) -> list[str]: - part_names: list[str] = [] - for filename in os.listdir(dir_model): - if filename.endswith(suffix): - part_names.append(filename) - - part_names.sort() - - return part_names - - @staticmethod - def load_hparams(dir_model: Path): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @classmethod - def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: - assert names - - def func(modelcls: AnyModel) -> AnyModel: - for name in names: - cls._model_classes[name] = modelcls - return modelcls - return func - - @classmethod - def from_model_architecture(cls, arch: str) -> type[Model]: - try: - return cls._model_classes[arch] - except KeyError: - raise NotImplementedError(f'Architecture {arch!r} not supported!') from None - - # used for GPT-2 BPE and WordPiece vocabs - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - return tokens, toktypes, tokpre - - # NOTE: this function is generated by convert-hf-to-gguf-update.py - # do not modify it manually! - # ref: https://github.com/ggerganov/llama.cpp/pull/6920 - # Marker: Start get_vocab_base_pre - def get_vocab_base_pre(self, tokenizer) -> str: - # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that - # is specific for the BPE pre-tokenizer used by the model - # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can - # use in llama.cpp to implement the same pre-tokenizer - - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' - - chktok = tokenizer.encode(chktxt) - chkhsh = sha256(str(chktok).encode()).hexdigest() - - logger.debug(f"chktok: {chktok}") - logger.debug(f"chkhsh: {chkhsh}") - - res = None - - # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script - # or pull the latest version of the model from Huggingface - # don't edit the hashes manually! - if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" - if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base - res = "deepseek-llm" - if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base - res = "deepseek-coder" - if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - # ref: https://huggingface.co/tiiuae/falcon-7b - res = "falcon" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 - res = "bert-bge" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/mosaicml/mpt-7b - res = "mpt" - if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - # ref: https://huggingface.co/bigcode/starcoder2-3b - res = "starcoder" - if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - # ref: https://huggingface.co/openai-community/gpt2 - res = "gpt-2" - if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b - res = "stablelm2" - if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - # ref: https://huggingface.co/smallcloudai/Refact-1_6-base - res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 - res = "command-r" - if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - # ref: https://huggingface.co/Qwen/Qwen1.5-7B - res = "qwen2" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf - res = "olmo" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base - res = "dbrx" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en - res = "jina-v2-en" - if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es - res = "jina-v2-es" - if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de - res = "jina-v2-de" - if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct - res = "smaug-bpe" - - if res is None: - logger.warning("\n") - logger.warning("**************************************************************************************") - logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") - logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") - logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") - logger.warning("**") - logger.warning(f"** chkhsh: {chkhsh}") - logger.warning("**************************************************************************************") - logger.warning("\n") - raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - - logger.debug(f"tokenizer.ggml.pre: {repr(res)}") - logger.debug(f"chkhsh: {chkhsh}") - - return res - # Marker: End get_vocab_base_pre - - def _set_vocab_gpt2(self) -> None: - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_qwen(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.special_tokens - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_sentencepiece(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if (token_id >= vocab_size): - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_llama_hf(self): - vocab = gguf.LlamaHfVocab(self.dir_model) - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - -@Model.register("GPTNeoXForCausalLM") -class GPTNeoXModel(Model): - model_arch = gguf.MODEL_ARCH.GPTNEOX - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - tensors: list[tuple[str, Tensor]] = [] - - if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors - - -@Model.register("BloomForCausalLM") -class BloomModel(Model): - model_arch = gguf.MODEL_ARCH.BLOOM - - def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(4 * n_embed) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - name = re.sub(r'transformer\.', '', name) - - tensors: list[tuple[str, Tensor]] = [] - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - tensors.append((self.map_tensor_name(name), data_torch)) - - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("MPTForCausalLM") -class MPTModel(Model): - model_arch = gguf.MODEL_ARCH.MPT - - def set_vocab(self): - try: - self._set_vocab_gpt2() - except Exception: - # Fallback for SEA-LION model - self._set_vocab_sentencepiece() - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_pad_token_id(3) - self.gguf_writer.add_eos_token_id(1) - self.gguf_writer.add_unk_token_id(0) - - def set_gguf_parameters(self): - block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-5) - if self.hparams["attn_config"]["clip_qkv"] is not None: - self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) - else: - self.gguf_writer.add_max_alibi_bias(0.0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if "scales" in name: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name.replace("scales", "act.scales") - else: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) - - return [(new_name, data_torch)] - - -@Model.register("OrionForCausalLM") -class OrionModel(Model): - model_arch = gguf.MODEL_ARCH.ORION - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - # note: config provides rms norm but it is actually layer norm - # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 - self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) - - -@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(Model): - model_arch = gguf.MODEL_ARCH.BAICHUAN - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - tensors: list[tuple[str, Tensor]] = [] - - if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": - logger.info(f"Unpacking and permuting layer {bid}") - tensors = [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), - self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), - self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), - self._reverse_hf_part(data_torch, 2)), - ] - else: - tensors = [(self.map_tensor_name(name), data_torch)] - - return tensors - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, - ) -> Tensor: - r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] - - -@Model.register("XverseForCausalLM") -class XverseModel(Model): - model_arch = gguf.MODEL_ARCH.XVERSE - - def set_vocab(self): - assert (self.dir_model / "tokenizer.json").is_file() - dir_model = self.dir_model - hparams = self.hparams - - tokens: list[bytes] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode('utf-8') - # replace "\x00" to string with length > 0 - if token_text == b"\x00": - toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode('utf-8') - elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): - toktype = gguf.TokenType.BYTE # special - elif reverse_vocab[token_id] in added_vocab: - if tokenizer.added_tokens_decoder[token_id].special: - toktype = gguf.TokenType.CONTROL - else: - toktype = gguf.TokenType.USER_DEFINED - else: - toktype = gguf.TokenType.NORMAL - - tokens.append(token_text) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - # HF models permute some of the tensors, so we need to undo that - if name.endswith("q_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) - if name.endswith("k_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) - - return [(self.map_tensor_name(name), data_torch)] - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - -@Model.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(Model): - model_arch = gguf.MODEL_ARCH.FALCON - - def set_gguf_parameters(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - self.gguf_writer.add_name("Falcon") - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 - head_dim = self.hparams["hidden_size"] // n_head - - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("GPTBigCodeForCausalLM") -class StarCoderModel(Model): - model_arch = gguf.MODEL_ARCH.STARCODER - - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("StarCoder") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -@Model.register("GPTRefactForCausalLM") -class RefactModel(Model): - model_arch = gguf.MODEL_ARCH.REFACT - - def set_vocab(self): - super().set_vocab() - - # TODO: how to determine special FIM tokens automatically? - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) - special_vocab._set_special_token("prefix", 1) - special_vocab._set_special_token("suffix", 3) - special_vocab._set_special_token("middle", 2) - special_vocab._set_special_token("fsep", 4) # is this correct? - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("Refact") - # refact uses Alibi. So this is from config.json which might be used by training. - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - - self.gguf_writer.add_feed_forward_length(ff_dim) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - n_head = self.hparams["n_head"] - n_head_kv = 1 - head_dim = self.hparams["n_embd"] // n_head - - tensors: list[tuple[str, Tensor]] = [] - - if bid is not None: - if name == f"transformer.h.{bid}.attn.kv.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) - elif name == f"transformer.h.{bid}.attn.q.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) - elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) - - if len(tensors) == 0: - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors - - -@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(Model): - model_arch = gguf.MODEL_ARCH.STABLELM - - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab - self._set_vocab_qwen() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) - self.gguf_writer.add_file_type(self.ftype) - - _q_norms: list[dict[str, Tensor]] | None = None - _k_norms: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams["num_key_value_heads"] - - if name.find("q_layernorm.norms") != -1: - assert bid is not None - - if self._q_norms is None: - self._q_norms = [{} for _ in range(self.block_count)] - - self._q_norms[bid][name] = data_torch - - if len(self._q_norms[bid]) >= n_head: - return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") - else: - return [] - - if name.find("k_layernorm.norms") != -1: - assert bid is not None - - if self._k_norms is None: - self._k_norms = [{} for _ in range(self.block_count)] - - self._k_norms[bid][name] = data_torch - - if len(self._k_norms[bid]) >= n_kv_head: - return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): - datas: list[Tensor] = [] - # extract the norms in order - for xid in range(n_head): - ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" - datas.append(norms[ename]) - del norms[ename] - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" - new_name = self.map_tensor_name(merged_name) - - return [(new_name, data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._q_norms is not None or self._k_norms is not None: - # flatten two `list[dict[str, Tensor]]` into a single `list[str]` - norms = ( - [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] - ) + ( - [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] - ) - if len(norms) > 0: - raise ValueError(f"Unprocessed norms: {norms}") - - -@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") -class LlamaModel(Model): - model_arch = gguf.MODEL_ARCH.LLAMA - - def set_vocab(self): - try: - self. _set_vocab_sentencepiece() - except FileNotFoundError: - try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() - - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) - if self.hparams.get("vocab_size", 32000) == 32016: - special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] - ) - special_vocab._set_special_token("prefix", 32007) - special_vocab._set_special_token("suffix", 32008) - special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("GrokForCausalLM") -class GrokModel(Model): - model_arch = gguf.MODEL_ARCH.GROK - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_name("Grok") - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find(".moe.") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["linear", "linear_1", "linear_v"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("DbrxForCausalLM") -class DbrxModel(Model): - model_arch = gguf.MODEL_ARCH.DBRX - - def set_gguf_parameters(self): - ffn_config = self.hparams["ffn_config"] - attn_config = self.hparams["attn_config"] - self.gguf_writer.add_name(self.hparams["model_type"]) - self.gguf_writer.add_block_count(self.hparams["n_layers"]) - - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) - - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) - - self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) - - self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) - self.gguf_writer.add_file_type(self.ftype) - - self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) - - self.gguf_writer.add_layer_norm_eps(1e-5) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_expert = self.hparams["ffn_config"]["moe_num_experts"] - n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] - n_embd = self.hparams["d_model"] - - # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose - # original implementation expects (n_expert, n_ff, n_embd) for all experts weights - # But llama.cpp moe graph works differently - # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions - # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - experts = False - - for exp_tensor_name in exp_tensor_names.keys(): - if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: - experts = True - data_torch = data_torch.view(n_expert, n_ff, n_embd) - if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: - data_torch = data_torch.permute(*permute_tensor) - break - - # map tensor names - # In MoE models the ffn tensors are typically most of the model weights, - # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. - # Every other model has the weight names ending in .weight, - # let's assume that is the convention which is not the case for dbrx: - # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) - - return [(new_name, data_torch)] - - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid # unused - - return n_dims > 1 - - -@Model.register("MiniCPMForCausalLM") -class MiniCPMModel(Model): - model_arch = gguf.MODEL_ARCH.MINICPM - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def set_vocab(self): - self._set_vocab_llama_hf() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("QWenLMHeadModel") -class QwenModel(Model): - model_arch = gguf.MODEL_ARCH.QWEN - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - self._set_vocab_qwen() - - def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -@Model.register("Qwen2ForCausalLM") -class Qwen2Model(Model): - model_arch = gguf.MODEL_ARCH.QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - -@Model.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(Model): - model_arch = gguf.MODEL_ARCH.QWEN2MOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (n_experts := self.hparams.get("num_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.hparams["num_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("GPT2LMHeadModel") -class GPT2Model(Model): - model_arch = gguf.MODEL_ARCH.GPT2 - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_ctx"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - - # we don't need these - if name.endswith((".attn.bias", ".attn.masked_bias")): - return tensors - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - tensors.append((new_name, data_torch)) - - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("PhiForCausalLM") -class Phi2Model(Model): - model_arch = gguf.MODEL_ARCH.PHI2 - - def set_gguf_parameters(self): - block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - - rot_pct = self.find_hparam(["partial_rotary_factor"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - - self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) - - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(4 * n_embd) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_add_bos_token(False) - - -@Model.register("Phi3ForCausalLM") -class Phi3MiniModel(Model): - model_arch = gguf.MODEL_ARCH.PHI3 - - def set_vocab(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - raise ValueError(f'Error: Missing {tokenizer_path}') - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - token_id = added_tokens_json[key] - if (token_id >= vocab_size): - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, foken_data in added_tokens_decoder.items(): - token_id = int(token_id) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: - assert tokens[token_id] == token - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - tokenizer_file = self.dir_model / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - added_tokens = tokenizer_json.get("added_tokens", []) - for foken_data in added_tokens: - token_id = int(foken_data["id"]) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: - assert tokens[token_id] == token - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) - rms_eps = self.find_hparam(["rms_norm_eps"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head - - self.gguf_writer.add_name("Phi3") - self.gguf_writer.add_context_length(max_pos_embds) - self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(rms_eps) - self.gguf_writer.add_rope_dimension_count(rope_dims) - self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) - self.gguf_writer.add_file_type(self.ftype) - - # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(['rope_scaling'], True) - if (rope_scaling is None): - return - - scale = max_pos_embds / orig_max_pos_embds - - rope_scaling_type = rope_scaling.get('type', '').lower() - if len(rope_scaling_type) == 0: - raise KeyError('Missing the required key rope_scaling.type') - - if rope_scaling_type == 'su': - attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 - elif rope_scaling_type == 'yarn': - attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 - else: - raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') - - self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) - - -@Model.register("PlamoForCausalLM") -class PlamoModel(Model): - model_arch = gguf.MODEL_ARCH.PLAMO - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name("PLaMo") - self.gguf_writer.add_context_length(4096) # not in config.json - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def shuffle_attn_q_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(8, 5, 128, 5120) - data_torch = torch.permute(data_torch, (1, 0, 2, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def shuffle_attn_output_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(5120, 8, 5, 128) - data_torch = torch.permute(data_torch, (0, 2, 1, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - new_name = self.map_tensor_name(name) - - # shuffle for broadcasting of gqa in ggml_mul_mat - if new_name.endswith("attn_q.weight"): - data_torch = self.shuffle_attn_q_weight(data_torch) - elif new_name.endswith("attn_output.weight"): - data_torch = self.shuffle_attn_output_weight(data_torch) - - return [(new_name, data_torch)] - - -@Model.register("CodeShellForCausalLM") -class CodeShellModel(Model): - model_arch = gguf.MODEL_ARCH.CODESHELL - - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("CodeShell") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(10000.0) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - new_name = self.map_tensor_name(name) - - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None - - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - # copy tok_embd.weight to output.weight - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors - - -@Model.register("InternLM2ForCausalLM") -class InternLM2Model(Model): - model_arch = gguf.MODEL_ARCH.INTERNLM2 - - def set_vocab(self): - # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - sentencepiece_model = model.ModelProto() - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉".encode("utf-8") - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - old_eos = special_vocab.special_token_ids["eos"] - if "chat" in os.path.basename(self.dir_model.absolute()): - # For the chat model, we replace the eos with '<|im_end|>'. - # TODO: this is a hack, should be fixed - # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 - special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ -in chat mode so that the conversation can end normally.") - - special_vocab.add_to_gguf(self.gguf_writer) - - def _try_get_sft_eos(self, tokenizer): - unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]') - im_end_list = tokenizer.Encode('<|im_end|>') - eos_token = None - assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) - if len(unused_145_list) == 1: - eos_token = unused_145_list[0] - if len(im_end_list) == 1: - eos_token = im_end_list[0] - assert eos_token - return eos_token - - def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_heads = self.hparams["num_attention_heads"] - num_kv_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] - q_per_kv = num_heads // num_kv_heads - head_dim = hidden_size // num_heads - num_groups = num_heads // q_per_kv - - qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" - - if re.match(qkv_pattern, name): - bid = re.findall(qkv_pattern, name)[0] - qkv = data_torch - # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim)) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] - # The model weights of q and k equire additional reshape. - # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads) - # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) - k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads) - # v = rearrange(v, " o g n i -> o (g n i)").T - v = v.reshape((v.shape[0], -1)).T - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), - ] - else: - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("BertModel", "CamembertModel") -class BertModel(Model): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.vocab_size = None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_causal_attention(False) - - # get pooling path - pooling_path = None - module_path = self.dir_model / "modules.json" - if module_path.is_file(): - with open(module_path, encoding="utf-8") as f: - modules = json.load(f) - for mod in modules: - if mod["type"] == "sentence_transformers.models.Pooling": - pooling_path = mod["path"] - break - - # get pooling type - if pooling_path is not None: - with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: - pooling = json.load(f) - if pooling["pooling_mode_mean_tokens"]: - pooling_type = gguf.PoolingType.MEAN - elif pooling["pooling_mode_cls_token"]: - pooling_type = gguf.PoolingType.CLS - else: - raise NotImplementedError("Only MEAN and CLS pooling types supported") - self.gguf_writer.add_pooling_type(pooling_type) - - def set_vocab(self): - tokens, toktypes, tokpre = self.get_vocab_base() - self.vocab_size = len(tokens) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" - - # convert to phantom space vocab - def phantom(tok): - if tok.startswith("[") and tok.endswith("]"): - return tok - if tok.startswith("##"): - return tok[2:] - return "\u2581" + tok - tokens = list(map(phantom, tokens)) - - # add vocab to gguf - self.gguf_writer.add_tokenizer_model("bert") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # handle special tokens - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return [] # we don't need these - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("NomicBertModel") -class NomicBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.NOMIC_BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # the HF config claims n_ctx=8192, but it uses RoPE scaling - self.hparams["n_ctx"] = 2048 - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # this doesn't do anything in the HF version - assert self.hparams["causal"] is False - # no bias tensors - assert self.hparams["qkv_proj_bias"] is False - assert self.hparams["mlp_fc1_bias"] is False - assert self.hparams["mlp_fc2_bias"] is False - # norm at end of layer - assert self.hparams["prenorm"] is False - # standard RoPE - assert self.hparams["rotary_emb_fraction"] == 1.0 - assert self.hparams["rotary_emb_interleaved"] is False - assert self.hparams["rotary_emb_scale_base"] is None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - - -@Model.register("GemmaForCausalLM") -class GemmaModel(Model): - model_arch = gguf.MODEL_ARCH.GEMMA - - def set_vocab(self): - self._set_vocab_sentencepiece() - - # TODO: these special tokens should be exported only for the CodeGemma family - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) - special_vocab._set_special_token("prefix", 67) - special_vocab._set_special_token("suffix", 69) - special_vocab._set_special_token("middle", 68) - special_vocab._set_special_token("fsep", 70) - special_vocab._set_special_token("eot", 107) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] - - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("Starcoder2ForCausalLM") -class StarCoder2Model(Model): - model_arch = gguf.MODEL_ARCH.STARCODER2 - - -@Model.register("MambaForCausalLM", "MambaLMHeadModel") -class MambaModel(Model): - model_arch = gguf.MODEL_ARCH.MAMBA - - def set_vocab(self): - vocab_size = self.hparams["vocab_size"] - # Round vocab size to next multiple of 8 - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - # pad using ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - elif (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - else: - # Use the GPT-NeoX tokenizer when no tokenizer files are present - tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") - neox_reader = gguf.GGUFReader(tokenizer_path, "r") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) - self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) - self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) - self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) - self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID) - self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0) - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - - # Fail early for models which don't have a block expansion factor of 2 - assert d_inner == 2 * d_model - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_file_type(self.ftype) - - _tok_embd = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - - new_name = self.map_tensor_name(name) - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - # assuming token_embd.weight is seen before output.weight - if self._tok_embd is not None and new_name == output_name: - if torch.equal(self._tok_embd, data_torch): - logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") - return [] - elif new_name == tok_embd_name: - self._tok_embd = data_torch - - return [(new_name, data_torch)] - - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del n_dims # unused - - return bid is not None and new_name in ( - self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ - gguf.MODEL_TENSOR.SSM_CONV1D, - gguf.MODEL_TENSOR.SSM_X, - gguf.MODEL_TENSOR.SSM_DT, - gguf.MODEL_TENSOR.SSM_A, - gguf.MODEL_TENSOR.SSM_D, - ] - ) - - -@Model.register("CohereForCausalLM") -class CommandR2Model(Model): - model_arch = gguf.MODEL_ARCH.COMMAND_R - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # max_position_embeddings = 8192 in config.json but model was actually - # trained on 128k context length - # aya-23 models don't have model_max_length specified - self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - -@Model.register("OlmoForCausalLM") -@Model.register("OLMoForCausalLM") -class OlmoModel(Model): - model_arch = gguf.MODEL_ARCH.OLMO - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_eps(1e-5) - clip_qkv = self.hparams.get("clip_qkv") - if clip_qkv is not None: - self.gguf_writer.add_clamp_kqv(clip_qkv) - - # Same as super class, but permuting q_proj, k_proj - # Copied from: LlamaModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("JinaBertModel", "JinaBertForMaskedLM") -class JinaBertV2Model(BertModel): - model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.intermediate_size = self.hparams["intermediate_size"] - - def get_tensors(self): - for name, data in super().get_tensors(): - if 'gated_layers' in name: - d1 = data[:self.intermediate_size, :] - name1 = name.replace('gated_layers', 'gated_layers_w') - d2 = data[self.intermediate_size:, :] - name2 = name.replace('gated_layers', 'gated_layers_v') - yield name1, d1 - yield name2, d2 - continue - - yield name, data - - def set_vocab(self, *args, **kwargs): - tokenizer_class = 'BertTokenizer' - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_class = json.load(f)['tokenizer_class'] - - if tokenizer_class == 'BertTokenizer': - super().set_vocab() - elif tokenizer_class == 'RobertaTokenizer': - self._set_vocab_gpt2() - self.gguf_writer.add_token_type_count(2) - else: - raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - -@Model.register("ArcticForCausalLM") -class ArcticModel(Model): - model_arch = gguf.MODEL_ARCH.ARCTIC - - def set_vocab(self): - # The reason for using a custom implementation here is that the - # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from - # tokenizer.model and used them as BOS and EOS instead of adding new tokens. - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - # Read the whole vocabulary from the tokenizer.model file - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - # Use the added_tokens_decoder field from tokeniser_config.json as the source - # of information about added/redefined tokens and modify them accordingly. - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - - if "added_tokens_decoder" in tokenizer_config_json: - added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] - for token_id, token_json in added_tokens_decoder.items(): - token_id = int(token_id) - if (token_id >= vocab_size): - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - token_content = token_json["content"] - token_type = SentencePieceTokenTypes.USER_DEFINED - token_score = -10000.0 - - # Map unk_token to UNKNOWN, other special tokens to CONTROL - # Set the score to 0.0 as in the original tokenizer.model - if ("special" in token_json) and token_json["special"]: - if token_content == tokenizer_config_json["unk_token"]: - token_type = SentencePieceTokenTypes.UNKNOWN - else: - token_type = SentencePieceTokenTypes.CONTROL - token_score = 0.0 - - logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") - tokens[token_id] = token_content.encode("utf-8") - toktypes[token_id] = token_type - scores[token_id] = token_score - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@Model.register("DeepseekV2ForCausalLM") -class DeepseekV2Model(Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def write_tensors(self): - super().write_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -###### CONVERSION LOGIC ###### - - -# tree of lazy tensors -class LazyTorchTensor(gguf.LazyBase): - _tensor_type = torch.Tensor - # to keep the type-checker happy - dtype: torch.dtype - shape: torch.Size - - # only used when converting a torch.Tensor to a np.ndarray - _dtype_map: dict[torch.dtype, type] = { - torch.float16: np.float16, - torch.float32: np.float32, - } - - def numpy(self) -> gguf.LazyNumpyTensor: - dtype = self._dtype_map[self.dtype] - return gguf.LazyNumpyTensor( - meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - lazy=self._lazy, - args=(self,), - func=(lambda s: s[0].numpy()) - ) - - @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor: - return torch.empty(size=shape, dtype=dtype, device="meta") - - @classmethod - def __torch_function__(cls, func, types, args=(), kwargs=None): - del types # unused - - if kwargs is None: - kwargs = {} - - if func is torch.Tensor.numpy: - return args[0].numpy() - - return LazyTorchTensor._wrap_fn(func)(*args, **kwargs) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--awq-path", type=Path, default=None, - help="Path to scale awq cache file", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", - ) - parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", - ) - parser.add_argument( - "--bigendian", action="store_true", - help="model is executed on big endian machine", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file", - ) - parser.add_argument( - "--use-temp-file", action="store_true", - help="use the tempfile library while processing (helpful when running out of memory, process killed)", - ) - parser.add_argument( - "--no-lazy", action="store_true", - help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", - ) - parser.add_argument( - "--model-name", type=str, default=None, - help="name of the model", - ) - parser.add_argument( - "--verbose", action="store_true", - help="increase output verbosity", - ) - parser.add_argument( - "--split", action="store_true", - help="split the converted model into multiple files" - ) - parser.add_argument( - "--split-max-tensors", type=int, - help="max tensors in each split" - ) - parser.add_argument( - "--split-max-size", type=str, - help="max size per split N(M|G)" - ) - parser.add_argument( - "--dry-run", action="store_true", - help="only print out a split plan and exit, without writing any new files" - ) - parser.add_argument( - "--small-first-shard", action="store_true", - help="do not add tensors to the first shard (disabled by default)" - ) - - return parser.parse_args() - - -def main() -> None: - args = parse_args() - - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - - dir_model = args.model - - if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] - tmp_model_path = args.model / "weighted_model" - dir_model = tmp_model_path - if tmp_model_path.is_dir(): - logger.info(f"{tmp_model_path} exists as a weighted model.") - else: - tmp_model_path.mkdir(parents=True, exist_ok=True) - logger.info("Saving new weighted model ...") - add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - logger.info(f"Saved weighted model at {tmp_model_path}.") - - if not dir_model.is_dir(): - logger.error(f'Error: {args.model} is not a directory') - sys.exit(1) - - if args.split and not (args.split_max_tensors or args.split_max_size): - raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") - - if args.split_max_tensors and args.split_max_size: - raise ValueError("Can't specify both --split-max-tensors and --split-max-size") - - split_arguments = gguf.SplitArguments(args) - - ftype_map: dict[str, gguf.LlamaFileType] = { - "f32": gguf.LlamaFileType.ALL_F32, - "f16": gguf.LlamaFileType.MOSTLY_F16, - "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - "auto": gguf.LlamaFileType.GUESSED, - } - - if args.outfile is not None: - fname_out = args.outfile - else: - # output in the same directory as the model by default - fname_out = dir_model / 'ggml-model-{ftype}.gguf' - - logger.info(f"Loading model: {dir_model.name}") - - hparams = Model.load_hparams(dir_model) - - with torch.inference_mode(): - try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) - except NotImplementedError: - logger.error(f"Model {hparams['architectures'][0]} is not supported") - sys.exit(1) - - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, - args.no_lazy, split_arguments) - - logger.info("Set model parameters") - model_instance.set_gguf_parameters() - - logger.info("Set model tokenizer") - model_instance.set_vocab() - - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - - if args.vocab_only: - logger.info(f"Exporting model vocab to '{model_instance.fname_out}'") - model_instance.write_vocab() - else: - logger.info(f"Exporting model to '{model_instance.fname_out}'") - model_instance.write() - - logger.info(f"Model successfully exported to '{model_instance.fname_out}'") - - -if __name__ == '__main__': - main() +#!/usr/bin/env python3 + +from __future__ import annotations + +import logging +import argparse +import contextlib +import json +import os +import re +import sys +from enum import IntEnum +from pathlib import Path +from hashlib import sha256 +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast + +import math +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import importlib +gguf = importlib.import_module("gguf-py.gguf") +# import gguf + +logger = logging.getLogger("hf-to-gguf") + + +###### MODEL DEFINITIONS ###### + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +AnyModel = TypeVar("AnyModel", bound="type[Model]") + + +class Model: + _model_classes: dict[str, type[Model]] = {} + + dir_model: Path + ftype: int + is_big_endian: bool + endianess: gguf.GGUFEndian + use_temp_file: bool + lazy: bool + part_names: list[str] + is_safetensors: bool + hparams: dict[str, Any] + block_count: int + tensor_map: gguf.TensorNameMap + tensor_names: set[str] | None + fname_out: Path + gguf_writer: gguf.GGUFWriter + + # subclasses should define this! + model_arch: gguf.MODEL_ARCH + + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, + split_arguments: gguf.SplitArguments): + if type(self) is Model: + raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + self.dir_model = dir_model + self.ftype = ftype + self.is_big_endian = is_big_endian + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.use_temp_file = use_temp_file + self.lazy = not eager + self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors") + self.is_safetensors = len(self.part_names) > 0 + if not self.is_safetensors: + self.part_names = Model.get_model_part_names(self.dir_model, ".bin") + self.hparams = Model.load_hparams(self.dir_model) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + self.tensor_names = None + if self.ftype == gguf.LlamaFileType.GUESSED: + # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. + _, first_tensor = next(self.get_tensors()) + if first_tensor.dtype == torch.float16: + logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") + self.ftype = gguf.LlamaFileType.MOSTLY_F16 + else: + logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") + self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + ftype_up: str = self.ftype.name.partition("_")[2].upper() + ftype_lw: str = ftype_up.lower() + # allow templating the file name with the output ftype, useful with the "auto" ftype + self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) + self.gguf_writer = gguf.GGUFWriterSplit(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, + endianess=self.endianess, use_temp_file=self.use_temp_file) + + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def set_vocab(self): + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + tensor_names_from_parts: set[str] = set() + + if len(self.part_names) > 1: + self.tensor_names = set() + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + logger.info(f"gguf: loading model weight map from '{index_name}'") + with open(self.dir_model / index_name, "r", encoding="utf-8") as f: + index: dict[str, Any] = json.load(f) + weight_map = index.get("weight_map") + if weight_map is None or not isinstance(weight_map, dict): + raise ValueError(f"Can't load 'weight_map' from {index_name!r}") + self.tensor_names.update(weight_map.keys()) + else: + self.tensor_names = tensor_names_from_parts + + for part_name in self.part_names: + logger.info(f"gguf: loading model part '{part_name}'") + ctx: ContextManager[Any] + if self.is_safetensors: + from safetensors import safe_open + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) + else: + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + + with ctx as model_part: + tensor_names_from_parts.update(model_part.keys()) + + for name in model_part.keys(): + data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + if self.lazy: + data = LazyTorchTensor.from_eager(data) + yield name, data + + # only verify tensor name presence; it doesn't matter if they are not in the right files + if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") + + def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") + name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in name: + assert bid is not None + name = name.format(bid=bid) + return name + suffix + + def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + return False + key_name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in key_name: + if bid is None: + return False + key_name = key_name.format(bid=bid) + else: + if bid is not None: + return False + return name == (key_name + suffix) + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) + if new_name is None: + raise ValueError(f"Can not map tensor {name!r}") + return new_name + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + return [(self.map_tensor_name(name), data_torch)] + + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused + + return False + + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused + + return False + + def write_tensors(self): + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # use the first number-like part of the tensor name as the block id + bid = None + for part in name.split("."): + if part.isdecimal(): + bid = int(part) + break + + for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): + data: np.ndarray = data # type hint + n_dims = len(data.shape) + data_dtype = data.dtype + data_qtype: gguf.GGMLQuantizationType | None = None + + # when both are True, f32 should win + extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) + extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) + + # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + extra_f32 = any(cond for cond in ( + extra_f32, + n_dims == 1, + new_name.endswith("_norm.weight"), + )) + + # Some tensor types are always in float32 + extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + )) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + extra_f16 = any(cond for cond in ( + extra_f16, + (name.endswith(".weight") and n_dims >= 2), + )) + + if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: + if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data = gguf.quantize_bf16(data) + assert data.dtype == np.int16 + data_qtype = gguf.GGMLQuantizationType.BF16 + + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): + data = gguf.quantize_q8_0(data) + assert data.dtype == np.uint8 + data_qtype = gguf.GGMLQuantizationType.Q8_0 + + else: # default to float16 for quantized tensors + if data_dtype != np.float16: + data = data.astype(np.float16) + data_qtype = gguf.GGMLQuantizationType.F16 + + if data_qtype is None: # by default, convert to float32 + if data_dtype != np.float32: + data = data.astype(np.float32) + data_qtype = gguf.GGMLQuantizationType.F32 + + shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + + # n_dims is implicit in the shape + logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + + def write(self): + self.write_tensors() + self.gguf_writer.init_shards() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + def write_vocab(self): + if self.gguf_writer.split_arguments.split: + raise ValueError('Splitting the vocabulary is not supported') + self.gguf_writer.init_shards() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + @staticmethod + def get_model_part_names(dir_model: Path, suffix: str) -> list[str]: + part_names: list[str] = [] + for filename in os.listdir(dir_model): + if filename.endswith(suffix): + part_names.append(filename) + + part_names.sort() + + return part_names + + @staticmethod + def load_hparams(dir_model: Path): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @classmethod + def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: + assert names + + def func(modelcls: AnyModel) -> AnyModel: + for name in names: + cls._model_classes[name] = modelcls + return modelcls + return func + + @classmethod + def from_model_architecture(cls, arch: str) -> type[Model]: + try: + return cls._model_classes[arch] + except KeyError: + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + + # used for GPT-2 BPE and WordPiece vocabs + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes, tokpre + + # NOTE: this function is generated by convert-hf-to-gguf-update.py + # do not modify it manually! + # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 + res = "bert-bge" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/mosaicml/mpt-7b + res = "mpt" + if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": + # ref: https://huggingface.co/bigcode/starcoder2-3b + res = "starcoder" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b + res = "stablelm2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" + if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": + # ref: https://huggingface.co/Qwen/Qwen1.5-7B + res = "qwen2" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf + res = "olmo" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": + # ref: https://huggingface.co/databricks/dbrx-base + res = "dbrx" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en + res = "jina-v2-en" + if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es + res = "jina-v2-es" + if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de + res = "jina-v2-de" + if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": + # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct + res = "smaug-bpe" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") + + return res + # Marker: End get_vocab_base_pre + + def _set_vocab_gpt2(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_qwen(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_llama_hf(self): + vocab = gguf.LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + +@Model.register("GPTNeoXForCausalLM") +class GPTNeoXModel(Model): + model_arch = gguf.MODEL_ARCH.GPTNEOX + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + tensors: list[tuple[str, Tensor]] = [] + + if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + tensors.append((self.map_tensor_name(name), data_torch)) + + return tensors + + +@Model.register("BloomForCausalLM") +class BloomModel(Model): + model_arch = gguf.MODEL_ARCH.BLOOM + + def set_gguf_parameters(self): + self.gguf_writer.add_name("Bloom") + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + name = re.sub(r'transformer\.', '', name) + + tensors: list[tuple[str, Tensor]] = [] + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + tensors.append((self.map_tensor_name(name), data_torch)) + + if name == "word_embeddings.weight": + assert self.tensor_names is not None + + # TODO: tie them at runtime, don't duplicate in the model file + if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("MPTForCausalLM") +class MPTModel(Model): + model_arch = gguf.MODEL_ARCH.MPT + + def set_vocab(self): + try: + self._set_vocab_gpt2() + except Exception: + # Fallback for SEA-LION model + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_pad_token_id(3) + self.gguf_writer.add_eos_token_id(1) + self.gguf_writer.add_unk_token_id(0) + + def set_gguf_parameters(self): + block_count = self.hparams["n_layers"] + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-5) + if self.hparams["attn_config"]["clip_qkv"] is not None: + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + if self.hparams["attn_config"]["alibi"]: + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + else: + self.gguf_writer.add_max_alibi_bias(0.0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if "scales" in name: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = new_name.replace("scales", "act.scales") + else: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) + + return [(new_name, data_torch)] + + +@Model.register("OrionForCausalLM") +class OrionModel(Model): + model_arch = gguf.MODEL_ARCH.ORION + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) + + +@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(Model): + model_arch = gguf.MODEL_ARCH.BAICHUAN + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + tensors: list[tuple[str, Tensor]] = [] + + if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": + logger.info(f"Unpacking and permuting layer {bid}") + tensors = [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + self._reverse_hf_part(data_torch, 2)), + ] + else: + tensors = [(self.map_tensor_name(name), data_torch)] + + return tensors + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] + + +@Model.register("XverseForCausalLM") +class XverseModel(Model): + model_arch = gguf.MODEL_ARCH.XVERSE + + def set_vocab(self): + assert (self.dir_model / "tokenizer.json").is_file() + dir_model = self.dir_model + hparams = self.hparams + + tokens: list[bytes] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for token_id in range(vocab_size): + token_text = reverse_vocab[token_id].encode('utf-8') + # replace "\x00" to string with length > 0 + if token_text == b"\x00": + toktype = gguf.TokenType.BYTE # special + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + toktype = gguf.TokenType.BYTE # special + elif reverse_vocab[token_id] in added_vocab: + if tokenizer.added_tokens_decoder[token_id].special: + toktype = gguf.TokenType.CONTROL + else: + toktype = gguf.TokenType.USER_DEFINED + else: + toktype = gguf.TokenType.NORMAL + + tokens.append(token_text) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith("q_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) + if name.endswith("k_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) + + return [(self.map_tensor_name(name), data_torch)] + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + +@Model.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(Model): + model_arch = gguf.MODEL_ARCH.FALCON + + def set_gguf_parameters(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_name("Falcon") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 + head_dim = self.hparams["hidden_size"] // n_head + + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("GPTBigCodeForCausalLM") +class StarCoderModel(Model): + model_arch = gguf.MODEL_ARCH.STARCODER + + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("StarCoder") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +@Model.register("GPTRefactForCausalLM") +class RefactModel(Model): + model_arch = gguf.MODEL_ARCH.REFACT + + def set_vocab(self): + super().set_vocab() + + # TODO: how to determine special FIM tokens automatically? + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab._set_special_token("prefix", 1) + special_vocab._set_special_token("suffix", 3) + special_vocab._set_special_token("middle", 2) + special_vocab._set_special_token("fsep", 4) # is this correct? + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("Refact") + # refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + + tensors: list[tuple[str, Tensor]] = [] + + if bid is not None: + if name == f"transformer.h.{bid}.attn.kv.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) + elif name == f"transformer.h.{bid}.attn.q.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) + elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) + + if len(tensors) == 0: + tensors.append((self.map_tensor_name(name), data_torch)) + + return tensors + + +@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(Model): + model_arch = gguf.MODEL_ARCH.STABLELM + + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + self.gguf_writer.add_file_type(self.ftype) + + _q_norms: list[dict[str, Tensor]] | None = None + _k_norms: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + + if name.find("q_layernorm.norms") != -1: + assert bid is not None + + if self._q_norms is None: + self._q_norms = [{} for _ in range(self.block_count)] + + self._q_norms[bid][name] = data_torch + + if len(self._q_norms[bid]) >= n_head: + return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") + else: + return [] + + if name.find("k_layernorm.norms") != -1: + assert bid is not None + + if self._k_norms is None: + self._k_norms = [{} for _ in range(self.block_count)] + + self._k_norms[bid][name] = data_torch + + if len(self._k_norms[bid]) >= n_kv_head: + return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): + datas: list[Tensor] = [] + # extract the norms in order + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" + datas.append(norms[ename]) + del norms[ename] + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + new_name = self.map_tensor_name(merged_name) + + return [(new_name, data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._q_norms is not None or self._k_norms is not None: + # flatten two `list[dict[str, Tensor]]` into a single `list[str]` + norms = ( + [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] + ) + ( + [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] + ) + if len(norms) > 0: + raise ValueError(f"Unprocessed norms: {norms}") + + +@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +class LlamaModel(Model): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + try: + self. _set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() + + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + ) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("GrokForCausalLM") +class GrokModel(Model): + model_arch = gguf.MODEL_ARCH.GROK + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_name("Grok") + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find(".moe.") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["linear", "linear_1", "linear_v"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("DbrxForCausalLM") +class DbrxModel(Model): + model_arch = gguf.MODEL_ARCH.DBRX + + def set_gguf_parameters(self): + ffn_config = self.hparams["ffn_config"] + attn_config = self.hparams["attn_config"] + self.gguf_writer.add_name(self.hparams["model_type"]) + self.gguf_writer.add_block_count(self.hparams["n_layers"]) + + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) + + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) + + self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) + + self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) + self.gguf_writer.add_file_type(self.ftype) + + self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + + self.gguf_writer.add_layer_norm_eps(1e-5) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_expert = self.hparams["ffn_config"]["moe_num_experts"] + n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] + n_embd = self.hparams["d_model"] + + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + experts = False + + for exp_tensor_name in exp_tensor_names.keys(): + if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: + experts = True + data_torch = data_torch.view(n_expert, n_ff, n_embd) + if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: + data_torch = data_torch.permute(*permute_tensor) + break + + # map tensor names + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) + + return [(new_name, data_torch)] + + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid # unused + + return n_dims > 1 + + +@Model.register("MiniCPMForCausalLM") +class MiniCPMModel(Model): + model_arch = gguf.MODEL_ARCH.MINICPM + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + self.gguf_writer.add_name("MiniCPM") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def set_vocab(self): + self._set_vocab_llama_hf() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("QWenLMHeadModel") +class QwenModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN + + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + self._set_vocab_qwen() + + def set_gguf_parameters(self): + self.gguf_writer.add_name("Qwen") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +@Model.register("Qwen2ForCausalLM") +class Qwen2Model(Model): + model_arch = gguf.MODEL_ARCH.QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + +@Model.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("GPT2LMHeadModel") +class GPT2Model(Model): + model_arch = gguf.MODEL_ARCH.GPT2 + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_ctx"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias", ".attn.masked_bias")): + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + tensors.append((new_name, data_torch)) + + # note: GPT2 output is tied to (same as) wte in original model + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("PhiForCausalLM") +class Phi2Model(Model): + model_arch = gguf.MODEL_ARCH.PHI2 + + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + + self.gguf_writer.add_name("Phi2") + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + +@Model.register("Phi3ForCausalLM") +class Phi3MiniModel(Model): + model_arch = gguf.MODEL_ARCH.PHI3 + + def set_vocab(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + raise ValueError(f'Error: Missing {tokenizer_path}') + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + assert tokens[token_id] == token + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + assert tokens[token_id] == token + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) + rms_eps = self.find_hparam(["rms_norm_eps"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rope_dims = n_embd // n_head + + self.gguf_writer.add_name("Phi3") + self.gguf_writer.add_context_length(max_pos_embds) + self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_rms_eps(rms_eps) + self.gguf_writer.add_rope_dimension_count(rope_dims) + self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) + self.gguf_writer.add_file_type(self.ftype) + + # write rope scaling for long context (128k) model + rope_scaling = self.find_hparam(['rope_scaling'], True) + if (rope_scaling is None): + return + + scale = max_pos_embds / orig_max_pos_embds + + rope_scaling_type = rope_scaling.get('type', '').lower() + if len(rope_scaling_type) == 0: + raise KeyError('Missing the required key rope_scaling.type') + + if rope_scaling_type == 'su': + attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 + elif rope_scaling_type == 'yarn': + attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 + else: + raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') + + self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + + +@Model.register("PlamoForCausalLM") +class PlamoModel(Model): + model_arch = gguf.MODEL_ARCH.PLAMO + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name("PLaMo") + self.gguf_writer.add_context_length(4096) # not in config.json + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def shuffle_attn_q_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(8, 5, 128, 5120) + data_torch = torch.permute(data_torch, (1, 0, 2, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def shuffle_attn_output_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(5120, 8, 5, 128) + data_torch = torch.permute(data_torch, (0, 2, 1, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + new_name = self.map_tensor_name(name) + + # shuffle for broadcasting of gqa in ggml_mul_mat + if new_name.endswith("attn_q.weight"): + data_torch = self.shuffle_attn_q_weight(data_torch) + elif new_name.endswith("attn_output.weight"): + data_torch = self.shuffle_attn_output_weight(data_torch) + + return [(new_name, data_torch)] + + +@Model.register("CodeShellForCausalLM") +class CodeShellModel(Model): + model_arch = gguf.MODEL_ARCH.CODESHELL + + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("CodeShell") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + new_name = self.map_tensor_name(name) + + tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + assert self.tensor_names is not None + + if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + # copy tok_embd.weight to output.weight + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) + + return tensors + + +@Model.register("InternLM2ForCausalLM") +class InternLM2Model(Model): + model_arch = gguf.MODEL_ARCH.INTERNLM2 + + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + sentencepiece_model = model.ModelProto() + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉".encode("utf-8") + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if "chat" in os.path.basename(self.dir_model.absolute()): + # For the chat model, we replace the eos with '<|im_end|>'. + # TODO: this is a hack, should be fixed + # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 + special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) + logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally.") + + special_vocab.add_to_gguf(self.gguf_writer) + + def _try_get_sft_eos(self, tokenizer): + unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.Encode('<|im_end|>') + eos_token = None + assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) + if len(unused_145_list) == 1: + eos_token = unused_145_list[0] + if len(im_end_list) == 1: + eos_token = im_end_list[0] + assert eos_token + return eos_token + + def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("InternLM2") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + q_per_kv = num_heads // num_kv_heads + head_dim = hidden_size // num_heads + num_groups = num_heads // q_per_kv + + qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" + + if re.match(qkv_pattern, name): + bid = re.findall(qkv_pattern, name)[0] + qkv = data_torch + # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) + qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim)) + q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + # The model weights of q and k equire additional reshape. + # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads) + # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) + k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads) + # v = rearrange(v, " o g n i -> o (g n i)").T + v = v.reshape((v.shape[0], -1)).T + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), + ] + else: + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("BertModel", "CamembertModel") +class BertModel(Model): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.vocab_size = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_causal_attention(False) + + # get pooling path + pooling_path = None + module_path = self.dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"] == "sentence_transformers.models.Pooling": + pooling_path = mod["path"] + break + + # get pooling type + if pooling_path is not None: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + pooling = json.load(f) + if pooling["pooling_mode_mean_tokens"]: + pooling_type = gguf.PoolingType.MEAN + elif pooling["pooling_mode_cls_token"]: + pooling_type = gguf.PoolingType.CLS + else: + raise NotImplementedError("Only MEAN and CLS pooling types supported") + self.gguf_writer.add_pooling_type(pooling_type) + + def set_vocab(self): + tokens, toktypes, tokpre = self.get_vocab_base() + self.vocab_size = len(tokens) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" + + # convert to phantom space vocab + def phantom(tok): + if tok.startswith("[") and tok.endswith("]"): + return tok + if tok.startswith("##"): + return tok[2:] + return "\u2581" + tok + tokens = list(map(phantom, tokens)) + + # add vocab to gguf + self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # handle special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + return [] # we don't need these + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("NomicBertModel") +class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.NOMIC_BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors + assert self.hparams["qkv_proj_bias"] is False + assert self.hparams["mlp_fc1_bias"] is False + assert self.hparams["mlp_fc2_bias"] is False + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + + +@Model.register("GemmaForCausalLM") +class GemmaModel(Model): + model_arch = gguf.MODEL_ARCH.GEMMA + + def set_vocab(self): + self._set_vocab_sentencepiece() + + # TODO: these special tokens should be exported only for the CodeGemma family + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab._set_special_token("prefix", 67) + special_vocab._set_special_token("suffix", 69) + special_vocab._set_special_token("middle", 68) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("Starcoder2ForCausalLM") +class StarCoder2Model(Model): + model_arch = gguf.MODEL_ARCH.STARCODER2 + + +@Model.register("MambaForCausalLM", "MambaLMHeadModel") +class MambaModel(Model): + model_arch = gguf.MODEL_ARCH.MAMBA + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 8 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + elif (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + neox_reader = gguf.GGUFReader(tokenizer_path, "r") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) + assert field + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + assert field + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) + assert field + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID) + self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + + # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_file_type(self.ftype) + + _tok_embd = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) + + new_name = self.map_tensor_name(name) + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + # assuming token_embd.weight is seen before output.weight + if self._tok_embd is not None and new_name == output_name: + if torch.equal(self._tok_embd, data_torch): + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") + return [] + elif new_name == tok_embd_name: + self._tok_embd = data_torch + + return [(new_name, data_torch)] + + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del n_dims # unused + + return bid is not None and new_name in ( + self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ + gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.SSM_X, + gguf.MODEL_TENSOR.SSM_DT, + gguf.MODEL_TENSOR.SSM_A, + gguf.MODEL_TENSOR.SSM_D, + ] + ) + + +@Model.register("CohereForCausalLM") +class CommandR2Model(Model): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + # aya-23 models don't have model_max_length specified + self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + +@Model.register("OlmoForCausalLM") +@Model.register("OLMoForCausalLM") +class OlmoModel(Model): + model_arch = gguf.MODEL_ARCH.OLMO + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_eps(1e-5) + clip_qkv = self.hparams.get("clip_qkv") + if clip_qkv is not None: + self.gguf_writer.add_clamp_kqv(clip_qkv) + + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("JinaBertModel", "JinaBertForMaskedLM") +class JinaBertV2Model(BertModel): + model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.intermediate_size = self.hparams["intermediate_size"] + + def get_tensors(self): + for name, data in super().get_tensors(): + if 'gated_layers' in name: + d1 = data[:self.intermediate_size, :] + name1 = name.replace('gated_layers', 'gated_layers_w') + d2 = data[self.intermediate_size:, :] + name2 = name.replace('gated_layers', 'gated_layers_v') + yield name1, d1 + yield name2, d2 + continue + + yield name, data + + def set_vocab(self, *args, **kwargs): + tokenizer_class = 'BertTokenizer' + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_class = json.load(f)['tokenizer_class'] + + if tokenizer_class == 'BertTokenizer': + super().set_vocab() + elif tokenizer_class == 'RobertaTokenizer': + self._set_vocab_gpt2() + self.gguf_writer.add_token_type_count(2) + else: + raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + +@Model.register("ArcticForCausalLM") +class ArcticModel(Model): + model_arch = gguf.MODEL_ARCH.ARCTIC + + def set_vocab(self): + # The reason for using a custom implementation here is that the + # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from + # tokenizer.model and used them as BOS and EOS instead of adding new tokens. + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + # Read the whole vocabulary from the tokenizer.model file + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + # Use the added_tokens_decoder field from tokeniser_config.json as the source + # of information about added/redefined tokens and modify them accordingly. + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + + if "added_tokens_decoder" in tokenizer_config_json: + added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] + for token_id, token_json in added_tokens_decoder.items(): + token_id = int(token_id) + if (token_id >= vocab_size): + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + token_content = token_json["content"] + token_type = SentencePieceTokenTypes.USER_DEFINED + token_score = -10000.0 + + # Map unk_token to UNKNOWN, other special tokens to CONTROL + # Set the score to 0.0 as in the original tokenizer.model + if ("special" in token_json) and token_json["special"]: + if token_content == tokenizer_config_json["unk_token"]: + token_type = SentencePieceTokenTypes.UNKNOWN + else: + token_type = SentencePieceTokenTypes.CONTROL + token_score = 0.0 + + logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") + tokens[token_id] = token_content.encode("utf-8") + toktypes[token_id] = token_type + scores[token_id] = token_score + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("DeepseekV2ForCausalLM") +class DeepseekV2Model(Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +###### CONVERSION LOGIC ###### + + +# tree of lazy tensors +class LazyTorchTensor(gguf.LazyBase): + _tensor_type = torch.Tensor + # to keep the type-checker happy + dtype: torch.dtype + shape: torch.Size + + # only used when converting a torch.Tensor to a np.ndarray + _dtype_map: dict[torch.dtype, type] = { + torch.float16: np.float16, + torch.float32: np.float32, + } + + def numpy(self) -> gguf.LazyNumpyTensor: + dtype = self._dtype_map[self.dtype] + return gguf.LazyNumpyTensor( + meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), + lazy=self._lazy, + args=(self,), + func=(lambda s: s[0].numpy()) + ) + + @classmethod + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor: + return torch.empty(size=shape, dtype=dtype, device="meta") + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.numpy: + return args[0].numpy() + + return LazyTorchTensor._wrap_fn(func)(*args, **kwargs) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a huggingface model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--awq-path", type=Path, default=None, + help="Path to scale awq cache file", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file", + ) + parser.add_argument( + "--use-temp-file", action="store_true", + help="use the tempfile library while processing (helpful when running out of memory, process killed)", + ) + parser.add_argument( + "--no-lazy", action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + ) + parser.add_argument( + "--model-name", type=str, default=None, + help="name of the model", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + parser.add_argument( + "--split", action="store_true", + help="split the converted model into multiple files" + ) + parser.add_argument( + "--split-max-tensors", type=int, + help="max tensors in each split" + ) + parser.add_argument( + "--split-max-size", type=str, + help="max size per split N(M|G)" + ) + parser.add_argument( + "--dry-run", action="store_true", + help="only print out a split plan and exit, without writing any new files" + ) + parser.add_argument( + "--small-first-shard", action="store_true", + help="do not add tensors to the first shard (disabled by default)" + ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + dir_model = args.model + + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] + tmp_model_path = args.model / "weighted_model" + dir_model = tmp_model_path + if tmp_model_path.is_dir(): + logger.info(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + logger.info("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + logger.info(f"Saved weighted model at {tmp_model_path}.") + + if not dir_model.is_dir(): + logger.error(f'Error: {args.model} is not a directory') + sys.exit(1) + + if args.split and not (args.split_max_tensors or args.split_max_size): + raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") + + if args.split_max_tensors and args.split_max_size: + raise ValueError("Can't specify both --split-max-tensors and --split-max-size") + + split_arguments = gguf.SplitArguments(args) + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "auto": gguf.LlamaFileType.GUESSED, + } + + if args.outfile is not None: + fname_out = args.outfile + else: + # output in the same directory as the model by default + fname_out = dir_model / 'ggml-model-{ftype}.gguf' + + logger.info(f"Loading model: {dir_model.name}") + + hparams = Model.load_hparams(dir_model) + + with torch.inference_mode(): + try: + model_class = Model.from_model_architecture(hparams["architectures"][0]) + except NotImplementedError: + logger.error(f"Model {hparams['architectures'][0]} is not supported") + sys.exit(1) + + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, + args.no_lazy, split_arguments) + + logger.info("Set model parameters") + model_instance.set_gguf_parameters() + + logger.info("Set model tokenizer") + model_instance.set_vocab() + + model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + if args.vocab_only: + logger.info(f"Exporting model vocab to '{model_instance.fname_out}'") + model_instance.write_vocab() + else: + logger.info(f"Exporting model to '{model_instance.fname_out}'") + model_instance.write() + + logger.info(f"Model successfully exported to '{model_instance.fname_out}'") + + +if __name__ == '__main__': + main() From dc5cf5fd825a0ffa4964fdebc4f9b42444d39ca2 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Fri, 7 Jun 2024 17:26:30 -0400 Subject: [PATCH 30/66] Update gguf-py/gguf/gguf_writer_split.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer_split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/gguf_writer_split.py b/gguf-py/gguf/gguf_writer_split.py index c115679993ab5..a419512ce64e6 100644 --- a/gguf-py/gguf/gguf_writer_split.py +++ b/gguf-py/gguf/gguf_writer_split.py @@ -25,8 +25,8 @@ SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" METADATA_ONLY_INDICATOR = -1 -KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)} -TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType] # (tensor name, tensor data, tensor dtype) +KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType | None]] # {key: (value, type)} +TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType | None] # (tensor name, tensor data, tensor dtype) @dataclass From e093dfba9f2ba2b8f5345c747c8e96f7776789ca Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 7 Jun 2024 17:31:35 -0400 Subject: [PATCH 31/66] convert-hf : restore executable file permission --- convert-hf-to-gguf.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 convert-hf-to-gguf.py diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py old mode 100644 new mode 100755 From 9576965ce748a685299e3fb95a489804ca6a8c8a Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 7 Jun 2024 20:51:22 -0400 Subject: [PATCH 32/66] examples/convert-legacy-llama.py: restore executable file permission --- examples/convert-legacy-llama.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 examples/convert-legacy-llama.py diff --git a/examples/convert-legacy-llama.py b/examples/convert-legacy-llama.py old mode 100644 new mode 100755 From c6ae1d679943994301021f05f0110823ff746440 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 7 Jun 2024 21:09:03 -0400 Subject: [PATCH 33/66] reinstate original gguf package import and fix type annotation --- convert-hf-to-gguf.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b4ea11a15f760..20bf67d80207c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -23,9 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import importlib -gguf = importlib.import_module("gguf-py.gguf") -# import gguf +import gguf logger = logging.getLogger("hf-to-gguf") @@ -60,7 +58,7 @@ class Model: tensor_map: gguf.TensorNameMap tensor_names: set[str] | None fname_out: Path - gguf_writer: gguf.GGUFWriter + gguf_writer: gguf.GGUFWriterSplit # subclasses should define this! model_arch: gguf.MODEL_ARCH From 2e70fa10554a527bd5308521f38b4d2753413751 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 7 Jun 2024 21:18:30 -0400 Subject: [PATCH 34/66] attempt to appease the linter --- convert-hf-to-gguf.py | 2 +- gguf-py/gguf/gguf_writer_split.py | 33 ++++++++++++++++--------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 20bf67d80207c..c56e416c26751 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -95,7 +95,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) self.gguf_writer = gguf.GGUFWriterSplit(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, - endianess=self.endianess, use_temp_file=self.use_temp_file) + endianess=self.endianess, use_temp_file=self.use_temp_file) @classmethod def __init_subclass__(cls): diff --git a/gguf-py/gguf/gguf_writer_split.py b/gguf-py/gguf/gguf_writer_split.py index a419512ce64e6..effa083d5fc27 100644 --- a/gguf-py/gguf/gguf_writer_split.py +++ b/gguf-py/gguf/gguf_writer_split.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import logging from enum import IntEnum from typing import TYPE_CHECKING, Any, Sequence from argparse import Namespace @@ -21,6 +22,8 @@ from .gguf_writer import GGUFWriter, WriterState from .constants import Keys +logger = logging.getLogger(__name__) + SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" METADATA_ONLY_INDICATOR = -1 @@ -63,7 +66,7 @@ class GGUFWriterSplit(GGUFWriter): def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE - ) -> None: + ) -> None: # we intentionally don't call superclass constructor self.arch = arch self.path = Path(path) @@ -86,11 +89,11 @@ def init_shards(self) -> None: # check if we need to split if self.split_arguments.split_max_tensors and self.total_tensors < self.split_arguments.split_max_tensors: - print("Model has fewer tensors than the split threshold, not splitting") + logger.warning("Model has fewer tensors than the split threshold, not splitting") self.split_style = SplitStyle.NONE if self.split_arguments.split_max_size and total_size < self.split_arguments.split_max_size: - print("Model has smaller size than the split threshold, not splitting") + logger.warning("Model has smaller size than the split threshold, not splitting") self.split_style = SplitStyle.NONE # no shards are created when writing vocab so make one @@ -105,13 +108,12 @@ def init_shards(self) -> None: self.shards[i].path = self.path.with_name(SHARD_NAME_FORMAT.format(self.path.stem, i + 1, len(self.shards))) # print shard info - print("\nWriting the following files:") + logger.info("Writing the following files:") for shard in self.shards: - print(f" {shard.path}: n_tensors = {shard.tensor_count}, total_size = {GGUFWriterSplit.format_n_bytes_to_str(shard.size)}") - print() + logger.info(f" {shard.path}: n_tensors = {shard.tensor_count}, total_size = {GGUFWriterSplit.format_n_bytes_to_str(shard.size)}") if self.split_arguments.dry_run: - print("\nDry run, not writing files") + logger.info("Dry run, not writing files") exit() # we don't want to initialize GGUFWriters until now because they create files @@ -137,7 +139,7 @@ def init_shards(self) -> None: try: (name, tensor, dtype) = shard.tensors.popleft() writer.add_tensor(name, tensor, raw_dtype=dtype) - except: + except IndexError: break self.shard_writers.append(writer) @@ -154,7 +156,7 @@ def write_header_to_file(self) -> None: def write_kv_data_to_file(self) -> None: if self.state is not WriterState.HEADER: raise ValueError(f'Expected GGUFWriterSplit state to be HEADER, got {self.state}') - + for writer in self.shard_writers: writer.write_kv_data_to_file() @@ -169,9 +171,9 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: writer = self.shard_writers[i] is_metadata = writer.ti_data_count == 0 if is_metadata: - print(f"Writing to shard {i + 1}/{len(self.shards)} with metadata only") + logger.info(f"Writing to shard {i + 1}/{len(self.shards)} with metadata only") else: - print(f"Writing to shard {i + 1}/{len(self.shards)} with {writer.ti_data_count}/{running_total} remaining tensors (of {self.total_tensors} total)") + logger.info(f"Writing to shard {i + 1}/{len(self.shards)} with {writer.ti_data_count}/{running_total} remaining tensors (of {self.total_tensors} total)") running_total -= writer.ti_data_count writer.write_tensors_to_file(progress=(progress and not is_metadata)) del writer @@ -181,7 +183,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: # override add_key, add_val to handle kv data separately def add_key(self, key: str) -> None: self.recent_key = key - + def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None: if self.recent_key is None: raise ValueError("No key set for value") @@ -226,9 +228,7 @@ def get_tensor_size(tensor) -> int: return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) except AttributeError: # numpy ndarray[Any, Any] return tensor.nbytes - except: # this should never happen - raise ValueError(f"Invalid tensor type: {type(tensor)}") - + @staticmethod def split_str_to_n_bytes(split_str: str) -> int: if split_str.endswith("K"): @@ -256,4 +256,5 @@ def format_n_bytes_to_str(num: int) -> str: if abs(fnum) < 1000.0: return f"{fnum:3.1f}{unit}" fnum /= 1000.0 - return f"{fnum:.1f}T - over 1TB, --split recommended" \ No newline at end of file + return f"{fnum:.1f}T - over 1TB, --split recommended" + From 891b19cb81947cd9c92cbfae8d8b52c9ac9d9277 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 7 Jun 2024 21:20:46 -0400 Subject: [PATCH 35/66] attempt 2 to appease the linter --- gguf-py/gguf/gguf_writer_split.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gguf-py/gguf/gguf_writer_split.py b/gguf-py/gguf/gguf_writer_split.py index effa083d5fc27..fca193d8ff663 100644 --- a/gguf-py/gguf/gguf_writer_split.py +++ b/gguf-py/gguf/gguf_writer_split.py @@ -66,7 +66,7 @@ class GGUFWriterSplit(GGUFWriter): def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE - ) -> None: + ) -> None: # we intentionally don't call superclass constructor self.arch = arch self.path = Path(path) @@ -257,4 +257,3 @@ def format_n_bytes_to_str(num: int) -> str: return f"{fnum:3.1f}{unit}" fnum /= 1000.0 return f"{fnum:.1f}T - over 1TB, --split recommended" - From 02be0dd6543015ca58bcada19a6694507cdb98cd Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 7 Jun 2024 21:26:40 -0400 Subject: [PATCH 36/66] attempt 3 to appease the linter --- gguf-py/gguf/gguf_writer_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer_split.py b/gguf-py/gguf/gguf_writer_split.py index fca193d8ff663..655cddbfe43b1 100644 --- a/gguf-py/gguf/gguf_writer_split.py +++ b/gguf-py/gguf/gguf_writer_split.py @@ -66,7 +66,7 @@ class GGUFWriterSplit(GGUFWriter): def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE - ) -> None: + ) -> None: # we intentionally don't call superclass constructor self.arch = arch self.path = Path(path) From f658e91f4a2eadf164494503d1c58c0e40e31fdc Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 8 Jun 2024 08:10:12 -0400 Subject: [PATCH 37/66] comma consistency --- convert-hf-to-gguf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c56e416c26751..0edc581426764 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2801,23 +2801,23 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--split", action="store_true", - help="split the converted model into multiple files" + help="split the converted model into multiple files", ) parser.add_argument( "--split-max-tensors", type=int, - help="max tensors in each split" + help="max tensors in each split", ) parser.add_argument( "--split-max-size", type=str, - help="max size per split N(M|G)" + help="max size per split N(M|G)", ) parser.add_argument( "--dry-run", action="store_true", - help="only print out a split plan and exit, without writing any new files" + help="only print out a split plan and exit, without writing any new files", ) parser.add_argument( "--small-first-shard", action="store_true", - help="do not add tensors to the first shard (disabled by default)" + help="do not add tensors to the first shard (disabled by default)", ) return parser.parse_args() From 079dfe3a8c3d71243acfaedeb3f73e1141eeaa2e Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Sat, 8 Jun 2024 15:42:17 -0400 Subject: [PATCH 38/66] Update convert-hf-to-gguf.py Co-authored-by: compilade --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 0edc581426764..28ecaea7ace11 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -94,7 +94,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFWriterSplit(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, + self.gguf_writer = gguf.GGUFWriterSplit(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, endianess=self.endianess, use_temp_file=self.use_temp_file) @classmethod From 282e71fb39eedca8e05f31e72ba9526907de5f08 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 8 Jun 2024 23:00:42 -0400 Subject: [PATCH 39/66] edit cmd line args --- convert-hf-to-gguf.py | 11 ++--------- gguf-py/gguf/gguf_writer_split.py | 13 ++++++------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 28ecaea7ace11..fe0066442a693 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2799,10 +2799,6 @@ def parse_args() -> argparse.Namespace: "--verbose", action="store_true", help="increase output verbosity", ) - parser.add_argument( - "--split", action="store_true", - help="split the converted model into multiple files", - ) parser.add_argument( "--split-max-tensors", type=int, help="max tensors in each split", @@ -2816,8 +2812,8 @@ def parse_args() -> argparse.Namespace: help="only print out a split plan and exit, without writing any new files", ) parser.add_argument( - "--small-first-shard", action="store_true", - help="do not add tensors to the first shard (disabled by default)", + "--no-tensor-first-split", action="store_true", + help="do not add tensors to the first split (disabled by default)" ) return parser.parse_args() @@ -2847,9 +2843,6 @@ def main() -> None: logger.error(f'Error: {args.model} is not a directory') sys.exit(1) - if args.split and not (args.split_max_tensors or args.split_max_size): - raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") - if args.split_max_tensors and args.split_max_size: raise ValueError("Can't specify both --split-max-tensors and --split-max-size") diff --git a/gguf-py/gguf/gguf_writer_split.py b/gguf-py/gguf/gguf_writer_split.py index 655cddbfe43b1..b4836737a03ef 100644 --- a/gguf-py/gguf/gguf_writer_split.py +++ b/gguf-py/gguf/gguf_writer_split.py @@ -48,14 +48,13 @@ class SplitStyle(IntEnum): class SplitArguments: def __init__(self, args: Namespace) -> None: - self.split = args.split - self.split_max_tensors = args.split_max_tensors if args.split else 0 - self.split_max_size = GGUFWriterSplit.split_str_to_n_bytes(args.split_max_size) if args.split and args.split_max_size else 0 - self.split_style = SplitStyle.NONE if not self.split \ - else SplitStyle.TENSORS if self.split_max_tensors \ - else SplitStyle.SIZE + self.split_max_tensors = args.split_max_tensors if args.split_max_tensors else 0 + self.split_max_size = GGUFWriterSplit.split_str_to_n_bytes(args.split_max_size) if args.split_max_size else 0 + self.split_style = SplitStyle.TENSORS if self.split_max_tensors \ + else SplitStyle.SIZE if self.split_max_size \ + else SplitStyle.NONE self.dry_run = args.dry_run - self.small_first_shard = args.small_first_shard + self.small_first_shard = args.no_tensor_first_split class GGUFWriterSplit(GGUFWriter): From 03cc9bcbe80d957e93a70d4a62e1f878d024b8c8 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 8 Jun 2024 23:14:26 -0400 Subject: [PATCH 40/66] use simplification from #7827 --- gguf-py/gguf/gguf_writer_split.py | 45 +++++++++++-------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/gguf-py/gguf/gguf_writer_split.py b/gguf-py/gguf/gguf_writer_split.py index b4836737a03ef..bc1e9443a2300 100644 --- a/gguf-py/gguf/gguf_writer_split.py +++ b/gguf-py/gguf/gguf_writer_split.py @@ -61,7 +61,7 @@ class GGUFWriterSplit(GGUFWriter): kv_data: KVTempData split_arguments: SplitArguments shards: list[Shard] - shard_writers: list[GGUFWriter] + shard_writers: list[tuple[GGUFWriter, os.PathLike[str]]] def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE @@ -115,17 +115,15 @@ def init_shards(self) -> None: logger.info("Dry run, not writing files") exit() - # we don't want to initialize GGUFWriters until now because they create files for i, shard in enumerate(self.shards): # add_architecture is used for consistency - examples/gguf_split doesn't add arch to all shards - writer = GGUFWriter(shard.path, self.arch, use_temp_file=self.use_temp_file, + writer = GGUFWriter(None, self.arch, use_temp_file=self.use_temp_file, endianess=self.endianess, add_architecture=(i == 0)) # only the first shard needs all the KV data if i == 0: for key, (value, etype) in self.kv_data.items(): - writer.add_key(key) - writer.add_val(value, etype) + writer.add_key_value(key, value, etype) # add split metadata unless it's one file - small first shard splits even with SplitStyle.NONE if self.split_arguments.split_style != SplitStyle.NONE or self.split_arguments.small_first_shard: @@ -141,14 +139,14 @@ def init_shards(self) -> None: except IndexError: break - self.shard_writers.append(writer) + self.shard_writers.append((writer, shard.path)) - def write_header_to_file(self) -> None: + def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None: if self.state is not WriterState.EMPTY: raise ValueError(f'Expected GGUFWriterSplit state to be EMPTY, got {self.state}') - for writer in self.shard_writers: - writer.write_header_to_file() + for (writer, path) in self.shard_writers: + writer.write_header_to_file(path) self.state = WriterState.HEADER @@ -156,7 +154,7 @@ def write_kv_data_to_file(self) -> None: if self.state is not WriterState.HEADER: raise ValueError(f'Expected GGUFWriterSplit state to be HEADER, got {self.state}') - for writer in self.shard_writers: + for (writer, _) in self.shard_writers: writer.write_kv_data_to_file() self.state = WriterState.KV_DATA @@ -167,32 +165,21 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: running_total = self.total_tensors for i in range(len(self.shard_writers)): - writer = self.shard_writers[i] - is_metadata = writer.ti_data_count == 0 + writer = self.shard_writers[i][0] + is_metadata = len(writer.tensors) == 0 if is_metadata: logger.info(f"Writing to shard {i + 1}/{len(self.shards)} with metadata only") else: - logger.info(f"Writing to shard {i + 1}/{len(self.shards)} with {writer.ti_data_count}/{running_total} remaining tensors (of {self.total_tensors} total)") - running_total -= writer.ti_data_count + logger.info(f"Writing to shard {i + 1}/{len(self.shards)} with {len(writer.tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") + running_total -= len(writer.tensors) writer.write_tensors_to_file(progress=(progress and not is_metadata)) del writer self.state = WriterState.TI_DATA - # override add_key, add_val to handle kv data separately - def add_key(self, key: str) -> None: - self.recent_key = key - - def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None: - if self.recent_key is None: - raise ValueError("No key set for value") - self.kv_data[self.recent_key] = (val, vtype) - - # need to handle arrays separately - def add_array(self, key: str, val: Sequence[Any]) -> None: - if not isinstance(val, Sequence): - raise ValueError(f'Expected a sequence for {key}, got {type(val)}') - self.kv_data[key] = (val, GGUFValueType.ARRAY) + # override add_key_value to handle kv data separately + def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None: + self.kv_data[key] = (val, vtype) def add_tensor( self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, @@ -218,7 +205,7 @@ def add_tensor( self.shards[-1].tensors.append((name, tensor, raw_dtype)) def close(self) -> None: - for writer in self.shard_writers: + for (writer, _) in self.shard_writers: writer.close() @staticmethod From 97dd416903532c75cd40b4e0a35403ef0014a920 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 00:34:36 -0400 Subject: [PATCH 41/66] kv/ti data are still wrong --- convert-hf-to-gguf.py | 10 +- gguf-py/gguf/__init__.py | 1 - gguf-py/gguf/gguf_writer.py | 295 +++++++++++++++++++++++------- gguf-py/gguf/gguf_writer_split.py | 245 ------------------------- 4 files changed, 235 insertions(+), 316 deletions(-) delete mode 100644 gguf-py/gguf/gguf_writer_split.py diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b89f8ff2dfd80..b1806e244d33b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -59,7 +59,7 @@ class Model: tensor_map: gguf.TensorNameMap tensor_names: set[str] | None fname_out: Path - gguf_writer: gguf.GGUFWriterSplit + gguf_writer: gguf.GGUFWriter # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -95,8 +95,8 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFWriterSplit(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, - endianess=self.endianess, use_temp_file=self.use_temp_file) + self.gguf_writer = gguf.GGUFWriter(None, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, + endianess=self.endianess, use_temp_file=self.use_temp_file) @classmethod def __init_subclass__(cls): @@ -326,16 +326,14 @@ def write_tensors(self): def write(self): self.write_tensors() - self.gguf_writer.init_shards() self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() def write_vocab(self): - if self.gguf_writer.split_arguments.split: + if self.gguf_writer.split_arguments.split_style != gguf.SplitStyle.NONE: raise ValueError('Splitting the vocabulary is not supported') - self.gguf_writer.init_shards() self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py index a2197255a556e..ea5146b161bc8 100644 --- a/gguf-py/gguf/__init__.py +++ b/gguf-py/gguf/__init__.py @@ -2,7 +2,6 @@ from .lazy import * from .gguf_reader import * from .gguf_writer import * -from .gguf_writer_split import * from .quants import * from .tensor_mapping import * from .vocab import * diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4413e90105f3c..84190837d8809 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -5,10 +5,13 @@ import shutil import struct import tempfile +from argparse import Namespace +from collections import deque from dataclasses import dataclass from enum import Enum, auto +from pathlib import Path from io import BufferedWriter -from typing import IO, Any, Sequence, Mapping +from typing import IO, Any, Sequence, Mapping, TypeAlias from string import ascii_letters, digits import numpy as np @@ -27,10 +30,19 @@ ) from .quants import quant_shape_from_byte_shape +from .constants import Keys logger = logging.getLogger(__name__) +SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" +NUM_SHARD_KV_DATA = 6 +METADATA_ONLY_INDICATOR = -1 + +KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType | None]] # {key: (value, type)} +TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType | None] # (tensor name, tensor data, tensor dtype) + + @dataclass class TensorInfo: shape: Sequence[int] @@ -45,6 +57,25 @@ class GGUFValue: type: GGUFValueType +@dataclass +class Shard: + path: Path + tensor_count: int + size: int + tensors: deque[TensorTempData] + + +class SplitArguments: + def __init__(self, args: Namespace) -> None: + self.split_max_tensors = args.split_max_tensors if args.split_max_tensors else 0 + self.split_max_size = GGUFWriter.split_str_to_n_bytes(args.split_max_size) if args.split_max_size else 0 + self.split_style = SplitStyle.TENSORS if self.split_max_tensors \ + else SplitStyle.SIZE if self.split_max_size \ + else SplitStyle.NONE + self.dry_run = args.dry_run + self.small_first_shard = args.no_tensor_first_split + + class WriterState(Enum): NO_FILE = auto() EMPTY = auto() @@ -54,11 +85,17 @@ class WriterState(Enum): WEIGHTS = auto() +class SplitStyle(Enum): + NONE = auto() + TENSORS = auto() + SIZE = auto() + + class GGUFWriter: - fout: BufferedWriter | None + fout: list[BufferedWriter | None] path: os.PathLike[str] | str | None temp_file: tempfile.SpooledTemporaryFile[bytes] | None - tensors: dict[str, TensorInfo] + tensors: list[dict[str, TensorInfo]] kv_data: dict[str, GGUFValue] state: WriterState _simple_value_packing = { @@ -76,25 +113,55 @@ class GGUFWriter: } def __init__( - self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, - endianess: GGUFEndian = GGUFEndian.LITTLE, add_architecture: bool = True + self, path: os.PathLike[str] | str | None, arch: str, split_arguments: SplitArguments, + use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE ): - self.fout = None + self.fout = [] self.path = path self.arch = arch self.endianess = endianess self.data_alignment = GGUF_DEFAULT_ALIGNMENT + self.split_arguments = split_arguments self.use_temp_file = use_temp_file self.temp_file = None - self.tensors = dict() + self.tensors = [] self.kv_data = dict() logger.info("gguf: This GGUF file is for {0} Endian only".format( "Big" if self.endianess == GGUFEndian.BIG else "Little", )) self.state = WriterState.NO_FILE - if add_architecture: - self.add_architecture() + if self.split_arguments.small_first_shard: + self.tensors.append(dict()) + + self.add_architecture() + + def verify_arguments(self) -> None: + total_tensors = sum(len(ti) for ti in self.tensors) + total_size = sum(sum(GGUFWriter.get_tensor_size(ti) for ti in t.values()) for t in self.tensors) + + if self.split_arguments.split_max_tensors and total_tensors < self.split_arguments.split_max_tensors: + logger.warning("Model has fewer tensors than the split threshold, not splitting") + self.split_style = SplitStyle.NONE + + if self.split_arguments.split_max_size and total_size < self.split_arguments.split_max_size: + logger.warning("Model has smaller size than the split threshold, not splitting") + self.split_style = SplitStyle.NONE + + # no shards are created when writing vocab so make one + if not self.tensors or len(self.tensors) == 0: + self.tensors.append(dict()) + + def format_shard_names(self) -> list[os.PathLike[str]]: + pathobj = Path(self.path) + if self.split_arguments.split_style == SplitStyle.NONE: + return [pathobj] + + shard_names = [] + for i in range(len(self.tensors)): + shard_names.append(pathobj.with_name(SHARD_NAME_FORMAT.format(pathobj.stem, i + 1, len(self.tensors)))) + + return shard_names def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None: if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path): @@ -107,24 +174,52 @@ def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None: self.path = path if self.path is not None: - if self.fout is not None: - self.fout.close() - self.fout = open(self.path, "wb") + self.fout = [] + for fout in self.format_shard_names(): + self.fout.append(open(fout, "wb")) self.state = WriterState.EMPTY + def print_plan(self) -> None: + logger.info("Writing the following files:") + for i in range(len(self.fout)): + logger.info(f" {self.fout[i].name}: n_tensors = {len(self.tensors[i])}, total_size = {GGUFWriter.format_n_bytes_to_str(GGUFWriter.get_tensors_total_size(self.tensors[i].values()))}") + + if self.split_arguments.dry_run: + logger.info("Dry run, not writing files") + exit() + def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None: + self.verify_arguments() self.open_output_file(path) + self.print_plan() if self.state is not WriterState.EMPTY: raise ValueError(f'Expected output file to be empty, got {self.state}') - self._write_packed(" bytearray: + total_tensors = sum(len(t) for t in self.tensors) + kv_data += self._pack_val(Keys.Split.LLM_KV_SPLIT_NO, GGUFValueType.STRING, add_vtype=False) + kv_data += self._pack_val(shard_no, GGUFValueType.UINT16, add_vtype=True) + kv_data += self._pack_val(Keys.Split.LLM_KV_SPLIT_COUNT, GGUFValueType.STRING, add_vtype=False) + kv_data += self._pack_val(len(self.fout), GGUFValueType.UINT16, add_vtype=True) + kv_data += self._pack_val(Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT, GGUFValueType.STRING, add_vtype=False) + kv_data += self._pack_val(total_tensors, GGUFValueType.INT32, add_vtype=True) + return kv_data + def write_kv_data_to_file(self) -> None: if self.state is not WriterState.HEADER: raise ValueError(f'Expected output file to contain the header, got {self.state}') @@ -136,8 +231,16 @@ def write_kv_data_to_file(self) -> None: kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False) kv_data += self._pack_val(val.value, val.type, add_vtype=True) - self.fout.write(kv_data) - self.flush() + if len(self.fout) > 1: + kv_data = self.add_shard_kv_data(kv_data, 0) + + # only the first shard needs kv data + self.fout[0].write(kv_data) + self.fout[0].flush() + + for i in range(1, len(self.fout)): + self.fout[i].write(self.add_shard_kv_data(bytearray(), i)) + self.fout[i].flush() self.state = WriterState.KV_DATA def write_ti_data_to_file(self) -> None: @@ -145,21 +248,23 @@ def write_ti_data_to_file(self) -> None: raise ValueError(f'Expected output file to contain KV data, got {self.state}') assert self.fout is not None - ti_data = bytearray() - offset_tensor = 0 - - for name, ti in self.tensors.items(): - ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False) - n_dims = len(ti.shape) - ti_data += self._pack("I", n_dims) - for i in range(n_dims): - ti_data += self._pack("Q", ti.shape[n_dims - 1 - i]) - ti_data += self._pack("I", ti.dtype) - ti_data += self._pack("Q", offset_tensor) - offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment) - - self.fout.write(ti_data) - self.flush() + for i in range(len(self.fout)): + assert self.fout[i] is not None + ti_data = bytearray() + offset_tensor = 0 + + for name, ti in self.tensors[i].items(): + ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False) + n_dims = len(ti.shape) + ti_data += self._pack("I", n_dims) + for i in range(n_dims): + ti_data += self._pack("Q", ti.shape[n_dims - 1 - i]) + ti_data += self._pack("I", ti.dtype) + ti_data += self._pack("Q", offset_tensor) + offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment) + + self.fout[i].write(ti_data) + self.fout[i].flush() self.state = WriterState.TI_DATA def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None: @@ -248,7 +353,18 @@ def add_tensor_info( if tensor_dtype == np.uint8: tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) - self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes) + # create splits as necessary, such as to start it off + if (len(self.tensors) == self.split_arguments.small_first_shard \ + # or split when over tensor limit + or (self.split_arguments.split_style == SplitStyle.TENSORS \ + and len(self.tensors[-1]) >= self.split_arguments.split_max_tensors) \ + # or split when over size limit + or (self.split_arguments.split_style == SplitStyle.SIZE \ + and GGUFWriter.get_tensors_total_size(self.tensors[-1].values()) + tensor_nbytes > self.split_arguments.split_max_size)): + + self.tensors.append(dict()) + + self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes) def add_tensor( self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, @@ -265,7 +381,7 @@ def add_tensor( self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype) if self.temp_file is None: - self.tensors[name].tensor = tensor + self.tensors[-1][name].tensor = tensor return tensor.tofile(self.temp_file) @@ -283,9 +399,12 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: if self.endianess == GGUFEndian.BIG: tensor.byteswap(inplace=True) - self.write_padding(self.fout, self.fout.tell()) - tensor.tofile(self.fout) - self.write_padding(self.fout, tensor.nbytes) + + for fout in self.fout: + assert fout is not None + self.write_padding(fout, fout.tell()) + tensor.tofile(fout) + self.write_padding(fout, tensor.nbytes) self.state = WriterState.WEIGHTS @@ -294,27 +413,31 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: assert self.fout is not None - self.write_padding(self.fout, self.fout.tell()) + for fout in self.fout: + assert fout is not None + self.write_padding(fout, fout.tell()) if self.temp_file is None: - bar = None - - if progress: - from tqdm import tqdm - - total_bytes = sum(t.nbytes for t in self.tensors.values()) - - bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) - - # relying on the fact that Python dicts preserve insertion order (since 3.7) - for ti in self.tensors.values(): - assert ti.tensor is not None # can only iterate once over the tensors - assert ti.tensor.nbytes == ti.nbytes - ti.tensor.tofile(self.fout) - if bar is not None: - bar.update(ti.nbytes) - self.write_padding(self.fout, ti.nbytes) - ti.tensor = None + for i in range(len(self.fout)): + assert self.fout[i] is not None + bar = None + + if progress: + from tqdm import tqdm + + total_bytes = GGUFWriter.get_tensors_total_size(self.tensors[i].values()) + + bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) + + # relying on the fact that Python dicts preserve insertion order (since 3.7) + for ti in self.tensors[i].values(): + assert ti.tensor is not None # can only iterate once over the tensors + assert ti.tensor.nbytes == ti.nbytes + ti.tensor.tofile(self.fout[i]) + if bar is not None: + bar.update(ti.nbytes) + self.write_padding(self.fout[i], ti.nbytes) + ti.tensor = None else: self.temp_file.seek(0) @@ -326,12 +449,16 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: def flush(self) -> None: assert self.fout is not None - self.fout.flush() + for fout in self.fout: + assert fout is not None + fout.flush() def close(self) -> None: if self.fout is not None: - self.fout.close() - self.fout = None + for fout in self.fout: + if fout is not None: + fout.close() + self.fout = [] def add_architecture(self) -> None: self.add_string(Keys.General.ARCHITECTURE, self.arch) @@ -609,6 +736,46 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes: return kv_data - def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: - assert self.fout is not None - self.fout.write(self._pack(fmt, value, skip_pack_prefix)) + def _write_packed(self, fout: BufferedWriter, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: + assert fout is not None + fout.write(self._pack(fmt, value, skip_pack_prefix)) + + @staticmethod + def get_tensor_size(tensor) -> int: + try: + return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) + except AttributeError: # numpy ndarray[Any, Any] + return tensor.nbytes + + @staticmethod + def get_tensors_total_size(tensors) -> int: + return sum(GGUFWriter.get_tensor_size(ti) for ti in tensors) + + @staticmethod + def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1000 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1000 * 1000 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1000 * 1000 * 1000 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + + if n <= 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + @staticmethod + def format_n_bytes_to_str(num: int) -> str: + if num == METADATA_ONLY_INDICATOR: + return "negligible - metadata only" + fnum = float(num) + for unit in ("", "K", "M", "G"): + if abs(fnum) < 1000.0: + return f"{fnum:3.1f}{unit}" + fnum /= 1000.0 + return f"{fnum:.1f}T - over 1TB, --split recommended" \ No newline at end of file diff --git a/gguf-py/gguf/gguf_writer_split.py b/gguf-py/gguf/gguf_writer_split.py deleted file mode 100644 index bc1e9443a2300..0000000000000 --- a/gguf-py/gguf/gguf_writer_split.py +++ /dev/null @@ -1,245 +0,0 @@ -from __future__ import annotations - -import os -import logging -from enum import IntEnum -from typing import TYPE_CHECKING, Any, Sequence -from argparse import Namespace -from collections import deque -from dataclasses import dataclass -from pathlib import Path - -import numpy as np - -if TYPE_CHECKING: - from typing_extensions import TypeAlias - -from .constants import ( - GGMLQuantizationType, - GGUFEndian, - GGUFValueType -) -from .gguf_writer import GGUFWriter, WriterState -from .constants import Keys - -logger = logging.getLogger(__name__) - - -SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" -METADATA_ONLY_INDICATOR = -1 - -KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType | None]] # {key: (value, type)} -TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType | None] # (tensor name, tensor data, tensor dtype) - - -@dataclass -class Shard: - path: Path - tensor_count: int - size: int - tensors: deque[TensorTempData] - - -class SplitStyle(IntEnum): - NONE = 0 - TENSORS = 1 - SIZE = 2 - - -class SplitArguments: - def __init__(self, args: Namespace) -> None: - self.split_max_tensors = args.split_max_tensors if args.split_max_tensors else 0 - self.split_max_size = GGUFWriterSplit.split_str_to_n_bytes(args.split_max_size) if args.split_max_size else 0 - self.split_style = SplitStyle.TENSORS if self.split_max_tensors \ - else SplitStyle.SIZE if self.split_max_size \ - else SplitStyle.NONE - self.dry_run = args.dry_run - self.small_first_shard = args.no_tensor_first_split - - -class GGUFWriterSplit(GGUFWriter): - kv_data: KVTempData - split_arguments: SplitArguments - shards: list[Shard] - shard_writers: list[tuple[GGUFWriter, os.PathLike[str]]] - - def __init__(self, path: os.PathLike[str] | str, arch: str, split_arguments: SplitArguments, - use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE - ) -> None: - # we intentionally don't call superclass constructor - self.arch = arch - self.path = Path(path) - self.endianess = endianess - self.kv_data = {} - self.shards = [] - self.shard_writers = [] - self.total_tensors = 0 - self.use_temp_file = use_temp_file - self.split_arguments = split_arguments - self.recent_key = None - self.state = WriterState.EMPTY - - if self.split_arguments.small_first_shard: - self.shards.append(Shard(Path(), 0, METADATA_ONLY_INDICATOR, deque())) - - def init_shards(self) -> None: - self.total_tensors = sum(shard.tensor_count for shard in self.shards) - total_size = sum(shard.size for shard in self.shards) - - # check if we need to split - if self.split_arguments.split_max_tensors and self.total_tensors < self.split_arguments.split_max_tensors: - logger.warning("Model has fewer tensors than the split threshold, not splitting") - self.split_style = SplitStyle.NONE - - if self.split_arguments.split_max_size and total_size < self.split_arguments.split_max_size: - logger.warning("Model has smaller size than the split threshold, not splitting") - self.split_style = SplitStyle.NONE - - # no shards are created when writing vocab so make one - if not self.shards: - self.shards.append(Shard(Path(), 0, METADATA_ONLY_INDICATOR, deque())) - - # format shard names - if len(self.shards) == 1: - self.shards[0].path = self.path - else: - for i in range(len(self.shards)): - self.shards[i].path = self.path.with_name(SHARD_NAME_FORMAT.format(self.path.stem, i + 1, len(self.shards))) - - # print shard info - logger.info("Writing the following files:") - for shard in self.shards: - logger.info(f" {shard.path}: n_tensors = {shard.tensor_count}, total_size = {GGUFWriterSplit.format_n_bytes_to_str(shard.size)}") - - if self.split_arguments.dry_run: - logger.info("Dry run, not writing files") - exit() - - for i, shard in enumerate(self.shards): - # add_architecture is used for consistency - examples/gguf_split doesn't add arch to all shards - writer = GGUFWriter(None, self.arch, use_temp_file=self.use_temp_file, - endianess=self.endianess, add_architecture=(i == 0)) - - # only the first shard needs all the KV data - if i == 0: - for key, (value, etype) in self.kv_data.items(): - writer.add_key_value(key, value, etype) - - # add split metadata unless it's one file - small first shard splits even with SplitStyle.NONE - if self.split_arguments.split_style != SplitStyle.NONE or self.split_arguments.small_first_shard: - writer.add_uint16(Keys.Split.LLM_KV_SPLIT_NO, i) - writer.add_uint16(Keys.Split.LLM_KV_SPLIT_COUNT, len(self.shards)) - writer.add_int32(Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors) - - # add tensors, deque popleft() ensures references to eager tensors are not kept - while True: - try: - (name, tensor, dtype) = shard.tensors.popleft() - writer.add_tensor(name, tensor, raw_dtype=dtype) - except IndexError: - break - - self.shard_writers.append((writer, shard.path)) - - def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None: - if self.state is not WriterState.EMPTY: - raise ValueError(f'Expected GGUFWriterSplit state to be EMPTY, got {self.state}') - - for (writer, path) in self.shard_writers: - writer.write_header_to_file(path) - - self.state = WriterState.HEADER - - def write_kv_data_to_file(self) -> None: - if self.state is not WriterState.HEADER: - raise ValueError(f'Expected GGUFWriterSplit state to be HEADER, got {self.state}') - - for (writer, _) in self.shard_writers: - writer.write_kv_data_to_file() - - self.state = WriterState.KV_DATA - - def write_tensors_to_file(self, *, progress: bool = False) -> None: - if self.state is not WriterState.KV_DATA: - raise ValueError(f'Expected GGUFWriterSplit state to be KV_DATA, got {self.state}') - - running_total = self.total_tensors - for i in range(len(self.shard_writers)): - writer = self.shard_writers[i][0] - is_metadata = len(writer.tensors) == 0 - if is_metadata: - logger.info(f"Writing to shard {i + 1}/{len(self.shards)} with metadata only") - else: - logger.info(f"Writing to shard {i + 1}/{len(self.shards)} with {len(writer.tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)") - running_total -= len(writer.tensors) - writer.write_tensors_to_file(progress=(progress and not is_metadata)) - del writer - - self.state = WriterState.TI_DATA - - # override add_key_value to handle kv data separately - def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None: - self.kv_data[key] = (val, vtype) - - def add_tensor( - self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, - raw_dtype: GGMLQuantizationType | None = None, - ) -> None: - # we build splits as tensors are added so we need logic to figure out when to split - # logic is all in the conditional because it short-circuits, otherwise accessing self.shards[-1] would throw an error - - # create a first shard to start it off - if (len(self.shards) == self.split_arguments.small_first_shard \ - # or split when over tensor limit - or (self.split_arguments.split_style == SplitStyle.TENSORS \ - and self.shards[-1].tensor_count >= self.split_arguments.split_max_tensors) \ - # or split when over size limit - or (self.split_arguments.split_style == SplitStyle.SIZE \ - and self.shards[-1].size + GGUFWriterSplit.get_tensor_size(tensor) > self.split_arguments.split_max_size)): - - # we fill in the name later when we know how many shards there are - self.shards.append(Shard(Path(), 1, GGUFWriterSplit.get_tensor_size(tensor), deque([(name, tensor, raw_dtype)]))) - else: - self.shards[-1].tensor_count += 1 - self.shards[-1].size += GGUFWriterSplit.get_tensor_size(tensor) - self.shards[-1].tensors.append((name, tensor, raw_dtype)) - - def close(self) -> None: - for (writer, _) in self.shard_writers: - writer.close() - - @staticmethod - def get_tensor_size(tensor) -> int: - try: - return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) - except AttributeError: # numpy ndarray[Any, Any] - return tensor.nbytes - - @staticmethod - def split_str_to_n_bytes(split_str: str) -> int: - if split_str.endswith("K"): - n = int(split_str[:-1]) * 1000 - elif split_str.endswith("M"): - n = int(split_str[:-1]) * 1000 * 1000 - elif split_str.endswith("G"): - n = int(split_str[:-1]) * 1000 * 1000 * 1000 - elif split_str.isnumeric(): - n = int(split_str) - else: - raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") - - if n <= 0: - raise ValueError(f"Invalid split size: {split_str}, must be positive") - - return n - - @staticmethod - def format_n_bytes_to_str(num: int) -> str: - if num == METADATA_ONLY_INDICATOR: - return "negligible - metadata only" - fnum = float(num) - for unit in ("", "K", "M", "G"): - if abs(fnum) < 1000.0: - return f"{fnum:3.1f}{unit}" - fnum /= 1000.0 - return f"{fnum:.1f}T - over 1TB, --split recommended" From ff2dd7d30dcd7b2dc172fcf448f6a67b42504247 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 10:29:47 -0400 Subject: [PATCH 42/66] try to refactor kv data (still fails) --- gguf-py/gguf/gguf_writer.py | 63 ++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 84190837d8809..0a8749d7fb58c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -36,7 +36,7 @@ SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" -NUM_SHARD_KV_DATA = 6 +NUM_SHARD_KV_DATA = 3 METADATA_ONLY_INDICATOR = -1 KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType | None]] # {key: (value, type)} @@ -92,11 +92,11 @@ class SplitStyle(Enum): class GGUFWriter: - fout: list[BufferedWriter | None] + fout: list[BufferedWriter | None] | None path: os.PathLike[str] | str | None temp_file: tempfile.SpooledTemporaryFile[bytes] | None tensors: list[dict[str, TensorInfo]] - kv_data: dict[str, GGUFValue] + kv_data: list[dict[str, GGUFValue]] state: WriterState _simple_value_packing = { GGUFValueType.UINT8: "B", @@ -125,7 +125,7 @@ def __init__( self.use_temp_file = use_temp_file self.temp_file = None self.tensors = [] - self.kv_data = dict() + self.kv_data = [dict()] logger.info("gguf: This GGUF file is for {0} Endian only".format( "Big" if self.endianess == GGUFEndian.BIG else "Little", )) @@ -188,6 +188,20 @@ def print_plan(self) -> None: logger.info("Dry run, not writing files") exit() + def add_shard_kv_data(self) -> None: + if self.split_arguments.split_style == SplitStyle.NONE: + return + + total_tensors = sum(len(t) for t in self.tensors) + for i in range(len(self.fout)): + try: # TODO better way to do this + self.kv_data[i] + except IndexError: + self.kv_data.append(dict()) + self.kv_data[i][Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16) + self.kv_data[i][Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(len(self.fout), GGUFValueType.UINT16) + self.kv_data[i][Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32) + def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None: self.verify_arguments() self.open_output_file(path) @@ -197,50 +211,35 @@ def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> No raise ValueError(f'Expected output file to be empty, got {self.state}') assert len(self.fout) == len(self.tensors) + assert len(self.kv_data) == 1 + + self.add_shard_kv_data() for i in range(len(self.fout)): fout = self.fout[i] + #print(f"writing header: GGUF_VERSION={GGUF_VERSION}, GGUF_MAGIC={GGUF_MAGIC}, n_tensors={len(self.tensors[i])}, n_kv_data={len(self.kv_data[i])}") self._write_packed(fout, " bytearray: - total_tensors = sum(len(t) for t in self.tensors) - kv_data += self._pack_val(Keys.Split.LLM_KV_SPLIT_NO, GGUFValueType.STRING, add_vtype=False) - kv_data += self._pack_val(shard_no, GGUFValueType.UINT16, add_vtype=True) - kv_data += self._pack_val(Keys.Split.LLM_KV_SPLIT_COUNT, GGUFValueType.STRING, add_vtype=False) - kv_data += self._pack_val(len(self.fout), GGUFValueType.UINT16, add_vtype=True) - kv_data += self._pack_val(Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT, GGUFValueType.STRING, add_vtype=False) - kv_data += self._pack_val(total_tensors, GGUFValueType.INT32, add_vtype=True) - return kv_data - def write_kv_data_to_file(self) -> None: if self.state is not WriterState.HEADER: raise ValueError(f'Expected output file to contain the header, got {self.state}') assert self.fout is not None - kv_data = bytearray() - - for key, val in self.kv_data.items(): - kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False) - kv_data += self._pack_val(val.value, val.type, add_vtype=True) + for fout, kv_data in zip(self.fout, self.kv_data): + kv_bytes = bytearray() - if len(self.fout) > 1: - kv_data = self.add_shard_kv_data(kv_data, 0) + for key, val in kv_data.items(): + kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False) + kv_bytes += self._pack_val(val.value, val.type, add_vtype=True) - # only the first shard needs kv data - self.fout[0].write(kv_data) - self.fout[0].flush() + fout.write(kv_bytes) - for i in range(1, len(self.fout)): - self.fout[i].write(self.add_shard_kv_data(bytearray(), i)) - self.fout[i].flush() + self.flush() self.state = WriterState.KV_DATA def write_ti_data_to_file(self) -> None: @@ -271,7 +270,7 @@ def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None: if key in self.kv_data: raise ValueError(f'Duplicated key name {key!r}') - self.kv_data[key] = GGUFValue(value=val, type=vtype) + self.kv_data[0][key] = GGUFValue(value=val, type=vtype) def add_uint8(self, key: str, val: int) -> None: self.add_key_value(key,val, GGUFValueType.UINT8) From ba1be979ebc25f24c167e77a16840b21ec12c926 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 11:10:33 -0400 Subject: [PATCH 43/66] fix ti data messiness --- gguf-py/gguf/gguf_writer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 0a8749d7fb58c..2b64c6c0d8358 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -194,7 +194,8 @@ def add_shard_kv_data(self) -> None: total_tensors = sum(len(t) for t in self.tensors) for i in range(len(self.fout)): - try: # TODO better way to do this + # just see whether it exists + try: self.kv_data[i] except IndexError: self.kv_data.append(dict()) @@ -217,7 +218,6 @@ def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> No for i in range(len(self.fout)): fout = self.fout[i] - #print(f"writing header: GGUF_VERSION={GGUF_VERSION}, GGUF_MAGIC={GGUF_MAGIC}, n_tensors={len(self.tensors[i])}, n_kv_data={len(self.kv_data[i])}") self._write_packed(fout, " None: ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False) n_dims = len(ti.shape) ti_data += self._pack("I", n_dims) - for i in range(n_dims): - ti_data += self._pack("Q", ti.shape[n_dims - 1 - i]) + for j in range(n_dims): + ti_data += self._pack("Q", ti.shape[n_dims - 1 - j]) ti_data += self._pack("I", ti.dtype) ti_data += self._pack("Q", offset_tensor) offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment) From 0779f2f74f88db35209298412d03a1ae8d06745b Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 11:20:14 -0400 Subject: [PATCH 44/66] tidy up --- convert-hf-to-gguf.py | 7 ++++--- gguf-py/gguf/gguf_writer.py | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6bb25bd6b4548..de8e10813b53b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2891,13 +2891,14 @@ def main() -> None: model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) if args.vocab_only: - logger.info(f"Exporting model vocab to '{model_instance.fname_out}'") + logger.info(f"Exporting model vocab...") model_instance.write_vocab() + logger.info(f"Model vocab successfully exported.") else: - logger.info(f"Exporting model to '{model_instance.fname_out}'") + logger.info(f"Exporting model...") model_instance.write() + logger.info(f"Model successfully exported.") - logger.info(f"Model successfully exported to '{model_instance.fname_out}'") if __name__ == '__main__': diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2b64c6c0d8358..4b1025820d9e7 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -95,7 +95,7 @@ class GGUFWriter: fout: list[BufferedWriter | None] | None path: os.PathLike[str] | str | None temp_file: tempfile.SpooledTemporaryFile[bytes] | None - tensors: list[dict[str, TensorInfo]] + tensors: list[dict[str, TensorInfo | np.ndarray[Any, Any]]] kv_data: list[dict[str, GGUFValue]] state: WriterState _simple_value_packing = { @@ -182,7 +182,7 @@ def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None: def print_plan(self) -> None: logger.info("Writing the following files:") for i in range(len(self.fout)): - logger.info(f" {self.fout[i].name}: n_tensors = {len(self.tensors[i])}, total_size = {GGUFWriter.format_n_bytes_to_str(GGUFWriter.get_tensors_total_size(self.tensors[i].values()))}") + logger.info(f"{self.fout[i].name}: n_tensors = {len(self.tensors[i])}, total_size = {GGUFWriter.format_n_bytes_to_str(GGUFWriter.get_tensors_total_size(self.tensors[i].values()))}") if self.split_arguments.dry_run: logger.info("Dry run, not writing files") @@ -327,8 +327,9 @@ def add_tensor_info( if self.state is not WriterState.NO_FILE: raise ValueError(f'Expected output file to be not yet opened, got {self.state}') - if name in self.tensors: - raise ValueError(f'Duplicated tensor name {name!r}') + for i in range(len(self.tensors)): + if name in self.tensors[i]: + raise ValueError(f'Duplicated tensor name {name!r}') if raw_dtype is None: if tensor_dtype == np.float16: From a234bf821b8cad89b7318efed495ad43da5988e3 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 11:23:55 -0400 Subject: [PATCH 45/66] fix linting --- convert-hf-to-gguf.py | 10 ++++------ gguf-py/gguf/gguf_writer.py | 9 ++++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index de8e10813b53b..271ded8c411fb 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2891,15 +2891,13 @@ def main() -> None: model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) if args.vocab_only: - logger.info(f"Exporting model vocab...") + logger.info("Exporting model vocab...") model_instance.write_vocab() - logger.info(f"Model vocab successfully exported.") + logger.info("Model vocab successfully exported.") else: - logger.info(f"Exporting model...") + logger.info("Exporting model...") model_instance.write() - logger.info(f"Model successfully exported.") - - + logger.info("Model successfully exported.") if __name__ == '__main__': main() diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4b1025820d9e7..766ae86b44987 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -30,7 +30,6 @@ ) from .quants import quant_shape_from_byte_shape -from .constants import Keys logger = logging.getLogger(__name__) @@ -191,7 +190,7 @@ def print_plan(self) -> None: def add_shard_kv_data(self) -> None: if self.split_arguments.split_style == SplitStyle.NONE: return - + total_tensors = sum(len(t) for t in self.tensors) for i in range(len(self.fout)): # just see whether it exists @@ -746,11 +745,11 @@ def get_tensor_size(tensor) -> int: return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) except AttributeError: # numpy ndarray[Any, Any] return tensor.nbytes - + @staticmethod def get_tensors_total_size(tensors) -> int: return sum(GGUFWriter.get_tensor_size(ti) for ti in tensors) - + @staticmethod def split_str_to_n_bytes(split_str: str) -> int: if split_str.endswith("K"): @@ -778,4 +777,4 @@ def format_n_bytes_to_str(num: int) -> str: if abs(fnum) < 1000.0: return f"{fnum:3.1f}{unit}" fnum /= 1000.0 - return f"{fnum:.1f}T - over 1TB, --split recommended" \ No newline at end of file + return f"{fnum:.1f}T - over 1TB, --split recommended" From 49b9fbe9420365cbb6f7a8f312a41883a98ea186 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 11:37:56 -0400 Subject: [PATCH 46/66] actually make the linter happy --- convert-hf-to-gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 271ded8c411fb..aded7a48698b0 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2899,5 +2899,6 @@ def main() -> None: model_instance.write() logger.info("Model successfully exported.") + if __name__ == '__main__': main() From 0471f67f4f3e95d21f5b2da7a0e2a2a7b0215aef Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 12:40:02 -0400 Subject: [PATCH 47/66] cleanup round 1 --- gguf-py/gguf/gguf_writer.py | 65 +++++++++++++------------------------ 1 file changed, 23 insertions(+), 42 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 766ae86b44987..0ef75ff07cf79 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -56,14 +56,6 @@ class GGUFValue: type: GGUFValueType -@dataclass -class Shard: - path: Path - tensor_count: int - size: int - tensors: deque[TensorTempData] - - class SplitArguments: def __init__(self, args: Namespace) -> None: self.split_max_tensors = args.split_max_tensors if args.split_max_tensors else 0 @@ -91,10 +83,10 @@ class SplitStyle(Enum): class GGUFWriter: - fout: list[BufferedWriter | None] | None + fout: list[BufferedWriter] | None path: os.PathLike[str] | str | None temp_file: tempfile.SpooledTemporaryFile[bytes] | None - tensors: list[dict[str, TensorInfo | np.ndarray[Any, Any]]] + tensors: list[dict[str, TensorInfo]] kv_data: list[dict[str, GGUFValue]] state: WriterState _simple_value_packing = { @@ -137,7 +129,7 @@ def __init__( def verify_arguments(self) -> None: total_tensors = sum(len(ti) for ti in self.tensors) - total_size = sum(sum(GGUFWriter.get_tensor_size(ti) for ti in t.values()) for t in self.tensors) + total_size = sum(ti.nbytes for t in self.tensors for ti in t.values()) if self.split_arguments.split_max_tensors and total_tensors < self.split_arguments.split_max_tensors: logger.warning("Model has fewer tensors than the split threshold, not splitting") @@ -149,10 +141,10 @@ def verify_arguments(self) -> None: # no shards are created when writing vocab so make one if not self.tensors or len(self.tensors) == 0: - self.tensors.append(dict()) + self.tensors = [dict()] - def format_shard_names(self) -> list[os.PathLike[str]]: - pathobj = Path(self.path) + def format_shard_names(self, path: os.PathLike[str] | str | None = None) -> list[os.PathLike[str]]: + pathobj = Path(path) if self.split_arguments.split_style == SplitStyle.NONE: return [pathobj] @@ -174,14 +166,15 @@ def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None: if self.path is not None: self.fout = [] - for fout in self.format_shard_names(): + for fout in self.format_shard_names(self.path): self.fout.append(open(fout, "wb")) self.state = WriterState.EMPTY - def print_plan(self) -> None: + def print_plan(self, path: os.PathLike[str] | str | None = None) -> None: logger.info("Writing the following files:") - for i in range(len(self.fout)): - logger.info(f"{self.fout[i].name}: n_tensors = {len(self.tensors[i])}, total_size = {GGUFWriter.format_n_bytes_to_str(GGUFWriter.get_tensors_total_size(self.tensors[i].values()))}") + filenames = self.format_shard_names(path) + for i in range(len(filenames)): + logger.info(f"{filenames[i]}: n_tensors = {len(self.tensors[i])}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in self.tensors[i].values()))}") if self.split_arguments.dry_run: logger.info("Dry run, not writing files") @@ -204,8 +197,8 @@ def add_shard_kv_data(self) -> None: def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None: self.verify_arguments() + self.print_plan(path) self.open_output_file(path) - self.print_plan() if self.state is not WriterState.EMPTY: raise ValueError(f'Expected output file to be empty, got {self.state}') @@ -215,13 +208,12 @@ def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> No self.add_shard_kv_data() - for i in range(len(self.fout)): - fout = self.fout[i] + for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data): self._write_packed(fout, " None: @@ -246,12 +238,12 @@ def write_ti_data_to_file(self) -> None: raise ValueError(f'Expected output file to contain KV data, got {self.state}') assert self.fout is not None - for i in range(len(self.fout)): - assert self.fout[i] is not None + for fout, tensors in zip(self.fout, self.tensors): + assert fout is not None ti_data = bytearray() offset_tensor = 0 - for name, ti in self.tensors[i].items(): + for name, ti in tensors.items(): ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False) n_dims = len(ti.shape) ti_data += self._pack("I", n_dims) @@ -261,8 +253,8 @@ def write_ti_data_to_file(self) -> None: ti_data += self._pack("Q", offset_tensor) offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment) - self.fout[i].write(ti_data) - self.fout[i].flush() + fout.write(ti_data) + fout.flush() self.state = WriterState.TI_DATA def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None: @@ -359,7 +351,7 @@ def add_tensor_info( and len(self.tensors[-1]) >= self.split_arguments.split_max_tensors) \ # or split when over size limit or (self.split_arguments.split_style == SplitStyle.SIZE \ - and GGUFWriter.get_tensors_total_size(self.tensors[-1].values()) + tensor_nbytes > self.split_arguments.split_max_size)): + and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_arguments.split_max_size)): self.tensors.append(dict()) @@ -424,7 +416,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: if progress: from tqdm import tqdm - total_bytes = GGUFWriter.get_tensors_total_size(self.tensors[i].values()) + total_bytes = sum(ti.nbytes for ti in self.tensors[i].values()) bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) @@ -739,17 +731,6 @@ def _write_packed(self, fout: BufferedWriter, fmt: str, value: Any, skip_pack_pr assert fout is not None fout.write(self._pack(fmt, value, skip_pack_prefix)) - @staticmethod - def get_tensor_size(tensor) -> int: - try: - return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) - except AttributeError: # numpy ndarray[Any, Any] - return tensor.nbytes - - @staticmethod - def get_tensors_total_size(tensors) -> int: - return sum(GGUFWriter.get_tensor_size(ti) for ti in tensors) - @staticmethod def split_str_to_n_bytes(split_str: str) -> int: if split_str.endswith("K"): From 5a96b8f27f9c3890003a9aa7ad94337cdeb1f9c5 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 13:08:06 -0400 Subject: [PATCH 48/66] remove SplitStrategy, SplitArguments --- convert-hf-to-gguf.py | 39 ++++++++++----- gguf-py/gguf/gguf_writer.py | 98 +++++++++++-------------------------- 2 files changed, 55 insertions(+), 82 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index aded7a48698b0..d1ecfbe8ad709 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -66,7 +66,7 @@ class Model: model_arch: gguf.MODEL_ARCH def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, - split_arguments: gguf.SplitArguments, model_name: str | None): + model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = 0, small_first_shard: bool = 0): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -97,8 +97,8 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFWriter(None, gguf.MODEL_ARCH_NAMES[self.model_arch], split_arguments, - endianess=self.endianess, use_temp_file=self.use_temp_file) + self.gguf_writer = gguf.GGUFWriter(None, gguf.MODEL_ARCH_NAMES[self.model_arch],endianess=self.endianess, use_temp_file=self.use_temp_file, + split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @classmethod def __init_subclass__(cls): @@ -334,7 +334,7 @@ def write(self): self.gguf_writer.close() def write_vocab(self): - if self.gguf_writer.split_arguments.split_style != gguf.SplitStyle.NONE: + if len(self.gguf_writer.tensors) != 1: raise ValueError('Splitting the vocabulary is not supported') self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_kv_data_to_file() @@ -2806,11 +2806,11 @@ def parse_args() -> argparse.Namespace: help="increase output verbosity", ) parser.add_argument( - "--split-max-tensors", type=int, + "--split-max-tensors", type=int, default=0, help="max tensors in each split", ) parser.add_argument( - "--split-max-size", type=str, + "--split-max-size", type=str, default="0", help="max size per split N(M|G)", ) parser.add_argument( @@ -2825,6 +2825,24 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() +def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1000 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1000 * 1000 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1000 * 1000 * 1000 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + + if n < 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + def main() -> None: args = parse_args() @@ -2849,11 +2867,6 @@ def main() -> None: logger.error(f'Error: {args.model} is not a directory') sys.exit(1) - if args.split_max_tensors and args.split_max_size: - raise ValueError("Can't specify both --split-max-tensors and --split-max-size") - - split_arguments = gguf.SplitArguments(args) - ftype_map: dict[str, gguf.LlamaFileType] = { "f32": gguf.LlamaFileType.ALL_F32, "f16": gguf.LlamaFileType.MOSTLY_F16, @@ -2880,7 +2893,9 @@ def main() -> None: sys.exit(1) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, - args.no_lazy, split_arguments, args.model_name) + args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors, + split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, + small_first_shard=args.no_tensor_first_split) logger.info("Set model parameters") model_instance.set_gguf_parameters() diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 0ef75ff07cf79..75ba421016bff 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -5,8 +5,6 @@ import shutil import struct import tempfile -from argparse import Namespace -from collections import deque from dataclasses import dataclass from enum import Enum, auto from pathlib import Path @@ -56,17 +54,6 @@ class GGUFValue: type: GGUFValueType -class SplitArguments: - def __init__(self, args: Namespace) -> None: - self.split_max_tensors = args.split_max_tensors if args.split_max_tensors else 0 - self.split_max_size = GGUFWriter.split_str_to_n_bytes(args.split_max_size) if args.split_max_size else 0 - self.split_style = SplitStyle.TENSORS if self.split_max_tensors \ - else SplitStyle.SIZE if self.split_max_size \ - else SplitStyle.NONE - self.dry_run = args.dry_run - self.small_first_shard = args.no_tensor_first_split - - class WriterState(Enum): NO_FILE = auto() EMPTY = auto() @@ -76,12 +63,6 @@ class WriterState(Enum): WEIGHTS = auto() -class SplitStyle(Enum): - NONE = auto() - TENSORS = auto() - SIZE = auto() - - class GGUFWriter: fout: list[BufferedWriter] | None path: os.PathLike[str] | str | None @@ -104,40 +85,34 @@ class GGUFWriter: } def __init__( - self, path: os.PathLike[str] | str | None, arch: str, split_arguments: SplitArguments, - use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE - ): + self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): self.fout = [] self.path = path self.arch = arch self.endianess = endianess self.data_alignment = GGUF_DEFAULT_ALIGNMENT - self.split_arguments = split_arguments self.use_temp_file = use_temp_file self.temp_file = None self.tensors = [] self.kv_data = [dict()] + self.split_max_tensors = split_max_tensors + self.split_max_size = split_max_size + self.dry_run = dry_run + self.small_first_shard = small_first_shard logger.info("gguf: This GGUF file is for {0} Endian only".format( "Big" if self.endianess == GGUFEndian.BIG else "Little", )) self.state = WriterState.NO_FILE - if self.split_arguments.small_first_shard: + if self.small_first_shard: self.tensors.append(dict()) self.add_architecture() def verify_arguments(self) -> None: - total_tensors = sum(len(ti) for ti in self.tensors) - total_size = sum(ti.nbytes for t in self.tensors for ti in t.values()) - - if self.split_arguments.split_max_tensors and total_tensors < self.split_arguments.split_max_tensors: - logger.warning("Model has fewer tensors than the split threshold, not splitting") - self.split_style = SplitStyle.NONE - - if self.split_arguments.split_max_size and total_size < self.split_arguments.split_max_size: - logger.warning("Model has smaller size than the split threshold, not splitting") - self.split_style = SplitStyle.NONE + if len(self.tensors) == 1: + logger.warning("Model fails split requirements, not splitting") # no shards are created when writing vocab so make one if not self.tensors or len(self.tensors) == 0: @@ -145,7 +120,7 @@ def verify_arguments(self) -> None: def format_shard_names(self, path: os.PathLike[str] | str | None = None) -> list[os.PathLike[str]]: pathobj = Path(path) - if self.split_arguments.split_style == SplitStyle.NONE: + if len(self.tensors) == 1: return [pathobj] shard_names = [] @@ -173,15 +148,16 @@ def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None: def print_plan(self, path: os.PathLike[str] | str | None = None) -> None: logger.info("Writing the following files:") filenames = self.format_shard_names(path) - for i in range(len(filenames)): - logger.info(f"{filenames[i]}: n_tensors = {len(self.tensors[i])}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in self.tensors[i].values()))}") + assert len(filenames) == len(self.tensors) + for name, tensors in zip(filenames, self.tensors): + logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}") - if self.split_arguments.dry_run: + if self.dry_run: logger.info("Dry run, not writing files") exit() def add_shard_kv_data(self) -> None: - if self.split_arguments.split_style == SplitStyle.NONE: + if len(self.tensors) == 1: return total_tensors = sum(len(t) for t in self.tensors) @@ -318,8 +294,8 @@ def add_tensor_info( if self.state is not WriterState.NO_FILE: raise ValueError(f'Expected output file to be not yet opened, got {self.state}') - for i in range(len(self.tensors)): - if name in self.tensors[i]: + for tensors in self.tensors: + if name in tensors: raise ValueError(f'Duplicated tensor name {name!r}') if raw_dtype is None: @@ -345,13 +321,13 @@ def add_tensor_info( tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) # create splits as necessary, such as to start it off - if (len(self.tensors) == self.split_arguments.small_first_shard \ + if (len(self.tensors) == self.small_first_shard \ # or split when over tensor limit - or (self.split_arguments.split_style == SplitStyle.TENSORS \ - and len(self.tensors[-1]) >= self.split_arguments.split_max_tensors) \ + or self.split_max_tensors != 0 and \ + len(self.tensors[-1]) >= self.split_max_tensors \ # or split when over size limit - or (self.split_arguments.split_style == SplitStyle.SIZE \ - and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_arguments.split_max_size)): + or self.split_max_size != 0 and \ + sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size): self.tensors.append(dict()) @@ -409,25 +385,25 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: self.write_padding(fout, fout.tell()) if self.temp_file is None: - for i in range(len(self.fout)): - assert self.fout[i] is not None + for fout, tensors in zip(self.fout, self.tensors): + assert fout is not None bar = None if progress: from tqdm import tqdm - total_bytes = sum(ti.nbytes for ti in self.tensors[i].values()) + total_bytes = sum(ti.nbytes for ti in tensors.values()) bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) # relying on the fact that Python dicts preserve insertion order (since 3.7) - for ti in self.tensors[i].values(): + for ti in tensors.values(): assert ti.tensor is not None # can only iterate once over the tensors assert ti.tensor.nbytes == ti.nbytes - ti.tensor.tofile(self.fout[i]) + ti.tensor.tofile(fout) if bar is not None: bar.update(ti.nbytes) - self.write_padding(self.fout[i], ti.nbytes) + self.write_padding(fout, ti.nbytes) ti.tensor = None else: self.temp_file.seek(0) @@ -731,24 +707,6 @@ def _write_packed(self, fout: BufferedWriter, fmt: str, value: Any, skip_pack_pr assert fout is not None fout.write(self._pack(fmt, value, skip_pack_prefix)) - @staticmethod - def split_str_to_n_bytes(split_str: str) -> int: - if split_str.endswith("K"): - n = int(split_str[:-1]) * 1000 - elif split_str.endswith("M"): - n = int(split_str[:-1]) * 1000 * 1000 - elif split_str.endswith("G"): - n = int(split_str[:-1]) * 1000 * 1000 * 1000 - elif split_str.isnumeric(): - n = int(split_str) - else: - raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") - - if n <= 0: - raise ValueError(f"Invalid split size: {split_str}, must be positive") - - return n - @staticmethod def format_n_bytes_to_str(num: int) -> str: if num == METADATA_ONLY_INDICATOR: From f7ecd996910601efdd9ce429f8503b5d04411140 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 13:09:05 -0400 Subject: [PATCH 49/66] appease linter --- gguf-py/gguf/gguf_writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 75ba421016bff..f5860533d2a34 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -86,7 +86,8 @@ class GGUFWriter: def __init__( self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE, - split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False + ): self.fout = [] self.path = path self.arch = arch From 9d7f694438f347d0bd1f79044ef6fa565caeb236 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 16:02:23 -0400 Subject: [PATCH 50/66] fix typing and clean up --- convert-hf-to-gguf.py | 6 ++- gguf-py/gguf/gguf_writer.py | 95 +++++++++++++------------------------ 2 files changed, 39 insertions(+), 62 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d1ecfbe8ad709..e1cb05fea8ce4 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -66,7 +66,7 @@ class Model: model_arch: gguf.MODEL_ARCH def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, - model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = 0, small_first_shard: bool = 0): + model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -2875,6 +2875,10 @@ def main() -> None: "auto": gguf.LlamaFileType.GUESSED, } + if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"): + logger.error("Error: Cannot use temp file when splitting") + sys.exit(1) + if args.outfile is not None: fname_out = args.outfile else: diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index f5860533d2a34..19556c55203f0 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -9,7 +9,7 @@ from enum import Enum, auto from pathlib import Path from io import BufferedWriter -from typing import IO, Any, Sequence, Mapping, TypeAlias +from typing import IO, Any, Sequence, Mapping from string import ascii_letters, digits import numpy as np @@ -33,11 +33,6 @@ SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" -NUM_SHARD_KV_DATA = 3 -METADATA_ONLY_INDICATOR = -1 - -KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType | None]] # {key: (value, type)} -TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any], GGMLQuantizationType | None] # (tensor name, tensor data, tensor dtype) @dataclass @@ -65,7 +60,7 @@ class WriterState(Enum): class GGUFWriter: fout: list[BufferedWriter] | None - path: os.PathLike[str] | str | None + path: Path | None temp_file: tempfile.SpooledTemporaryFile[bytes] | None tensors: list[dict[str, TensorInfo]] kv_data: list[dict[str, GGUFValue]] @@ -88,15 +83,15 @@ def __init__( self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False ): - self.fout = [] - self.path = path + self.fout = None + self.path = Path(path) if path else None self.arch = arch self.endianess = endianess self.data_alignment = GGUF_DEFAULT_ALIGNMENT self.use_temp_file = use_temp_file self.temp_file = None - self.tensors = [] - self.kv_data = [dict()] + self.tensors = [{}] + self.kv_data = [{}] self.split_max_tensors = split_max_tensors self.split_max_size = split_max_size self.dry_run = dry_run @@ -107,30 +102,16 @@ def __init__( self.state = WriterState.NO_FILE if self.small_first_shard: - self.tensors.append(dict()) + self.tensors.append({}) self.add_architecture() - def verify_arguments(self) -> None: - if len(self.tensors) == 1: - logger.warning("Model fails split requirements, not splitting") - - # no shards are created when writing vocab so make one - if not self.tensors or len(self.tensors) == 0: - self.tensors = [dict()] - - def format_shard_names(self, path: os.PathLike[str] | str | None = None) -> list[os.PathLike[str]]: - pathobj = Path(path) + def format_shard_names(self, path: Path) -> list[Path]: if len(self.tensors) == 1: - return [pathobj] - - shard_names = [] - for i in range(len(self.tensors)): - shard_names.append(pathobj.with_name(SHARD_NAME_FORMAT.format(pathobj.stem, i + 1, len(self.tensors)))) - - return shard_names + return [path] + return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))] - def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None: + def open_output_file(self, path: Path | None = None) -> None: if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path): # allow calling this multiple times as long as the path is the same return @@ -141,14 +122,14 @@ def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None: self.path = path if self.path is not None: - self.fout = [] - for fout in self.format_shard_names(self.path): - self.fout.append(open(fout, "wb")) + self.print_plan() + self.fout = [open(filename, "wb") for filename in self.format_shard_names(self.path)] self.state = WriterState.EMPTY - def print_plan(self, path: os.PathLike[str] | str | None = None) -> None: + def print_plan(self) -> None: logger.info("Writing the following files:") - filenames = self.format_shard_names(path) + assert self.path is not None + filenames = self.format_shard_names(self.path) assert len(filenames) == len(self.tensors) for name, tensors in zip(filenames, self.tensors): logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}") @@ -162,24 +143,28 @@ def add_shard_kv_data(self) -> None: return total_tensors = sum(len(t) for t in self.tensors) - for i in range(len(self.fout)): + assert self.fout is not None + total_splits = len(self.fout) + for i in range(total_splits): # just see whether it exists try: self.kv_data[i] except IndexError: self.kv_data.append(dict()) self.kv_data[i][Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16) - self.kv_data[i][Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(len(self.fout), GGUFValueType.UINT16) + self.kv_data[i][Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16) self.kv_data[i][Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32) - def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None: - self.verify_arguments() - self.print_plan(path) + def write_header_to_file(self, path: Path | None = None) -> None: + if len(self.tensors) == 1: + logger.warning("Model fails split requirements, not splitting") + self.open_output_file(path) if self.state is not WriterState.EMPTY: raise ValueError(f'Expected output file to be empty, got {self.state}') + assert self.fout is not None assert len(self.fout) == len(self.tensors) assert len(self.kv_data) == 1 @@ -216,7 +201,6 @@ def write_ti_data_to_file(self) -> None: assert self.fout is not None for fout, tensors in zip(self.fout, self.tensors): - assert fout is not None ti_data = bytearray() offset_tensor = 0 @@ -235,7 +219,7 @@ def write_ti_data_to_file(self) -> None: self.state = WriterState.TI_DATA def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None: - if key in self.kv_data: + if any(key in kv_data for kv_data in self.kv_data): raise ValueError(f'Duplicated key name {key!r}') self.kv_data[0][key] = GGUFValue(value=val, type=vtype) @@ -279,9 +263,6 @@ def add_string(self, key: str, val: str) -> None: self.add_key_value(key, val, GGUFValueType.STRING) def add_array(self, key: str, val: Sequence[Any]) -> None: - if not isinstance(val, Sequence): - raise ValueError("Value must be a sequence for array type") - self.add_key_value(key, val, GGUFValueType.ARRAY) @staticmethod @@ -295,9 +276,8 @@ def add_tensor_info( if self.state is not WriterState.NO_FILE: raise ValueError(f'Expected output file to be not yet opened, got {self.state}') - for tensors in self.tensors: - if name in tensors: - raise ValueError(f'Duplicated tensor name {name!r}') + if any(name in tensors for tensors in self.tensors): + raise ValueError(f'Duplicated tensor name {name!r}') if raw_dtype is None: if tensor_dtype == np.float16: @@ -321,10 +301,8 @@ def add_tensor_info( if tensor_dtype == np.uint8: tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) - # create splits as necessary, such as to start it off - if (len(self.tensors) == self.small_first_shard \ - # or split when over tensor limit - or self.split_max_tensors != 0 and \ + # split when over tensor limit + if (self.split_max_tensors != 0 and \ len(self.tensors[-1]) >= self.split_max_tensors \ # or split when over size limit or self.split_max_size != 0 and \ @@ -369,7 +347,6 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: tensor.byteswap(inplace=True) for fout in self.fout: - assert fout is not None self.write_padding(fout, fout.tell()) tensor.tofile(fout) self.write_padding(fout, tensor.nbytes) @@ -382,12 +359,10 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: assert self.fout is not None for fout in self.fout: - assert fout is not None self.write_padding(fout, fout.tell()) if self.temp_file is None: for fout, tensors in zip(self.fout, self.tensors): - assert fout is not None bar = None if progress: @@ -409,7 +384,8 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: else: self.temp_file.seek(0) - shutil.copyfileobj(self.temp_file, self.fout) + assert self.fout is not None + shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1]) self.flush() self.temp_file.close() @@ -418,14 +394,12 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: def flush(self) -> None: assert self.fout is not None for fout in self.fout: - assert fout is not None fout.flush() def close(self) -> None: if self.fout is not None: for fout in self.fout: - if fout is not None: - fout.close() + fout.close() self.fout = [] def add_architecture(self) -> None: @@ -705,12 +679,11 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes: return kv_data def _write_packed(self, fout: BufferedWriter, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: - assert fout is not None fout.write(self._pack(fmt, value, skip_pack_prefix)) @staticmethod def format_n_bytes_to_str(num: int) -> str: - if num == METADATA_ONLY_INDICATOR: + if num == 0: return "negligible - metadata only" fnum = float(num) for unit in ("", "K", "M", "G"): From 0417104397e54dce71cd2c6e9c23b11f2acf0d60 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 16:05:08 -0400 Subject: [PATCH 51/66] fix linting --- gguf-py/gguf/gguf_writer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 19556c55203f0..69a4a3ff6601a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -302,11 +302,9 @@ def add_tensor_info( tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) # split when over tensor limit - if (self.split_max_tensors != 0 and \ - len(self.tensors[-1]) >= self.split_max_tensors \ - # or split when over size limit - or self.split_max_size != 0 and \ - sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size): + if (self.split_max_tensors != 0 and len(self.tensors[-1]) >= self.split_max_tensors \ + # or split when over size limit + or self.split_max_size != 0 and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size): self.tensors.append(dict()) From 70a6bc91cc0e1f444647ef35896c8b3c4a1701bf Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Sun, 9 Jun 2024 17:08:11 -0400 Subject: [PATCH 52/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 69a4a3ff6601a..39cdb227626ec 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -398,7 +398,7 @@ def close(self) -> None: if self.fout is not None: for fout in self.fout: fout.close() - self.fout = [] + self.fout = None def add_architecture(self) -> None: self.add_string(Keys.General.ARCHITECTURE, self.arch) From 1e2d9cb589dafa8f6a41e68d0efccd31fbecd5f4 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sun, 9 Jun 2024 17:31:25 -0400 Subject: [PATCH 53/66] progress bar, fix split logic --- gguf-py/gguf/gguf_writer.py | 38 +++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 39cdb227626ec..a102cd123c200 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -145,12 +145,8 @@ def add_shard_kv_data(self) -> None: total_tensors = sum(len(t) for t in self.tensors) assert self.fout is not None total_splits = len(self.fout) + self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits)) for i in range(total_splits): - # just see whether it exists - try: - self.kv_data[i] - except IndexError: - self.kv_data.append(dict()) self.kv_data[i][Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16) self.kv_data[i][Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16) self.kv_data[i][Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32) @@ -301,10 +297,12 @@ def add_tensor_info( if tensor_dtype == np.uint8: tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) - # split when over tensor limit - if (self.split_max_tensors != 0 and len(self.tensors[-1]) >= self.split_max_tensors \ + # make sure there is at least one tensor before splitting + if (len(self.tensors[-1]) > 0 + # split when over tensor limit + and (self.split_max_tensors != 0 and len(self.tensors[-1]) >= self.split_max_tensors) # or split when over size limit - or self.split_max_size != 0 and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size): + or (self.split_max_size != 0 and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size)): self.tensors.append(dict()) @@ -360,15 +358,25 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: self.write_padding(fout, fout.tell()) if self.temp_file is None: - for fout, tensors in zip(self.fout, self.tensors): - bar = None + bar = None + shard_bar = None - if progress: - from tqdm import tqdm + if progress: + from tqdm import tqdm - total_bytes = sum(ti.nbytes for ti in tensors.values()) + total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values()) - bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) + bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) + shard_bar = tqdm(desc="Shard progress", total=total_bytes, unit="byte", unit_scale=True) + + for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)): + if bar and len(self.fout) > 1: + bar.desc = f"Writing ({i + 1}/{len(self.fout)})" + if shard_bar and len(self.fout) > 1: + total = sum(ti.nbytes for ti in tensors.values()) + # bar behaves weirdly when total is 0 + if total > 0: + shard_bar.reset(total=total) # relying on the fact that Python dicts preserve insertion order (since 3.7) for ti in tensors.values(): @@ -377,6 +385,8 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: ti.tensor.tofile(fout) if bar is not None: bar.update(ti.nbytes) + if shard_bar is not None: + shard_bar.update(ti.nbytes) self.write_padding(fout, ti.nbytes) ti.tensor = None else: From f7e79839464bd6048eb1bc3c671f6bb42c553e47 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Sun, 9 Jun 2024 20:17:25 -0400 Subject: [PATCH 54/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a102cd123c200..7ff14cb88cf06 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -167,10 +167,10 @@ def write_header_to_file(self, path: Path | None = None) -> None: self.add_shard_kv_data() for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data): - self._write_packed(fout, " Date: Sun, 9 Jun 2024 20:22:17 -0400 Subject: [PATCH 55/66] catch oversights --- gguf-py/gguf/gguf_writer.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7ff14cb88cf06..b76a0ab596778 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -122,11 +122,11 @@ def open_output_file(self, path: Path | None = None) -> None: self.path = path if self.path is not None: - self.print_plan() - self.fout = [open(filename, "wb") for filename in self.format_shard_names(self.path)] + filenames = self.print_plan() + self.fout = [open(filename, "wb") for filename in filenames] self.state = WriterState.EMPTY - def print_plan(self) -> None: + def print_plan(self) -> list[Path]: logger.info("Writing the following files:") assert self.path is not None filenames = self.format_shard_names(self.path) @@ -138,6 +138,8 @@ def print_plan(self) -> None: logger.info("Dry run, not writing files") exit() + return filenames + def add_shard_kv_data(self) -> None: if len(self.tensors) == 1: return @@ -152,7 +154,7 @@ def add_shard_kv_data(self) -> None: self.kv_data[i][Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32) def write_header_to_file(self, path: Path | None = None) -> None: - if len(self.tensors) == 1: + if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0): logger.warning("Model fails split requirements, not splitting") self.open_output_file(path) @@ -298,13 +300,15 @@ def add_tensor_info( tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) # make sure there is at least one tensor before splitting - if (len(self.tensors[-1]) > 0 - # split when over tensor limit - and (self.split_max_tensors != 0 and len(self.tensors[-1]) >= self.split_max_tensors) - # or split when over size limit - or (self.split_max_size != 0 and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size)): - - self.tensors.append(dict()) + if len(self.tensors[-1]) > 0: + if ( # split when over tensor limit + self.split_max_tensors != 0 + and len(self.tensors[-1]) >= self.split_max_tensors + ) or ( # split when over size limit + self.split_max_size != 0 + and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size + ): + self.tensors.append({}) self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes) @@ -367,12 +371,12 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values()) bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) - shard_bar = tqdm(desc="Shard progress", total=total_bytes, unit="byte", unit_scale=True) + if len(self.fout) > 1: + shard_bar = tqdm(desc=f"Shard (1/{len(self.fout)})", total=total_bytes, unit="byte", unit_scale=True) for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)): - if bar and len(self.fout) > 1: - bar.desc = f"Writing ({i + 1}/{len(self.fout)})" if shard_bar and len(self.fout) > 1: + shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})") total = sum(ti.nbytes for ti in tensors.values()) # bar behaves weirdly when total is 0 if total > 0: @@ -686,9 +690,6 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes: return kv_data - def _write_packed(self, fout: BufferedWriter, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: - fout.write(self._pack(fmt, value, skip_pack_prefix)) - @staticmethod def format_n_bytes_to_str(num: int) -> str: if num == 0: From 7eea552db8e85e04a1809d22ca29a5d97ce8be68 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Mon, 10 Jun 2024 07:54:06 -0400 Subject: [PATCH 56/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index b76a0ab596778..7d204a0e7e3ae 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -148,10 +148,10 @@ def add_shard_kv_data(self) -> None: assert self.fout is not None total_splits = len(self.fout) self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits)) - for i in range(total_splits): - self.kv_data[i][Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16) - self.kv_data[i][Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16) - self.kv_data[i][Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32) + for i, kv_data in enumerate(self.kv_data): + kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16) + kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16) + kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32) def write_header_to_file(self, path: Path | None = None) -> None: if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0): From 99f9a248056a610e6fda7839037ea705c66c4464 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Mon, 10 Jun 2024 07:54:18 -0400 Subject: [PATCH 57/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7d204a0e7e3ae..099f99cd8a377 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -375,7 +375,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: shard_bar = tqdm(desc=f"Shard (1/{len(self.fout)})", total=total_bytes, unit="byte", unit_scale=True) for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)): - if shard_bar and len(self.fout) > 1: + if shard_bar is not None: shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})") total = sum(ti.nbytes for ti in tensors.values()) # bar behaves weirdly when total is 0 From ad02c9409a6d3e72130b08a3261320ef01617589 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Mon, 10 Jun 2024 07:54:50 -0400 Subject: [PATCH 58/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 099f99cd8a377..ded0a63ebcf05 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -396,7 +396,6 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: else: self.temp_file.seek(0) - assert self.fout is not None shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1]) self.flush() self.temp_file.close() From c1b1a29266a555ab43abc003365213fb32bb4667 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Mon, 10 Jun 2024 07:55:01 -0400 Subject: [PATCH 59/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ded0a63ebcf05..9d7db73e0e2e6 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -372,7 +372,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) if len(self.fout) > 1: - shard_bar = tqdm(desc=f"Shard (1/{len(self.fout)})", total=total_bytes, unit="byte", unit_scale=True) + shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True) for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)): if shard_bar is not None: From 4550826871277e9317914852790ae133672798af Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Mon, 10 Jun 2024 07:55:24 -0400 Subject: [PATCH 60/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 9d7db73e0e2e6..bda817279a6a2 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -379,8 +379,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})") total = sum(ti.nbytes for ti in tensors.values()) # bar behaves weirdly when total is 0 - if total > 0: - shard_bar.reset(total=total) + shard_bar.reset(total=(total if total > 0 else None)) # relying on the fact that Python dicts preserve insertion order (since 3.7) for ti in tensors.values(): From efa06098a6f3deec6f27ad3814294fa19d84ddbe Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 10 Jun 2024 07:58:17 -0400 Subject: [PATCH 61/66] swap bar orders --- gguf-py/gguf/gguf_writer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index bda817279a6a2..8042ecee7ca31 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -370,15 +370,14 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values()) - bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) if len(self.fout) > 1: shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True) + bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)): if shard_bar is not None: shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})") total = sum(ti.nbytes for ti in tensors.values()) - # bar behaves weirdly when total is 0 shard_bar.reset(total=(total if total > 0 else None)) # relying on the fact that Python dicts preserve insertion order (since 3.7) @@ -386,10 +385,10 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: assert ti.tensor is not None # can only iterate once over the tensors assert ti.tensor.nbytes == ti.nbytes ti.tensor.tofile(fout) - if bar is not None: - bar.update(ti.nbytes) if shard_bar is not None: shard_bar.update(ti.nbytes) + if bar is not None: + bar.update(ti.nbytes) self.write_padding(fout, ti.nbytes) ti.tensor = None else: From b843445827e22076c7943d509d6207abb0a6d59f Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:54:41 -0400 Subject: [PATCH 62/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 8042ecee7ca31..bd4971a820bfe 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -696,4 +696,4 @@ def format_n_bytes_to_str(num: int) -> str: if abs(fnum) < 1000.0: return f"{fnum:3.1f}{unit}" fnum /= 1000.0 - return f"{fnum:.1f}T - over 1TB, --split recommended" + return f"{fnum:.1f}T - over 1TB, split recommended" From 854bd64a5dea1cb7be49f82aad39baaac33e8eb0 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:55:08 -0400 Subject: [PATCH 63/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index bd4971a820bfe..d26e1745128f8 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -362,8 +362,8 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: self.write_padding(fout, fout.tell()) if self.temp_file is None: - bar = None shard_bar = None + bar = None if progress: from tqdm import tqdm From 05b183fe7b9bbe4e79f5c3e51424ae73f0ed5eb2 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 10 Jun 2024 14:00:13 -0400 Subject: [PATCH 64/66] compatibility fix --- gguf-py/gguf/gguf_writer.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d26e1745128f8..13f231261917a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -346,10 +346,25 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: if self.endianess == GGUFEndian.BIG: tensor.byteswap(inplace=True) - for fout in self.fout: - self.write_padding(fout, fout.tell()) - tensor.tofile(fout) - self.write_padding(fout, tensor.nbytes) + file_id = -1 + for i, tensors in enumerate(self.tensors): + if len(tensors) > 0: + file_id = i + break + + fout = self.fout[file_id] + + # pop the first tensor info + # TODO: cleaner way to get the first key + first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0] + ti = self.tensors[file_id].pop(first_tensor_name) + assert len(ti.shape) == len(tensor.shape) + assert all(dim1 == dim2 for dim1, dim2 in zip(ti.shape, tensor.shape)) + assert ti.nbytes == tensor.nbytes + + self.write_padding(fout, fout.tell()) + tensor.tofile(fout) + self.write_padding(fout, tensor.nbytes) self.state = WriterState.WEIGHTS From e9895d2ce933ff9cc4eedd4ec12cf265ce887926 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Date: Mon, 10 Jun 2024 14:55:14 -0400 Subject: [PATCH 65/66] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 13f231261917a..85a868a77ca45 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -358,8 +358,6 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: # TODO: cleaner way to get the first key first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0] ti = self.tensors[file_id].pop(first_tensor_name) - assert len(ti.shape) == len(tensor.shape) - assert all(dim1 == dim2 for dim1, dim2 in zip(ti.shape, tensor.shape)) assert ti.nbytes == tensor.nbytes self.write_padding(fout, fout.tell()) From 163712e7e34b2bb37ac8db464e7f551eedd12523 Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 23 Jun 2024 19:41:16 +1000 Subject: [PATCH 66/66] Update convert-hf-to-gguf.py Co-authored-by: compilade --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index bf26390895e2e..61f456ac76e51 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -97,7 +97,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFWriter(None, gguf.MODEL_ARCH_NAMES[self.model_arch],endianess=self.endianess, use_temp_file=self.use_temp_file, + self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @classmethod