Skip to content

Commit

Permalink
llama : support models without vocabulary (#5798)
Browse files Browse the repository at this point in the history
* additional methods to read model and ctx parameters

* vocab size as a part of a model metadata

* models without vocabulary, convert.py part

* models without vocabulary, llama.cpp part

* PR clean up

* converter scrypt fixes

* llama_vocab_type update (renamed the new key)

* pr review fixes

* revert function renaming

* one more NoVocab assert
  • Loading branch information
Xarbirus authored Mar 14, 2024
1 parent 044ec4b commit 69ff613
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 88 deletions.
126 changes: 73 additions & 53 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,9 @@ def load(model_plus: ModelPlus) -> Params:
#

class BpeVocab:
tokenizer_model = "gpt2"
name = "bpe"

def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
if isinstance(self.bpe_tokenizer.get('model'), dict):
Expand Down Expand Up @@ -390,6 +393,9 @@ def __repr__(self) -> str:


class SentencePieceVocab:
tokenizer_model = "llama"
name = "spm"

def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int]
Expand Down Expand Up @@ -453,6 +459,9 @@ def __repr__(self) -> str:


class HfVocab:
tokenizer_model = "llama"
name = "hfft"

def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
try:
from transformers import AutoTokenizer
Expand Down Expand Up @@ -553,7 +562,15 @@ def __repr__(self) -> str:
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
class NoVocab:
tokenizer_model = "no_vocab"
name = "no_vocab"

def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"


Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"


#
Expand Down Expand Up @@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
# Handle special case where the model's vocab size is not set
if params.n_vocab == -1:
raise ValueError(
f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
)
if isinstance(vocab, NoVocab):
return # model has no vocab

# Check for a vocab size mismatch
if params.n_vocab == vocab.vocab_size:
Expand Down Expand Up @@ -977,6 +996,7 @@ def add_meta_arch(self, params: Params) -> None:
name = str(params.path_model.parent).split('/')[-1]

self.gguf.add_name (name)
self.gguf.add_vocab_size (params.n_vocab)
self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer)
Expand Down Expand Up @@ -1013,21 +1033,9 @@ def add_meta_arch(self, params: Params) -> None:
if params.ftype is not None:
self.gguf.add_file_type(params.ftype)

def handle_tokenizer_model(self, vocab: Vocab) -> str:
# Map the vocab types to the supported tokenizer models
tokenizer_model = {
SentencePieceVocab: "llama",
HfVocab: "llama",
BpeVocab: "gpt2",
}.get(type(vocab))

# Block if vocab type is not predefined
if tokenizer_model is None:
raise ValueError("Unknown vocab type: Not supported")

return tokenizer_model

def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
assert not isinstance(vocab, NoVocab)

tokens = []
scores = []
toktypes = []
Expand All @@ -1043,11 +1051,8 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list
return tokens, scores, toktypes

def add_meta_vocab(self, vocab: Vocab) -> None:
# Handle the tokenizer model
tokenizer_model = self.handle_tokenizer_model(vocab)

# Ensure that tokenizer_model is added to the GGUF model
self.gguf.add_tokenizer_model(tokenizer_model)
self.gguf.add_tokenizer_model(vocab.tokenizer_model)

# Extract model vocabulary for model conversion
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
Expand All @@ -1074,6 +1079,26 @@ def write_meta(self) -> None:
def write_tensor_info(self) -> None:
self.gguf.write_ti_data_to_file()

def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
self.gguf.write_tensor_data(ndarray)

def close(self) -> None:
self.gguf.close()

Expand All @@ -1082,7 +1107,7 @@ def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
check_vocab_size(params, vocab, pad_vocab=pad_vocab)

of = OutputFile(fname_out, endianess=endianess)

Expand Down Expand Up @@ -1120,8 +1145,11 @@ def write_all(

# meta data
of.add_meta_arch(params)
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
if isinstance(vocab, NoVocab):
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
else:
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)

# tensor info
for name, lazy_tensor in model.items():
Expand All @@ -1131,24 +1159,7 @@ def write_all(
of.write_tensor_info()

# tensor data
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
of.gguf.write_tensor_data(ndarray)
of.write_tensor_data(ftype, model, concurrency)

of.close()

Expand Down Expand Up @@ -1309,8 +1320,8 @@ def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
return vtype, path
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")

def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe"
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocab.name == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
return gguf.SpecialVocab(
model_parent_path,
Expand All @@ -1319,30 +1330,34 @@ def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path:
n_vocab=n_vocab,
)

def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
vocab_type, path = self._select_file(vocab_types)
print(f"Loading vocab file {path!r}, type {vocab_type!r}")

added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab
if vocab_type == "bpe":
vocab = BpeVocab(
return BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
elif vocab_type == "spm":
vocab = SentencePieceVocab(
if vocab_type == "spm":
return SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
elif vocab_type == "hfft":
vocab = HfVocab(
if vocab_type == "hfft":
return HfVocab(
path.parent, added_tokens_path if added_tokens_path.exists() else None
)
raise ValueError(vocab_type)

def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
vocab: Vocab
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
vocab = NoVocab()
else:
raise ValueError(vocab_type)
vocab = self._create_vocab_by_path(vocab_types)
# FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab(
vocab,
vocab_type,
model_parent_path,
)
return vocab, special_vocab
Expand Down Expand Up @@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
Expand All @@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")

args = parser.parse_args(args_in)
if args.no_vocab:
if args.vocab_only:
raise ValueError("no need to specify --vocab-only if using --no-vocab")
args.vocab_type = "no_vocab"

if args.dump_single:
model_plus = lazy_load_file(args.model)
Expand Down Expand Up @@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
print(f"Wrote {outfile}")
return

if model_plus.vocab is not None and args.vocab_dir is None:
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab

print(f"Vocab info: {vocab}")
Expand Down
2 changes: 2 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class General:
FILE_TYPE = "general.file_type"

class LLM:
VOCAB_SIZE = "{arch}.vocab_size"
CONTEXT_LENGTH = "{arch}.context_length"
EMBEDDING_LENGTH = "{arch}.embedding_length"
BLOCK_COUNT = "{arch}.block_count"
Expand Down Expand Up @@ -752,6 +753,7 @@ def get_type(val: Any) -> GGUFValueType:
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE

# LLM
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,9 @@ def add_custom_alignment(self, alignment: int) -> None:
self.data_alignment = alignment
self.add_uint32(Keys.General.ALIGNMENT, alignment)

def add_vocab_size(self, size: int) -> None:
self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)

def add_context_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)

Expand Down
Loading

0 comments on commit 69ff613

Please sign in to comment.