Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
apicalshark authored Nov 2, 2024
2 parents e0f3017 + 1926d6e commit 73069a9
Show file tree
Hide file tree
Showing 19 changed files with 666 additions and 477 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ BUILD_TARGETS = \
llama-save-load-state \
llama-server \
llama-simple \
llama-simple-chat \
llama-speculative \
llama-tokenize \
llama-vdot \
Expand Down Expand Up @@ -1287,6 +1288,11 @@ llama-simple: examples/simple/simple.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-simple-chat: examples/simple-chat/simple-chat.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-tokenize: examples/tokenize/tokenize.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

## Hot topics

- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

----
Expand Down
182 changes: 91 additions & 91 deletions ci/run.sh

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ struct common_sampler_params {

struct common_params {
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_ctx = 4096; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
Expand Down
27 changes: 14 additions & 13 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ class Model:
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

Expand All @@ -87,7 +88,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model)
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None
Expand Down Expand Up @@ -1541,6 +1542,17 @@ def set_vocab(self):
special_vocab._set_special_token("eot", 32010)
special_vocab.add_to_gguf(self.gguf_writer)

tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])

# Apply to granite small models only
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)

def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
Expand All @@ -1557,17 +1569,6 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])

# Apply to granite small models only
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)

@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
Expand Down
47 changes: 37 additions & 10 deletions convert_lora_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from math import prod
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
from transformers import AutoConfig

import torch

Expand Down Expand Up @@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace:
help="only print out what will be done, without writing any new files",
)
parser.add_argument(
"--base", type=Path, required=True,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
"--base", type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
)
parser.add_argument(
"lora_path", type=Path,
Expand All @@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args()


def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
# normally, adapter does not come with base model config, we need to load it from AutoConfig
config = AutoConfig.from_pretrained(hf_model_id)
return config.to_dict()


if __name__ == '__main__':
args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
Expand All @@ -281,7 +288,7 @@ def parse_args() -> argparse.Namespace:

ftype = ftype_map[args.outtype]

dir_base_model: Path = args.base
dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path
lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors"
Expand All @@ -301,9 +308,29 @@ def parse_args() -> argparse.Namespace:
input_model = os.path.join(dir_lora, "adapter_model.bin")
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)

# load LoRA config
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)

# load base model
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model)
if dir_base_model is None:
if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}")
try:
hparams = load_hparams_from_hf(model_id)
except OSError as e:
logger.error(f"Failed to load base model config: {e}")
logger.error("Please try downloading the base model and add its path to --base")
sys.exit(1)
else:
logger.error("'base_model_name_or_path' is not found in adapter_config.json")
logger.error("Base model config is required. Please download the base model and add its path to --base")
sys.exit(1)
else:
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model)

with torch.inference_mode():
try:
model_class = Model.from_model_architecture(hparams["architectures"][0])
Expand All @@ -323,13 +350,15 @@ def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
self.dir_model_card = dir_lora_model
self.lora_alpha = float(lora_alpha)

def set_vocab(self):
pass

def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")

def set_gguf_parameters(self):
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
super().set_gguf_parameters()

def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
Expand All @@ -350,7 +379,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
sys.exit(1)

if base_name in tensor_map:
Expand Down Expand Up @@ -384,9 +413,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b)

with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)

alpha: float = lparams["lora_alpha"]

model_instance = LoraModel(
Expand All @@ -399,6 +425,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
dry_run=args.dry_run,
dir_lora_model=dir_lora,
lora_alpha=alpha,
hparams=hparams,
)

logger.info("Exporting model...")
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ else()
endif()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(simple-chat)
add_subdirectory(speculative)
add_subdirectory(tokenize)
endif()
35 changes: 12 additions & 23 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -725,12 +725,12 @@ struct server_context {
return nullptr;
}

server_slot * get_available_slot(const std::string & prompt) {
server_slot * get_available_slot(const server_task & task) {
server_slot * ret = nullptr;

// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
int max_lcp_len = 0;
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
int max_lcs_len = 0;
float similarity = 0;

for (server_slot & slot : slots) {
Expand All @@ -740,25 +740,25 @@ struct server_context {
}

// skip the slot if it does not contains cached tokens
if (slot.prompt_tokens.empty()) {
if (slot.cache_tokens.empty()) {
continue;
}

// length of the Longest Common Prefix between the current slot's prompt and the input prompt
int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);

// fraction of the common substring length compared to the current slot's prompt length
similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
// fraction of the common subsequence length compared to the current slot's prompt length
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());

// select the current slot if the criteria match
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
max_lcp_len = lcp_len;
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
max_lcs_len = lcs_len;
ret = &slot;
}
}

if (ret != nullptr) {
SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
}
}

Expand Down Expand Up @@ -1514,18 +1514,7 @@ struct server_context {
{
const int id_slot = json_value(task.data, "id_slot", -1);

server_slot * slot;

if (id_slot != -1) {
slot = get_slot_by_id(id_slot);
} else {
std::string prompt;
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
prompt = json_value(task.data, "prompt", std::string());
}

slot = get_available_slot(prompt);
}
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);

if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
Expand Down
52 changes: 47 additions & 5 deletions examples/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,18 +439,60 @@ static std::string gen_chatcmplid() {
// other common utils
//

static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}

return i;
}

static size_t longest_common_prefix(const std::string & a, const std::string & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
// check for empty sequences
if (a.empty() || b.empty()) {
return 0;
}

return i;
// get the lengths of the input sequences
int a_len = a.size();
int b_len = b.size();

// initialize the maximum length of the longest common subsequence (LCS)
int max_length = 0;

// use two rows instead of a 2D matrix to optimize space
std::vector<int> prev_row(b_len + 1, 0);
std::vector<int> curr_row(b_len + 1, 0);

// iterate through the elements of a
for (int i = 1; i <= a_len; i++) {
// iterate through the elements of b
for (int j = 1; j <= b_len; j++) {
// if elements at the current positions match
if (a[i - 1] == b[j - 1]) {
// if it's the first element of either sequences, set LCS length to 1
if (i == 1 || j == 1) {
curr_row[j] = 1;
} else {
// increment LCS length by 1 compared to the previous element
curr_row[j] = prev_row[j - 1] + 1;
}

// update max_length if necessary
if (curr_row[j] > max_length) {
max_length = curr_row[j];
}
} else {
// reset LCS length if elements don't match
curr_row[j] = 0;
}
}

// update the previous row for the next iteration
prev_row = curr_row;
}

// return the maximum length of the LCS
return max_length;
}

static bool ends_with(const std::string & str, const std::string & suffix) {
Expand Down
5 changes: 5 additions & 0 deletions examples/simple-chat/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET llama-simple-chat)
add_executable(${TARGET} simple-chat.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
7 changes: 7 additions & 0 deletions examples/simple-chat/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# llama.cpp/example/simple-chat

The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file.

```bash
./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048
...
Loading

0 comments on commit 73069a9

Please sign in to comment.