Skip to content

Commit

Permalink
Merge pull request #18 from LostRuins/concedo_experimental
Browse files Browse the repository at this point in the history
Concedo experimental
  • Loading branch information
Nexesenex authored Dec 1, 2023
2 parents eb42c73 + a195cde commit fdeb516
Show file tree
Hide file tree
Showing 53 changed files with 2,317 additions and 214 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ models-mnt
/libllama.so
/llama-bench
/llava-cli
/lookahead
/main
/metal
/perplexity
Expand Down
2 changes: 1 addition & 1 deletion colab.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
"!cp koboldcpp_cublas.so koboldcpp_cublas.dat\r\n",
"!apt install aria2 -y\r\n",
"!aria2c -x 10 -o model.ggml --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $Model\r\n",
"!python koboldcpp.py model.ggml --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --hordeconfig concedo 1 1 --remotetunnel\r\n"
"!python koboldcpp.py model.ggml --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --quiet --remotetunnel\r\n"
]
}
],
Expand Down
2 changes: 1 addition & 1 deletion common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ add_custom_command(
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
Expand Down
79 changes: 79 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <regex>
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <cinttypes>
Expand Down Expand Up @@ -496,6 +497,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.chatml = true;
} else if (arg == "--infill") {
params.infill = true;
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
params.dump_kv_cache = true;
} else if (arg == "--multiline-input") {
params.multiline_input = true;
} else if (arg == "--simple-io") {
Expand Down Expand Up @@ -836,6 +839,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif // GGML_USE_CUBLAS
#endif
printf(" --verbose-prompt print prompt before generation\n");
printf(" -dkvc, --dump-kv-cache\n");
printf(" verbose print of the KV cache\n");
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
Expand Down Expand Up @@ -1387,3 +1392,77 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
}

//
// KV cache utils
//

void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";

printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);

llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;

for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
int seq_count = 0;
for (int j = 0; j < view.n_max_seq; j++) {
if (cs_curr[j] >= 0) { seq_count++; }
}
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
}

printf("\n=== Done dumping\n");
}

void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);

std::unordered_map<llama_seq_id, size_t> seqs;
llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;

for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
for (int j = 0; j < view.n_max_seq; j++) {
if (cs_curr[j] < 0) { continue; }
if (seqs.find(cs_curr[j]) == seqs.end()) {
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
seqs[cs_curr[j]] = seqs.size();
}
}
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
}

printf("=== Sequence legend: ");
for (const auto & it : seqs) {
printf("%zu=%d, ", it.second, it.first);
}
printf("'+'=other sequence ids");

c_curr = view.cells;
cs_curr = view.cells_sequences;
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
for (int j = 0; j < view.n_max_seq; j++) {
if (cs_curr[j] >= 0) {
const auto & it = seqs.find(cs_curr[j]);
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
} else {
putchar('.');
}
}
putchar(' ');
}

printf("\n=== Done dumping\n");
}
11 changes: 11 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ struct gpt_params {
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes

// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
Expand Down Expand Up @@ -226,3 +227,13 @@ std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

//
// KV cache utils
//

// Dump the KV cache view with the number of sequences per cell.
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
29 changes: 15 additions & 14 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
from safetensors import safe_open
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
else:
ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu"))
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))

with ctx as model_part:
for name in model_part.keys():
Expand Down Expand Up @@ -880,20 +880,21 @@ def parse_args() -> argparse.Namespace:

hparams = Model.load_hparams(dir_model)

model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)

print("Set model parameters")
model_instance.set_gguf_parameters()
print("Set model parameters")
model_instance.set_gguf_parameters()

print("Set model tokenizer")
model_instance.set_vocab()
print("Set model tokenizer")
model_instance.set_vocab()

if args.vocab_only:
print(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab()
else:
print(f"Exporting model to '{fname_out}'")
model_instance.write()
if args.vocab_only:
print(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab()
else:
print(f"Exporting model to '{fname_out}'")
model_instance.write()

print(f"Model successfully exported to '{fname_out}'")
print(f"Model successfully exported to '{fname_out}'")
Empty file modified convert.py
100644 → 100755
Empty file.
Binary file added docs/llama-star/idea-arch.key
Binary file not shown.
Binary file added docs/llama-star/idea-arch.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ else()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(lookahead)
add_subdirectory(train-text-from-scratch)
if (LLAMA_METAL)
add_subdirectory(metal)
Expand Down
2 changes: 1 addition & 1 deletion examples/batched.swift/Sources/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ while n_cur <= n_len {
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

// is it an end of stream? -> mark the stream as finished
if new_token_id == llama_token_eos(context) || n_cur == n_len {
if new_token_id == llama_token_eos(model) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {
Expand Down
2 changes: 1 addition & 1 deletion examples/finetune/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
```

Finetune output files will be saved every N iterations (config with `--save-every N`).
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
So in above example after 10 iterations these files will be written:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
Expand Down
1 change: 1 addition & 0 deletions examples/llama.swiftui/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
xcuserdata
7 changes: 7 additions & 0 deletions examples/llama.swiftui/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# llama.swiftui

Local inference of llama.cpp on an iPhone.
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.

https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545

Loading

0 comments on commit fdeb516

Please sign in to comment.