From a6a1f7c9d4d4c84667036471909b01987bdf0f3b Mon Sep 17 00:00:00 2001 From: icpp Date: Wed, 25 Sep 2024 20:09:50 -0400 Subject: [PATCH] qwen2.5 --- README.md | 237 ++++++++++++------------------------------ canister_ids.json | 5 + native/main.cpp | 2 +- native/test_qwen2.cpp | 52 ++++----- scripts/upload.py | 48 +++++++-- src/llama_cpp.did | 11 +- src/main_.cpp | 122 +++++++++++++++++----- src/main_.h | 4 +- src/max_tokens.h | 5 +- src/model.cpp | 17 +-- src/run.cpp | 34 +++--- 11 files changed, 274 insertions(+), 263 deletions(-) create mode 100644 canister_ids.json diff --git a/README.md b/README.md index c3b1594..e030258 100644 --- a/README.md +++ b/README.md @@ -9,38 +9,6 @@ This repo allows you to deploy llama.cpp as a Smart Contract to the Internet Computer. - -# Overview - -The following models work: - -| Model | File Size | Location | Status | Notes | -| ----- | --------- | -------- | ------ | ------| -| stories260Ktok512.gguf | 2.00 Mb | ./models | ✅ | Testing only | -| stories15Mtok4096.gguf | 32.00 Mb | ./models | ✅ | Ok | -| storiesICP42Mtok4096.gguf | 113.00 Mb | ./models | ✅ | Works great | -| gpt2.Q8_0.gguf | 176.00 Mb | https://huggingface.co/igorbkz/gpt2-Q8_0-GGUF | ✅ | Not very good| - -

- ---- - -The following models load, but hit instruction limit after a few tokens, making it unusable: -| Model | File Size | Location | Status | Notes | -| ----- | --------- | -------- | ------ | ------| -| tinyllama-1.1b-chat-v1.0.Q8_0.gguf | 1.17 Gb | https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF | ✅ | 4 tokens max | - -

- ---- -The following models do not load, because they do not fit in wasm32 memory - Model | File Size | Location | Status | Notes | -| ----- | --------- | -------- | ------ | ------| -| Phi-3-mini-4k-instruct-q4.gguf | 2.39 Gb | https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf | 🚫 | Needs wasm64 | - -

- - # WARNING ⚠️ This repo is under heavy development. 🚧 @@ -61,7 +29,7 @@ Please join our [OpenChat C++ community](https://oc.app/community/cklkv-3aaaa-aa WARNING: Currently, the canister can only be build on a `mac` ! -- Use Python 3.11 ❗❗❗ +- VERY IMPORTANT: Use Python 3.11 ❗❗❗ - Install [icpp-pro](https://docs.icpp.world/installation.html), the C++ Canister Development Kit (CDK) for the Internet Computer @@ -140,73 +108,7 @@ WARNING: Currently, the canister can only be build on a `mac` ! # Build & Test models -## storiesICP42Mtok4096.gguf (113.0 Mb) - - This is a fine-tuned model that generates funny stories about ICP & ckBTC. - - The context window for the model is 128 tokens, and that is the maximum length llama.cpp allows for token generation. - - The same deployment & test procedures can be used for the really small test models `stories260Ktok512.gguf` & `stories15Mtok4096.gguf`. Those two models are great for fleshing out the deployment, but the LLMs themselves are too small to create comprehensive stories. - - - Download the model from huggingface: https://huggingface.co/onicai/llama_cpp_canister_models - - Store it in: `models/storiesICP42Mtok4096.gguf` - - - Upload the model: - ```bash - python -m scripts.upload --network local --canister llama_cpp --canister-filename models/storiesICP42Mtok4096.gguf models/storiesICP42Mtok4096.gguf - ``` - - - Load the model into OP memory - - This command will load a model into working memory (Orthogonal Persisted): - ```bash - dfx canister call llama_cpp load_model '(record { args = vec {"--model"; "models/storiesICP42Mtok4096.gguf";} })' - ``` - - - Ensure the canister is ready for Inference, with the model loaded - ```bash - dfx canister call llama_cpp ready - ``` - - - Chat with the LLM: - - ```bash - # Start a new chat - this resets the prompt-cache for this conversation - dfx canister call llama_cpp new_chat '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"} })' - - # Create 50 tokens from a prompt, with caching - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all";"--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "50"; "-p"; "Dominic loves writing stories"} })' - - # Create another 50 tokens, using the cache - just continue, no new prompt provided - # Repeat until the LLM says it is done or until you hit the context limit with the error: - # `main_: error: prompt is too long` - # - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all";"--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "50";} })' - - # After a couple of calls, you will get something like this as output, unless you hit the context limit error: - ( - variant { - Ok = record { - status_code = 200 : nat16; - output = ""; - error = ""; - input = " Dominic loves writing stories. He wanted to share his love with others, so he built a fun website on the Internet Computer. With his ckBTC, he bought a cool new book with new characters. Every night before bed, Dominic read his favorite stories with his favorite characters. The end."; - } - }, - ) - - - ######################################## - # Tip. Add this to the args vec if you # - # want to see how many tokens the # - # canister can generate before it # - # hits the instruction limit # - # # - # ;"--print-token-count"; "1" # - ######################################## - -## qwen2.5-0,5b-instruct-q8_0.gguf (676 Mb; ~14 tokens max) +## qwen2.5-0.5b-instruct-q8_0.gguf (676 Mb; ~14 tokens max) - Download the model from huggingface: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF @@ -236,51 +138,34 @@ WARNING: Currently, the canister can only be build on a `mac` ! - Chat with the LLM: - Different ways to use this model with llama.cpp: + Details how to use the Qwen models with llama.cpp: https://qwen.readthedocs.io/en/latest/run_locally/llama.cpp.html ```bash # Start a new chat - this resets the prompt-cache for this conversation dfx canister call llama_cpp new_chat '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"} })' - # Next, in a sequence of update calls, process the equivalent of this llama-cli command: - # Note though that the canister is not a GPU, so the -fa and -ngl are not passed to the canister - ./llama-cli -m /models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -fa -ngl 80 -n 512 --prompt-cache prompt.cache --prompt-cache-all - - # First call, which will ingest the prompt up to max_tokens_update, which was set above + # Repeat this call until the prompt_remaining is empty. KEEP SENDING THE ORIGINAL PROMPT OR THE CONVERSATION dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "512" } })' + ... + # Once prompt_remaining is empty, repeat this call, with an empty prompt, until the generation is complete: + dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; ""; "-n"; "512" } })' + + ... + + # Once generated_eog = true, the LLM is done generating + + # this is the output after several update calls and it has reached eog: - # But... This model can generate 14 tokens, including initial prompt, before hitting the instruction limit - # So, split it up... - # TODO: build the initial prompt in multiple steps... - # For now, skip this step, so, NO system instructions - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"; "-n"; "1" } })' - # And for now, just start here. - # The user prompt: - # (-) sandwiched in between: - # <|im_start|>user\n ... <|im_end|>\n - # Tell the LLM that next up is the assistant to generate tokens: - # <|im_start|>assistant\n - # (-) generate 1 token, which is NOT stored in the cache...: - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "1" } })' - - # Continue generating, 10 tokens at a time - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "10" } })' - # At some point, you will get an <|im_end|> special section back, which indicates the end of the assistant token generation: - ``` - % dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "10" } })' - ( - variant { - Ok = record { - status_code = 200 : nat16; - output = " and recommendation systems.<|im_end|>"; - error = ""; - input = "<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\nLarge Language Models (LLMs) are artificial intelligence models that can generate human-like text or answer questions based on large amounts of text data. They are commonly used in natural language processing (NLP) tasks such as question answering, text summarization, and natural language generation. LLMs can be trained on large amounts of text data, making them highly efficient and scalable for a wide range of applications. They can also be trained on unstructured or semi-structured data, making them useful for tasks such as sentiment analysis, topic modeling,"; - } - }, - ) + + + + # NOTE: This is the equivalent llama-cli call, when running llama.cpp locally + ./llama-cli -m /models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -fa -ngl 80 -n 512 --prompt-cache prompt.cache --prompt-cache-all + + ``` ######################################## @@ -292,21 +177,50 @@ WARNING: Currently, the canister can only be build on a `mac` ! # ;"--print-token-count"; "1" # ######################################## + - Deployed to mainnet at canister: 6uwoh-vaaaa-aaaag-amema-cai -## TODO: REMOVE IF 2.5 WORKS OUT... qwen2-0_5b-instruct-q8_0.gguf (531 Mb; ~14 tokens max) + To be able to upload the model, I had to change the [compute allocation](https://internetcomputer.org/docs/current/developer-docs/smart-contracts/maintain/settings#compute-allocation) - - Download the model from huggingface: https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GGUF + ``` + # check the settings + dfx canister status --ic llama_cpp - Store it in: `models/Qwen/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-q8_0.gguf` + # Set a compute allocation (costs a rental fee) + dfx canister update-settings --ic llama_cpp --compute-allocation 50 + ``` + - Cost of uploading the 676 Mb model, using the compute-allocation of 50, cost about: + + ~3 TCycle = $4 + + - Cost of 10 tokens = 31_868_339_839 Cycles = ~0.03 TCycles = ~$0.04 + So, 1 Token = ~$0.004 + +--- +--- + +## storiesICP42Mtok4096.gguf (113.0 Mb) + + This is a fine-tuned model that generates funny stories about ICP & ckBTC. + + The context window for the model is 128 tokens, and that is the maximum length llama.cpp allows for token generation. + + The same deployment & test procedures can be used for the really small test models `stories260Ktok512.gguf` & `stories15Mtok4096.gguf`. Those two models are great for fleshing out the deployment, but the LLMs themselves are too small to create comprehensive stories. + + - Download the model from huggingface: https://huggingface.co/onicai/llama_cpp_canister_models + + Store it in: `models/storiesICP42Mtok4096.gguf` + - Upload the model: ```bash - python -m scripts.upload --network local --canister llama_cpp --canister-filename models/qwen2-0_5b-instruct-q8_0.gguf models/Qwen/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-q8_0.gguf + python -m scripts.upload --network local --canister llama_cpp --canister-filename models/storiesICP42Mtok4096.gguf models/storiesICP42Mtok4096.gguf ``` - - Load the model into OP memory (Do once, and note that it is already done by scripts.upload above) + - Load the model into OP memory + + This command will load a model into working memory (Orthogonal Persisted): ```bash - dfx canister call llama_cpp load_model '(record { args = vec {"--model"; "models/qwen2-0_5b-instruct-q8_0.gguf";} })' + dfx canister call llama_cpp load_model '(record { args = vec {"--model"; "models/storiesICP42Mtok4096.gguf";} })' ``` - Ensure the canister is ready for Inference, with the model loaded @@ -316,50 +230,31 @@ WARNING: Currently, the canister can only be build on a `mac` ! - Chat with the LLM: - Different ways to use this model with llama.cpp: - https://qwen.readthedocs.io/en/latest/run_locally/llama.cpp.html - ```bash # Start a new chat - this resets the prompt-cache for this conversation dfx canister call llama_cpp new_chat '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"} })' - # This is how you would do it if there was no instructions limit... - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "512" } })' + # Create 50 tokens from a prompt, with caching + dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all";"--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "50"; "-p"; "Dominic loves writing stories"} })' - # NOTE: Equivalent direct call to llama-cli - ./llama-cli -m /models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf -sp -p "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n" -fa -ngl 80 -n 512 --prompt-cache prompt.cache --prompt-cache-all + # Create another 50 tokens, using the cache - just continue, no new prompt provided + # Repeat until the LLM says it is done or until you hit the context limit with the error: + # `main_: error: prompt is too long` + # + dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all";"--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "50";} })' - # But... This model can generate 14 tokens, including initial prompt, before hitting the instruction limit - # So, split it up... - # TODO: build the initial prompt in multiple steps... - # For now, skip this step, so, NO system instructions - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"; "-n"; "1" } })' - # And for now, just start here. - # The user prompt: - # (-) sandwiched in between: - # <|im_start|>user\n ... <|im_end|>\n - # Tell the LLM that next up is the assistant to generate tokens: - # <|im_start|>assistant\n - # (-) generate 1 token, which is NOT stored in the cache...: - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "1" } })' - - # Continue generating, 10 tokens at a time - dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "10" } })' - - # At some point, you will get an <|im_end|> special section back, which indicates the end of the assistant token generation: - ``` - % dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "10" } })' + # After a couple of calls, you will get something like this as output, unless you hit the context limit error: ( variant { Ok = record { status_code = 200 : nat16; - output = " and recommendation systems.<|im_end|>"; + output = ""; error = ""; - input = "<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\nLarge Language Models (LLMs) are artificial intelligence models that can generate human-like text or answer questions based on large amounts of text data. They are commonly used in natural language processing (NLP) tasks such as question answering, text summarization, and natural language generation. LLMs can be trained on large amounts of text data, making them highly efficient and scalable for a wide range of applications. They can also be trained on unstructured or semi-structured data, making them useful for tasks such as sentiment analysis, topic modeling,"; + input = " Dominic loves writing stories. He wanted to share his love with others, so he built a fun website on the Internet Computer. With his ckBTC, he bought a cool new book with new characters. Every night before bed, Dominic read his favorite stories with his favorite characters. The end."; } }, ) - ``` + ######################################## # Tip. Add this to the args vec if you # diff --git a/canister_ids.json b/canister_ids.json new file mode 100644 index 0000000..e39853d --- /dev/null +++ b/canister_ids.json @@ -0,0 +1,5 @@ +{ + "llama_cpp": { + "ic": "6uwoh-vaaaa-aaaag-amema-cai" + } +} \ No newline at end of file diff --git a/native/main.cpp b/native/main.cpp index b30c7d9..f21ec67 100644 --- a/native/main.cpp +++ b/native/main.cpp @@ -22,8 +22,8 @@ int main() { bool exit_on_fail = true; MockIC mockIC(exit_on_fail); - test_qwen2(mockIC); test_canister_functions(mockIC); + test_qwen2(mockIC); test_tiny_stories(mockIC); // returns 1 if any tests failed diff --git a/native/test_qwen2.cpp b/native/test_qwen2.cpp index 8ad6dca..ade10a8 100644 --- a/native/test_qwen2.cpp +++ b/native/test_qwen2.cpp @@ -42,7 +42,7 @@ void test_qwen2(MockIC &mockIC) { // '(record { args = vec {"--model"; "models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf";} })' candid_in = "4449444c026c01dd9ad28304016d71010002072d2d6d6f64656c466d6f64656c732f5177656e2f5177656e322e352d302e35422d496e7374727563742d474755462f7177656e322e352d302e35622d696e7374727563742d71385f302e67677566"; - // '(variant { Ok = record { status_code = 200 : nat16; input=""; prompt_remaining=""; output="Model succesfully loaded into memory."; error="" } })' + // '(variant { Ok = record { status_code = 200 : nat16; input=""; prompt_remaining=""; output="Model succesfully loaded into memory."; error=""; generated_eog=false : bool } })' candid_out = "4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a0100010100254d6f64656c2073756363657366756c6c79206c6f6164656420696e746f206d656d6f72792e0000c80000"; @@ -94,49 +94,39 @@ void test_qwen2(MockIC &mockIC) { silent_on_trap, my_principal); // ----------------------------------------------------------------------------- - // Feed the system prompt into the cache: - // (NOTE: for long system prompts, this must be done in a loop) // -sp : special token output enabled - // -n 1 : let it generate 1 token. - // -> this is NOT stored in the cache, because the last token never is - // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "1"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"} })' -> - // '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })' + // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' -> + // -> '(variant { Ok = record { status_code = 200 : nat16; error = ""; output = ""; input = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>" ; prompt_remaining = "user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"; } })' mockIC.run_test( - "run_update for chat " + std::to_string(i) + " - " + model, run_update, - "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e0131022d703a3c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a", - "44444", silent_on_trap, my_principal); + "run_update prompt step 1 for chat " + std::to_string(i) + " - " + model, run_update, + "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a", + "4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a01000101000000463c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3ec80044757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a", silent_on_trap, my_principal); // ----------------------------------------------------------------------------- - // Feed the user prompt into the cache & indicate it is now the turn of the assistant: - // (NOTE: for long user prompts, this must be done in a loop) // -sp : special token output enabled - // -n 1 : let it generate 1 token. - // -> this is NOT stored in the cache, because the last token never is - // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "1"; "-p"; "<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' -> - // '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })' + // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' -> + // -> '(variant { Ok = record { status_code = 200 : nat16; error = ""; output = ""; input = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant"; prompt_remaining = "\n";} })' mockIC.run_test( - "run_update for chat " + std::to_string(i) + " - " + model, run_update, - "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e0131022d70503c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a", - "44444", silent_on_trap, my_principal); + "run_update prompt step 2 for chat " + std::to_string(i) + " - " + model, run_update, + "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a", + "4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a0100010100000089013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e74c800010a", silent_on_trap, my_principal); // ----------------------------------------------------------------------------- - // Generate tokens from prompt while saving everything to cache, - // without re-reading the model ! - // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })' -> - // '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })' + // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' -> + // -> can no longer check it, because the LLM generated tokens mockIC.run_test( - "run_update for chat " + std::to_string(i) + " - " + model, run_update, - "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573", + "run_update prompt step 3 for chat " + std::to_string(i) + " - " + model, run_update, + "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a", "", silent_on_trap, my_principal); // ----------------------------------------------------------------------------- - // Continue generating tokens while using & saving the cache, without re-reading the model - // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' -> - // '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })' + // Once there is no prompt_remaining, it is totally ok to send an empty prompt, and just let it generate new tokens + // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; ""} })' -> + // -> can no longer check it, because the LLM generated tokens mockIC.run_test( - "run_update for chat " + std::to_string(i) + " continued - " + model, - run_update, - "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000", + "run_update prompt step 4 for chat " + std::to_string(i) + " - " + model, run_update, + "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d7000", "", silent_on_trap, my_principal); + } } \ No newline at end of file diff --git a/scripts/upload.py b/scripts/upload.py index cd1492b..a9eaab5 100644 --- a/scripts/upload.py +++ b/scripts/upload.py @@ -12,6 +12,7 @@ # pylint: disable=invalid-name, too-few-public-methods, no-member, too-many-statements import sys +import time from pathlib import Path from typing import Generator from .ic_py_canister import get_canister @@ -122,14 +123,45 @@ def main() -> int: print(f"- chunk[0] = {chunk[0]}") print(f"- chunk[-1] = {chunk[-1]}") - response = canister_instance.file_upload_chunk( - { - "filename": canister_filename, - "chunk": chunk, - "chunksize": chunksize, - "offset": offset, - } - ) # pylint: disable=no-member + """Handle exceptions in case the Ingress is busy and it throws this message: + Traceback (most recent call last): + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "llama_cpp_canister/scripts/upload.py", line 186, in + sys.exit(main()) + ^^^^^^ + File "llama_cpp_canister/scripts/upload.py", line 125, in main + response = canister_instance.file_upload_chunk( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File ".../miniconda3/envs/icpp_pro_w_llama_cpp_canister/lib/python3.11/site-packages/ic/canister.py", line 65, in __call__ + res = self.agent.update_raw( + ^^^^^^^^^^^^^^^^^^^^^^ + File ".../miniconda3/envs/icpp_pro_w_llama_cpp_canister/lib/python3.11/site-packages/ic/agent.py", line 116, in update_raw + raise Exception('Rejected: ' + result.decode()) + Exception: Rejected: Ingress message 0x33efc6c16bb217df26b8940c5ae4f3f7beaa795a252227923f77b711f4b61fe3 timed out waiting to start executing. + """ + max_retries = 5 + retry_delay = 2 # seconds + for attempt in range(1, max_retries + 1): + try: + response = canister_instance.file_upload_chunk( + { + "filename": canister_filename, + "chunk": chunk, + "chunksize": chunksize, + "offset": offset, + } + ) # pylint: disable=no-member + break # Exit the loop if the request is successful + except Exception as e: + print(f"Attempt {attempt} failed: {e}") + if attempt == max_retries: + print("Max retries reached. Failing.") + raise # Re-raise the exception if max retries are reached, which will exit the program + else: + print(f"Retrying in {retry_delay} seconds...") + time.sleep(retry_delay) # Wait before retrying + if "Ok" in response[0].keys(): print(f"OK! filesize = {response[0]['Ok']['filesize']}") else: diff --git a/src/llama_cpp.did b/src/llama_cpp.did index 0ec1125..f59f8d9 100644 --- a/src/llama_cpp.did +++ b/src/llama_cpp.did @@ -15,11 +15,12 @@ type MaxTokensRecord = record { }; type RunOutputRecord = record { - status_code: StatusCode; - input: text; + status_code: StatusCode; + output: text; + conversation: text; + error: text; prompt_remaining: text; - output: text; - error: text + generated_eog: bool; }; type OutputRecordResult = variant { Ok : RunOutputRecord; @@ -80,7 +81,7 @@ service : { // model endpoints load_model : (InputRecord) -> (OutputRecordResult); set_max_tokens : (MaxTokensRecord) -> (StatusCodeRecordResult); - get_max_tokens : () -> (MaxTokensRecord); + get_max_tokens : () -> (MaxTokensRecord) query; // up & down load of files file_download_chunk : (FileDownloadInputRecord) -> (FileDownloadRecordResult) query; diff --git a/src/main_.cpp b/src/main_.cpp index 26ff904..60f815e 100644 --- a/src/main_.cpp +++ b/src/main_.cpp @@ -136,7 +136,12 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector 0) { + prompt_remaining = prompt; + } + // ICPP-PATCH-END + + LOG_TEE("prompt: \"%s\"\n", log_tostr(prompt)); + LOG_TEE("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_TEE("# tokens: %s\n", std::to_string(embd_inp.size()).c_str()); } // Should not run without any tokens @@ -567,9 +576,20 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only int n_session_consumed = 0; int n_past_guidance = 0; + // ICPP-PATCH-START + // We can only handle max_tokens evaluations per call + int n_eval_total = 0; + // We break out of the while loop below a little bit different at end of generation + // we actually first go back one more time, to store the eog token in the conversation & cache, + // while llama.cpp does not do that + // We do want to break the while loop cleanly, to go through the memory cleanup at the end + generated_eog = false; + bool break_while_loop = false; + // ICPP-PATCH-END + std::vector input_tokens; g_input_tokens = &input_tokens; std::vector output_tokens; g_output_tokens = &output_tokens; - // std::ostringstream output_ss; g_output_ss = &output_ss; + // std::ostringstream output_ss; g_output_ss = &output_ss; // ICPP_PATCH g_output_ss = &output_ss; // ICPP-PATCH: we pass this in via argument, // so we can return it to canister caller std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode @@ -704,6 +724,17 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only n_past++; n_session_consumed++; + // ICPP-PATCH-START + // Keep track of the processed conversation tokens and the remaining prompt + int id = embd[i]; + const std::string token_str = llama_token_to_piece(ctx, id, params.special); + conversation_ss << token_str; + + if (prompt_remaining.find(token_str) == 0) { + prompt_remaining.erase(0, token_str.length()); + } + // ICPP-PATCH-END + if (n_session_consumed >= (int) session_tokens.size()) { ++i; break; @@ -763,6 +794,13 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only n_eval = params.n_batch; } + // ICPP-PATCH-START + // We must process the predictions in multiple calls due to IC's instruction limit + if (max_tokens > 0 && n_eval >= max_tokens) { + n_eval = max_tokens; + } + // ICPP-PATCH-END + LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { @@ -780,13 +818,50 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only if (params.n_print > 0 && n_past % params.n_print == 0) { LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } + + // ICPP-PATCH-START + // Keep track of the processed conversation tokens and the remaining prompt + for (int j=0; j 0 && n_eval_total >= max_tokens)) { + if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { + session_tokens.insert(session_tokens.end(), embd.begin(), embd.begin() + n_eval); + n_session_consumed = session_tokens.size(); + } + break_while_loop = true; + break; + } + // ICPP-PATCH-END + } + // ICPP-PATCH-START + if (break_while_loop) { + break; } + // ICPP-PATCH-END if (!embd.empty() && !path_session.empty()) { session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); n_session_consumed = session_tokens.size(); } } + // ICPP-PATCH-START + if (break_while_loop) { + break; + } + // ICPP-PATCH-END embd.clear(); embd_guidance.clear(); @@ -794,6 +869,9 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // optionally save the session on first sample (for faster prompt loading next time) if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { + // ICPP-PATCH-START + std::cout << "saving " << std::to_string(session_tokens.size()) << " tokens to session file " << path_session << std::endl; + // ICPP-PATCH-END need_to_save_session = false; llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); @@ -817,7 +895,7 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only LOG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG_TEE("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); @@ -845,9 +923,6 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only if (embd.size() > 1) { // Incoming Requested Tokens input_tokens.push_back(id); - // ICPP-PATCH-START - input_ss << token_str; - // ICPP-PATCH-END } else { // Outgoing Generated Tokens output_tokens.push_back(id); @@ -1034,7 +1109,11 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only // end of generation if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { LOG_TEE(" [end of text]\n"); - break; + // break; // we do not break the loop here, but we do it above + // once the eog token has been decoded and added to conversation_ss & session_tokens + // ICPP-PATCH-START + generated_eog = true; + // ICPP-PATCH-END } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. @@ -1045,17 +1124,10 @@ int main_(int argc, char ** argv, std::string principal_id, bool load_model_only } } - // ICPP-PATCH-START - // The last token is not yet stored in session_tokens - // Don't do this in general... Revisit when building:w a prompt in multiple steps.. - // if (!embd.empty() && !path_session.empty()) { - // session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); - // n_session_consumed = session_tokens.size(); - // } - - // ICPP-PATCH-END - if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { + // ICPP-PATCH-START + std::cout << "\nSaving " << std::to_string(session_tokens.size()) << " tokens to session file " << path_session << std::endl; + // ICPP-PATCH-END LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } diff --git a/src/main_.h b/src/main_.h index 4fd3631..46675c0 100644 --- a/src/main_.h +++ b/src/main_.h @@ -3,7 +3,7 @@ #include int main_(int argc, char **argv, std::string principal_id, bool load_model_only, - std::string &icpp_error_msg, std::ostringstream &input_ss, - std::ostringstream &output_ss); + std::string &icpp_error_msg, std::ostringstream &conversation_ss, + std::ostringstream &output_ss, const uint64_t &max_tokens, std::string &prompt_remaining, bool &generated_eog); void free_model(); void reset_static_memory(); \ No newline at end of file diff --git a/src/max_tokens.h b/src/max_tokens.h index 8194159..29757f4 100644 --- a/src/max_tokens.h +++ b/src/max_tokens.h @@ -5,4 +5,7 @@ #include void set_max_tokens() WASM_SYMBOL_EXPORTED("canister_update set_max_tokens"); -void get_max_tokens() WASM_SYMBOL_EXPORTED("canister_query get_max_tokens"); \ No newline at end of file +void get_max_tokens() WASM_SYMBOL_EXPORTED("canister_query get_max_tokens"); + +extern uint64_t max_tokens_update; +extern uint64_t max_tokens_query; \ No newline at end of file diff --git a/src/model.cpp b/src/model.cpp index eaad3f5..4aeee5c 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -7,6 +7,7 @@ #include "ready.h" #include "upload.h" #include "utils.h" +#include "max_tokens.h" #include #include @@ -31,21 +32,24 @@ void load_model() { // Call main_, just like it is called in the llama-cli app std::string icpp_error_msg; - std::ostringstream input_ss; + std::ostringstream conversation_ss; std::ostringstream output_ss; bool load_model_only = true; + std::string prompt_remaining; + bool generated_eog = false; int result = main_(argc, argv.data(), principal_id, load_model_only, - icpp_error_msg, input_ss, output_ss); + icpp_error_msg, conversation_ss, output_ss, max_tokens_update, prompt_remaining, generated_eog); // Exit if there was an error if (result != 0) { CandidTypeRecord r_out; r_out.append("status_code", CandidTypeNat16{Http::StatusCode::InternalServerError}); // 500 - r_out.append("input", CandidTypeText{""}); - r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("conversation", CandidTypeText{""}); r_out.append("output", CandidTypeText{""}); r_out.append("error", CandidTypeText{icpp_error_msg}); + r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("generated_eog", CandidTypeBool{generated_eog}); ic_api.to_wire(CandidTypeVariant{"Err", r_out}); return; } @@ -55,10 +59,11 @@ void load_model() { CandidTypeRecord r_out; r_out.append("status_code", CandidTypeNat16{Http::StatusCode::OK}); // 200 - r_out.append("input", CandidTypeText{""}); - r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("conversation", CandidTypeText{""}); r_out.append("output", CandidTypeText{"Model succesfully loaded into memory."}); r_out.append("error", CandidTypeText{""}); + r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("generated_eog", CandidTypeBool{generated_eog}); ic_api.to_wire(CandidTypeVariant{"Ok", r_out}); } \ No newline at end of file diff --git a/src/run.cpp b/src/run.cpp index 3e2962f..efc8d85 100644 --- a/src/run.cpp +++ b/src/run.cpp @@ -3,6 +3,7 @@ #include "http.h" #include "main_.h" #include "utils.h" +#include "max_tokens.h" #include #include @@ -94,10 +95,11 @@ void new_chat() { r_out.append( "status_code", CandidTypeNat16{Http::StatusCode::InternalServerError}); // 500 - r_out.append("input", CandidTypeText{""}); - r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("conversation", CandidTypeText{""}); r_out.append("output", CandidTypeText{""}); r_out.append("error", CandidTypeText{msg}); + r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("generated_eog", CandidTypeBool{false}); ic_api.to_wire(CandidTypeVariant{"Err", r_out}); return; } @@ -113,14 +115,15 @@ void new_chat() { // Return output over the wire CandidTypeRecord r_out; r_out.append("status_code", CandidTypeNat16{Http::StatusCode::OK}); // 200 - r_out.append("input", CandidTypeText{""}); - r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("conversation", CandidTypeText{""}); r_out.append("output", CandidTypeText{msg}); r_out.append("error", CandidTypeText{""}); + r_out.append("prompt_remaining", CandidTypeText{""}); + r_out.append("generated_eog", CandidTypeBool{false}); ic_api.to_wire(CandidTypeVariant{"Ok", r_out}); } -void run(IC_API &ic_api) { +void run(IC_API &ic_api, const uint64_t &max_tokens) { CandidTypePrincipal caller = ic_api.get_caller(); std::string principal_id = caller.get_text(); @@ -129,20 +132,24 @@ void run(IC_API &ic_api) { // Call main_, just like it is called in the llama-cli app std::string icpp_error_msg; - std::ostringstream input_ss; // input tokens (from prompt or session cache) - std::ostringstream output_ss; // output tokens (generated during this call) + std::ostringstream conversation_ss; // input tokens (from session cache + prompt) + std::ostringstream output_ss; // output tokens (generated during this call) + std::string prompt_remaining; // part of the prompt not processed due to max_tokens + bool generated_eog = false; // this is set to true if llama.cpp is generating new tokens, and it generated an eog (End Of Generation) bool load_model_only = false; int result = main_(argc, argv.data(), principal_id, load_model_only, - icpp_error_msg, input_ss, output_ss); + icpp_error_msg, conversation_ss, output_ss, max_tokens, prompt_remaining, generated_eog); // Exit if there was an error if (result != 0) { CandidTypeRecord r_out; r_out.append("status_code", CandidTypeNat16{Http::StatusCode::InternalServerError}); // 500 - r_out.append("input", CandidTypeText{input_ss.str()}); + r_out.append("conversation", CandidTypeText{conversation_ss.str()}); r_out.append("output", CandidTypeText{output_ss.str()}); r_out.append("error", CandidTypeText{icpp_error_msg}); + r_out.append("prompt_remaining", CandidTypeText{prompt_remaining}); + r_out.append("generated_eog", CandidTypeBool{generated_eog}); ic_api.to_wire(CandidTypeVariant{"Err", r_out}); return; } @@ -150,18 +157,19 @@ void run(IC_API &ic_api) { // Return output over the wire CandidTypeRecord r_out; r_out.append("status_code", CandidTypeNat16{Http::StatusCode::OK}); // 200 - r_out.append("input", CandidTypeText{input_ss.str()}); - r_out.append("prompt_remaining", CandidTypeText{"--TODO--"}); + r_out.append("conversation", CandidTypeText{conversation_ss.str()}); r_out.append("output", CandidTypeText{output_ss.str()}); r_out.append("error", CandidTypeText{""}); + r_out.append("prompt_remaining", CandidTypeText{prompt_remaining}); + r_out.append("generated_eog", CandidTypeBool{generated_eog}); ic_api.to_wire(CandidTypeVariant{"Ok", r_out}); } void run_query() { IC_API ic_api(CanisterQuery{std::string(__func__)}, false); - run(ic_api); + run(ic_api, max_tokens_query); } void run_update() { IC_API ic_api(CanisterUpdate{std::string(__func__)}, false); - run(ic_api); + run(ic_api, max_tokens_update); }