Skip to content

Commit

Permalink
qwen2.5
Browse files Browse the repository at this point in the history
  • Loading branch information
icppWorld committed Sep 26, 2024
1 parent 5729ebe commit a6a1f7c
Show file tree
Hide file tree
Showing 11 changed files with 274 additions and 263 deletions.
237 changes: 66 additions & 171 deletions README.md

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions canister_ids.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"llama_cpp": {
"ic": "6uwoh-vaaaa-aaaag-amema-cai"
}
}
2 changes: 1 addition & 1 deletion native/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ int main() {
bool exit_on_fail = true;
MockIC mockIC(exit_on_fail);

test_qwen2(mockIC);
test_canister_functions(mockIC);
test_qwen2(mockIC);
test_tiny_stories(mockIC);

// returns 1 if any tests failed
Expand Down
52 changes: 21 additions & 31 deletions native/test_qwen2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void test_qwen2(MockIC &mockIC) {
// '(record { args = vec {"--model"; "models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf";} })'
candid_in =
"4449444c026c01dd9ad28304016d71010002072d2d6d6f64656c466d6f64656c732f5177656e2f5177656e322e352d302e35422d496e7374727563742d474755462f7177656e322e352d302e35622d696e7374727563742d71385f302e67677566";
// '(variant { Ok = record { status_code = 200 : nat16; input=""; prompt_remaining=""; output="Model succesfully loaded into memory."; error="" } })'
// '(variant { Ok = record { status_code = 200 : nat16; input=""; prompt_remaining=""; output="Model succesfully loaded into memory."; error=""; generated_eog=false : bool } })'
candid_out =
"4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a0100010100254d6f64656c2073756363657366756c6c79206c6f6164656420696e746f206d656d6f72792e0000c80000";

Expand Down Expand Up @@ -94,49 +94,39 @@ void test_qwen2(MockIC &mockIC) {
silent_on_trap, my_principal);

// -----------------------------------------------------------------------------
// Feed the system prompt into the cache:
// (NOTE: for long system prompts, this must be done in a loop)
// -sp : special token output enabled
// -n 1 : let it generate 1 token.
// -> this is NOT stored in the cache, because the last token never is
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "1"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"} })' ->
// '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })'
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
// -> '(variant { Ok = record { status_code = 200 : nat16; error = ""; output = ""; input = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>" ; prompt_remaining = "user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"; } })'
mockIC.run_test(
"run_update for chat " + std::to_string(i) + " - " + model, run_update,
"4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e0131022d703a3c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a",
"44444", silent_on_trap, my_principal);
"run_update prompt step 1 for chat " + std::to_string(i) + " - " + model, run_update,
"4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
"4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a01000101000000463c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3ec80044757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a", silent_on_trap, my_principal);

// -----------------------------------------------------------------------------
// Feed the user prompt into the cache & indicate it is now the turn of the assistant:
// (NOTE: for long user prompts, this must be done in a loop)
// -sp : special token output enabled
// -n 1 : let it generate 1 token.
// -> this is NOT stored in the cache, because the last token never is
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "1"; "-p"; "<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
// '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })'
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
// -> '(variant { Ok = record { status_code = 200 : nat16; error = ""; output = ""; input = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant"; prompt_remaining = "\n";} })'
mockIC.run_test(
"run_update for chat " + std::to_string(i) + " - " + model, run_update,
"4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e0131022d70503c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
"44444", silent_on_trap, my_principal);
"run_update prompt step 2 for chat " + std::to_string(i) + " - " + model, run_update,
"4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
"4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a0100010100000089013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e74c800010a", silent_on_trap, my_principal);

// -----------------------------------------------------------------------------
// Generate tokens from prompt while saving everything to cache,
// without re-reading the model !
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })' ->
// '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })'
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
// -> can no longer check it, because the LLM generated tokens
mockIC.run_test(
"run_update for chat " + std::to_string(i) + " - " + model, run_update,
"4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573",
"run_update prompt step 3 for chat " + std::to_string(i) + " - " + model, run_update,
"4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
"", silent_on_trap, my_principal);

// -----------------------------------------------------------------------------
// Continue generating tokens while using & saving the cache, without re-reading the model
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' ->
// '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })'
// Once there is no prompt_remaining, it is totally ok to send an empty prompt, and just let it generate new tokens
// '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; ""} })' ->
// -> can no longer check it, because the LLM generated tokens
mockIC.run_test(
"run_update for chat " + std::to_string(i) + " continued - " + model,
run_update,
"4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000",
"run_update prompt step 4 for chat " + std::to_string(i) + " - " + model, run_update,
"4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d7000",
"", silent_on_trap, my_principal);

}
}
48 changes: 40 additions & 8 deletions scripts/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# pylint: disable=invalid-name, too-few-public-methods, no-member, too-many-statements

import sys
import time
from pathlib import Path
from typing import Generator
from .ic_py_canister import get_canister
Expand Down Expand Up @@ -122,14 +123,45 @@ def main() -> int:
print(f"- chunk[0] = {chunk[0]}")
print(f"- chunk[-1] = {chunk[-1]}")

response = canister_instance.file_upload_chunk(
{
"filename": canister_filename,
"chunk": chunk,
"chunksize": chunksize,
"offset": offset,
}
) # pylint: disable=no-member
"""Handle exceptions in case the Ingress is busy and it throws this message:
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "llama_cpp_canister/scripts/upload.py", line 186, in <module>
sys.exit(main())
^^^^^^
File "llama_cpp_canister/scripts/upload.py", line 125, in main
response = canister_instance.file_upload_chunk(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".../miniconda3/envs/icpp_pro_w_llama_cpp_canister/lib/python3.11/site-packages/ic/canister.py", line 65, in __call__
res = self.agent.update_raw(
^^^^^^^^^^^^^^^^^^^^^^
File ".../miniconda3/envs/icpp_pro_w_llama_cpp_canister/lib/python3.11/site-packages/ic/agent.py", line 116, in update_raw
raise Exception('Rejected: ' + result.decode())
Exception: Rejected: Ingress message 0x33efc6c16bb217df26b8940c5ae4f3f7beaa795a252227923f77b711f4b61fe3 timed out waiting to start executing.
"""
max_retries = 5
retry_delay = 2 # seconds
for attempt in range(1, max_retries + 1):
try:
response = canister_instance.file_upload_chunk(
{
"filename": canister_filename,
"chunk": chunk,
"chunksize": chunksize,
"offset": offset,
}
) # pylint: disable=no-member
break # Exit the loop if the request is successful
except Exception as e:
print(f"Attempt {attempt} failed: {e}")
if attempt == max_retries:
print("Max retries reached. Failing.")
raise # Re-raise the exception if max retries are reached, which will exit the program
else:
print(f"Retrying in {retry_delay} seconds...")
time.sleep(retry_delay) # Wait before retrying

if "Ok" in response[0].keys():
print(f"OK! filesize = {response[0]['Ok']['filesize']}")
else:
Expand Down
11 changes: 6 additions & 5 deletions src/llama_cpp.did
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@ type MaxTokensRecord = record {
};

type RunOutputRecord = record {
status_code: StatusCode;
input: text;
status_code: StatusCode;
output: text;
conversation: text;
error: text;
prompt_remaining: text;
output: text;
error: text
generated_eog: bool;
};
type OutputRecordResult = variant {
Ok : RunOutputRecord;
Expand Down Expand Up @@ -80,7 +81,7 @@ service : {
// model endpoints
load_model : (InputRecord) -> (OutputRecordResult);
set_max_tokens : (MaxTokensRecord) -> (StatusCodeRecordResult);
get_max_tokens : () -> (MaxTokensRecord);
get_max_tokens : () -> (MaxTokensRecord) query;

// up & down load of files
file_download_chunk : (FileDownloadInputRecord) -> (FileDownloadRecordResult) query;
Expand Down
Loading

0 comments on commit a6a1f7c

Please sign in to comment.