qwen2.5

onicai · Sep 26, 2024 · a6a1f7c · a6a1f7c
1 parent 5729ebe
commit a6a1f7c
Show file tree

Hide file tree

Showing 11 changed files with 274 additions and 263 deletions.
diff --git a/README.md b/README.md
diff --git a/canister_ids.json b/canister_ids.json
@@ -0,0 +1,5 @@
+{
+  "llama_cpp": {
+    "ic": "6uwoh-vaaaa-aaaag-amema-cai"
+  }
+}
diff --git a/native/main.cpp b/native/main.cpp
@@ -22,8 +22,8 @@ int main() {
   bool exit_on_fail = true;
   MockIC mockIC(exit_on_fail);
 
-  test_qwen2(mockIC);
   test_canister_functions(mockIC);
+  test_qwen2(mockIC);
   test_tiny_stories(mockIC);
 
   // returns 1 if any tests failed

diff --git a/native/test_qwen2.cpp b/native/test_qwen2.cpp
@@ -42,7 +42,7 @@ void test_qwen2(MockIC &mockIC) {
   // '(record { args = vec {"--model"; "models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf";} })'
   candid_in =
       "4449444c026c01dd9ad28304016d71010002072d2d6d6f64656c466d6f64656c732f5177656e2f5177656e322e352d302e35422d496e7374727563742d474755462f7177656e322e352d302e35622d696e7374727563742d71385f302e67677566";
-  // '(variant { Ok = record { status_code = 200 : nat16; input=""; prompt_remaining=""; output="Model succesfully loaded into memory."; error="" } })'
+  // '(variant { Ok = record { status_code = 200 : nat16; input=""; prompt_remaining=""; output="Model succesfully loaded into memory."; error=""; generated_eog=false : bool } })'
   candid_out =
       "4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a0100010100254d6f64656c2073756363657366756c6c79206c6f6164656420696e746f206d656d6f72792e0000c80000";
 
@@ -94,49 +94,39 @@ void test_qwen2(MockIC &mockIC) {
         silent_on_trap, my_principal);
 
     // -----------------------------------------------------------------------------
-    // Feed the system prompt into the cache:
-    // (NOTE: for long system prompts, this must be done in a loop)
     // -sp  : special token output enabled
-    // -n 1 : let it generate 1 token.
-    //        -> this is NOT stored in the cache, because the last token never is
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "1"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"} })' ->
-    // '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })'
+    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
+    // -> '(variant { Ok = record { status_code = 200 : nat16; error = ""; output = ""; input = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>" ; prompt_remaining = "user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"; } })' 
     mockIC.run_test(
-        "run_update for chat " + std::to_string(i) + " - " + model, run_update,
-        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e0131022d703a3c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a",
-        "44444", silent_on_trap, my_principal);
+        "run_update prompt step 1 for chat " + std::to_string(i) + " - " + model, run_update,
+        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
+        "4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a01000101000000463c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3ec80044757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a", silent_on_trap, my_principal);
 
     // -----------------------------------------------------------------------------
-    // Feed the user prompt into the cache & indicate it is now the turn of the assistant:
-    // (NOTE: for long user prompts, this must be done in a loop)
     // -sp  : special token output enabled
-    // -n 1 : let it generate 1 token.
-    //        -> this is NOT stored in the cache, because the last token never is
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "1"; "-p"; "<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
-    // '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })'
+    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
+    // -> '(variant { Ok = record { status_code = 200 : nat16; error = ""; output = ""; input = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant"; prompt_remaining = "\n";} })' 
     mockIC.run_test(
-        "run_update for chat " + std::to_string(i) + " - " + model, run_update,
-        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e0131022d70503c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
-        "44444", silent_on_trap, my_principal);
+        "run_update prompt step 2 for chat " + std::to_string(i) + " - " + model, run_update,
+        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
+        "4449444c026c05819e846471c897a79907718a88f7f00b719aa1b2f90c7adb92a2c90d716b01bc8a0100010100000089013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e74c800010a", silent_on_trap, my_principal);
 
     // -----------------------------------------------------------------------------
-    // Generate tokens from prompt while saving everything to cache,
-    // without re-reading the model !
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; "Joe loves writing stories"} })' ->
-    // '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })'
+    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
+    // -> can no longer check it, because the LLM generated tokens 
     mockIC.run_test(
-        "run_update for chat " + std::to_string(i) + " - " + model, run_update,
-        "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d70194a6f65206c6f7665732077726974696e672073746f72696573",
+        "run_update prompt step 3 for chat " + std::to_string(i) + " - " + model, run_update,
+        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
         "", silent_on_trap, my_principal);
 
     // -----------------------------------------------------------------------------
-    // Continue generating tokens while using & saving the cache, without re-reading the model
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "--samplers"; "top_p"; "--temp"; "0.1"; "--top-p"; "0.9"; "-n"; "20"; "-p"; ""} })' ->
-    // '(variant { Ok = record { status_code = 200 : nat16; output = "TODO" } })'
+    // Once there is no prompt_remaining, it is totally ok to send an empty prompt, and just let it generate new tokens
+    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; ""} })' ->
+    // -> can no longer check it, because the LLM generated tokens 
     mockIC.run_test(
-        "run_update for chat " + std::to_string(i) + " continued - " + model,
-        run_update,
-        "4449444c026c01dd9ad28304016d7101000d0e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c0a2d2d73616d706c65727305746f705f70062d2d74656d7003302e31072d2d746f702d7003302e39022d6e023230022d7000",
+        "run_update prompt step 4 for chat " + std::to_string(i) + " - " + model, run_update,
+        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d7000",
         "", silent_on_trap, my_principal);
+
   }
 }
diff --git a/scripts/upload.py b/scripts/upload.py
@@ -12,6 +12,7 @@
 # pylint: disable=invalid-name, too-few-public-methods, no-member, too-many-statements
 
 import sys
+import time
 from pathlib import Path
 from typing import Generator
 from .ic_py_canister import get_canister
@@ -122,14 +123,45 @@ def main() -> int:
             print(f"- chunk[0]  = {chunk[0]}")
             print(f"- chunk[-1] = {chunk[-1]}")
 
-        response = canister_instance.file_upload_chunk(
-            {
-                "filename": canister_filename,
-                "chunk": chunk,
-                "chunksize": chunksize,
-                "offset": offset,
-            }
-        )  # pylint: disable=no-member
+        """Handle exceptions in case the Ingress is busy and it throws this message:
+        Traceback (most recent call last):
+        File "<frozen runpy>", line 198, in _run_module_as_main
+        File "<frozen runpy>", line 88, in _run_code
+        File "llama_cpp_canister/scripts/upload.py", line 186, in <module>
+            sys.exit(main())
+                    ^^^^^^
+        File "llama_cpp_canister/scripts/upload.py", line 125, in main
+            response = canister_instance.file_upload_chunk(
+                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        File ".../miniconda3/envs/icpp_pro_w_llama_cpp_canister/lib/python3.11/site-packages/ic/canister.py", line 65, in __call__
+            res = self.agent.update_raw(
+                ^^^^^^^^^^^^^^^^^^^^^^
+        File ".../miniconda3/envs/icpp_pro_w_llama_cpp_canister/lib/python3.11/site-packages/ic/agent.py", line 116, in update_raw
+            raise Exception('Rejected: ' + result.decode())
+        Exception: Rejected: Ingress message 0x33efc6c16bb217df26b8940c5ae4f3f7beaa795a252227923f77b711f4b61fe3 timed out waiting to start executing.
+        """
+        max_retries = 5
+        retry_delay = 2  # seconds
+        for attempt in range(1, max_retries + 1):
+            try:
+                response = canister_instance.file_upload_chunk(
+                    {
+                        "filename": canister_filename,
+                        "chunk": chunk,
+                        "chunksize": chunksize,
+                        "offset": offset,
+                    }
+                )  # pylint: disable=no-member
+                break  # Exit the loop if the request is successful
+            except Exception as e:
+                print(f"Attempt {attempt} failed: {e}")
+                if attempt == max_retries:
+                    print("Max retries reached. Failing.")
+                    raise  # Re-raise the exception if max retries are reached, which will exit the program
+                else:
+                    print(f"Retrying in {retry_delay} seconds...")
+                    time.sleep(retry_delay)  # Wait before retrying
+
         if "Ok" in response[0].keys():
             print(f"OK! filesize = {response[0]['Ok']['filesize']}")
         else:

diff --git a/src/llama_cpp.did b/src/llama_cpp.did
@@ -15,11 +15,12 @@ type MaxTokensRecord = record {
 };
 
 type RunOutputRecord = record { 
-  status_code: StatusCode; 
-  input: text; 
+  status_code: StatusCode;  
+  output: text;
+  conversation: text;
+  error: text;
   prompt_remaining: text;
-  output: text; 
-  error: text 
+  generated_eog: bool;
 };
 type OutputRecordResult = variant {
   Ok : RunOutputRecord;
@@ -80,7 +81,7 @@ service : {
   // model endpoints
   load_model : (InputRecord) -> (OutputRecordResult);
   set_max_tokens : (MaxTokensRecord) -> (StatusCodeRecordResult);
-  get_max_tokens : () -> (MaxTokensRecord);
+  get_max_tokens : () -> (MaxTokensRecord) query;
 
   // up & down load of files
   file_download_chunk : (FileDownloadInputRecord) -> (FileDownloadRecordResult) query;