diff --git a/.github/workflows/cicd-mac.yml b/.github/workflows/cicd-mac.yml
new file mode 100644
index 0000000..ac4f6e3
--- /dev/null
+++ b/.github/workflows/cicd-mac.yml
@@ -0,0 +1,113 @@
+name: cicd-mac
+
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - "scripts/**"
+      - "native/**"
+      - "src/**"
+      - "test/**"
+      - "Makefile"
+      - ".github/trigger.txt"
+      - ".github/workflows/cicd-mac.yml"
+
+jobs:
+  all-mac:
+    name: all-mac
+    strategy:
+      matrix:
+        python-version: [3.8, 3.11]
+        os: [macos-latest]
+
+    runs-on: ${{ matrix.os }}
+    
+    steps:
+      # https://github.com/marketplace/actions/setup-miniconda
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      # -------------------------------------------------------------------
+      # Checkout llama_cpp_canister & llama_cpp_onicai_fork as nested directory
+      - name: checkout llama_cpp_canister
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: checkout llama_cpp_onicai_fork
+        uses: actions/checkout@v4
+        with:
+          repository: onicai/llama_cpp_onicai_fork
+          ref: onicai  # Specify the branch name here
+          path: src/llama_cpp_onicai_fork
+          fetch-depth: 1 # Get just the last commit
+          submodules: 'recursive'
+      # -------------------------------------------------------------------
+
+      - name: install
+        shell: bash -l {0} # activates the default conda environment ('test')
+        run: |
+          echo "Installing tool chains & dependencies"
+          pwd
+          make summary
+          make install-dfx
+          make install-python
+          make install-homebrew-mac
+          make install-jp-mac
+
+      - name: versions
+        shell: bash -l {0}
+        run: |
+          echo "icpp --version: $(icpp --version)"
+          echo "clang++ --version: $(clang++ --version)"
+          echo "g++ --version: $(g++ --version)"
+          #echo "wasm2wat version: $(wasm2wat --version)"
+          echo "pip version     : $(pip --version)"
+          echo "python version  : $(python --version)"
+          echo "jp version      : $(jp --version)"
+          #echo "rustc version   : $(rustc --version)"
+          echo "dfx version    : $(dfx --version)"
+          echo "Ensure conda works properly"
+          conda info
+          which pip
+          which python
+          which icpp
+
+      - name: install-wasi-sdk
+        shell: bash -l {0}
+        run: |
+          echo "Installing wasi-sdk"
+          icpp install-wasi-sdk
+
+      - name: install-rust
+        shell: bash -l {0}
+        run: |
+          echo "Installing rust"
+          icpp install-rust
+
+      - name: build-info-cpp
+        shell: bash -l {0}
+        run: |
+          make build-info-cpp
+
+      - name: all-static
+        shell: bash -l {0}
+        run: |
+          make all-static
+
+      - name: test-llm-wasm
+        shell: bash -l {0}
+        run: |
+          make test-llm-wasm
+
+      - name: test-llm-native
+        shell: bash -l {0}
+        run: |
+          make test-llm-native
+
+      # TODO
+      # - name: all-tests
+      #   shell: bash -l {0}
+      #   run: |
+      #     make all-tests
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd-ubuntu.yml-TODO
similarity index 60%
rename from .github/workflows/cicd.yml
rename to .github/workflows/cicd-ubuntu.yml-TODO
index 16d4409..4d636ed 100644
--- a/.github/workflows/cicd.yml
+++ b/.github/workflows/cicd-ubuntu.yml-TODO
@@ -1,33 +1,47 @@
-name: cicd
+name: cicd-ubuntu
 
 on:
   workflow_dispatch:
   push:
     paths:
-      - "icpp_llama2/**"
+      - "scripts/**"
+      - "native/**"
+      - "src/**"
+      - "test/**"
       - "Makefile"
       - ".github/trigger.txt"
-      - ".github/workflows/cicd.yml"
-
-env:
-  PYTHON_VERSION: 3.11
+      - ".github/workflows/cicd-ubuntu.yml"
 
 jobs:
-  all:
-    name: all
-    runs-on: ubuntu-latest
+  all-ubuntu:
+    name: all-ubuntu
+    strategy:
+      matrix:
+        python-version: [3.8, 3.11]
+        os: [ubuntu-latest]
+
+    runs-on: ${{ matrix.os }}
+
     steps:
       # https://github.com/marketplace/actions/setup-miniconda
-      - uses: conda-incubator/setup-miniconda@v2
+      - uses: conda-incubator/setup-miniconda@v3
         with:
-          python-version: ${{ env.PYTHON_VERSION }}
+          python-version: ${{ matrix.python-version }}
 
       # -------------------------------------------------------------------
-      # Checkout icpp-llm
-      - name: checkout icpp-llm
-        uses: actions/checkout@v3
+      # Checkout llama_cpp_canister & llama_cpp_onicai_fork as nested directory
+      - name: checkout llama_cpp_canister
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: checkout llama_cpp_onicai_fork
+        uses: actions/checkout@v4
         with:
+          repository: onicai/llama_cpp_onicai_fork
+          path: src/llama_cpp_onicai_fork
           fetch-depth: 0
+          submodules: 'recursive'
       # -------------------------------------------------------------------
 
       - name: install
@@ -41,7 +55,7 @@ jobs:
           make install-dfx
           make install-python
           make install-clang-ubuntu
-          make install-jp
+          make install-jp-ubuntu
 
       - name: versions
         shell: bash -l {0}
@@ -49,9 +63,11 @@ jobs:
           echo "icpp --version: $(icpp --version)"
           echo "clang++ --version: $(clang++ --version)"
           echo "g++ --version: $(g++ --version)"
+          #echo "wasm2wat version: $(wasm2wat --version)"
           echo "pip version     : $(pip --version)"
           echo "python version  : $(python --version)"
           echo "jp version      : $(jp --version)"
+          #echo "rustc version   : $(rustc --version)"
           echo "dfx version    : $(dfx --version)"
           echo "Ensure conda works properly"
           conda info
@@ -65,11 +81,11 @@ jobs:
           echo "Installing wasi-sdk"
           icpp install-wasi-sdk
 
-      - name: download models
+      - name: install-rust
         shell: bash -l {0}
         run: |
-          make icpp_llama2_get_stories15M
-          make icpp_llama2_get_stories260K
+          echo "Installing rust"
+          icpp install-rust
 
       - name: all-tests
         shell: bash -l {0}
diff --git a/Makefile b/Makefile
index 6c8b3d8..2e69e96 100644
--- a/Makefile
+++ b/Makefile
@@ -47,22 +47,12 @@ CLANG_TIDY = $(ICPP_COMPILER_ROOT)/bin/clang-tidy
 # CI/CD - Phony Makefile targets
 #
 .PHONY: all-tests
-all-tests: all-static test-all-llms 
-
-# TODO: change to gguf from llama_cpp_... of onicai's huggingface repo
-.PHONY: icpp_llama2_get_stories260K
-icpp_llama2_get_stories260K:
-	cd icpp_llama2 && \
-		mkdir -p stories260K && \
-		wget -P stories260K https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin && \
-		wget -P stories260K https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-
-# TODO: change to gguf from llama_cpp_... of onicai's huggingface repo
-.PHONY: icpp_llama2_get_stories15M
-icpp_llama2_get_stories15M:
-	cd icpp_llama2 && \
-		mkdir -p models && \
-		wget -P models https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin
+all-tests: all-static test-llm-wasm test-llm-native 
+
+.PHONY: build-info-cpp
+build-info-cpp: 
+	echo "Creating src/llama_cpp_onicai_fork/common/build-info.cpp"
+	@sh src/llama_cpp_onicai_fork/scripts/build-info.sh clang > src/llama_cpp_onicai_fork/common/build-info.cpp
 
 .PHONY: summary
 summary:
@@ -75,19 +65,24 @@ summary:
 	@echo ICPP_COMPILER_ROOT=$(ICPP_COMPILER_ROOT)
 	@echo "-------------------------------------------------------------"
 
-# TODO: change to testing llama_cpp
-.PHONY: test-all-llms
-test-all-llms:
+.PHONY: test-llm-native
+test-llm-native:
+	dfx identity use default
+	build-info-cpp
+	icpp build-native
+	./build-native/mockic.exe
+
+.PHONY: test-llm-wasm
+test-llm-wasm:
 	dfx identity use default
-	@echo "#########################################"
-	@echo "####### testing icpp_llama2 #############"
-	@echo "#########################################"
-	cd icpp_llama2 && \
-		icpp build-native && \
-		./build-native/mockic.exe && \
-		./demo.sh && \
-		pytest && \
-		dfx stop
+	build-info-cpp
+	icpp build-wasm
+	dfx stop
+	dfx start --clean --background
+	dfx deploy
+	python -m scripts.upload models/stories260Ktok512.gguf
+	pytest -vv
+	dfx stop
 	
 .PHONY: all-static
 all-static: \
@@ -164,11 +159,18 @@ install-didc:
 	@echo "Installed successfully in:"
 	@echo /usr/local/bin/didc
 
-# TODO: update as in icpp-pro, for ubuntu & mac
-.PHONY: install-jp
-install-jp:
+.PHONY: install-jp-ubuntu
+install-jp-ubuntu:
 	sudo apt-get update && sudo apt-get install jp
 
+.PHONY: install-jp-mac
+install-jp-mac:
+	brew install jp
+
+.PHONY: install-homebrew-mac
+install-homebrew-mac:
+	/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+
 .PHONY: install-python
 install-python:
 	pip install --upgrade pip
diff --git a/README-NOTES-ON-PORT.md b/README-NOTES-ON-PORT.md
deleted file mode 100644
index 131879d..0000000
--- a/README-NOTES-ON-PORT.md
+++ /dev/null
@@ -1,141 +0,0 @@
-# Port of llama.cpp to the Internet Computer
-
-THIS IS OUT OF DATE -- WE NOW USE wasi2ic, AND MANY THINGS CAN REMAIN UNCHANGED
-
-## Files required for the llama2 LLM inference engine
-
-First, we determined what files are needed, and listed them in icpp.toml:
-
-Notes: 
- - main_.cpp is the canister equivalent of llama.cpp main.cpp
- - All files: src/llama_cpp_onicai_fork/unicode-data.cpp, src/llama_cpp_onicai_fork/unicode.cpp, src/llama_cpp_onicai_fork/common/json-schema-to-grammar.cpp, src/llama_cpp_onicai_fork/common/build-info.cpp, , src/llama_cpp_onicai_fork/common/grammar-parser.cpp, src/llama_cpp_onicai_fork/common/sampling.cpp, src/llama_cpp_onicai_fork/common/common.cpp, , src/llama_cpp_onicai_fork/llama.cpp, src/*.cpp, src/llama_cpp_onicai_fork/ggml.c, src/llama_cpp_onicai_fork/ggml-alloc.c, src/llama_cpp_onicai_fork/ggml-backend.c, src/llama_cpp_onicai_fork/ggml-quants.c
- 
-
-We made sure it worked properly using a native build, and then proceeded to port it to the iC.
-
-## Porting to IC
-
-When porting the llama.cpp application to a Smart Contract running on the IC, the following had to be changed:
-2. No exceptions
-3. No curl (to dowload a model file)
-4. No OS or machine specific capabilities (APPLE vs WIN32 vs ...)
-5. No CUDA (GPU acceleration)
-6. No threading (Multi Threading acceleration)
-7. No Microsoft Visual C++ compiler
-8. No main()+console() program -> instead use canister request/response API
-
-Because llama.cpp is designed to be super efficient when running local, by default it does a 
-lot of checking against the hardware it is being compiled on. There are no build-in options
-to bypass those checks, and the only option is to patch it.
-
-
-## 1. No file IO
-Wherever there was a file-io or directory creation code, we simply outcommented it.
-
-## 2. No exceptions
-The IC does not handle exceptions. Our approach is to:
-- replace all `throw` statements with an IC_API::trap
-- outcomment try - catch
-- outcomment `// #include <signal.h>`
-
-For example:
-```
-# In function: bool gpt_params_parse_ex
-# replace:
-throw std::invalid_argument("error: unknown argument: " + arg);
-# with:
-IC_API::trap(std::string("INVALID ARGUMENT: ") + "error: unknown argument: " + arg);
-
-# Then no need for a try-catch, since canister already trapped:
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    bool result = true;
-    // try {
-        if (!gpt_params_parse_ex(argc, argv, params)) {
-            gpt_print_usage(argc, argv, gpt_params());
-            exit(0);
-        }
-    // }
-    // catch (const std::invalid_argument & ex) {
-    //     fprintf(stderr, "%s\n", ex.what());
-    //     gpt_print_usage(argc, argv, gpt_params());
-    //     exit(1);
-    // }
-    return result;
-}
-```
-
-## 3. No curl (to dowload a model file)
-We do not use curl to dowload model files, so outcommented all sections with:
-```
-#if defined(LLAMA_USE_CURL)
-...
-#endif
-```
-
-## 4. No OS or machine specific capabilities (APPLE vs WIN32 vs ...)
-Since we're building the wasm on a Linux/Mac/Windows machine, we need to 
-outcomment preprocessor sections like these, else the compiler will throw errors:
-```
-#if defined(__APPLE__) && defined(__MACH__)
-...
-#
-
-#ifdef __linux__
-...
-#endif
-
-#if defined(_WIN32)
-...
-#endif
-```
-
-## 5. No CUDA (GPU acceleration)
-
-
-## 6. No threading (Multi Threading acceleration)
-The llama.cpp application does not provide an option to compile without threading.
-This made it quite involved to port, and these are patches applied:
-
-- patch use of `std::thread::hardware_concurrency()`
-  ```
-  // AB PATCH
-  // unsigned int n_threads = std::thread::hardware_concurrency(); 
-  unsigned int n_threads = 1;
-  ```
-
-- patch and outcomment code that checks cpu details, eg:
-  ```
-  /**
-  * Returns number of CPUs on system that are useful for math.
-  */
-  int get_math_cpu_count() {
-  // #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
-  //     int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
-  //     if (cpu_count < 1) {
-  //         return get_num_physical_cores();
-  //     }
-  //     if (is_hybrid_cpu()) {
-  //         cpu_set_t affinity;
-  //         if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
-  //             int result = count_math_cpus(cpu_count);
-  //             pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
-  //             if (result > 0) {
-  //                 return result;
-  //             }
-  //         }
-  //     }
-  // #endif
-      return get_num_physical_cores();
-  }
-  ```
-
-## 7. No Microsoft Visual C++ compiler
-Nothing needs to be changed, but just mentioning it here that we
-use the clang++ compiler, and statements like this will be ignored:
-```
-#if defined(_MSC_VER)
-...
-#endif
-```
-
-## 8. No main() program -> instead use canister endpoints API
\ No newline at end of file
diff --git a/README.md b/README.md
index 216c930..7fa2fb3 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-[![llama_cpp_canister](https://github.com/onicai/llama_cpp_canister/actions/workflows/cicd.yml/badge.svg)](https://github.com/onicai/llama_cpp_canister/actions/workflows/cicd.yml)
+[![llama_cpp_canister](https://github.com/onicai/llama_cpp_canister/actions/workflows/cicd-mac.yml/badge.svg)](https://github.com/onicai/llama_cpp_canister/actions/workflows/cicd-mac.yml)
 
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) for the Internet Computer.
@@ -6,6 +6,8 @@
 
 # Getting Started
 
+Currently, the canister can only be build on a `mac` !
+
 - Install the C++ development environment for the Internet Computer ([docs](https://docs.icpp.world/installation.html)):
 
 - Clone the repo and it's children:
@@ -33,6 +35,7 @@
    conda activate llama_cpp_canister
 
    # Install the python dependencies
+   # From root of llama_cpp_canister repo:
    pip install -r requirements.txt
    ```
 
@@ -46,6 +49,7 @@
    ```
 
    _(Note: On Windows, just install dfx in wsl, and icpp-pro in PowerShell will know where to find it. )_
+   _(Note 2: It does not yet work on Windows... Stay tuned... )_
 
 - Build & Deploy a pre-trained model to canister `llama_cpp`:
 
@@ -72,21 +76,21 @@
     (variant { Ok = record { status_code = 200 : nat16 } })
     ```
   - Upload the 260K parameter model:
-    _(We have included this tiny fine-tuned model)_
+    _(We included this fine-tuned model in the repo)_
     ```bash
     python -m scripts.upload --network local --canister llama_cpp models/stories260Ktok512.gguf
     ```
 
 - Test it with dfx.
 
-  - Generate a story, using the `run_query` or `run_update` call:
+  - Generate 20 tokens, using the `run_query` or `run_update` call:
 
     ```bash
-    $ dfx canister call llama_cpp run_query '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "600"; "--ctx-size"; "128"} })'
+    $ dfx canister call llama_cpp run_query '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "20"; "--ctx-size"; "128"} })'
     
-    $ dfx canister call llama_cpp run_update '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "600"; "--ctx-size"; "128"} })'
+    $ dfx canister call llama_cpp run_update '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "20"; "--ctx-size"; "128"} })'
     
-    -> See token generation in the dfx log window, until it hits the instruction limit
+    -> See token generation in the dfx log window
 
 
 # Models
@@ -111,13 +115,11 @@ We will start by expanding our tests to the tiny stories models:
 | stories42Mtok32000.guff  | todo |
 | stories110Mtok32000.guff | todo |
 
-# Next Steps
-
-## Expand models that can run
+# TODO
 
-The LLM must be able generate tokens until it runs into the instructions limit.
+## Run larger models
 
-I want to be able to run variations of two types of models:
+We focus on two types of models:
 
 1. the TinyStories models:
 
@@ -135,36 +137,40 @@ I want to be able to run variations of two types of models:
 
 ## Optimizations
 
+In order to run the larger models, following optimizations are planned:
+
 - Don't read model as part of `run_query` or `run_update`, but read it only once
 - Use SIMD, to reduce number of instructions required to generate a token
 - Use quantized models
+- ... other items as IC capabilities grow ...
 
 
 ## Canister with sequence of update calls
 
 Because a single update call is never enough, due to the instructions limit, a sequence of update calls is required. This is non-trivial, because the state of the LLM at the end of each update call must be saved.
 
-I did an initial study on the data structures of the llama.cpp code, but it is not immediately clear what data must be preserved to be able to continue the token generation in a sub-sequent call. The internal data structure is very different from llama2.c, which had a very nice `RunState` data-structure that contained everything needed.
+The llama.cpp code has a caching mechanism that likely can be used for this purpose.
 
-The llama.cpp code has a caching mechanism that potentially can be used for this purpose, but this is not yet proven. More research will be needed.
+## Completions endpoint
 
-Even so, the steps required for this task are clear:
+Implement an endpoint that is very similar to the industry standard completions API. This will ensure that the LLM canister can be easily integrated into both Web3 and Web2 applications.
 
-- Save LLM state after update call
-- Expand API endpoint to allow a sequence of update calls for token generation until done
+## Integrate into ICGPT
 
-## Completions endpoint
+Once finished, we will integrate it all into [ICGPT](https://icgpt.icpp.world/).
 
-Implement an endpoint that is very similar to the industry standard completions API. This will ensure that the LLM canister can be easily integrated into both Web3 and Web2 applications.
+## Support build on Ubuntu & Windows
 
-# Integrate into ICGPT
+Currently, the build process only works on a mac.
+We will expand it to also work on Ubuntu & Windows
 
-Once finished, we will integrate it into [ICGPT](https://icgpt.icpp.world/).
+# Appendix A: DFINITY DeAI grant project
 
+This project is sponsored by a DFINITY DeAI grant.
 
-# Appendix A: Status of DFINITY DeAI grant project: ICGPT V2
+The status for milestone 1 & 2 are summarized below.
 
-# Milestone 1 (Status on July 21, 2024)
+# Milestone 2 (Status on July 21, 2024)
 
 The following tasks were completed:
 
@@ -177,8 +183,10 @@ The following tasks were completed:
 - Write python scripts to upload the model weights and tokenizer
 - Test it locally
   - We were able to run the `stories260Ktok512.gguf` model
+  - Token generation works, until it hits the instruction limit
+- Implement CI/CD pipeline using GitHub actions
 
-# 30 day sprint (COMPLETED on May 12, 2024)
+# Milestone 1 - 30 day sprint (COMPLETED May 12, 2024)
 
 As part of an initial 30 day sprint, the following tasks were completed:
 
@@ -200,6 +208,4 @@ All models are stored on [huggingface/onicai/llama_cpp_canister_models](https://
 
 ## study code and create implementation plan
 
-We dug deep into the code and studied it by stepping through it in the debugger with VS Code. We have gained sufficient understanding to create a solid implementation plan.
-
-Work on this has already begun.
\ No newline at end of file
+We dug deep into the code and studied it by stepping through it in the debugger with VS Code. We have gained sufficient understanding to create a solid implementation plan.
\ No newline at end of file
diff --git a/demo.ps1 b/demo.ps1
index 47e7a1b..399451b 100644
--- a/demo.ps1
+++ b/demo.ps1
@@ -18,7 +18,7 @@ wsl dfx stop
 Write-Host " "
 Write-Host "--------------------------------------------------"
 Write-Host "Starting the local network in wsl as a PowerShell background job"
-$jobName = "greet"
+$jobName = "llama_cpp"
 
 # Stop the job if it already exist
 # Get a list of all the background jobs with a specific name
@@ -53,30 +53,34 @@ icpp build-wasm --to-compile all
 Write-Host " "
 Write-Host "--------------------------------------------------"
 Write-Host "Deploying the wasm to a canister on the local network"
-wsl --% dfx deploy
+wsl --% . ~/.local/share/dfx/env; dfx deploy
+
+#######################################################################
+echo " "
+echo "--------------------------------------------------"
+echo "Uploading the *.gguf model file"
+python -m scripts.upload models/stories260Ktok512.gguf
+# python -m scripts.upload models/stories15Mtok4096.gguf
+# python -m scripts.upload ../../repos_hf/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-q4.gguf --canister-file models/Phi-3-mini-4k-instruct-q4.gguf
 
 #######################################################################
 Write-Host " "
 Write-Host "--------------------------------------------------"
 Write-Host "Running some manual tests with dfx"
-wsl --% dfx canister call greet greet_0
-wsl --% dfx canister call greet greet_1
-wsl --% dfx canister call greet greet_2 '("C++ Developer")'
-wsl --% dfx canister call greet greet_3 '(record { "icpp version" = 1 : int; OS = "Linux" : text })'
-wsl --% dfx canister call greet greet_4 '(record { 6 = 42 : int; 9 = 43 : int }, record { 7 = 44 : int; 10 = 45 : int })'
-wsl --% dfx canister call greet greet_json '("{\"name\": \"AJ\"}")'
+wsl --% . ~/.local/share/dfx/env; dfx canister call llama_cpp run_query '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "25"; "--ctx-size"; "128"} })'
+wsl --% . ~/.local/share/dfx/env; dfx canister call llama_cpp run_update '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "25"; "--ctx-size"; "128"} })'
 
 #######################################################################
 Write-Host " "
 Write-Host "--------------------------------------------------"
 Write-Host "Running the full smoketests with pytest"
-pytest --network=local
+pytest -vv --network=local
 
 #######################################################################
 Write-Host " "
 Write-Host "--------------------------------------------------"
 Write-Host "Stopping the local network in wsl"
-wsl dfx stop
+wsl --% . ~/.local/share/dfx/env; dfx stop
 
 Native build on Windows is temporarily broken..
 #######################################################################
diff --git a/demo.sh b/demo.sh
index f524f76..20f34bb 100755
--- a/demo.sh
+++ b/demo.sh
@@ -36,21 +36,22 @@ dfx deploy
 echo " "
 echo "--------------------------------------------------"
 echo "Uploading the *.gguf model file"
-python -m scripts.upload ../../repos_hf/llama_cpp_canister_models/stories260Ktok512.gguf --canister-file stories260Ktok512.gguf
-python -m scripts.upload ../../repos_hf/llama_cpp_canister_models/stories15Mtok4096.gguf --canister-file stories15Mtok4096.gguf
-python -m scripts.upload ../../repos_hf/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-q4.gguf --canister-file Phi-3-mini-4k-instruct-q4.gguf
+python -m scripts.upload models/stories260Ktok512.gguf
+# python -m scripts.upload models/stories15Mtok4096.gguf
+# python -m scripts.upload ../../repos_hf/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-q4.gguf --canister-file models/Phi-3-mini-4k-instruct-q4.gguf
 
 #######################################################################
 echo " "
 echo "--------------------------------------------------"
 echo "Running some manual tests with dfx"
-dfx canister call llama_cpp run_query '(record { args = vec {"--model"; "stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "600"; "--ctx-size"; "128"} })'
+dfx canister call llama_cpp run_query '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "25"; "--ctx-size"; "128"} })'
+dfx canister call llama_cpp run_update '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "25"; "--ctx-size"; "128"} })'
 
 #######################################################################
 echo " "
 echo "--------------------------------------------------"
 echo "Running the full smoketests with pytest"
-pytest --network=local
+pytest -vv --network=local
 
 #######################################################################
 echo "--------------------------------------------------"
diff --git a/icpp.toml b/icpp.toml
index 7ed6015..72d233f 100644
--- a/icpp.toml
+++ b/icpp.toml
@@ -15,7 +15,6 @@ cpp_paths = [
 cpp_include_dirs = [
     "src/llama_cpp_onicai_fork",
     "src/llama_cpp_onicai_fork/common",
-    "src/vendors/*",
 ]
 # NOTE: Adding compile flag "-msimd128" might be too much. It will compile everything with simd
 #       Alternative is to add it at granular level in the code, like:
diff --git a/native/main.cpp b/native/main.cpp
index 1809987..ad4a128 100644
--- a/native/main.cpp
+++ b/native/main.cpp
@@ -29,8 +29,7 @@ int main() {
   mockIC.run_test(
       "run_query", run_query,
       "4449444c026c01dd9ad28304016d71010008072d2d6d6f64656c1d6d6f64656c732f73746f726965733236304b746f6b3531322e67677566082d2d70726f6d7074265061747269636b206c6f766573206963652d637265616d2e204f6e206120686f7420646179200b2d2d6e2d70726564696374033630300a2d2d6374782d73697a6503313238",
-      "",
-      silent_on_trap, my_principal);
+      "", silent_on_trap, my_principal);
 
   // -----------------------------------------------------------------------------
   // '(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "600"; "--ctx-size"; "128"} })' ->
@@ -38,8 +37,7 @@ int main() {
   mockIC.run_test(
       "run_update", run_update,
       "4449444c026c01dd9ad28304016d71010008072d2d6d6f64656c1d6d6f64656c732f73746f726965733236304b746f6b3531322e67677566082d2d70726f6d7074265061747269636b206c6f766573206963652d637265616d2e204f6e206120686f7420646179200b2d2d6e2d70726564696374033630300a2d2d6374782d73697a6503313238",
-      "",
-      silent_on_trap, my_principal);
+      "", silent_on_trap, my_principal);
 
   // returns 1 if any tests failed
   return mockIC.test_summary();
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..c050fbd
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths =
+    test
\ No newline at end of file
diff --git a/scripts/TODO/ic_py_canister.py b/scripts/TODO/ic_py_canister.py
deleted file mode 100644
index efbf06b..0000000
--- a/scripts/TODO/ic_py_canister.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""Returns the ic-py Canister instance, for calling the endpoints."""
-
-import sys
-import platform
-import subprocess
-from pathlib import Path
-from typing import Optional
-from ic.canister import Canister  # type: ignore
-from ic.client import Client  # type: ignore
-from ic.identity import Identity  # type: ignore
-from ic.agent import Agent  # type: ignore
-from icpp.run_shell_cmd import run_shell_cmd
-
-ROOT_PATH = Path(__file__).parent.parent
-
-# We use dfx to get some information.
-# On Windows, dfx must be installed in wsl.
-DFX = "dfx"
-RUN_IN_POWERSHELL = False
-if platform.win32_ver()[0]:
-    DFX = "wsl --% dfx"
-    RUN_IN_POWERSHELL = True
-
-
-def run_dfx_command(cmd: str) -> Optional[str]:
-    """Runs dfx command as a subprocess"""
-    try:
-        return run_shell_cmd(
-            cmd,
-            capture_output=True,
-            run_in_powershell=RUN_IN_POWERSHELL,
-        ).rstrip("\n")
-    except subprocess.CalledProcessError as e:
-        print(f"Failed dfx command: '{cmd}' with error: \n{e.output}")
-        sys.exit(1)
-    return None
-
-
-def get_canister(
-    canister_name: str,
-    candid_path: Path,
-    network: str = "local",
-    canister_id: Optional[str] = "",
-) -> Canister:
-    """Returns an ic_py Canister instance"""
-
-    # Check if the network is up
-    print(f"--\nChecking if the {network} network is up...")
-    run_dfx_command(f"{DFX} ping {network} ")
-    print("Ok!")
-
-    # Set the network URL
-    if network == "local":
-        replica_port = run_dfx_command(f"{DFX} info replica-port  ")
-        replica_rev = run_dfx_command(f"{DFX} info replica-rev  ")
-        webserver_port = run_dfx_command(f"{DFX} info webserver-port  ")
-        networks_json_path = run_dfx_command(f"{DFX} info networks-json-path  ")
-        print(f"replica-port       = {replica_port}")
-        print(f"replica-rev        = {replica_rev}")
-        print(f"webserver-port     = {webserver_port}")
-        print(f"networks-json-path = {networks_json_path}")
-
-        network_url = f"http://localhost:{replica_port}"
-    else:
-        # https://smartcontracts.org/docs/interface-spec/index.html#http-interface
-        network_url = "https://ic0.app"
-
-    print(f"Network URL        = {network_url}")
-
-    # Get the name of the current identity
-    identity_whoami = run_dfx_command(f"{DFX} identity whoami ")
-    print(f"Using identity = {identity_whoami}")
-
-    # Try to get the id of the canister if not provided explicitly
-    # This only works from the same directory as where you deployed from.
-    # So we also provide the option to just pass in the canister_id directly
-    if canister_id == "":
-        canister_id = run_dfx_command(
-            f"{DFX} canister --network {network} id {canister_name} "
-        )
-    print(f"Canister ID = {canister_id}")
-
-    # Get the private key of the current identity
-    private_key = run_dfx_command(f"{DFX} identity export {identity_whoami} ")
-
-    # Create an Identity instance using the private key
-    identity = Identity.from_pem(private_key)
-
-    # Create an HTTP client instance for making HTTPS calls to the IC
-    # https://smartcontracts.org/docs/interface-spec/index.html#http-interface
-    client = Client(url=network_url)
-
-    # Create an IC agent to communicate with IC canisters
-    agent = Agent(identity, client)
-
-    # Read canister's candid from file
-    with open(
-        candid_path,
-        "r",
-        encoding="utf-8",
-    ) as f:
-        canister_did = f.read()
-
-    # Create a Canister instance
-    return Canister(agent=agent, canister_id=canister_id, candid=canister_did)
diff --git a/scripts/TODO/llama2_c_sizer.py b/scripts/TODO/llama2_c_sizer.py
deleted file mode 100644
index 0a01bd2..0000000
--- a/scripts/TODO/llama2_c_sizer.py
+++ /dev/null
@@ -1,267 +0,0 @@
-"""Calculates the require resources to deploy a Llama2 model to an IC canister"""
-
-# pylint: disable=invalid-name
-import sys
-import struct
-from pathlib import Path
-from typing import TextIO
-
-ROOT_PATH = Path(__file__).parent.parent
-
-# For 32 bit system
-SIZE_OF_FLOAT = 4  # bytes
-SIZE_OF_POINTER = 4  # bytes
-SIZE_OF_BYTE_PIECES = 512  # bytes (static size)
-
-
-def read_config_from_file(file_path: Path) -> dict[str, int]:
-    """
-    Reads the Config structure from a binary file and returns it as a dictionary.
-    """
-    with open(file_path, "rb") as f:
-        # Read the data corresponding to the Config struct
-        data: bytes = f.read(struct.calcsize("7i"))
-        config_values = struct.unpack("7i", data)
-
-        config: dict[str, int] = {
-            "dim": config_values[0],
-            "hidden_dim": config_values[1],
-            "n_layers": config_values[2],
-            "n_heads": config_values[3],
-            "n_kv_heads": config_values[4],
-            "vocab_size": abs(
-                config_values[5]
-            ),  # account for possible negative vocab_size
-            "seq_len": config_values[6],
-        }
-    return config
-
-
-def calculate_memory(config: dict[str, int]) -> dict[str, dict[str, float]]:
-    """Calculate required memory for all the LLM components"""
-    # Tokenizer
-    vocab_memory = config["vocab_size"] * SIZE_OF_POINTER
-    vocab_scores_memory = config["vocab_size"] * SIZE_OF_FLOAT
-
-    # TransformerWeights
-    head_size = config["dim"] / config["n_heads"]
-    n_layers = config["n_layers"]
-
-    token_embedding_table = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT
-    rms_att_weight = n_layers * config["dim"] * SIZE_OF_FLOAT
-    wq = n_layers * config["dim"] * (config["n_heads"] * head_size) * SIZE_OF_FLOAT
-    wk = n_layers * config["dim"] * (config["n_kv_heads"] * head_size) * SIZE_OF_FLOAT
-    wv = wk  # Same as wk
-    wo = n_layers * (config["n_heads"] * head_size) * config["dim"] * SIZE_OF_FLOAT
-    rms_ffn_weight = n_layers * config["dim"] * SIZE_OF_FLOAT
-    w1 = n_layers * config["dim"] * config["hidden_dim"] * SIZE_OF_FLOAT
-    w2 = n_layers * config["hidden_dim"] * config["dim"] * SIZE_OF_FLOAT
-    w3 = w1  # Same as w1
-    rms_final_weight = config["dim"] * SIZE_OF_FLOAT
-    wcls = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT
-
-    # RunState
-    kv_dim = (config["dim"] * config["n_kv_heads"]) / config["n_heads"]
-    x = config["dim"] * SIZE_OF_FLOAT
-    xb = x  # Same as x
-    xb2 = x  # Same as x
-    hb = config["hidden_dim"] * SIZE_OF_FLOAT
-    hb2 = hb  # Same as hb
-    q = x  # Same as x
-    k = kv_dim * SIZE_OF_FLOAT
-    v = k  # Same as k
-    att = config["n_heads"] * config["seq_len"] * SIZE_OF_FLOAT
-    logits = config["vocab_size"] * SIZE_OF_FLOAT
-    key_cache = n_layers * config["seq_len"] * kv_dim * SIZE_OF_FLOAT
-    value_cache = key_cache  # Same as key_cache
-
-    # Calculate total memory usage for Tokenizer, TransformerWeights and RunState
-    total_tokenizer = vocab_memory + vocab_scores_memory + SIZE_OF_BYTE_PIECES
-
-    total_transformer_weights = sum(
-        [
-            token_embedding_table,
-            rms_att_weight,
-            wq,
-            wk,
-            wv,
-            wo,
-            rms_ffn_weight,
-            w1,
-            w2,
-            w3,
-            rms_final_weight,
-            wcls,
-        ]
-    )
-    total_run_state = sum(
-        [x, xb, xb2, hb, hb2, q, k, v, att, logits, key_cache, value_cache]
-    )
-
-    # Collate the results in a dictionary
-    data: dict[str, dict[str, float]] = {
-        "Tokenizer Memory (per model)": {
-            "vocab_memory": vocab_memory / (1024 * 1024),
-            "vocab_scores_memory": vocab_scores_memory / (1024 * 1024),
-        },
-        "TransformerWeights Memory (per model)": {
-            "token_embedding_table": token_embedding_table / (1024 * 1024),
-            "rms_att_weight": rms_att_weight / (1024 * 1024),
-            "wq": wq / (1024 * 1024),
-            "wk": wk / (1024 * 1024),
-            "wv": wv / (1024 * 1024),
-            "wo": wo / (1024 * 1024),
-            "rms_ffn_weight": rms_ffn_weight / (1024 * 1024),
-            "w1": w1 / (1024 * 1024),
-            "w2": w2 / (1024 * 1024),
-            "w3": w3 / (1024 * 1024),
-            "rms_final_weight": rms_final_weight / (1024 * 1024),
-            "wcls": wcls / (1024 * 1024),
-        },
-        "RunState Memory (per user)": {
-            "x": x / (1024 * 1024),
-            "xb": xb / (1024 * 1024),
-            "xb2": xb2 / (1024 * 1024),
-            "hb": hb / (1024 * 1024),
-            "hb2": hb2 / (1024 * 1024),
-            "q": q / (1024 * 1024),
-            "k": k / (1024 * 1024),
-            "v": v / (1024 * 1024),
-            "att": att / (1024 * 1024),
-            "logits": logits / (1024 * 1024),
-            "key_cache": key_cache / (1024 * 1024),
-            "value_cache": value_cache / (1024 * 1024),
-        },
-        "Total Memory": {
-            "Total Tokenizer Memory (per model)": total_tokenizer / (1024 * 1024),
-            "Total TransformerWeights Memory (per model)": total_transformer_weights
-            / (1024 * 1024),
-            "Total RunState Memory (per user)": total_run_state / (1024 * 1024),
-            "Overall Total Memory": (total_transformer_weights + total_run_state)
-            / (1024 * 1024),
-        },
-    }
-    return data
-
-
-def write_data(file: TextIO, title: str, data: dict[str, dict[str, float]]) -> None:
-    """Writes it all to a Markdown file"""
-    # Get the models for headers
-    headers = ["Memory Type"] + [f"{model}<br>(MB)" for model in data.keys()]
-
-    # Write the table name
-    file.write(f"### {title}\n\n")
-
-    # Write the headers
-    file.write(" | ".join(headers) + "\n")
-    file.write(" | ".join(["---"] * len(headers)) + "\n")
-
-    # Assuming that all models have the same memory types,
-    # using the first model to get the list of memory types
-    memory_types = list(data[next(iter(data))].keys())
-
-    totals = {model: 0.0 for model in data.keys()}
-
-    for mtype in memory_types:
-        row_data = [mtype] + [
-            f"{model_data[mtype]:.2f}" for model_data in data.values()
-        ]
-        file.write(" | ".join(row_data) + "\n")
-
-        # Accumulate totals for the first three tables
-        if title in [
-            "Tokenizer Memory (per model)",
-            "TransformerWeights Memory (per model)",
-            "RunState Memory (per user)",
-        ]:
-            for model, value in zip(
-                data.keys(),
-                [model_data[mtype] for model_data in data.values()],
-            ):
-                totals[model] += value
-
-    if title in [
-        "Tokenizer Memory (per model)",
-        "TransformerWeights Memory (per model)",
-        "RunState Memory (per user)",
-    ]:
-        # Add the totals to the table
-        total_row = ["Total"] + [f"{totals[model]:.2f}" for model in data.keys()]
-        file.write(" | ".join(total_row) + "\n")
-    else:
-        # Calculate max users for each model
-        # Calculate number of users for each model and add it to the data
-        number_of_users = {}
-        for model, values in data.items():
-            total_available_memory = 4 * 1024  # Available canister memory in MB
-            total_tokenizer_memory = values["Total Tokenizer Memory (per model)"]
-            total_transformer_weights_memory = values[
-                "Total TransformerWeights Memory (per model)"
-            ]
-            total_runstate_memory = values["Total RunState Memory (per user)"]
-
-            number_of_users[model] = int(
-                (
-                    total_available_memory
-                    - total_tokenizer_memory
-                    - total_transformer_weights_memory
-                )
-                / total_runstate_memory
-            )
-
-        # Write the markdown table for number of users
-        file.write("\n\n")
-        # Get the models for headers
-        headers = ["Canister Metrics"] + [f"{model}<br>(MB)" for model in data.keys()]
-
-        # Write the table name
-        file.write("### Canister Metrics\n\n")
-
-        # Write the headers
-        file.write(" | ".join(headers) + "\n")
-        file.write(" | ".join(["---"] * len(headers)) + "\n")
-
-        row_data = ["Max number of concurrent users"] + [
-            f"{number_of_users[model]}" for model in data.keys()
-        ]
-        file.write(" | ".join(row_data) + "\n")
-
-    file.write("\n\n")
-
-
-def main() -> int:
-    """Reads the model.bin files and summarizes the resource requirements."""
-    file_paths: dict[str, Path] = {
-        "260K": ROOT_PATH / "stories260K/stories260K.bin",
-        "15M": ROOT_PATH / "models/stories15Mtok4096.bin",
-        "42M": ROOT_PATH / "models/stories42M.bin",
-        "110M": ROOT_PATH / "models/stories110M.bin",
-    }
-
-    data = {}
-    for key, file_path in file_paths.items():
-        config: dict[str, int] = read_config_from_file(file_path)
-        data[key] = calculate_memory(config)
-
-    output_path = ROOT_PATH / "README_icpp_llama2_resource_requirements.md"
-    with open(output_path, "w", encoding="utf-8") as file:
-        file.write("# Canister resource requirements for llama2_c.")
-        file.write("\n")
-        file.write("\nDo not edit this file. It is created with the command: ")
-        file.write("\n```bash")
-        file.write("\npython -m scripts.icpp_llama2_sizer")
-        file.write("\n```\n\n")
-        for key in [
-            "Tokenizer Memory (per model)",
-            "TransformerWeights Memory (per model)",
-            "RunState Memory (per user)",
-            "Total Memory",
-        ]:
-            subset_data = {k: v[key] for k, v in data.items()}
-            write_data(file, key, subset_data)
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/scripts/TODO/nft_config.py b/scripts/TODO/nft_config.py
deleted file mode 100644
index 1f14c2d..0000000
--- a/scripts/TODO/nft_config.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""Reads a toml file and returns a dictionary with the data."""
-
-import sys
-from pathlib import Path
-from typing import Any, Dict
-import typer
-
-# `tomllib` was introduced in python 3.11
-# for earlier versions, use `tomli` instead
-if sys.version_info >= (3, 11):
-    import tomllib
-else:
-    try:
-        import tomli as tomllib
-    except ImportError:
-        typer.echo("ERROR: cannot find python module tomli")
-        sys.exit(1)
-
-
-def read_toml(toml_path: Path) -> Dict[Any, Any]:
-    """Reads the inference toml file"""
-    with open(toml_path, "rb") as f:
-        data = tomllib.load(f)
-
-    return data
diff --git a/scripts/TODO/nft_init.py b/scripts/TODO/nft_init.py
deleted file mode 100644
index 4c02b11..0000000
--- a/scripts/TODO/nft_init.py
+++ /dev/null
@@ -1,108 +0,0 @@
-"""Initializes the NFT Collection.
-
-Run with:
-
-    python -m scripts.nft_init
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, no-member, too-many-statements
-
-import sys
-from pathlib import Path
-from pprint import pprint
-from .ic_py_canister import get_canister
-from .parse_args_nft_init import parse_args
-
-ROOT_PATH = Path(__file__).parent.parent
-
-#  0 - none
-#  1 - minimal
-#  2 - a lot
-DEBUG_VERBOSE = 1
-
-
-def main() -> int:
-    """Initializes the NFT Collection."""
-
-    args = parse_args()
-
-    network = args.network
-    canister_name = args.canister
-    canister_id = args.canister_id
-    candid_path = ROOT_PATH / args.candid
-
-    nft_supply_cap = args.nft_supply_cap
-    nft_symbol = args.nft_symbol
-    nft_name = args.nft_name
-    nft_description = args.nft_description
-
-    dfx_json_path = ROOT_PATH / "dfx.json"
-
-    print(
-        f"Summary of canister & NFT Collection:"
-        f"\n - network         = {network}"
-        f"\n - canister        = {canister_name}"
-        f"\n - canister_id     = {canister_id}"
-        f"\n - dfx_json_path   = {dfx_json_path}"
-        f"\n - candid_path     = {candid_path}"
-        f"\n - nft_supply_cap  = {nft_supply_cap}"
-        f"\n - nft_symbol      = {nft_symbol}"
-        f"\n - nft_name        = {nft_name}"
-        f"\n - nft_description = {nft_description}"
-    )
-
-    # ---------------------------------------------------------------------------
-    # get ic-py based Canister instance
-    canister_llama2 = get_canister(canister_name, candid_path, network, canister_id)
-
-    # check health (liveness)
-    print("--\nChecking liveness of canister (did we deploy it!)")
-    response = canister_llama2.health()
-    if "Ok" in response[0].keys():
-        print("Ok!")
-    else:
-        print("Not OK, response is:")
-        print(response)
-
-    # ---------------------------------------------------------------------------
-    # Initialize the NFT Collection
-    print("--\nInitializing the NFT Collection, getting it ready for minting.")
-    record_in = {
-        "nft_supply_cap": nft_supply_cap,
-        "nft_total_supply": 0,
-        "nft_symbol": nft_symbol,
-        "nft_name": nft_name,
-        "nft_description": nft_description,
-    }
-    response = canister_llama2.nft_init(record_in)
-    if "Ok" in response[0].keys():
-        if DEBUG_VERBOSE >= 2:
-            print("OK!")
-    else:
-        print("Something went wrong:")
-        print(response)
-        sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # Summarize the NFT Collection
-    print("--\nSummary of the NFT Collection.")
-    response = canister_llama2.nft_metadata()
-    pprint(response[0])
-
-    # ---------------------------------------------------------------------------
-    print(
-        f"--\nCongratulations, NFT Collection {nft_symbol} in canister {canister_name} "
-        f"is set up."
-        f"\nMake sure to deploy the LLM prior to minting."
-    )
-    try:
-        print("💯 🎉 🏁")
-    except UnicodeEncodeError:
-        print(" ")
-
-    # ---------------------------------------------------------------------------
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/scripts/TODO/nft_metadata.py b/scripts/TODO/nft_metadata.py
deleted file mode 100644
index 5b2c7ab..0000000
--- a/scripts/TODO/nft_metadata.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""Summarizes the NFT Collection.
-
-Run with:
-
-    python -m scripts.nft_metadata
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, no-member, too-many-statements
-
-import sys
-from pathlib import Path
-from pprint import pprint
-from .ic_py_canister import get_canister
-from .parse_args_nft_init import parse_args
-
-ROOT_PATH = Path(__file__).parent.parent
-
-#  0 - none
-#  1 - minimal
-#  2 - a lot
-DEBUG_VERBOSE = 1
-
-
-def main() -> int:
-    """Summarize the NFT Collection."""
-
-    args = parse_args()
-
-    network = args.network
-    canister_name = args.canister
-    canister_id = args.canister_id
-    candid_path = ROOT_PATH / args.candid
-
-    dfx_json_path = ROOT_PATH / "dfx.json"
-
-    print(
-        f"Summary of canister & NFT Collection:"
-        f"\n - network         = {network}"
-        f"\n - canister        = {canister_name}"
-        f"\n - canister_id     = {canister_id}"
-        f"\n - dfx_json_path   = {dfx_json_path}"
-        f"\n - candid_path     = {candid_path}"
-    )
-
-    # ---------------------------------------------------------------------------
-    # get ic-py based Canister instance
-    canister_llama2 = get_canister(canister_name, candid_path, network, canister_id)
-
-    # check health (liveness)
-    print("--\nChecking liveness of canister (did we deploy it!)")
-    response = canister_llama2.health()
-    if "Ok" in response[0].keys():
-        print("Ok!")
-    else:
-        print("Not OK, response is:")
-        print(response)
-
-    # ---------------------------------------------------------------------------
-    # Summarize the NFT Collection
-    print("--\nSummary of the NFT Collection.")
-    response = canister_llama2.nft_metadata()
-    pprint(response[0])
-
-    # ---------------------------------------------------------------------------
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/scripts/TODO/nft_mint.py b/scripts/TODO/nft_mint.py
deleted file mode 100644
index 39ac18e..0000000
--- a/scripts/TODO/nft_mint.py
+++ /dev/null
@@ -1,122 +0,0 @@
-"""Mint an NFT owned by a bitcoin ordinal, from a toml file
-
-Run with:
-
-    python -m scripts.nft_mint --network ic --canister <canister-name> --nft-config <path-to-your-nft-config-toml-file> --token-ids <path-to-the-token-ids-toml-file>   # pylint: disable=line-too-long
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, no-member, too-many-statements, broad-except
-
-import sys
-from pathlib import Path
-from pprint import pprint
-from typing import Dict, Any
-from .ic_py_canister import get_canister
-from .parse_args_nft_mint import parse_args
-from .nft_config import read_toml
-
-ROOT_PATH = Path(__file__).parent.parent
-
-#  0 - none
-#  1 - minimal
-#  2 - a lot
-DEBUG_VERBOSE = 1
-
-
-def main() -> int:
-    """Summarize the NFT Collection."""
-
-    args = parse_args()
-
-    network = args.network
-    canister_name = args.canister
-    canister_id = args.canister_id
-    candid_path = ROOT_PATH / args.candid
-
-    nft_config_path = Path(args.nft_config)
-    token_ids_path = Path(args.token_ids)
-
-    dfx_json_path = ROOT_PATH / "dfx.json"
-
-    print(
-        f"Summary of canister & NFT Collection:"
-        f"\n - network                  = {network}"
-        f"\n - canister                 = {canister_name}"
-        f"\n - canister_id              = {canister_id}"
-        f"\n - dfx_json_path            = {dfx_json_path}"
-        f"\n - candid_path              = {candid_path}"
-        f"\n - nft_config_path          = {nft_config_path}"
-        f"\n - token_ids_path           = {token_ids_path}"
-    )
-
-    nft_config: Dict[Any, Any] = read_toml(nft_config_path)
-    token_ids: Dict[Any, Any] = read_toml(token_ids_path)
-
-    # ---------------------------------------------------------------------------
-    # get ic-py based Canister instance
-    canister_llama2 = get_canister(canister_name, candid_path, network, canister_id)
-
-    # check health (liveness)
-    print("--\nChecking liveness of canister (did we deploy it!)")
-    response = canister_llama2.health()
-    if "Ok" in response[0].keys():
-        print("Ok!")
-    else:
-        print("Not OK, response is:")
-        print(response)
-
-    # ---------------------------------------------------------------------------
-    # check readiness for inference
-    print("--\nChecking if the canister is ready for inference.")
-    response = canister_llama2.ready()
-    if "Ok" in response[0].keys():
-        if DEBUG_VERBOSE >= 2:
-            print("OK!")
-    else:
-        print("Something went wrong:")
-        print(response)
-        sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # Mint the NFT
-    #
-    nft_id = nft_config["nft_id"]
-
-    NFT = {}
-    NFT["token_id"] = token_ids["token_ids"][str(nft_id)]
-
-    print(f'--\nMinting NFT for token_id  = {NFT["token_id"]}')
-    error: bool = False
-    try:
-        response = canister_llama2.nft_mint(NFT)
-        print(response)
-        if "Ok" in response[0].keys():
-            if DEBUG_VERBOSE >= 2:
-                print("OK!")
-        else:
-            error = True
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        if "already exists" in str(e):
-            print("Accepting this error, will continue...")
-        else:
-            error = True
-
-    if error:
-        print("Something went wrong")
-        user_input = input("Do you still want to continue? [yes/no] ")
-        if user_input.lower() not in ("yes", "y"):
-            sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # Summarize the NFT Collection
-    print("--\nSummary of the NFT Collection.")
-    response = canister_llama2.nft_metadata()
-    pprint(response[0])
-
-    # ---------------------------------------------------------------------------
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/scripts/TODO/nft_update_story.py b/scripts/TODO/nft_update_story.py
deleted file mode 100644
index c40a066..0000000
--- a/scripts/TODO/nft_update_story.py
+++ /dev/null
@@ -1,184 +0,0 @@
-"""Update an NFT owned by a bitcoin ordinal, from a toml file
-
-Run with:
-
-    python -m scripts.nft_update_story --network ic --canister <canister-name> --nft-config <path-to-your-nft-config-toml-file> --token-ids <path-to-the-token-ids-toml-file>   # pylint: disable=line-too-long
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, no-member, too-many-statements, broad-except
-
-import sys
-from pathlib import Path
-from pprint import pprint
-from typing import Dict, Any
-from .ic_py_canister import get_canister
-from .parse_args_nft_mint import parse_args
-from .nft_config import read_toml
-
-ROOT_PATH = Path(__file__).parent.parent
-
-#  0 - none
-#  1 - minimal
-#  2 - a lot
-DEBUG_VERBOSE = 1
-
-
-def main() -> int:
-    """Summarize the NFT Collection."""
-
-    args = parse_args()
-
-    network = args.network
-    canister_name = args.canister
-    canister_id = args.canister_id
-    candid_path = ROOT_PATH / args.candid
-
-    nft_config_path = Path(args.nft_config)
-    token_ids_path = Path(args.token_ids)
-
-    dfx_json_path = ROOT_PATH / "dfx.json"
-
-    print(
-        f"Summary of canister & NFT Collection:"
-        f"\n - network         = {network}"
-        f"\n - canister        = {canister_name}"
-        f"\n - canister_id     = {canister_id}"
-        f"\n - dfx_json_path   = {dfx_json_path}"
-        f"\n - candid_path     = {candid_path}"
-        f"\n - nft_config_path = {nft_config_path}"
-        f"\n - token_ids_path  = {token_ids_path}"
-    )
-
-    nft_config: Dict[Any, Any] = read_toml(nft_config_path)
-    token_ids: Dict[Any, Any] = read_toml(token_ids_path)
-
-    # ---------------------------------------------------------------------------
-    # get ic-py based Canister instance
-    canister_llama2 = get_canister(canister_name, candid_path, network, canister_id)
-
-    # check health (liveness)
-    print("--\nChecking liveness of canister (did we deploy it!)")
-    response = canister_llama2.health()
-    if "Ok" in response[0].keys():
-        print("Ok!")
-    else:
-        print("Not OK, response is:")
-        print(response)
-
-    # ---------------------------------------------------------------------------
-    # check readiness for inference
-    print("--\nChecking if the canister is ready for inference.")
-    response = canister_llama2.ready()
-    if "Ok" in response[0].keys():
-        if DEBUG_VERBOSE >= 2:
-            print("OK!")
-    else:
-        print("Something went wrong:")
-        print(response)
-        sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # Start the story
-    nft_id = nft_config["nft_id"]
-
-    NFT = {}
-    NFT["token_id"] = token_ids["token_ids"][str(nft_id)]
-
-    prompt = nft_config["prompt"]
-    print(
-        "--\nGenerating a story with:"
-        f"\n - token_id     = {NFT['token_id']}"
-        f"\n - prompt                 = {prompt['prompt']}"
-        f"\n - temperature            = {prompt['temperature']}"
-        f"\n - topp                   = {prompt['topp']}"
-        f"\n - steps                  = {prompt['steps']}"
-        f"\n - rng_seed               = {prompt['rng_seed']}"
-    )
-
-    use_full_prompt = True
-
-    if use_full_prompt:
-        response = canister_llama2.nft_story_start(NFT, prompt)
-        print(response)
-        if "Ok" in response[0].keys():
-            if DEBUG_VERBOSE >= 2:
-                print("OK!")
-        else:
-            print("Something went wrong:")
-            sys.exit(1)
-    else:
-        words = prompt["prompt"].split()  # Splitting the initial prompt into words
-
-        # First, use nft_story_start for the first word
-        first_word = words[0]
-        print(f"--\nStarting a new story with the first word: {first_word}")
-        prompt["prompt"] = first_word  # Update the prompt with the first word
-        response = canister_llama2.nft_story_start(NFT, prompt)
-        print(response)
-        if "Ok" in response[0].keys():
-            if DEBUG_VERBOSE >= 2:
-                print("OK!")
-        else:
-            print("Something went wrong:")
-            sys.exit(1)
-
-        # ---------------------------------------------------------------------------
-        # Then, use nft_story_continue for the rest of the words
-        for word in words[1:]:  # Starting from the second word
-            print(f"--\nContinuing the story with the word: {word}")
-            prompt["prompt"] = word  # Update the prompt with the current word
-            response = canister_llama2.nft_story_continue(NFT, prompt)
-            print(response)
-            if "Ok" in response[0].keys():
-                if DEBUG_VERBOSE >= 2:
-                    print("OK!")
-            else:
-                print("Something went wrong:")
-                sys.exit(1)
-
-        response = canister_llama2.nft_story_start(NFT, prompt)
-        print(response)
-        if "Ok" in response[0].keys():
-            if DEBUG_VERBOSE >= 2:
-                print("OK!")
-        else:
-            print("Something went wrong:")
-            sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # Continue the story with empty prompt until we reached the end.
-
-    print("--\nContinuing the story until done...")
-
-    prompt["prompt"] = ""
-    while True:
-        response = canister_llama2.nft_story_continue(NFT, prompt)
-        print(response)
-        if "Ok" in response[0].keys():
-            # Check if the number of generated tokens is less than the requested tokens
-            if response[0]["Ok"]["num_tokens"] < prompt["steps"]:
-                print(f'The end! - num_tokens = {response[0]["Ok"]["num_tokens"]}')
-                break
-            # Check if the response is an empty string. If it is, break out of the loop.
-            if response[0]["Ok"]["inference"] == "":
-                print(
-                    "The end! - we got an empty string. THIS IS AN ERROR ACTUALLY. WE SHOULD NOT GET HERE.."
-                )
-                print("Something went wrong:")
-                sys.exit(1)
-        else:
-            print("Something went wrong:")
-            sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # Summarize the NFT Collection
-    print("--\nSummary of the NFT Collection.")
-    response = canister_llama2.nft_metadata()
-    pprint(response[0])
-
-    # ---------------------------------------------------------------------------
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/scripts/TODO/parse_args_nft_init.py b/scripts/TODO/parse_args_nft_init.py
deleted file mode 100644
index 0d0a041..0000000
--- a/scripts/TODO/parse_args_nft_init.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Import command line arguments for the scripts."""
-
-import argparse
-
-
-def parse_args() -> argparse.Namespace:
-    """Returns the command line arguments"""
-    parser = argparse.ArgumentParser(description="Initialize the NFT collection")
-    parser.add_argument(
-        "--network",
-        type=str,
-        default="local",
-        help="Network: ic or local",
-    )
-    parser.add_argument(
-        "--canister",
-        type=str,
-        default="no-default",
-        help="canister name in dfx.json",
-    )
-    parser.add_argument(
-        "--canister-id",
-        type=str,
-        default="",
-        help="canister-id name canister_ids.json",
-    )
-    parser.add_argument(
-        "--candid",
-        type=str,
-        default="src/llama2.did",
-        help="canister's candid file",
-    )
-
-    parser.add_argument(
-        "--nft-supply-cap",
-        type=int,
-        default=25,
-        help="The max number of NFTs that will ever be minted.",
-    )
-    parser.add_argument(
-        "--nft-symbol",
-        type=str,
-        default="no-default",
-        help="Symbol of the NFT Collection",
-    )
-    parser.add_argument(
-        "--nft-name",
-        type=str,
-        default="no-default",
-        help="Name of the NFT Collection",
-    )
-    parser.add_argument(
-        "--nft-description",
-        type=str,
-        default=("no-default"),
-        help="Description of the NFT Collection",
-    )
-
-    args = parser.parse_args()
-    return args
diff --git a/scripts/TODO/parse_args_nft_metadata.py b/scripts/TODO/parse_args_nft_metadata.py
deleted file mode 100644
index 4683b38..0000000
--- a/scripts/TODO/parse_args_nft_metadata.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""Import command line arguments for the scripts."""
-
-import argparse
-
-
-def parse_args() -> argparse.Namespace:
-    """Returns the command line arguments"""
-    parser = argparse.ArgumentParser(description="Summarize the NFT collection")
-    parser.add_argument(
-        "--network",
-        type=str,
-        default="local",
-        help="Network: ic or local",
-    )
-    parser.add_argument(
-        "--canister",
-        type=str,
-        default="no-default",
-        help="canister name in dfx.json",
-    )
-    parser.add_argument(
-        "--canister-id",
-        type=str,
-        default="",
-        help="canister-id name canister_ids.json",
-    )
-    parser.add_argument(
-        "--candid",
-        type=str,
-        default="src/llama2.did",
-        help="canister's candid file",
-    )
-
-    args = parser.parse_args()
-    return args
diff --git a/scripts/TODO/parse_args_nft_mint.py b/scripts/TODO/parse_args_nft_mint.py
deleted file mode 100644
index 237abed..0000000
--- a/scripts/TODO/parse_args_nft_mint.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""Import command line arguments for the scripts."""
-
-import argparse
-
-
-def parse_args() -> argparse.Namespace:
-    """Returns the command line arguments"""
-    parser = argparse.ArgumentParser(
-        description="Generate a story and optionally mint it."
-    )
-    parser.add_argument(
-        "--network",
-        type=str,
-        default="local",
-        help="Network: ic or local",
-    )
-    parser.add_argument(
-        "--canister",
-        type=str,
-        default="no-default",
-        help="canister name in dfx.json",
-    )
-    parser.add_argument(
-        "--canister-id",
-        type=str,
-        default="",
-        help="canister-id name canister_ids.json",
-    )
-    parser.add_argument(
-        "--candid",
-        type=str,
-        default="src/llama2.did",
-        help="canister's candid file",
-    )
-
-    parser.add_argument(
-        "--nft-config",
-        type=str,
-        default="no-default",
-        help="A toml file with prompt and inference params for your NFTs' story",
-    )
-
-    parser.add_argument(
-        "--token-ids",
-        type=str,
-        default="no-default",
-        help="A toml file with all the bitcoin ordinal ids for the collection",
-    )
-
-    args = parser.parse_args()
-    return args
diff --git a/scripts/TODO/parse_args_upload.py b/scripts/TODO/parse_args_upload.py
deleted file mode 100644
index 543a17d..0000000
--- a/scripts/TODO/parse_args_upload.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""Import command line arguments for the scripts."""
-
-import argparse
-
-
-def parse_args() -> argparse.Namespace:
-    """Returns the command line arguments"""
-    parser = argparse.ArgumentParser(description="Load a model and set parameters")
-    parser.add_argument(
-        "--network",
-        type=str,
-        default="local",
-        help="Network: ic or local",
-    )
-    parser.add_argument(
-        "--canister",
-        type=str,
-        default="no-default",
-        help="canister name in dfx.json",
-    )
-    parser.add_argument(
-        "--canister-id",
-        type=str,
-        default="",
-        help="canister-id name canister_ids.json",
-    )
-    parser.add_argument(
-        "--candid",
-        type=str,
-        default="src/llama2.did",
-        help="canister's candid file",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="models/stories15Mtok4096.bin",
-        help="Model file (e.g. models/stories15Mtok4096.bin)",
-    )
-    parser.add_argument(
-        "--tokenizer",
-        type=str,
-        default="tokenizers/tok4096.bin",
-        help="Tokenizer file (e.g. tokenizers/tok4096.bin)",
-    )
-    parser.add_argument(
-        "--chunksize",
-        type=float,
-        default=1.9,
-        help="Chunk Size for upload, in Mb",
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=0.9,
-        help="Temperature (e.g. 1.0, or 0.0)",
-    )
-    parser.add_argument(
-        "--topp",
-        type=float,
-        default=1.0,
-        help="p value in top-p (nucleus) sampling. default 1.0 (=off)",
-    )
-    parser.add_argument(
-        "--steps",
-        type=int,
-        default=256,
-        help="Max number of steps to run for, 0: use seq_len",
-    )
-    parser.add_argument(
-        "--rng-seed",
-        type=int,
-        default=0,
-        help="seed, 0: use random seed based on time",
-    )
-    args = parser.parse_args()
-    return args
diff --git a/scripts/TODO/upload.py b/scripts/TODO/upload.py
deleted file mode 100644
index 8f1132f..0000000
--- a/scripts/TODO/upload.py
+++ /dev/null
@@ -1,226 +0,0 @@
-"""Uploads model & tokenizer.
-
-Run with:
-
-    python -m scripts.upload
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, no-member, too-many-statements
-
-import sys
-from pathlib import Path
-from typing import Generator
-from .ic_py_canister import get_canister
-from .parse_args_upload import parse_args
-
-ROOT_PATH = Path(__file__).parent.parent
-
-#  0 - none
-#  1 - minimal
-#  2 - a lot
-DEBUG_VERBOSE = 1
-
-
-# ------------------------------------------------------------------------------
-def read_file_bytes(file_path: Path) -> bytes:
-    """Returns the stories15Mtok4096.bin file as a bytes array"""
-    file_bytes = b""
-    try:
-        with open(file_path, "rb") as file:
-            file_bytes = file.read()
-
-    except FileNotFoundError:
-        print(f"ERROR: Unable to open the file {file_path}!")
-        sys.exit(1)
-
-    return file_bytes
-
-
-def generate_chunks(data: bytes, chunk_size: int) -> Generator[bytes, None, None]:
-    """Generator function to iterate over chunks"""
-    for i in range(0, len(data), chunk_size):
-        yield data[i : i + chunk_size]
-
-
-def main() -> int:
-    """Uploads the tokenizer & model, and initializes NFT Collection."""
-
-    args = parse_args()
-
-    network = args.network
-    canister_name = args.canister
-    canister_id = args.canister_id
-    candid_path = ROOT_PATH / args.candid
-    model_path = ROOT_PATH / args.model
-    tokenizer_path = ROOT_PATH / args.tokenizer
-    chunk_size_mb = args.chunksize
-
-    dfx_json_path = ROOT_PATH / "dfx.json"
-
-    print(
-        f"Summary of model & NFT Collection:"
-        f"\n - network         = {network}"
-        f"\n - canister        = {canister_name}"
-        f"\n - canister_id     = {canister_id}"
-        f"\n - dfx_json_path   = {dfx_json_path}"
-        f"\n - candid_path     = {candid_path}"
-        f"\n - model_path      = {model_path}"
-        f"\n - tokenizer_path  = {tokenizer_path}"
-    )
-
-    # ---------------------------------------------------------------------------
-    # get ic-py based Canister instance
-    canister_llama2 = get_canister(canister_name, candid_path, network, canister_id)
-
-    # check health (liveness)
-    print("--\nChecking liveness of canister (did we deploy it!)")
-    response = canister_llama2.health()
-    if "Ok" in response[0].keys():
-        print("Ok!")
-    else:
-        print("Not OK, response is:")
-        print(response)
-
-    # ---------------------------------------------------------------------------
-    # THE TOKENIZER FILE
-
-    # Read the tokenizer from disk
-    print(f"--\nReading the tokenizer file into a bytes object: {tokenizer_path}")
-    tokenizer_bytes = read_file_bytes(tokenizer_path)
-
-    # Reset the tokenizer
-    print("--\nResetting the tokenizer in canister")
-    response = canister_llama2.reset_tokenizer()  # pylint: disable=no-member
-    if "Ok" in response[0].keys():
-        if DEBUG_VERBOSE >= 2:
-            print("OK!")
-    else:
-        print("Something went wrong:")
-        print(response)
-        sys.exit(1)
-
-    # Upload tokenizer_bytes to the canister
-    print("--\nUploading the tokenizer bytes")
-
-    # converting MB to bytes
-    chunk_size = int(chunk_size_mb * 1024 * 1024)
-
-    # Iterate over all chunks
-    count_bytes = 0
-    for i, chunk in enumerate(generate_chunks(tokenizer_bytes, chunk_size)):
-        count_bytes += len(chunk)
-        if DEBUG_VERBOSE == 0:
-            pass
-        elif DEBUG_VERBOSE == 1:
-            print(
-                f"chunk size = {len(chunk)} bytes "
-                f"({count_bytes / len(tokenizer_bytes) * 100:.1f}%)"
-            )
-        else:
-            print("+++++++++++++++++++++++++++++++++++++++++++++++++++++")
-            print(f"Sending candid for {len(chunk)} bytes :")
-            print(f"- i         = {i}")
-            print(f"- progress  = {count_bytes / len(tokenizer_bytes) * 100:.1f} % ")
-            print(f"- chunk[0]  = {chunk[0]}")
-            print(f"- chunk[-1] = {chunk[-1]}")
-
-        response = canister_llama2.upload_tokenizer_bytes_chunk(
-            chunk
-        )  # pylint: disable=no-member
-        if "Ok" in response[0].keys():
-            print("OK!")
-        else:
-            print("Something went wrong:")
-            print(response)
-            sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # THE MODEL FILE
-
-    # Read the model from disk
-    print(f"--\nReading the model file into a bytes object: {model_path}")
-    model_bytes = read_file_bytes(model_path)
-
-    # Reset the model
-    print("--\nResetting the model in canister")
-    response = canister_llama2.reset_model()  # pylint: disable=no-member
-    if "Ok" in response[0].keys():
-        if DEBUG_VERBOSE >= 2:
-            print("OK!")
-    else:
-        print("Something went wrong:")
-        print(response)
-        sys.exit(1)
-
-    # Upload model_bytes to the canister
-    print(f"--\nUploading the model bytes, in {chunk_size_mb}Mb chunks")
-
-    # converting MB to bytes
-    chunk_size = int(chunk_size_mb * 1024 * 1024)
-
-    # Iterate over all chunks
-    count_bytes = 0
-    for i, chunk in enumerate(generate_chunks(model_bytes, chunk_size)):
-        count_bytes += len(chunk)
-        if DEBUG_VERBOSE == 0:
-            pass
-        elif DEBUG_VERBOSE == 1:
-            print(
-                f"chunk size = {len(chunk)} bytes "
-                f"({count_bytes / len(model_bytes) * 100:.1f}%)"
-            )
-        else:
-            print("+++++++++++++++++++++++++++++++++++++++++++++++++++++")
-            print(f"Sending candid for {len(chunk)} bytes :")
-            print(f"- i         = {i}")
-            print(f"- progress  = {count_bytes / len(model_bytes) * 100:.1f}% ")
-            print(f"- chunk[0]  = {chunk[0]}")
-            print(f"- chunk[-1] = {chunk[-1]}")
-
-        response = canister_llama2.upload_model_bytes_chunk(
-            chunk
-        )  # pylint: disable=no-member
-        if "Ok" in response[0].keys():
-            if DEBUG_VERBOSE >= 2:
-                print("OK!")
-        else:
-            print("Something went wrong:")
-            print(response)
-            sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # Initialize the canister
-    print("--\nInitializing the canister, getting it ready for inference.")
-    response = canister_llama2.initialize()
-    if "Ok" in response[0].keys():
-        if DEBUG_VERBOSE >= 2:
-            print("OK!")
-    else:
-        print("Something went wrong:")
-        print(response)
-        sys.exit(1)
-
-    # ---------------------------------------------------------------------------
-    # check readiness for inference
-    print("--\nChecking if the canister is ready for inference.")
-    response = canister_llama2.ready()
-    if "Ok" in response[0].keys():
-        if DEBUG_VERBOSE >= 2:
-            print("OK!")
-    else:
-        print("Something went wrong:")
-        print(response)
-        sys.exit(1)
-    # ---------------------------------------------------------------------------
-    print(f"--\nCongratulations, canister {canister_name} is ready for inference!")
-    try:
-        print("💯 🎉 🏁")
-    except UnicodeEncodeError:
-        print(" ")
-
-    # ---------------------------------------------------------------------------
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/scripts/upload.py b/scripts/upload.py
index 7a51598..3c5c16a 100644
--- a/scripts/upload.py
+++ b/scripts/upload.py
@@ -106,7 +106,7 @@ def main() -> int:
         elif DEBUG_VERBOSE == 1:
             print(
                 f"chunk size = {len(chunk)} "
-                f"len(file_bytes) = {len(file_bytes)} " 
+                f"len(file_bytes) = {len(file_bytes)} "
                 f"offset = {offset} bytes "
                 f"({((offset+len(chunk)) / len(file_bytes) * 100):.1f}%)"
             )
diff --git a/src/run.cpp b/src/run.cpp
index 14a4204..ff3f04f 100644
--- a/src/run.cpp
+++ b/src/run.cpp
@@ -33,9 +33,9 @@ void run(IC_API &ic_api) {
   int argc = args.size();
 
   // Construct argv
-  std::vector<char*> argv(argc);
+  std::vector<char *> argv(argc);
   for (int i = 0; i < argc; ++i) {
-      argv[i] = &args[i][0]; // Convert std::string to char*
+    argv[i] = &args[i][0]; // Convert std::string to char*
   }
 
   // Call main_, just like it is called in the console app
@@ -43,8 +43,9 @@ void run(IC_API &ic_api) {
 
   // Return output over the wire
   CandidTypeRecord r_out;
-  r_out.append("StatusCode", CandidTypeNat16{200}); // TODO: set the status code
-  r_out.append("output", CandidTypeText{"TODO: add output here.... "});
+  r_out.append("status", CandidTypeNat16{200}); // TODO: set the status code
+  r_out.append("output",
+               CandidTypeText{"TODO: we need to add some output here.... "});
   ic_api.to_wire(CandidTypeVariant{"Ok", r_out});
 }
 
diff --git a/test.ps1 b/test.ps1
deleted file mode 100644
index a51aee9..0000000
--- a/test.ps1
+++ /dev/null
@@ -1,22 +0,0 @@
-#######################################################################
-# This is a Windows PowerShell script, 
-#
-# (-) equivalent to the Makefile target `test-all-llms:
-# (-) In a Windows PowerShell (*):
-#
-#     .\test_all_llms.ps1
-#
-# (*) The Miniconda Powershell is highly recommended
-#
-#######################################################################
-
-wsl --% dfx identity use default
-
-Write-Host " "
-Write-Host "--------------------------------------------------"
-Write-Host "Testing icpp_llama2"
-Set-Location -Path .\icpp_llama2
-.\demo.ps1
-
-# Change directory back to the root
-Set-Location -Path ..\..
\ No newline at end of file
diff --git a/test/test_apis.py b/test/test_apis.py
index 9a5229f..5527bc1 100644
--- a/test/test_apis.py
+++ b/test/test_apis.py
@@ -16,114 +16,38 @@
 DFX_JSON_PATH = Path(__file__).parent / "../dfx.json"
 
 # Canister in the dfx.json file we want to test
-CANISTER_NAME = "greet"
+CANISTER_NAME = "llama_cpp"
 
 
-def test__greet_0(network: str) -> None:
+def test__health(network: str) -> None:
     response = call_canister_api(
         dfx_json_path=DFX_JSON_PATH,
         canister_name=CANISTER_NAME,
-        canister_method="greet_0",
+        canister_method="health",
         canister_argument="()",
         network=network,
     )
-    expected_response = '("hello!")'
+    expected_response = '(variant { Ok = record { status_code = 200 : nat16;} })'
     assert response == expected_response
 
-
-# Run this test with anonymous identity
-def test__greet_0_auth_err(identity_anonymous: Dict[str, str], network: str) -> None:
-    # double check the identity_anonymous fixture worked
-    assert identity_anonymous["identity"] == "anonymous"
-    assert identity_anonymous["principal"] == "2vxsx-fae"
-
+def test__run_query(network: str) -> None:
     response = call_canister_api(
         dfx_json_path=DFX_JSON_PATH,
         canister_name=CANISTER_NAME,
-        canister_method="greet_0_auth",
-        canister_argument="()",
+        canister_method="run_query",
+        canister_argument='(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "20"; "--ctx-size"; "128"} })',
         network=network,
     )
-    expected_response = "(variant { err = 401 : nat16 })"
+    expected_response = '(variant { Ok = record { status = 200 : nat16; output = "TODO: we need to add some output here.... ";} })'
     assert response == expected_response
 
-
-# Run this test with a logged in default identity
-def test__greet_0_auth_ok(identity_default: Dict[str, str], network: str) -> None:
-    # double check the identity_anonymous fixture worked
-    assert identity_default["identity"] == "default"
-
+def test__run_update(network: str) -> None:
     response = call_canister_api(
         dfx_json_path=DFX_JSON_PATH,
         canister_name=CANISTER_NAME,
-        canister_method="greet_0_auth",
-        canister_argument="()",
+        canister_method="run_update",
+        canister_argument='(record { args = vec {"--model"; "models/stories260Ktok512.gguf"; "--prompt"; "Patrick loves ice-cream. On a hot day "; "--n-predict"; "20"; "--ctx-size"; "128"} })',
         network=network,
     )
-    principal = identity_default["principal"]
-    expected_response = f'(variant {{ ok = "Hello {principal}" }})'
-    assert response == expected_response
-
-
-def test__greet_1(network: str) -> None:
-    response = call_canister_api(
-        dfx_json_path=DFX_JSON_PATH,
-        canister_name=CANISTER_NAME,
-        canister_method="greet_1",
-        canister_argument="()",
-        network=network,
-    )
-    expected_response = "(2_023 : int)"
-    assert response == expected_response
-
-
-def test__greet_2(network: str, principal: str) -> None:
-    response = call_canister_api(
-        dfx_json_path=DFX_JSON_PATH,
-        canister_name=CANISTER_NAME,
-        canister_method="greet_2",
-        canister_argument='("C++ Developer")',
-        network=network,
-    )
-    expected_response = f'("hello C++ Developer!\\nYour principal is: {principal}")'
-    assert response == expected_response
-
-
-def test__greet_3(network: str) -> None:
-    response = call_canister_api(
-        dfx_json_path=DFX_JSON_PATH,
-        canister_name=CANISTER_NAME,
-        canister_method="greet_3",
-        canister_argument='(record { "icpp version" = 1 : int; OS = "Linux" : text })',
-        network=network,
-    )
-    expected_response = '(record { "icpp Release Details" = "Version = 1 & Operating System = Linux"; "release year" = 2_023 : int;})'
-    assert response == expected_response
-
-
-def test__greet_4(network: str) -> None:
-    response = call_canister_api(
-        dfx_json_path=DFX_JSON_PATH,
-        canister_name=CANISTER_NAME,
-        canister_method="greet_4",
-        canister_argument="(record { 6 = 42 : int; 9 = 43 : int }, record { 7 = 44 : int; 10 = 45 : int })",
-        network=network,
-    )
-    expected_response = (
-        '("Hello!", "Your secret numbers are:", 42 : int, 43 : int, 44 : int, 45 : int)'
-    )
-    assert response == expected_response
-
-
-def test__greet_json(network: str, principal: str) -> None:
-    d = {"name": "AJB"}
-    text_in = dict_to_candid_text(d)
-    response = call_canister_api(
-        dfx_json_path=DFX_JSON_PATH,
-        canister_name=CANISTER_NAME,
-        canister_method="greet_json",
-        canister_argument=text_in,
-        network=network,
-    )
-    expected_response = '("{\\"greet\\":\\"Hello AJB!\\"}")'
-    assert response == expected_response
+    expected_response = '(variant { Ok = record { status = 200 : nat16; output = "TODO: we need to add some output here.... ";} })'
+    assert response == expected_response
\ No newline at end of file
diff --git a/version_clang.txt b/version_clang.txt
index da2d398..25bf17f 100644
--- a/version_clang.txt
+++ b/version_clang.txt
@@ -1 +1 @@
-14
\ No newline at end of file
+18
\ No newline at end of file