Merge pull request #34 from fengsh27/wip/fengsh/merge-upstream-0814

merge from upstream main
fengsh27 · Aug 14, 2024 · d50a245 · d50a245
2 parents be01ca6 + 7e0aa65
commit d50a245
Show file tree

Hide file tree

Showing 66 changed files with 1,742 additions and 1,546 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.1
+current_version = 0.6.4
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -36,7 +36,7 @@ jobs:
             mkdocs-material-
 
       - name: Packages
-        run: pip install mkdocs-material[imaging] mkdocstrings[python] pandas mkdocs-table-reader-plugin seaborn matplotlib scipy colorcet
+        run: pip install mkdocs-material[imaging] mkdocstrings[python] pandas mkdocs-table-reader-plugin seaborn matplotlib scipy colorcet mkdocs-redirects
 
       - name: Build
         run: mkdocs build

diff --git a/benchmark/data/benchmark_data.yaml b/benchmark/data/benchmark_data.yaml
@@ -228,7 +228,7 @@ medical_exam:
 
   # Math relevant questions
 
-  - case: single_answer:math:en
+  - case: single_choice:math:en
     input:
       prompt: A hollow organ of a patient contains 0.5 kg of liquid whose (mass) density is approximately the same as that of water. What is the approximate volume of this hollow organ? (A) 500 mm3 (B) 500 cm3 (C) 0,05 m3 (D) 0,5 m3 (E) 50 m3
       system_messages:
@@ -238,7 +238,7 @@ medical_exam:
     expected:
       answer: "b"
 
-  - case: single_answer:math:en
+  - case: single_choice:math:en
     input:
       prompt: In a car accident, an occupant weighing 75 kg is restrained by the tight-fitting seat belt, whereby a belt force limiter limits the force acting on the occupant to a maximum of 5,000 N. The acceleration acting on the occupant is thus limited (in terms of amount) to approx. (A) 0.067 m/s2 (B) 15 m/s2 (C) 33 m/s2 (D) 67 m/s2 (E) 375 m/s2
       system_messages:
@@ -248,7 +248,7 @@ medical_exam:
     expected:
       answer: "d"
 
-  - case: single_answer:math:en
+  - case: single_choice:math:en
     input:
       prompt: Medical oxygen is supplied in pressurised gas cylinders with an internal pressure of usually 200 bar. Manufacturers state that these cylinders should not be heated to over 50 °C and must be stored in a protected environment. By approximately what percentage does this internal pressure increase when such a pressurised gas cylinder is heated from 30 °C to 90 °C? (A) 3 % (B) 20 % (C) 40 % (D) 60 % (E) 300 %
       system_messages:

diff --git a/benchmark/load_dataset.py b/benchmark/load_dataset.py
@@ -115,6 +115,9 @@ def _delete_outdated_benchmark_results(data_dict: dict) -> None:
 
     # delete outdated results
     for file in result_files:
+        continue
+        # turn off deletion for now
+
         if "multimodal_answer" in file:
             continue
         result_file = pd.read_csv(file, header=0)

diff --git a/benchmark/results/medical_exam.csv b/benchmark/results/medical_exam.csv
diff --git a/benchmark/results/medical_exam_failure_modes.csv b/benchmark/results/medical_exam_failure_modes.csv
diff --git a/benchmark/results/processed/correlations.txt b/benchmark/results/processed/correlations.txt
@@ -1,4 +1,4 @@
-Size vs accuracy Pearson correlation: 0.2196874410865915
-Size vs accuracy Pearson correlation p-value: 8.674722515042285e-09
-Quantisation vs accuracy Pearson correlation: 0.2427859964015104
-Quantisation vs accuracy Pearson correlation p-value: 1.797917250633135e-10
+Size vs accuracy Pearson correlation: 0.22108985916662796
+Size vs accuracy Pearson correlation p-value: 7.1171375567780455e-09
+Quantisation vs accuracy Pearson correlation: 0.2455410361574719
+Quantisation vs accuracy Pearson correlation p-value: 1.1369970729884332e-10
diff --git a/benchmark/results/processed/medical_exam.csv b/benchmark/results/processed/medical_exam.csv
@@ -17,9 +17,9 @@ openhermes-2.5:7:ggufv2:Q3_K_M,604.0,1071.0,1.7320508075688772,0.563958916900093
 openhermes-2.5:7:ggufv2:Q2_K,576.0,1071.0,0.0,0.5378151260504201,3
 llama-2-chat:13:ggufv2:Q8_0,462.0,1071.0,0.0,0.43137254901960786,3
 llama-2-chat:13:ggufv2:Q5_K_M,462.0,1071.0,0.0,0.43137254901960786,3
-llama-2-chat:13:ggufv2:Q6_K,459.0,1071.0,0.0,0.42857142857142855,3
 llama-2-chat:13:ggufv2:Q4_K_M,459.0,1071.0,0.0,0.42857142857142855,3
 llama-2-chat:13:ggufv2:Q3_K_M,459.0,1071.0,0.0,0.42857142857142855,3
+llama-2-chat:13:ggufv2:Q6_K,459.0,1071.0,0.0,0.42857142857142855,3
 chatglm3:6:ggmlv3:q4_0,457.0,1071.0,21.616171041461605,0.42670401493930904,3
 llama-2-chat:13:ggufv2:Q2_K,444.0,1071.0,0.0,0.41456582633053224,3
 llama-2-chat:7:ggufv2:Q6_K,435.0,1071.0,0.0,0.4061624649859944,3
@@ -29,14 +29,13 @@ llama-2-chat:7:ggufv2:Q5_K_M,429.0,1071.0,0.0,0.4005602240896359,3
 llama-2-chat:7:ggufv2:Q3_K_M,423.0,1071.0,0.0,0.3949579831932773,3
 llama-2-chat:7:ggufv2:Q2_K,396.0,1071.0,0.0,0.3697478991596639,3
 mixtral-instruct-v0.1:46_7:ggufv2:Q4_K_M,395.0,1071.0,0.5773502691896257,0.36881419234360413,3
-mistral-instruct-v0.2:7:ggufv2:Q8_0,393.0,1071.0,0.0,0.36694677871148457,3
 mistral-instruct-v0.2:7:ggufv2:Q6_K,393.0,1071.0,0.0,0.36694677871148457,3
+mistral-instruct-v0.2:7:ggufv2:Q8_0,393.0,1071.0,0.0,0.36694677871148457,3
 mistral-instruct-v0.2:7:ggufv2:Q4_K_M,391.0,1071.0,0.5773502691896258,0.36507936507936506,3
 mistral-instruct-v0.2:7:ggufv2:Q5_K_M,390.0,1071.0,0.0,0.3641456582633053,3
 mistral-instruct-v0.2:7:ggufv2:Q3_K_M,386.0,1071.0,0.5773502691896258,0.3604108309990663,3
 mixtral-instruct-v0.1:46_7:ggufv2:Q8_0,384.0,1071.0,0.0,0.3585434173669468,3
-mixtral-instruct-v0.1:46_7:ggufv2:Q5_K_M,378.0,1071.0,0.0,0.35294117647058826,3
 mistral-instruct-v0.2:7:ggufv2:Q2_K,378.0,1071.0,0.0,0.35294117647058826,3
+mixtral-instruct-v0.1:46_7:ggufv2:Q5_K_M,378.0,1071.0,0.0,0.35294117647058826,3
 mixtral-instruct-v0.1:46_7:ggufv2:Q6_K,367.0,1071.0,0.5773502691896257,0.3426704014939309,3
 mixtral-instruct-v0.1:46_7:ggufv2:Q2_K,353.0,1071.0,0.5773502691896258,0.3295985060690943,3
-gpt-3.5-turbo-0613,312.0,1071.0,0.0,0.2913165266106443,3
diff --git a/benchmark/results/processed/naive_vs_biochatter.txt b/benchmark/results/processed/naive_vs_biochatter.txt
@@ -1,4 +1,4 @@
-mean: 0.8269791666666666 vs 0.4603125
+mean: 0.8269791666666667 vs 0.4603125
 std: 0.11106303523411587 vs 0.11423809220552653
-p-value: 1.5661453605577185e-37
-t-statistic: 18.410684234461385
+p-value: 1.5661453605576738e-37
+t-statistic: 18.410684234461392
diff --git a/benchmark/results/processed/overview-model.csv b/benchmark/results/processed/overview-model.csv
@@ -2,8 +2,8 @@ Model name,Size,Median Accuracy,SD
 gpt-3.5-turbo-0125,175,0.87,0.21
 gpt-4-turbo-2024-04-09,Unknown,0.83,0.3
 gpt-4-0613,Unknown,0.78,0.18
+gpt-3.5-turbo-0613,175,0.76,0.21
 gpt-4o-2024-05-13,Unknown,0.73,0.35
-gpt-3.5-turbo-0613,175,0.73,0.24
 gpt-4-0125-preview,Unknown,0.73,0.3
 gpt-4o-mini-2024-07-18,Unknown,0.7,0.27
 openhermes-2.5,7,0.7,0.32

diff --git a/benchmark/results/processed/overview-quantisation.csv b/benchmark/results/processed/overview-quantisation.csv
@@ -2,10 +2,10 @@ Model name,Size,Version,Quantisation,Median Accuracy,SD
 gpt-3.5-turbo-0125,175,,,0.87,0.21
 gpt-4-turbo-2024-04-09,Unknown,,,0.83,0.3
 gpt-4-0613,Unknown,,,0.78,0.18
+gpt-3.5-turbo-0613,175,,,0.76,0.21
 gpt-4-0125-preview,Unknown,,,0.73,0.3
 gpt-4o-2024-05-13,Unknown,,,0.73,0.35
 openhermes-2.5,7,ggufv2,Q5_K_M,0.73,0.32
-gpt-3.5-turbo-0613,175,,,0.73,0.24
 openhermes-2.5,7,ggufv2,Q8_0,0.71,0.32
 openhermes-2.5,7,ggufv2,Q4_K_M,0.71,0.33
 gpt-4o-mini-2024-07-18,Unknown,,,0.7,0.27

diff --git a/benchmark/results/processed/overview.csv b/benchmark/results/processed/overview.csv
@@ -2,10 +2,10 @@ Full model name,property_selection,query_generation,multimodal_answer,api_callin
 gpt-3.5-turbo-0125,0.35625,0.9666666666666668,,0.7464788732394366,0.5100318961757607,1.0,1.0,1.0,0.9266666666666666,0.4866666666666667,0.9,0.6704014939309056,0.8666666666666667,0.7858190775010643,0.8666666666666667,0.2113612585450092
 gpt-4-turbo-2024-04-09,0.303125,0.8266666666666667,0.99,,0.6503689591957673,1.0,1.0,0.0,0.6,0.5,1.0,0.8395061728395061,1.0,0.725805566558495,0.8266666666666667,0.30141137449353134
 gpt-4-0613,0.359375,0.9666666666666668,,0.6190476190476191,0.6689027994568157,1.0,0.8888888888888888,0.65,0.88,0.68,1.0,0.7301587301587301,0.8888888888888888,0.7776607160923007,0.7776607160923007,0.17755825856670066
+gpt-3.5-turbo-0613,0.3625,0.9466666666666668,,,0.5753814654033865,1.0,0.8888888888888888,0.5,0.8333333333333334,0.5,1.0,,0.7555555555555555,0.7362325909847831,0.7555555555555555,0.21192578872727258
 gpt-4-0125-preview,0.0,0.8333333333333334,,0.7936507936507936,0.6897052189771663,1.0,0.7777777777777778,0.75,0.0,0.44,0.5,0.7759103641456583,0.7333333333333333,0.6078092351015052,0.7333333333333333,0.2951294777600531
 gpt-4o-2024-05-13,0.0,0.8,0.96,0.8095238095238095,0.6539462799425529,1.0,1.0,0.0,0.0,0.5333333333333333,0.7,0.7628384687208216,0.85,0.6207416839631167,0.7314192343604108,0.3510912882717199
 openhermes-2.5:7:ggufv2:Q5_K_M,0.125,0.9133333333333332,,,0.5799163100443383,1.0,0.8888888888888888,1.0,0.0,0.5866666666666667,1.0,0.5714285714285714,0.7777777777777778,0.6766374134672342,0.7272075956225059,0.3185927867352887
-gpt-3.5-turbo-0613,0.3625,0.9466666666666668,,,0.5753814654033865,1.0,0.8888888888888888,0.5,0.8333333333333334,0.5,1.0,0.2913165266106443,0.7555555555555555,0.6957856760416795,0.7256706157986175,0.23707855730412225
 openhermes-2.5:7:ggufv2:Q8_0,0.125,0.88,,,0.6008286779833671,1.0,0.8888888888888888,1.0,0.0,0.4666666666666667,1.0,0.5770308123249299,0.7555555555555555,0.6630882364926735,0.7093218960241146,0.3199188844960537
 openhermes-2.5:7:ggufv2:Q4_K_M,0.046875,0.8733333333333333,,,0.5972813161390413,1.0,0.8888888888888888,1.0,0.0,0.4666666666666667,1.0,0.5863678804855276,0.7555555555555555,0.655906240097183,0.7057308978263692,0.3309323511149121
 gpt-4o-mini-2024-07-18,0.365625,0.9666666666666668,0.98,0.7142857142857143,0.6845534288609352,0.8333333333333334,1.0,0.0,0.66,0.5333333333333333,0.5,0.8404984423676013,0.925,0.6925612245267373,0.7034234694062258,0.2670836626973781

diff --git a/biochatter/api_agent/abc.py b/biochatter/api_agent/abc.py
@@ -93,7 +93,7 @@ class BaseFetcher(ABC):
     @abstractmethod
     def fetch_results(
         self,
-        query_model,
+        query_model: BaseModel,
         retries: Optional[int] = 3,
     ):
         """

diff --git a/biochatter/api_agent/blast.py b/biochatter/api_agent/blast.py
@@ -299,7 +299,6 @@ def summarise_results(
             question (str): The question to be answered.
             conversation_factory: A BioChatter conversation object.
             response_text (str): The response.text returned by NCBI.
-            n_lines (int): The number of lines to read from the file.
 
         Returns:
             str: The extracted answer from the BLAST results.

diff --git a/biochatter/api_agent/oncokb.py b/biochatter/api_agent/oncokb.py
@@ -301,7 +301,6 @@ def summarise_results(
             question (str): The question to be answered.
             conversation_factory: A BioChatter conversation object.
             response_text (str): The response.text returned by OncoKB.
-            n_lines (int): The number of lines to read from the file.
 
         Returns:
             str: The extracted answer from the BLAST results.

diff --git a/biochatter/llm_connect.py b/biochatter/llm_connect.py
@@ -11,7 +11,7 @@
     st = None
 
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional, Tuple
 import json
 import base64
 import logging
@@ -333,7 +333,8 @@ def _inject_context_by_ragagent_selector(self, text: str):
             agent for agent in self.rag_agents if agent.use_prompt
         ]
         decider_agent = RagAgentSelector(
-            rag_agents=rag_agents, conversation_factory=lambda: self,
+            rag_agents=rag_agents,
+            conversation_factory=lambda: self,
         )
         result = decider_agent.execute(text)
         if result.tool_result is not None and len(result.tool_result) > 0:
@@ -415,7 +416,7 @@ def get_last_injected_context(self) -> list[dict]:
             )
         return last_context
 
-    def get_msg_json(self):
+    def get_msg_json(self) -> str:
         """
         Return a JSON representation (of a list of dicts) of the messages in
         the conversation. The keys of the dicts are the roles, the values are
@@ -476,17 +477,14 @@ def __init__(
             split_correction=split_correction,
         )
 
-    def query(self, text: str):
+    def query(self, text: str) -> Tuple:
         """
         Return the entire message history as a single string. This is the
         message that is sent to the wasm model.
 
         Args:
             text (str): The user query.
 
-            collection_name (str): The name of the collection to use for
-                retrieval-augmented generation.
-
         Returns:
             tuple: A tuple containing the message history as a single string,
                 and `None` for the second and third elements of the tuple.
@@ -835,7 +833,7 @@ def _update_usage_stats(self, model: str, token_usage: dict):
         """
         pass
 
-    def set_api_key(self):
+    def set_api_key(self) -> bool:
         """
         Try to get the Xinference model from the client API. If the model is
         found, initialise the conversational agent. If the model is not found,
@@ -1122,7 +1120,7 @@ def __init__(
         self.ca_model_name = "gpt-3.5-turbo"
         # TODO make accessible by drop-down
 
-    def set_api_key(self, api_key: str, user: str):
+    def set_api_key(self, api_key: str, user: str) -> bool:
         """
         Set the API key for the OpenAI API. If the key is valid, initialise the
         conversational agent. Set the user for usage statistics.
@@ -1294,7 +1292,7 @@ def __init__(
         self.base_url = base_url
         self.deployment_name = deployment_name
 
-    def set_api_key(self, api_key: str, user: Optional[str] = None):
+    def set_api_key(self, api_key: str, user: Optional[str] = None) -> bool:
         """
         Set the API key for the Azure API. If the key is valid, initialise the
         conversational agent. No user stats on Azure.

diff --git a/biochatter/vectorstore.py b/biochatter/vectorstore.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import List, Optional
 
 from transformers import GPT2TokenizerFast
 from langchain.schema import Document
@@ -334,18 +334,16 @@ def __init__(
             documentids_workspace=documentids_workspace,
         )
 
-    def load_models(self):
+    def load_models(self) -> None:
         """
-        Return all models that are currently available on the Xinference server.
-
-        Returns:
-            dict: dict of models
+        Get all models that are currently available on the Xinference server and
+        write them to `self.models`.
         """
         for id, model in self.client.list_models().items():
             model["id"] = id
             self.models[model["model_name"]] = model
 
-    def list_models_by_type(self, type: str):
+    def list_models_by_type(self, type: str) -> List[str]:
         """
         Return all models of a certain type that are currently available on the
         Xinference server.

diff --git a/biochatter/vectorstore_agent.py b/biochatter/vectorstore_agent.py
@@ -522,7 +522,7 @@ def _build_meta_col_query_expr_for_all_documents(
         return expr.replace('"', "").replace("'", "")
 
     def similarity_search(
-        self, query: str, k: int = 3, doc_ids: list[str] = None
+        self, query: str, k: int = 3, doc_ids: Optional[list[str]] = None
     ) -> list[Document]:
         """
         Perform similarity search insider the currently active database
@@ -539,8 +539,8 @@ def similarity_search(
 
             k (int): the number of results to return
 
-            doc_ids(List[str] optional): the list of document ids, do similarity search across the
-            specified documents
+            doc_ids (Optional[list[str]]): the list of document ids, do
+                similarity search across the specified documents
 
         Returns:
             List[Document]: search results
@@ -569,8 +569,8 @@ def remove_document(
         Args:
             doc_id (str): the document to be deleted
 
-            doc_ids(List[str] optional): the list of document ids, defines documents scope
-            within which remove operation occurs.
+            doc_ids (Optional[list[str]]): the list of document ids, defines
+                documents scope within which remove operation occurs.
 
         Returns:
             bool: True if the document is deleted, False otherwise
@@ -610,8 +610,9 @@ def get_all_documents(
         Get all non-deleted documents from the currently active database.
 
         Args:
-            doc_ids(List[str] optional): the list of document ids, defines documents scope within
-            which the operation of obaining all documents occurs
+            doc_ids (List[str] optional): the list of document ids, defines
+                documents scope within which the operation of obtaining all
+                documents occurs
 
         Returns:
             List[Dict]: the metadata of all non-deleted documents in the form

diff --git a/docs/api-reference.md → docs/api-docs/api-calling.md b/docs/api-reference.md → docs/api-docs/api-calling.md
diff --git a/docs/api-docs/index.md b/docs/api-docs/index.md
@@ -0,0 +1,4 @@
+# BioChatter API Reference Documentation
+
+Here we collect documentation of BioChatter module APIs. For detailed
+information on each module, please refer to the navigation side bar.
diff --git a/docs/kg-reference.md → docs/api-docs/kg.md b/docs/kg-reference.md → docs/api-docs/kg.md
diff --git a/docs/llm_connect-reference.md → docs/api-docs/llm_connect.md b/docs/llm_connect-reference.md → docs/api-docs/llm_connect.md
diff --git a/docs/podcast-reference.md → docs/api-docs/podcast.md b/docs/podcast-reference.md → docs/api-docs/podcast.md
diff --git a/docs/reflexion-reference.md → docs/api-docs/reflexion.md b/docs/reflexion-reference.md → docs/api-docs/reflexion.md
diff --git a/docs/vectorstore-reference.md → docs/api-docs/vectorstore.md b/docs/vectorstore-reference.md → docs/api-docs/vectorstore.md
diff --git a/docs/benchmark-developer.md → docs/benchmark/developer.md b/docs/benchmark-developer.md → docs/benchmark/developer.md
diff --git a/docs/benchmark.md → docs/benchmark/overview.md b/docs/benchmark.md → docs/benchmark/overview.md
@@ -1,8 +1,8 @@
 # Benchmark Results - Overview
 
 Here we collect the results of the living BioChatter benchmark. For an
-explanation, see the [benchmarking documentation](benchmarking.md) and the
-[developer docs](benchmark-developer.md) for further reading.
+explanation, see the [benchmarking documentation](../features/benchmark.md) and the
+[developer docs](developer.md) for further reading.
 
 ## Scores per model
 
@@ -11,8 +11,8 @@ Click the column names to reorder.
 
 {{ read_csv('benchmark/results/processed/overview-model.csv', colalign=("left","right")) }}
 
-![Scatter Quantisation Name](images/scatter-per-quantisation-name.png)
-![Boxplot Model](images/stripplot-per-model.png)
+![Scatter Quantisation Name](../images/scatter-per-quantisation-name.png)
+![Boxplot Model](../images/stripplot-per-model.png)
 
 ## Scores per quantisation
 
@@ -21,7 +21,7 @@ Click the column names to reorder.
 
 {{ read_csv('benchmark/results/processed/overview-quantisation.csv', colalign=("left","right")) }}
 
-![Boxplot Quantisation](images/boxplot-per-quantisation.png)
+![Boxplot Quantisation](../images/boxplot-per-quantisation.png)
 
 ## Scores of all tasks
 

diff --git a/docs/benchmark-results.md → docs/benchmark/results.md b/docs/benchmark-results.md → docs/benchmark/results.md
@@ -104,4 +104,12 @@ In this set of tasks, we test LLM abilities to extract text from a given documen
 
     {{ read_csv('benchmark/results/processed/multimodal_answer.csv', colalign=("center","center","center","center","center")) }}
 
-![Stripplot Extraction Subtask](images/stripplot-extraction-tasks.png)
+![Stripplot Extraction Subtask](../images/stripplot-extraction-tasks.png)
+
+## Medical Exam Question Answering
+
+In this set of tasks, we test LLM abilities to answer medical exam questions.
+
+=== "Overall Performance"
+
+    {{ read_csv('benchmark/results/processed/medical_exam.csv', colalign=("center","center","center","center","center")) }}
diff --git a/docs/benchmarking.md → docs/features/benchmark.md b/docs/benchmarking.md → docs/features/benchmark.md
@@ -1,4 +1,4 @@
-# Benchmarking
+# The BioChatter Living Benchmark
 
 For trustworthy application of LLMs to real-world and biomedical problems, it is imperative to understand their performance and limitations.
 We need to constantly evaluate the multitude of combinations of individual models and versions, their parameters (e.g., temperature), prompt sets, databases and vector databases, and diverse application scenarios.
@@ -10,12 +10,12 @@ By tracking model performance on these tests over time, we can gain insights int
 ## Running the benchmark
 
 The benchmark uses the pytest framework to orchestrate the evaluation of a number of models on a number of tasks.
-The benchmark is run on a regular basis, and the results are published in the [benchmark section](benchmark.md).
+The benchmark is run on a regular basis, and the results are published in the [benchmark section](../benchmark/overview.md).
 The benchmarking suite can be found in the `benchmark` directory of the BioChatter repository.
 It can be executed using standard pytest syntax, e.g., `poetry run pytest benchmark`.
 As default behavior it checks, which test cases have already been executed and only executes the tests that have not been executed yet.
 To run all benchmarks again, use `poetry run pytest benchmark --run-all`.
-If you want to develop and debug the benchmark instead of just running it, please refer to the [developer docs](benchmark-developer.md).
+If you want to develop and debug the benchmark instead of just running it, please refer to the [developer docs](../benchmark/developer.md).
 
 To allow flexible extension of the benchmark, we have implemeted a modular test framework that uses pytest fixtures to allow easy addition of new models and tasks.
 All setup is done in the `conftest.py` file in the `benchmark` directory.

diff --git a/docs/chat.md → docs/features/chat.md b/docs/chat.md → docs/features/chat.md
diff --git a/docs/features/index.md b/docs/features/index.md
@@ -0,0 +1,5 @@
+# Features
+
+Here we describe the features of the BioChatter framework and their standard
+usage scenarios. For individual components, please check the side bar
+navigation.
diff --git a/docs/open-llm.md → docs/features/open-llm.md b/docs/open-llm.md → docs/features/open-llm.md
diff --git a/docs/podcast.md → docs/features/podcast.md b/docs/podcast.md → docs/features/podcast.md
diff --git a/docs/rag.md → docs/features/rag.md b/docs/rag.md → docs/features/rag.md
@@ -135,7 +135,7 @@ process. This is partly to account for the limited token input space of some
 models, and partly to better be able to test and compare the individual steps.
 The steps can also be wrapped in a single function, `generate_query`, which
 handles the entire process. In addition, we can use the [Reflexion
-Agent](../reflexion-agent) to iteratively improve a query based on its results.
+Agent](reflexion-agent.md) to iteratively improve a query based on its results.
 
 #### Setup
 

diff --git a/docs/reflexion-agent.md → docs/features/reflexion-agent.md b/docs/reflexion-agent.md → docs/features/reflexion-agent.md
diff --git a/docs/wasm.md → docs/features/wasm.md b/docs/wasm.md → docs/features/wasm.md
diff --git a/docs/images/boxplot-medical-exam-domain.png b/docs/images/boxplot-medical-exam-domain.png
diff --git a/docs/images/boxplot-medical-exam-language-domain.png b/docs/images/boxplot-medical-exam-language-domain.png
diff --git a/docs/images/boxplot-medical-exam-language.png b/docs/images/boxplot-medical-exam-language.png
diff --git a/docs/images/boxplot-medical-exam-task.png b/docs/images/boxplot-medical-exam-task.png
diff --git a/docs/images/boxplot-naive-vs-biochatter.pdf b/docs/images/boxplot-naive-vs-biochatter.pdf
diff --git a/docs/images/boxplot-per-quantisation.png b/docs/images/boxplot-per-quantisation.png
diff --git a/docs/images/boxplot-tasks.png b/docs/images/boxplot-tasks.png
diff --git a/docs/images/boxplot-text2cypher.png b/docs/images/boxplot-text2cypher.png
diff --git a/docs/images/dotplot-per-task.pdf b/docs/images/dotplot-per-task.pdf
diff --git a/docs/images/dotplot-per-task.png b/docs/images/dotplot-per-task.png
diff --git a/docs/images/histogram-image-caption-confidence.png b/docs/images/histogram-image-caption-confidence.png
diff --git a/docs/images/scatter-per-quantisation-name.pdf b/docs/images/scatter-per-quantisation-name.pdf
diff --git a/docs/images/scatter-per-quantisation-name.png b/docs/images/scatter-per-quantisation-name.png
diff --git a/docs/images/scatter-quantisation-accuracy.pdf b/docs/images/scatter-quantisation-accuracy.pdf
diff --git a/docs/images/scatter-quantisation-accuracy.png b/docs/images/scatter-quantisation-accuracy.png
diff --git a/docs/images/scatter-size-accuracy.pdf b/docs/images/scatter-size-accuracy.pdf
diff --git a/docs/images/scatter-size-accuracy.png b/docs/images/scatter-size-accuracy.png
diff --git a/docs/images/stripplot-extraction-tasks.png b/docs/images/stripplot-extraction-tasks.png
diff --git a/docs/images/stripplot-per-model.png b/docs/images/stripplot-per-model.png
diff --git a/docs/images/stripplot-rag-tasks.pdf b/docs/images/stripplot-rag-tasks.pdf
diff --git a/docs/images/stripplot-rag-tasks.png b/docs/images/stripplot-rag-tasks.png