feat(adaptiverag): start implementin adaptive rag

1. create docker compose to host chroma 2. create docker container to upsert sample data 3. start implementation + unit test work on #6
bsorrentino · Jun 17, 2024 · 538c5d7 · 538c5d7
1 parent e34f815
commit 538c5d7
Show file tree

Hide file tree

Showing 44 changed files with 919 additions and 0 deletions.
diff --git a/adaptive-rag/README.md b/adaptive-rag/README.md
@@ -0,0 +1,9 @@
+# Langgraph4j - Adaptive RAG
+
+Java implementation of [Adaptive Rag]
+
+
+[Adaptive Rag]:https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_adaptive_rag.ipynb
+
+
+
diff --git a/adaptive-rag/agentexecutor.puml.png b/adaptive-rag/agentexecutor.puml.png
diff --git a/adaptive-rag/correction_process.puml.png b/adaptive-rag/correction_process.puml.png
diff --git a/adaptive-rag/image_to_diagram.puml.png b/adaptive-rag/image_to_diagram.puml.png
diff --git a/adaptive-rag/image_to_diagram_with_correction.puml.png b/adaptive-rag/image_to_diagram_with_correction.puml.png
diff --git a/adaptive-rag/logging.properties b/adaptive-rag/logging.properties
@@ -0,0 +1,6 @@
+handlers=java.util.logging.ConsoleHandler
+.level=INFO
+DiagramCorrectionProcess.level=FINEST
+ImageToDiagramProcess.level=FINEST
+java.util.logging.ConsoleHandler.level=ALL
+java.util.logging.ConsoleHandler.formatter=java.util.logging.SimpleFormatter
diff --git a/adaptive-rag/pom.xml b/adaptive-rag/pom.xml
@@ -0,0 +1,99 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>org.bsc.langgraph4j</groupId>
+        <artifactId>langgraph4j-parent</artifactId>
+        <version>1.0-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>langgraph4j-adaptive-rag</artifactId>
+    <packaging>jar</packaging>
+
+    <name>langgraph4j::adaptive-rag</name>
+
+    <properties>
+    </properties>
+
+    <dependencies>
+
+        <dependency>
+            <groupId>org.bsc.langgraph4j</groupId>
+            <artifactId>langgraph4j-jdk8</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.projectlombok</groupId>
+            <artifactId>lombok</artifactId>
+            <scope>provided</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>dev.langchain4j</groupId>
+            <artifactId>langchain4j</artifactId>
+            <version>${langchai4j.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>dev.langchain4j</groupId>
+            <artifactId>langchain4j-open-ai</artifactId>
+            <version>${langchai4j.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>dev.langchain4j</groupId>
+            <artifactId>langchain4j-chroma</artifactId>
+            <version>${langchai4j.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>net.sourceforge.plantuml</groupId>
+            <artifactId>plantuml-mit</artifactId>
+            <version>1.2024.4</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-jdk14</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-deploy-plugin</artifactId>
+                <configuration>
+                    <skip>true</skip>
+                </configuration>
+            </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <configuration>
+                    <skipTests>true</skipTests>
+                </configuration>
+            </plugin>
+
+            <plugin>
+                <groupId>org.projectlombok</groupId>
+                <artifactId>lombok-maven-plugin</artifactId>
+                <version>1.18.20.0</version>
+                <configuration>
+                    <sourceDirectory>src/main/java</sourceDirectory>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/adaptive-rag/src/main/docker/docker-compose.yml b/adaptive-rag/src/main/docker/docker-compose.yml
@@ -0,0 +1,19 @@
+services:
+  chroma:
+    container_name: chromadb
+    image: chromadb/chroma:latest
+    command: "--workers 1 --host 0.0.0.0 --port 8000 --proxy-headers --log-config chromadb/log_config.yml --timeout-keep-alive 30"
+    ports:
+      - "8000:8000"
+    environment:
+      - IS_PERSISTENT=TRUE
+      - ALLOW_RESET=TRUE
+  upsert:
+    container_name: upsert
+    build:
+      context: ./upsert
+      dockerfile: Dockerfile
+    depends_on:
+      - chroma
+    environment:
+      - OPENAI_API_KEY=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
diff --git a/adaptive-rag/src/main/docker/upsert/Dockerfile b/adaptive-rag/src/main/docker/upsert/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:latest
+
+COPY requirements.txt /tmp
+
+RUN pip install --no-cache-dir --user -r /tmp/requirements.txt
+
+WORKDIR /workspace
+
+COPY index.py /workspace
+
+CMD ["python", "index.py"]
diff --git a/adaptive-rag/src/main/docker/upsert/index.py b/adaptive-rag/src/main/docker/upsert/index.py
@@ -0,0 +1,84 @@
+### Build Index
+
+import chromadb
+from chromadb.config import Settings
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+
+## see https://stackoverflow.com/a/77925278/521197
+class CustomOpenAIEmbeddings(OpenAIEmbeddings):
+    "make OpenAIEmbeddings compliant with chromadb api"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def _embed_documents(self, texts):
+        return super().embed_documents(texts)  # <--- use OpenAIEmbedding's embedding function
+
+    def __call__(self, input):
+        return self._embed_documents(input)    # <--- get the embeddings
+
+
+def upsert_chroma_via_http( client, docs, embedding_function ):
+    import uuid
+
+    client.reset()  # resets the database
+    collection = client.create_collection("rag-chroma", embedding_function=CustomOpenAIEmbeddings())
+    for doc in docs:
+        collection.add(
+            ids=[str(uuid.uuid1())],
+            metadatas=doc.metadata,
+            documents=doc.page_content
+        )
+    return client
+
+def upsert_chroma_local():
+    "Add to vectorstore"
+
+    vectorstore = Chroma.from_documents(
+        documents=doc_splits,
+        collection_name="rag-chroma",
+        embedding=embd,
+     )
+    retriever = vectorstore.as_retriever()
+
+def test_query( client, embedding_function ):
+
+    db4 = Chroma(
+        client=client,
+        collection_name="rag-chroma",
+        embedding_function=embedding_function,
+    )
+    query = "What are the types of agent memory?"
+    docs = db4.similarity_search(query)
+    print(docs[0].page_content)
+
+
+# Set embeddings
+embd = OpenAIEmbeddings()
+
+# Docs to index
+urls = [
+    "https://lilianweng.github.io/posts/2023-06-23-agent/",
+    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
+    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
+]
+
+# Load
+docs = [WebBaseLoader(url).load() for url in urls]
+docs_list = [item for sublist in docs for item in sublist]
+
+# Split
+text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+    chunk_size=500, chunk_overlap=0
+)
+doc_splits = text_splitter.split_documents(docs_list)
+
+client = chromadb.HttpClient( host="chromadb", port=8000, settings=Settings(allow_reset=True))
+
+upsert_chroma_via_http( client, doc_splits, embd )
+
+test_query( client, embd)
diff --git a/adaptive-rag/src/main/docker/upsert/requirements.txt b/adaptive-rag/src/main/docker/upsert/requirements.txt
@@ -0,0 +1,10 @@
+langchain_community
+langchain-openai
+tiktoken
+langchain
+chromadb
+beautifulsoup4
+#langgraph
+#tavily-python
+#langchain-cohere
+#langchainhub
diff --git a/adaptive-rag/src/main/java/dev/langchain4j/adaptiverag/Grader.java b/adaptive-rag/src/main/java/dev/langchain4j/adaptiverag/Grader.java
@@ -0,0 +1,33 @@
+package dev.langchain4j.adaptiverag;
+
+import dev.langchain4j.model.input.structured.StructuredPrompt;
+import dev.langchain4j.model.output.structured.Description;
+import dev.langchain4j.service.SystemMessage;
+
+public class Grader {
+
+    /**
+     * Binary score for relevance check on retrieved documents.
+     */
+    public static class Documents {
+
+        @Description("Documents are relevant to the question, 'yes' or 'no'")
+        public String binaryScore;
+    }
+
+    @StructuredPrompt("Retrieved document: \n\n {{document}} \n\n User question: {{question}}")
+    public static class CreateRecipePrompt {
+
+        private String document;
+        private String question;
+    }
+
+    public interface Retrieval {
+
+        @SystemMessage("You are a grader assessing relevance of a retrieved document to a user question. \n" +
+                "    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n" +
+                "    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n" +
+                "    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.")
+        Documents invoke(String question);
+    }
+}
diff --git a/adaptive-rag/src/main/java/dev/langchain4j/adaptiverag/Router.java b/adaptive-rag/src/main/java/dev/langchain4j/adaptiverag/Router.java
@@ -0,0 +1,32 @@
+package dev.langchain4j.adaptiverag;
+
+import dev.langchain4j.model.output.structured.Description;
+import dev.langchain4j.service.SystemMessage;
+
+/**
+ * Router for user queries to the most relevant datasource.
+ */
+public class Router {
+
+    public enum Type {
+        vectorstore,
+        websearch
+    }
+    /**
+    * Route a user query to the most relevant datasource.
+    */
+    public static class DataSource {
+
+        @Description("Given a user question choose to route it to web search or a vectorstore.")
+        Type datasource;
+    }
+
+
+    public interface Extractor {
+
+        @SystemMessage("You are an expert at routing a user question to a vectorstore or web search.\n" +
+                "The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.\n" +
+                "Use the vectorstore for questions on these topics. Otherwise, use web-search.")
+        DataSource route(String question);
+    }
+}
diff --git a/adaptive-rag/src/main/java/resources/convert_generic_diagram_to_plantuml.txt b/adaptive-rag/src/main/java/resources/convert_generic_diagram_to_plantuml.txt
@@ -0,0 +1,20 @@
+Translate the json data representing the diagram data into a plantuml script considering:
+
+1. The participants' shape must be translated in their plantuml counterpart using the following conversion rules :
+    - "rectangle" shape  must be translated into  plantuml's "rectangle"
+    - "circle" shape  must be translated into  plantuml's "circle"
+    - "person" or  "stickman" shape  must be translated into  plantuml's  "actor"
+    - "oval" or "ellipse"  shape  must be translated into  plantuml's  "usecase"
+    - "cylinder" shape  must be translated into  plantuml's  "database"
+    - "diamond" shape  must be translated into  plantuml's "hexagon"
+2. Each recognised participant must be written in the form: "<participant plantuml shape>"  "<name>" as <camel case name><<description>>
+3. Relations must be the arrow that connect participants
+4. Put diagram description in the legend of the diagram in the form:
+    legend
+    <description with a bullet point for each steps>
+    end legend
+5. Put  diagram title in the form:
+     title "<diagram title>"
+
+diagram data:
+ {{diagram_description}}
diff --git a/adaptive-rag/src/main/java/resources/convert_sequence_diagram_to_plantuml.txt b/adaptive-rag/src/main/java/resources/convert_sequence_diagram_to_plantuml.txt
@@ -0,0 +1,8 @@
+Translate the diagram description into plantUML syntax.
+Also put the diagram description in the legend in the form:
+legend
+<description with a bullet point for each steps>
+end legend
+
+diagram description with title {{diagram_title}}:
+{{diagram_description}}
diff --git a/adaptive-rag/src/main/java/resources/describe_diagram_image.txt b/adaptive-rag/src/main/java/resources/describe_diagram_image.txt
@@ -0,0 +1,36 @@
+describe the diagram in the image step by step so we can translate it into diagram-as-code syntax. 
+
+ Return a markdown code snippet with a JSON object formatted to look like:
+```json
+{
+  "type": string // Diagram tipology (one word). Eg. "sequence", "class", "process", etc."
+  "title": string // Diagram summary (max one line) or title (if any)
+  "participants": array[
+    {
+      "name": string // participant name
+      "shape": string // participant shape
+      "description": string // participant description
+    }
+  ]  // list of participants in the diagram 
+  "relations": array[
+    {
+      "source": string // source participant
+      "target": string // target participant
+      "description": string // relation description
+    }
+  ]  // list of relations in the diagram
+  "containers": array[
+    {
+      "name": string // container name
+      "children": array[
+        string
+      ]  // list of contained elements name
+      "description": string // container description
+    }
+  ]  // list of participants that contain other ones in the diagram
+  "description": array[
+    string
+  ]  // Step by step description of the diagram with clear indication of participants and actions between them.
+}
+```
+Must not include the "JSON schema" in the response