Merge branch 'pre/beta' into main

ScrapeGraphAI · May 2, 2024 · 96ebcfc · 96ebcfc
2 parents faa3498 + 75a4042
commit 96ebcfc
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,12 @@
 ## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02)
 
 
+
 ### Bug Fixes
 
 * gemini errror ([2ea54ea](https://github.com/VinciGit00/Scrapegraph-ai/commit/2ea54eab1d070e177c7d5ecfcc032b325fbd7c12))
 
+
 ## [0.6.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.2...v0.6.0) (2024-05-02)
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,6 @@ name = "scrapegraphai"
 
 version = "0.6.1"
 
-
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
     "Marco Vinciguerra <[email protected]>",
@@ -41,6 +40,8 @@ minify-html = "0.15.0"
 free-proxy = "1.1.1"
 langchain-groq = "0.1.3"
 playwright = "^1.43.0"
+langchain-aws = "^0.1.2"
+
 
 [tool.poetry.dev-dependencies]
 pytest = "8.0.0"

diff --git a/requirements.txt b/requirements.txt
@@ -13,4 +13,5 @@ google==3.0.0
 minify-html==0.15.0
 free-proxy==1.1.1
 langchain-groq==0.1.3
-playwright==1.43.0
+playwright==1.43.0
+langchain-aws==0.1.2
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -4,7 +4,8 @@
 
 from abc import ABC, abstractmethod
 from typing import Optional
-from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq
+
+from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq, Bedrock
 from ..helpers import models_tokens
 
 
@@ -47,7 +48,8 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
 
         # Set common configuration parameters
         self.verbose = True if config is None else config.get("verbose", False)
-        self.headless = True if config is None else config.get("headless", True)
+        self.headless = True if config is None else config.get(
+            "headless", True)
 
         # Create the graph
         self.graph = self._create_graph()
@@ -140,12 +142,26 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
             return HuggingFace(llm_params)
         elif "groq" in llm_params["model"]:
             llm_params["model"] = llm_params["model"].split("/")[-1]
-            
+
             try:
                 self.model_token = models_tokens["groq"][llm_params["model"]]
             except KeyError:
                 raise KeyError("Model not supported")
             return Groq(llm_params)
+        elif "bedrock" in llm_params["model"]:
+            llm_params["model"] = llm_params["model"].split("/")[-1]
+            model_id = llm_params["model"]
+
+            try:
+                self.model_token = models_tokens["bedrock"][llm_params["model"]]
+            except KeyError:
+                raise KeyError("Model not supported")
+            return Bedrock({
+                "model_id": model_id,
+                "model_kwargs": {
+                    "temperature": llm_params["temperature"],
+                }
+            })
         else:
             raise ValueError(
                 "Model provided by the configuration not supported")

diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -48,5 +48,22 @@
         "claude2": 9000,
         "claude2.1": 200000,
         "claude3": 200000
+    },
+    "bedrock": {
+        "anthropic.claude-3-haiku-20240307-v1:0": 200000,
+        "anthropic.claude-3-sonnet-20240229-v1:0": 200000,
+        "anthropic.claude-3-opus-20240229-v1:0": 200000,
+        "anthropic.claude-v2:1": 200000,
+        "anthropic.claude-v2": 100000,
+        "anthropic.claude-instant-v1": 100000,
+        "meta.llama3-8b-instruct-v1:0": 8192,
+        "meta.llama3-70b-instruct-v1:0": 8192,
+        "meta.llama2-13b-chat-v1": 4096,
+        "meta.llama2-70b-chat-v1": 4096,
+        "mistral.mistral-7b-instruct-v0:2": 32768,
+        "mistral.mixtral-8x7b-instruct-v0:1": 32768,
+        "mistral.mistral-large-2402-v1:0": 32768,
+        "cohere.embed-english-v3": 512,
+        "cohere.embed-multilingual-v3": 512
     }
 }
diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py
@@ -10,3 +10,4 @@
 from .ollama import Ollama
 from .hugging_face import HuggingFace
 from .groq import Groq
+from .bedrock import Bedrock
diff --git a/scrapegraphai/models/bedrock.py b/scrapegraphai/models/bedrock.py
@@ -0,0 +1,19 @@
+""" 
+bedrock configuration wrapper
+"""
+from langchain_aws import ChatBedrock
+
+
+class Bedrock(ChatBedrock):
+    """Class for wrapping bedrock module"""
+
+    def __init__(self, llm_config: dict):
+        """
+        A wrapper for the ChatBedrock class that provides default configuration
+        and could be extended with additional methods if needed.
+
+        Args:
+            llm_config (dict): Configuration parameters for the language model.
+        """
+        # Initialize the superclass (ChatBedrock) with provided config parameters
+        super().__init__(**llm_config)
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
@@ -6,12 +6,14 @@
 from langchain.docstore.document import Document
 from langchain.retrievers import ContextualCompressionRetriever
 from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline
+from langchain_aws.embeddings.bedrock import BedrockEmbeddings
 from langchain_community.document_transformers import EmbeddingsRedundantFilter
 from langchain_community.embeddings import HuggingFaceHubEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings
-from ..models import OpenAI, Ollama, AzureOpenAI, HuggingFace
+
+from ..models import OpenAI, Ollama, AzureOpenAI, HuggingFace, Bedrock
 from .base_node import BaseNode
 
 
@@ -39,7 +41,8 @@ def __init__(self, input: str, output: List[str], node_config: dict, node_name:
 
         self.llm_model = node_config["llm"]
         self.embedder_model = node_config.get("embedder_model", None)
-        self.verbose = True if node_config is None else node_config.get("verbose", False)
+        self.verbose = True if node_config is None else node_config.get(
+            "verbose", False)
 
     def execute(self, state: dict) -> dict:
         """
@@ -80,7 +83,7 @@ def execute(self, state: dict) -> dict:
                 },
             )
             chunked_docs.append(doc)
-        
+
         if self.verbose:
             print("--- (updated chunks metadata) ---")
 
@@ -104,6 +107,9 @@ def execute(self, state: dict) -> dict:
             embeddings = OllamaEmbeddings(**params)
         elif isinstance(embedding_model, HuggingFace):
             embeddings = HuggingFaceHubEmbeddings(model=embedding_model.model)
+        elif isinstance(embedding_model, Bedrock):
+            embeddings = BedrockEmbeddings(
+                client=None, model_id=embedding_model.model_id)
         else:
             raise ValueError("Embedding Model missing or not supported")