diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a29f231..17505fcd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,12 @@ ## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02) + ### Bug Fixes * gemini errror ([2ea54ea](https://github.com/VinciGit00/Scrapegraph-ai/commit/2ea54eab1d070e177c7d5ecfcc032b325fbd7c12)) + ## [0.6.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.2...v0.6.0) (2024-05-02) diff --git a/pyproject.toml b/pyproject.toml index 800e5c9f..19c41f4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,6 @@ name = "scrapegraphai" version = "0.6.1" - description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ "Marco Vinciguerra ", @@ -41,6 +40,8 @@ minify-html = "0.15.0" free-proxy = "1.1.1" langchain-groq = "0.1.3" playwright = "^1.43.0" +langchain-aws = "^0.1.2" + [tool.poetry.dev-dependencies] pytest = "8.0.0" diff --git a/requirements.txt b/requirements.txt index 02aadac4..b7c642d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ google==3.0.0 minify-html==0.15.0 free-proxy==1.1.1 langchain-groq==0.1.3 -playwright==1.43.0 \ No newline at end of file +playwright==1.43.0 +langchain-aws==0.1.2 diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 7949c114..b8a9efe9 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -4,7 +4,8 @@ from abc import ABC, abstractmethod from typing import Optional -from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq + +from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq, Bedrock from ..helpers import models_tokens @@ -47,7 +48,8 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None): # Set common configuration parameters self.verbose = True if config is None else config.get("verbose", False) - self.headless = True if config is None else config.get("headless", True) + self.headless = True if config is None else config.get( + "headless", True) # Create the graph self.graph = self._create_graph() @@ -140,12 +142,26 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: return HuggingFace(llm_params) elif "groq" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] - + try: self.model_token = models_tokens["groq"][llm_params["model"]] except KeyError: raise KeyError("Model not supported") return Groq(llm_params) + elif "bedrock" in llm_params["model"]: + llm_params["model"] = llm_params["model"].split("/")[-1] + model_id = llm_params["model"] + + try: + self.model_token = models_tokens["bedrock"][llm_params["model"]] + except KeyError: + raise KeyError("Model not supported") + return Bedrock({ + "model_id": model_id, + "model_kwargs": { + "temperature": llm_params["temperature"], + } + }) else: raise ValueError( "Model provided by the configuration not supported") diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 55d3e689..a9bab3fc 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -48,5 +48,22 @@ "claude2": 9000, "claude2.1": 200000, "claude3": 200000 + }, + "bedrock": { + "anthropic.claude-3-haiku-20240307-v1:0": 200000, + "anthropic.claude-3-sonnet-20240229-v1:0": 200000, + "anthropic.claude-3-opus-20240229-v1:0": 200000, + "anthropic.claude-v2:1": 200000, + "anthropic.claude-v2": 100000, + "anthropic.claude-instant-v1": 100000, + "meta.llama3-8b-instruct-v1:0": 8192, + "meta.llama3-70b-instruct-v1:0": 8192, + "meta.llama2-13b-chat-v1": 4096, + "meta.llama2-70b-chat-v1": 4096, + "mistral.mistral-7b-instruct-v0:2": 32768, + "mistral.mixtral-8x7b-instruct-v0:1": 32768, + "mistral.mistral-large-2402-v1:0": 32768, + "cohere.embed-english-v3": 512, + "cohere.embed-multilingual-v3": 512 } } diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py index b81e376f..19751e4c 100644 --- a/scrapegraphai/models/__init__.py +++ b/scrapegraphai/models/__init__.py @@ -10,3 +10,4 @@ from .ollama import Ollama from .hugging_face import HuggingFace from .groq import Groq +from .bedrock import Bedrock diff --git a/scrapegraphai/models/bedrock.py b/scrapegraphai/models/bedrock.py new file mode 100644 index 00000000..b7cbe288 --- /dev/null +++ b/scrapegraphai/models/bedrock.py @@ -0,0 +1,19 @@ +""" +bedrock configuration wrapper +""" +from langchain_aws import ChatBedrock + + +class Bedrock(ChatBedrock): + """Class for wrapping bedrock module""" + + def __init__(self, llm_config: dict): + """ + A wrapper for the ChatBedrock class that provides default configuration + and could be extended with additional methods if needed. + + Args: + llm_config (dict): Configuration parameters for the language model. + """ + # Initialize the superclass (ChatBedrock) with provided config parameters + super().__init__(**llm_config) diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index d3842742..92e7011f 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -6,12 +6,14 @@ from langchain.docstore.document import Document from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline +from langchain_aws.embeddings.bedrock import BedrockEmbeddings from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.embeddings import HuggingFaceHubEmbeddings from langchain_community.vectorstores import FAISS from langchain_community.embeddings import OllamaEmbeddings from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings -from ..models import OpenAI, Ollama, AzureOpenAI, HuggingFace + +from ..models import OpenAI, Ollama, AzureOpenAI, HuggingFace, Bedrock from .base_node import BaseNode @@ -39,7 +41,8 @@ def __init__(self, input: str, output: List[str], node_config: dict, node_name: self.llm_model = node_config["llm"] self.embedder_model = node_config.get("embedder_model", None) - self.verbose = True if node_config is None else node_config.get("verbose", False) + self.verbose = True if node_config is None else node_config.get( + "verbose", False) def execute(self, state: dict) -> dict: """ @@ -80,7 +83,7 @@ def execute(self, state: dict) -> dict: }, ) chunked_docs.append(doc) - + if self.verbose: print("--- (updated chunks metadata) ---") @@ -104,6 +107,9 @@ def execute(self, state: dict) -> dict: embeddings = OllamaEmbeddings(**params) elif isinstance(embedding_model, HuggingFace): embeddings = HuggingFaceHubEmbeddings(model=embedding_model.model) + elif isinstance(embedding_model, Bedrock): + embeddings = BedrockEmbeddings( + client=None, model_id=embedding_model.model_id) else: raise ValueError("Embedding Model missing or not supported")