Merge branch 'pre/beta' into pdf_scraper_refactoring

ScrapeGraphAI · May 24, 2024 · a4ee757 · a4ee757
2 parents 8d5eb0b + e1006f3
commit a4ee757
Show file tree

Hide file tree

Showing 37 changed files with 1,440 additions and 342 deletions.
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.10.14
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,37 @@
+## [1.5.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.4.0...v1.5.0-beta.1) (2024-05-24)
+
+
+### Features
+
+* **knowledgegraph:** add knowledge graph node ([0196423](https://github.com/VinciGit00/Scrapegraph-ai/commit/0196423bdeea6568086aae6db8fc0f5652fc4e87))
+* add logger integration ([e53766b](https://github.com/VinciGit00/Scrapegraph-ai/commit/e53766b16e89254f945f9b54b38445a24f8b81f2))
+* **smart-scraper-multi:** add schema to graphs and created SmartScraperMultiGraph ([fc58e2d](https://github.com/VinciGit00/Scrapegraph-ai/commit/fc58e2d3a6f05efa72b45c9e68c6bb41a1eee755))
+* **base_graph:** alligned with main ([73fa31d](https://github.com/VinciGit00/Scrapegraph-ai/commit/73fa31db0f791d1fd63b489ac88cc6e595aa07f9))
+* **verbose:** centralized graph logging on debug or warning depending on verbose ([c807695](https://github.com/VinciGit00/Scrapegraph-ai/commit/c807695720a85c74a0b4365afb397bbbcd7e2889))
+* **node:** knowledge graph node ([8c33ea3](https://github.com/VinciGit00/Scrapegraph-ai/commit/8c33ea3fbce18f74484fe7bd9469ab95c985ad0b))
+* **multiple:** quick fix working ([58cc903](https://github.com/VinciGit00/Scrapegraph-ai/commit/58cc903d556d0b8db10284493b05bed20992c339))
+* **kg:** removed import ([a338383](https://github.com/VinciGit00/Scrapegraph-ai/commit/a338383399b669ae2dd7bfcec168b791e8206816))
+* **docloaders:** undetected-playwright ([7b3ee4e](https://github.com/VinciGit00/Scrapegraph-ai/commit/7b3ee4e71e4af04edeb47999d70d398b67c93ac4))
+* **multiple_search:** working multiple example ([bed3eed](https://github.com/VinciGit00/Scrapegraph-ai/commit/bed3eed50c1678cfb07cba7b451ac28d38c87d7c))
+* **kg:** working rag kg ([c75e6a0](https://github.com/VinciGit00/Scrapegraph-ai/commit/c75e6a06b1a647f03e6ac6eeacdc578a85baa25b))
+
+
+### Bug Fixes
+
+* error in jsons ([ca436ab](https://github.com/VinciGit00/Scrapegraph-ai/commit/ca436abf3cbff21d752a71969e787e8f8c98c6a8))
+* **logger:** set up centralized root logger in base node ([4348d4f](https://github.com/VinciGit00/Scrapegraph-ai/commit/4348d4f4db6f30213acc1bbccebc2b143b4d2636))
+* **logging:** source code citation ([d139480](https://github.com/VinciGit00/Scrapegraph-ai/commit/d1394809d704bee4085d494ddebab772306b3b17))
+* template names ([b82f33a](https://github.com/VinciGit00/Scrapegraph-ai/commit/b82f33aee72515e4258e6f508fce15028eba5cbe))
+* **node-logging:** use centralized logger in each node for logging ([c251cc4](https://github.com/VinciGit00/Scrapegraph-ai/commit/c251cc45d3694f8e81503e38a6d2b362452b740e))
+* **web-loader:** use sublogger ([0790ecd](https://github.com/VinciGit00/Scrapegraph-ai/commit/0790ecd2083642af9f0a84583216ababe351cd76))
+
+
+### CI
+
+* **release:** 1.2.0-beta.1 [skip ci] ([fd3e0aa](https://github.com/VinciGit00/Scrapegraph-ai/commit/fd3e0aa5823509dfb46b4f597521c24d4eb345f1))
+* **release:** 1.3.0-beta.1 [skip ci] ([191db0b](https://github.com/VinciGit00/Scrapegraph-ai/commit/191db0bc779e4913713b47b68ec4162a347da3ea))
+* **release:** 1.4.0-beta.1 [skip ci] ([2caddf9](https://github.com/VinciGit00/Scrapegraph-ai/commit/2caddf9a99b5f3aedc1783216f21d23cd35b3a8c))
+* **release:** 1.4.0-beta.2 [skip ci] ([f1a2523](https://github.com/VinciGit00/Scrapegraph-ai/commit/f1a25233d650010e1932e0ab80938079a22a296d))
 
 ## [1.4.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.4.0-beta.1...v1.4.0-beta.2) (2024-05-19)
 

diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py
@@ -20,6 +20,7 @@
         # "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
     },
     "verbose": True,
+    "headless": False
 }
 
 # ************************************************

diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py
@@ -18,10 +18,10 @@
 
 graph_config = {
     "llm": {
-        "api_key":openai_key,
+        "api_key": openai_key,
         "model": "gpt-3.5-turbo",
     },
-    "verbose": True,
+    "verbose": False,
     "headless": False,
 }
 

diff --git a/examples/single_node/robot_node.py b/examples/single_node/robot_node.py
@@ -11,7 +11,7 @@
 
 graph_config = {
     "llm": {
-        "model": "ollama/llama3",
+        "model_name": "ollama/llama3",
         "temperature": 0,
         "streaming": True
     },

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "scrapegraphai"
 
 
-version = "1.4.0b2"
+version = "1.5.0b1"
 
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."

diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -1,14 +1,13 @@
 import asyncio
-import logging
 from typing import Any, AsyncIterator, Iterator, List, Optional
 
 from langchain_community.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 
-from ..utils import Proxy, dynamic_import, parse_or_search_proxy
+from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
 
 
-logger = logging.getLogger(__name__)
+logger = get_logger("web-loader")
 
 
 class ChromiumLoader(BaseLoader):

diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -1,15 +1,28 @@
 """
 AbstractGraph Module
 """
+
 from abc import ABC, abstractmethod
 from typing import Optional
+
 from langchain_aws import BedrockEmbeddings
-from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
 from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings
 from langchain_google_genai import GoogleGenerativeAIEmbeddings
-from ..helpers import models_tokens
-from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek
 from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
+from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
+
+from ..helpers import models_tokens
+from ..models import (
+    Anthropic,
+    AzureOpenAI,
+    Bedrock,
+    Gemini,
+    Groq,
+    HuggingFace,
+    Ollama,
+    OpenAI,
+)
+from ..utils.logging import set_verbosity_debug, set_verbosity_warning
 
 from ..helpers import models_tokens
 from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek
@@ -67,10 +80,15 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None, sche
         self.execution_info = None
 
         # Set common configuration parameters
-        self.verbose = False if config is None else config.get(
-            "verbose", False)
-        self.headless = True if config is None else config.get(
-            "headless", True)
+
+        verbose = bool(config and config.get("verbose"))
+
+        if verbose:
+            set_verbosity_debug()
+        else:
+            set_verbosity_warning()
+
+        self.headless = True if config is None else config.get("headless", True)
         self.loader_kwargs = config.get("loader_kwargs", {})
 
         common_params = {
@@ -96,22 +114,22 @@ def set_common_params(self, params: dict, overwrite=False):
 
     def _set_model_token(self, llm):
 
-        if 'Azure' in str(type(llm)):
+        if "Azure" in str(type(llm)):
             try:
                 self.model_token = models_tokens["azure"][llm.model_name]
             except KeyError:
                 raise KeyError("Model not supported")
 
-        elif 'HuggingFaceEndpoint' in str(type(llm)):
-            if 'mistral' in llm.repo_id:
+        elif "HuggingFaceEndpoint" in str(type(llm)):
+            if "mistral" in llm.repo_id:
                 try:
-                    self.model_token = models_tokens['mistral'][llm.repo_id]
+                    self.model_token = models_tokens["mistral"][llm.repo_id]
                 except KeyError:
                     raise KeyError("Model not supported")
-        elif 'Google' in str(type(llm)):
+        elif "Google" in str(type(llm)):
             try:
-                if 'gemini' in llm.model:
-                    self.model_token = models_tokens['gemini'][llm.model]
+                if "gemini" in llm.model:
+                    self.model_token = models_tokens["gemini"][llm.model]
             except KeyError:
                 raise KeyError("Model not supported")
 
@@ -129,17 +147,14 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
             KeyError: If the model is not supported.
         """
 
-        llm_defaults = {
-            "temperature": 0,
-            "streaming": False
-        }
+        llm_defaults = {"temperature": 0, "streaming": False}
         llm_params = {**llm_defaults, **llm_config}
 
         # If model instance is passed directly instead of the model details
-        if 'model_instance' in llm_params:
+        if "model_instance" in llm_params:
             if chat:
-                self._set_model_token(llm_params['model_instance'])
-            return llm_params['model_instance']
+                self._set_model_token(llm_params["model_instance"])
+            return llm_params["model_instance"]
 
         # Instantiate the language model based on the model name
         if "gpt-" in llm_params["model"]:
@@ -208,19 +223,21 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
         elif "bedrock" in llm_params["model"]:
             llm_params["model"] = llm_params["model"].split("/")[-1]
             model_id = llm_params["model"]
-            client = llm_params.get('client', None)
+            client = llm_params.get("client", None)
             try:
                 self.model_token = models_tokens["bedrock"][llm_params["model"]]
             except KeyError:
                 print("model not found, using default token size (8192)")
                 self.model_token = 8192
-            return Bedrock({
-                "client": client,
-                "model_id": model_id,
-                "model_kwargs": {
-                    "temperature": llm_params["temperature"],
+            return Bedrock(
+                {
+                    "client": client,
+                    "model_id": model_id,
+                    "model_kwargs": {
+                        "temperature": llm_params["temperature"],
+                    },
                 }
-            })
+            )
         elif "claude-3-" in llm_params["model"]:
             try:
                 self.model_token = models_tokens["claude"]["claude3"]
@@ -236,8 +253,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
                 self.model_token = 8192
             return DeepSeek(llm_params)
         else:
-            raise ValueError(
-                "Model provided by the configuration not supported")
+            raise ValueError("Model provided by the configuration not supported")
 
     def _create_default_embedder(self, llm_config=None) -> object:
         """
@@ -250,8 +266,9 @@ def _create_default_embedder(self, llm_config=None) -> object:
             ValueError: If the model is not supported.
         """
         if isinstance(self.llm_model, Gemini):
-            return GoogleGenerativeAIEmbeddings(google_api_key=llm_config['api_key'],
-                                                model="models/embedding-001")
+            return GoogleGenerativeAIEmbeddings(
+                google_api_key=llm_config["api_key"], model="models/embedding-001"
+            )
         if isinstance(self.llm_model, OpenAI):
             return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
         elif isinstance(self.llm_model, DeepSeek):
@@ -288,8 +305,8 @@ def _create_embedder(self, embedder_config: dict) -> object:
         Raises:
             KeyError: If the model is not supported.
         """
-        if 'model_instance' in embedder_config:
-            return embedder_config['model_instance']
+        if "model_instance" in embedder_config:
+            return embedder_config["model_instance"]
         # Instantiate the embedding model based on the model name
         if "openai" in embedder_config["model"]:
             return OpenAIEmbeddings(api_key=embedder_config["api_key"])
@@ -306,25 +323,27 @@ def _create_embedder(self, embedder_config: dict) -> object:
             try:
                 models_tokens["hugging_face"][embedder_config["model"]]
             except KeyError as exc:
-                raise KeyError("Model not supported")from exc
+                raise KeyError("Model not supported") from exc
             return HuggingFaceHubEmbeddings(model=embedder_config["model"])
         elif "gemini" in embedder_config["model"]:
             try:
                 models_tokens["gemini"][embedder_config["model"]]
             except KeyError as exc:
-                raise KeyError("Model not supported")from exc
+                raise KeyError("Model not supported") from exc
             return GoogleGenerativeAIEmbeddings(model=embedder_config["model"])
         elif "bedrock" in embedder_config["model"]:
             embedder_config["model"] = embedder_config["model"].split("/")[-1]
-            client = embedder_config.get('client', None)
+            client = embedder_config.get("client", None)
             try:
                 models_tokens["bedrock"][embedder_config["model"]]
             except KeyError as exc:
                 raise KeyError("Model not supported") from exc
-            return BedrockEmbeddings(client=client, model_id=embedder_config["model"])  
+            return BedrockEmbeddings(client=client, model_id=embedder_config["model"])
+        else:
+            raise ValueError("Model provided by the configuration not supported")
 
     def get_state(self, key=None) -> dict:
-        """""
+        """ ""
         Get the final state of the graph.
 
         Args:

diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -5,6 +5,7 @@
 models_tokens = {
     "openai": {
         "gpt-3.5-turbo-0125": 16385,
+        "gpt-3.5": 4096,
         "gpt-3.5-turbo": 4096,
         "gpt-3.5-turbo-1106": 16385,
         "gpt-3.5-turbo-instruct": 4096,