Merge pull request #743 from aziz-ullah-khan/pre/beta

Replacement of Google search to googlesearch-python and integration of Proxy.
ScrapeGraphAI · Oct 11, 2024 · 0a275d5 · 0a275d5
2 parents 528a974 + e828c70
commit 0a275d5
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 10 deletions.
diff --git a/examples/together/code_generator_graph_togehter.py b/examples/together/code_generator_graph_togehter.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+together_key = os.getenv("TOGETHER_KEY")
+
+graph_config = {
+    "llm": {
+        "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+        "api_key": together_key,
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,13 +28,12 @@ dependencies = [
     "free-proxy>=1.1.1",
     "playwright>=1.43.0",
     "undetected-playwright>=0.3.0",
-    "google>=3.0.0",
     "langchain-ollama>=0.1.3",
-    "simpleeval>=1.0.0",
-    "semchunk>=2.2.0",
-    "transformers>=4.44.2",
     "qdrant-client>=1.11.3",
     "fastembed>=0.3.6"
+    "semchunk>=2.2.0",
+    "transformers>=4.44.2",
+    "googlesearch-python>=1.2.5"
 ]
 
 license = "MIT"

diff --git a/requirements.txt b/requirements.txt
@@ -15,7 +15,7 @@ minify-html>=0.15.0
 free-proxy>=1.1.1
 playwright>=1.43.0
 undetected-playwright>=0.3.0
-google>=3.0.0
 semchunk>=1.0.1
 langchain-ollama>=0.1.3
-simpleeval>=0.9.13
+simpleeval>=0.9.13
+googlesearch-python>=1.2.5 
diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py
@@ -65,6 +65,7 @@ def _create_graph(self) -> BaseGraph:
             node_config={
                 "llm_model": self.llm_model,
                 "max_results": self.max_results,
+                "loader_kwargs": self.loader_kwargs,
                 "search_engine": self.copy_config.get("search_engine")
             }
         )

diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
@@ -41,6 +41,7 @@ def __init__(
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )
+        self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
         self.search_engine = (
             node_config["search_engine"]
             if node_config.get("search_engine")
@@ -93,8 +94,8 @@ def execute(self, state: dict) -> dict:
 
         self.logger.info(f"Search Query: {search_query}")
 
-        answer = search_on_web(query=search_query, max_results=self.max_results,
-                               search_engine=self.search_engine)
+        answer = search_on_web(query=search_query, num_results=self.max_results,
+                               search_engine=self.search_engine, proxy=self.proxy)
 
         if len(answer) == 0:
             raise ValueError("Zero results found for the search query.")

diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
@@ -10,7 +10,7 @@
 
 def search_on_web(query: str, search_engine: str = "Google",
                   max_results: int = 10, port: int = 8080, 
-                  timeout: int = 10) -> List[str]:
+                  timeout: int = 10, proxy: str | dict = None) -> List[str]:
     """
     Searches the web for a given query using specified search
     engine options and filters out PDF links.
@@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google",
         port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
         timeout (int, optional): The number of seconds to wait 
         for a response from a request. Default is 10 seconds.
+        proxy (dict or string, optional): The proxy server to use for the request. Default is None. 
 
     Returns:
         List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
@@ -36,6 +37,22 @@ def search_on_web(query: str, search_engine: str = "Google",
         ['http://example.com', 'http://example.org', ...]
     """
 
+    def format_proxy(proxy):
+        if isinstance(proxy, dict):
+            server = proxy.get('server')
+            username = proxy.get('username')
+            password = proxy.get('password')
+
+            if all([username, password, server]):
+                proxy_url = f"http://{username}:{password}@{server}"
+                return proxy_url
+            else:
+                raise ValueError("Proxy dictionary is missing required fields.")
+        elif isinstance(proxy, str):
+            return proxy  # "https://username:password@ip:port"
+        else:
+            raise TypeError("Proxy should be a dictionary or a string.")
+
     def filter_pdf_links(links: List[str]) -> List[str]:
         """
         Filters out any links that point to PDF files.
@@ -48,9 +65,12 @@ def filter_pdf_links(links: List[str]) -> List[str]:
         """
         return [link for link in links if not link.lower().endswith('.pdf')]
 
+    if proxy:
+        proxy = format_proxy(proxy)
+
     if search_engine.lower() == "google":
         res = []
-        for url in google_search(query, stop=max_results):
+        for url in google_search(query, num_results=max_results, proxy=proxy):
             res.append(url)
         return filter_pdf_links(res)