Skip to content

Commit

Permalink
Merge pull request #743 from aziz-ullah-khan/pre/beta
Browse files Browse the repository at this point in the history
Replacement of Google search to googlesearch-python and integration of Proxy.
  • Loading branch information
VinciGit00 authored Oct 11, 2024
2 parents 528a974 + e828c70 commit 0a275d5
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 10 deletions.
60 changes: 60 additions & 0 deletions examples/together/code_generator_graph_togehter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""

import os, json
from typing import List
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph

load_dotenv()

# ************************************************
# Define the output schema for the graph
# ************************************************

class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")

class Projects(BaseModel):
projects: List[Project]

# ************************************************
# Define the configuration for the graph
# ************************************************

together_key = os.getenv("TOGETHER_KEY")

graph_config = {
"llm": {
"model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
"api_key": together_key,
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}

# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************

code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)

result = code_generator_graph.run()
print(result)
7 changes: 3 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,12 @@ dependencies = [
"free-proxy>=1.1.1",
"playwright>=1.43.0",
"undetected-playwright>=0.3.0",
"google>=3.0.0",
"langchain-ollama>=0.1.3",
"simpleeval>=1.0.0",
"semchunk>=2.2.0",
"transformers>=4.44.2",
"qdrant-client>=1.11.3",
"fastembed>=0.3.6"
"semchunk>=2.2.0",
"transformers>=4.44.2",
"googlesearch-python>=1.2.5"
]

license = "MIT"
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ minify-html>=0.15.0
free-proxy>=1.1.1
playwright>=1.43.0
undetected-playwright>=0.3.0
google>=3.0.0
semchunk>=1.0.1
langchain-ollama>=0.1.3
simpleeval>=0.9.13
simpleeval>=0.9.13
googlesearch-python>=1.2.5
1 change: 1 addition & 0 deletions scrapegraphai/graphs/search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def _create_graph(self) -> BaseGraph:
node_config={
"llm_model": self.llm_model,
"max_results": self.max_results,
"loader_kwargs": self.loader_kwargs,
"search_engine": self.copy_config.get("search_engine")
}
)
Expand Down
5 changes: 3 additions & 2 deletions scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
self.search_engine = (
node_config["search_engine"]
if node_config.get("search_engine")
Expand Down Expand Up @@ -93,8 +94,8 @@ def execute(self, state: dict) -> dict:

self.logger.info(f"Search Query: {search_query}")

answer = search_on_web(query=search_query, max_results=self.max_results,
search_engine=self.search_engine)
answer = search_on_web(query=search_query, num_results=self.max_results,
search_engine=self.search_engine, proxy=self.proxy)

if len(answer) == 0:
raise ValueError("Zero results found for the search query.")
Expand Down
24 changes: 22 additions & 2 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def search_on_web(query: str, search_engine: str = "Google",
max_results: int = 10, port: int = 8080,
timeout: int = 10) -> List[str]:
timeout: int = 10, proxy: str | dict = None) -> List[str]:
"""
Searches the web for a given query using specified search
engine options and filters out PDF links.
Expand All @@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google",
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
timeout (int, optional): The number of seconds to wait
for a response from a request. Default is 10 seconds.
proxy (dict or string, optional): The proxy server to use for the request. Default is None.
Returns:
List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
Expand All @@ -36,6 +37,22 @@ def search_on_web(query: str, search_engine: str = "Google",
['http://example.com', 'http://example.org', ...]
"""

def format_proxy(proxy):
if isinstance(proxy, dict):
server = proxy.get('server')
username = proxy.get('username')
password = proxy.get('password')

if all([username, password, server]):
proxy_url = f"http://{username}:{password}@{server}"
return proxy_url
else:
raise ValueError("Proxy dictionary is missing required fields.")
elif isinstance(proxy, str):
return proxy # "https://username:password@ip:port"
else:
raise TypeError("Proxy should be a dictionary or a string.")

def filter_pdf_links(links: List[str]) -> List[str]:
"""
Filters out any links that point to PDF files.
Expand All @@ -48,9 +65,12 @@ def filter_pdf_links(links: List[str]) -> List[str]:
"""
return [link for link in links if not link.lower().endswith('.pdf')]

if proxy:
proxy = format_proxy(proxy)

if search_engine.lower() == "google":
res = []
for url in google_search(query, stop=max_results):
for url in google_search(query, num_results=max_results, proxy=proxy):
res.append(url)
return filter_pdf_links(res)

Expand Down

0 comments on commit 0a275d5

Please sign in to comment.