Skip to content

Commit

Permalink
feat: refactoring search function
Browse files Browse the repository at this point in the history
  • Loading branch information
VinciGit00 committed May 3, 2024
1 parent 2abe05a commit aeb1acb
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 7 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ free-proxy = "1.1.1"
langchain-groq = "0.1.3"
playwright = "^1.43.0"
langchain-aws = "^0.1.2"

langchain-anthropic = "^0.1.11"
yahoo-search-py=="^0.3"

[tool.poetry.dev-dependencies]
pytest = "8.0.0"
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ free-proxy==1.1.1
langchain-groq==0.1.3
playwright==1.43.0
langchain-aws==0.1.2
langchain-anthropic==0.1.11
yahoo-search-py==0.3
24 changes: 18 additions & 6 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
"""
Module for making the request on the web
"""
import re
from typing import List
from langchain_community.tools import DuckDuckGoSearchResults
from googlesearch import search
from googlesearch import search as google_search
from yahoo_search import search as yahoo_search


def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
Expand All @@ -29,18 +30,29 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
"""

if search_engine == "Google":
if search_engine.lower() == "google":
res = []

for url in search(query, stop=max_results):
for url in google_search(query, stop=max_results):
res.append(url)
return res
elif search_engine == "DuckDuckGo":
elif search_engine.lower() == "duckduckgo":
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)

links = re.findall(r'https?://[^\s,\]]+', res)

return links
elif search_engine.lower() == "yahoo":
list_result = yahoo_search(query)
results = []
for page in list_result.pages:
if len(results) >= max_results: # Check if max_results has already been reached
break # Exit loop if max_results has been reached
try:
results.append(page.link)
except AttributeError:
continue
return results
raise ValueError(
"The only search engines avaiable are DuckDuckGo or Google")
"The only search engines available are DuckDuckGo or Google")

0 comments on commit aeb1acb

Please sign in to comment.