From d056c439cd4582b4c6b4bf6efc5ebd057cd5a3a1 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 4 Oct 2024 14:16:13 +0200 Subject: [PATCH 1/3] Create code_generator_graph_togehter.py --- .../together/code_generator_graph_togehter.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 examples/together/code_generator_graph_togehter.py diff --git a/examples/together/code_generator_graph_togehter.py b/examples/together/code_generator_graph_togehter.py new file mode 100644 index 00000000..aefbeba4 --- /dev/null +++ b/examples/together/code_generator_graph_togehter.py @@ -0,0 +1,60 @@ +""" +Basic example of scraping pipeline using Code Generator with schema +""" + +import os, json +from typing import List +from dotenv import load_dotenv +from pydantic import BaseModel, Field +from scrapegraphai.graphs import CodeGeneratorGraph + +load_dotenv() + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +class Project(BaseModel): + title: str = Field(description="The title of the project") + description: str = Field(description="The description of the project") + +class Projects(BaseModel): + projects: List[Project] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_KEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + }, + "verbose": True, + "headless": False, + "reduction": 2, + "max_iterations": { + "overall": 10, + "syntax": 3, + "execution": 3, + "validation": 3, + "semantic": 3 + }, + "output_file_name": "extracted_data.py" +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +code_generator_graph = CodeGeneratorGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io/projects/", + schema=Projects, + config=graph_config +) + +result = code_generator_graph.run() +print(result) From 46195d65ca103dce76caa2cad99627f4b18a2416 Mon Sep 17 00:00:00 2001 From: aziz-ullah-khan Date: Fri, 11 Oct 2024 10:04:28 +0500 Subject: [PATCH 2/3] Update: proxy integration in googlesearch --- =1.2.5 | 8 + notebook.ipynb | 274 ++++++++++++++++++++ pyproject.toml | 4 +- requirements.txt | 4 +- scrapegraphai/graphs/search_graph.py | 1 + scrapegraphai/nodes/search_internet_node.py | 3 +- scrapegraphai/utils/research_web.py | 25 +- 7 files changed, 312 insertions(+), 7 deletions(-) create mode 100644 =1.2.5 create mode 100644 notebook.ipynb diff --git a/=1.2.5 b/=1.2.5 new file mode 100644 index 00000000..2da54259 --- /dev/null +++ b/=1.2.5 @@ -0,0 +1,8 @@ +Requirement already satisfied: googlesearch-python in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (1.2.5) +Requirement already satisfied: beautifulsoup4>=4.9 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from googlesearch-python) (4.12.3) +Requirement already satisfied: requests>=2.20 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from googlesearch-python) (2.32.3) +Requirement already satisfied: soupsieve>1.2 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from beautifulsoup4>=4.9->googlesearch-python) (2.6) +Requirement already satisfied: charset-normalizer<4,>=2 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (3.3.2) +Requirement already satisfied: idna<4,>=2.5 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (3.10) +Requirement already satisfied: urllib3<3,>=1.21.1 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (2.2.3) +Requirement already satisfied: certifi>=2017.4.17 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (2024.8.30) diff --git a/notebook.ipynb b/notebook.ipynb new file mode 100644 index 00000000..9c80e935 --- /dev/null +++ b/notebook.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pip install -e ." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from scrapegraphai.graphs import SearchGraph" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PROXY: https://vzktqema:btngo4nn7n6l@63.141.62.30:6323\n" + ] + }, + { + "ename": "TypeError", + "evalue": "search() got an unexpected keyword argument 'num_results'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 36\u001b[0m\n\u001b[1;32m 31\u001b[0m prompt \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mList the top 5 companies in the world by market capitalization.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 32\u001b[0m search_graph \u001b[38;5;241m=\u001b[39m SearchGraph(\n\u001b[1;32m 33\u001b[0m prompt\u001b[38;5;241m=\u001b[39mprompt,\n\u001b[1;32m 34\u001b[0m config\u001b[38;5;241m=\u001b[39mgraph_config\n\u001b[1;32m 35\u001b[0m )\n\u001b[0;32m---> 36\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43msearch_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/search_graph.py:121\u001b[0m, in \u001b[0;36mSearchGraph.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;124;03mExecutes the web scraping and searching process.\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \n\u001b[1;32m 117\u001b[0m \u001b[38;5;124;03mReturns:\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;124;03m str: The answer to the prompt.\u001b[39;00m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 120\u001b[0m inputs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser_prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprompt}\n\u001b[0;32m--> 121\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinal_state, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;66;03m# Store the URLs after execution\u001b[39;00m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124murls\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinal_state:\n", + "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:281\u001b[0m, in \u001b[0;36mBaseGraph.execute\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (result[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_state\u001b[39m\u001b[38;5;124m\"\u001b[39m], [])\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43minitial_state\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:197\u001b[0m, in \u001b[0;36mBaseGraph._execute_standard\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 184\u001b[0m graph_execution_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start_time\n\u001b[1;32m 185\u001b[0m log_graph_execution(\n\u001b[1;32m 186\u001b[0m graph_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgraph_name,\n\u001b[1;32m 187\u001b[0m source\u001b[38;5;241m=\u001b[39msource,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 195\u001b[0m exception\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mstr\u001b[39m(e)\n\u001b[1;32m 196\u001b[0m )\n\u001b[0;32m--> 197\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 198\u001b[0m node_exec_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m curr_time\n\u001b[1;32m 199\u001b[0m total_exec_time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m node_exec_time\n", + "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:181\u001b[0m, in \u001b[0;36mBaseGraph._execute_standard\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_manager\u001b[38;5;241m.\u001b[39mexclusive_get_callback(llm_model, llm_model_name) \u001b[38;5;28;01mas\u001b[39;00m cb:\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 181\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mcurrent_node\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstate\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 183\u001b[0m error_node \u001b[38;5;241m=\u001b[39m current_node\u001b[38;5;241m.\u001b[39mnode_name\n", + "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/nodes/search_internet_node.py:97\u001b[0m, in \u001b[0;36mSearchInternetNode.execute\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m 93\u001b[0m search_query \u001b[38;5;241m=\u001b[39m search_answer\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser_prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m: user_prompt})[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSearch Query: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msearch_query\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 97\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[43msearch_on_web\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msearch_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[43m \u001b[49m\u001b[43msearch_engine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch_engine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproxy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproxy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(answer) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mZero results found for the search query.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/utils/research_web.py:74\u001b[0m, in \u001b[0;36msearch_on_web\u001b[0;34m(query, search_engine, max_results, port, timeout, proxy)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPROXY: \u001b[39m\u001b[38;5;124m\"\u001b[39m, proxy)\n\u001b[1;32m 73\u001b[0m res \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 74\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m url \u001b[38;5;129;01min\u001b[39;00m \u001b[43mgoogle_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproxy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxy\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 75\u001b[0m res\u001b[38;5;241m.\u001b[39mappend(url)\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m filter_pdf_links(res)\n", + "\u001b[0;31mTypeError\u001b[0m: search() got an unexpected keyword argument 'num_results'" + ] + } + ], + "source": [ + "import os\n", + "os.environ['AZURE_OPENAI_GPT4O_SERVICE']=\"dwtc-openai-gpt4o\"\n", + "os.environ['AZURE_OPENAI_GPT4O_DEPLOYMENT']=\"gpt4o\"\n", + "os.environ['AZURE_OPENAI_GPT4O_KEY']=\"3cb3875145ec425880c6974d74e10cd7\"\n", + "os.environ['AZURE_OPENAI_GPT4O_API_VERSION']=\"2024-02-15-preview\"\n", + "\n", + "\n", + "graph_config = {\n", + " \"llm\": {\n", + " \"model\": \"azure_openai/gpt-4o\",\n", + " \"api_key\": os.environ['AZURE_OPENAI_GPT4O_KEY'],\n", + " \"azure_endpoint\": f\"https://{os.environ['AZURE_OPENAI_GPT4O_SERVICE']}.openai.azure.com\",\n", + " \"azure_deployment\": os.environ['AZURE_OPENAI_GPT4O_DEPLOYMENT'],\n", + " \"api_version\": os.environ['AZURE_OPENAI_GPT4O_API_VERSION'],\n", + " \"temperature\": 0.0,\n", + " },\n", + "\n", + " \"loader_kwargs\": {\n", + " \"proxy\" : {\n", + " \"server\": '63.141.62.30:6323', \n", + " \"username\": \"vzktqema\", \n", + " \"password\": \"btngo4nn7n6l\",\n", + " },\n", + " },\n", + "\n", + " \"verbose\": False,\n", + " \"headless\": True,\n", + " \"max_sites\": 1\n", + " }\n", + "\n", + "prompt = \"List the top 5 companies in the world by market capitalization.\"\n", + "search_graph = SearchGraph(\n", + " prompt=prompt,\n", + " config=graph_config\n", + " )\n", + "result = search_graph.run()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'search' from 'googlesearch' (unknown location)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgooglesearch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m search\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'search' from 'googlesearch' (unknown location)" + ] + } + ], + "source": [ + "from googlesearch import search" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "102.77s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing installation: google 3.0.0\n", + "Uninstalling google-3.0.0:\n", + " Successfully uninstalled google-3.0.0\n" + ] + } + ], + "source": [ + "!pip uninstall google -y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages/googlesearch/__init__.py'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import inspect\n", + "inspect.getfile(search)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'search' from 'googlesearch' (unknown location)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgooglesearch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m search\n\u001b[1;32m 2\u001b[0m search(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGoogle\u001b[39m\u001b[38;5;124m\"\u001b[39m, num_results\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m)\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'search' from 'googlesearch' (unknown location)" + ] + } + ], + "source": [ + "from googlesearch import search\n", + "search(\"Google\", num_results=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from scrapegraphai.utils import research_web" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import concurrent.futures\n", + "import time\n", + "from googlesearch import search # Ensure you have the googlesearch package installed\n", + "\n", + "def fetch_url(query):\n", + " # Fetch the URLs from the search query\n", + " return list(search(query, stop=10)) # Fetch 10 URLs for each query\n", + "\n", + "def main():\n", + " query = \"Weather in Pakistan\"\n", + " batch_size = 50 # Number of requests to send concurrently\n", + "\n", + " res = []\n", + " # Create a ThreadPoolExecutor to manage threads\n", + " with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:\n", + " # Submit multiple fetch requests to the executor\n", + " future_to_url = {executor.submit(fetch_url, query): i for i in range(batch_size)}\n", + " \n", + " for future in concurrent.futures.as_completed(future_to_url):\n", + " try:\n", + " urls = future.result()\n", + " res.append(urls) # Extend the results with the fetched URLs\n", + " except Exception as e:\n", + " print(f\"Error fetching data: {e}\")\n", + "\n", + " return res\n", + "\n", + "if __name__ == \"__main__\":\n", + " result = main()\n", + " print(len(result))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "colscrap-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index 7b1c0913..1da7f2dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,13 +28,13 @@ dependencies = [ "free-proxy>=1.1.1", "playwright>=1.43.0", "undetected-playwright>=0.3.0", - "google>=3.0.0", "langchain-ollama>=0.1.3", "simpleeval>=1.0.0", "semchunk==2.2.0", "transformers==4.44.2", "qdrant-client>=1.11.3", - "fastembed>=0.3.6" + "fastembed>=0.3.6", + "googlesearch-python>=1.2.5" ] license = "MIT" diff --git a/requirements.txt b/requirements.txt index e6e5d4d7..c72ad1bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ minify-html>=0.15.0 free-proxy>=1.1.1 playwright>=1.43.0 undetected-playwright>=0.3.0 -google>=3.0.0 semchunk>=1.0.1 langchain-ollama>=0.1.3 -simpleeval>=0.9.13 \ No newline at end of file +simpleeval>=0.9.13 +googlesearch-python>=1.2.5 \ No newline at end of file diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 8086caa6..d3577d32 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -72,6 +72,7 @@ def _create_graph(self) -> BaseGraph: node_config={ "llm_model": self.llm_model, "max_results": self.max_results, + "loader_kwargs": self.loader_kwargs, "search_engine": self.copy_config.get("search_engine") } ) diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 14ce3207..e318f923 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -41,6 +41,7 @@ def __init__( self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) + self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None) self.search_engine = ( node_config["search_engine"] if node_config.get("search_engine") @@ -94,7 +95,7 @@ def execute(self, state: dict) -> dict: self.logger.info(f"Search Query: {search_query}") answer = search_on_web(query=search_query, max_results=self.max_results, - search_engine=self.search_engine) + search_engine=self.search_engine, proxy=self.proxy) if len(answer) == 0: raise ValueError("Zero results found for the search query.") diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index edab3005..88a4566a 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -10,7 +10,7 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080, - timeout: int = 10) -> List[str]: + timeout: int = 10, proxy: str | dict = None) -> List[str]: """ Searches the web for a given query using specified search engine options and filters out PDF links. @@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google", port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. timeout (int, optional): The number of seconds to wait for a response from a request. Default is 10 seconds. + proxy (dict or string, optional): The proxy server to use for the request. Default is None. Returns: List[str]: A list of URLs as strings that are the search results, excluding any PDF links. @@ -36,6 +37,22 @@ def search_on_web(query: str, search_engine: str = "Google", ['http://example.com', 'http://example.org', ...] """ + def format_proxy(proxy): + if isinstance(proxy, dict): + server = proxy.get('server') + username = proxy.get('username') + password = proxy.get('password') + + if all([username, password, server]): + proxy_url = f"https://{username}:{password}@{server}" + return proxy_url + else: + raise ValueError("Proxy dictionary is missing required fields.") + elif isinstance(proxy, str): + return proxy # "https://username:password@ip:port" + else: + raise TypeError("Proxy should be a dictionary or a string.") + def filter_pdf_links(links: List[str]) -> List[str]: """ Filters out any links that point to PDF files. @@ -48,9 +65,13 @@ def filter_pdf_links(links: List[str]) -> List[str]: """ return [link for link in links if not link.lower().endswith('.pdf')] + if proxy: + proxy = format_proxy(proxy) + if search_engine.lower() == "google": + print("PROXY: ", proxy) res = [] - for url in google_search(query, stop=max_results): + for url in google_search(query, num_results=max_results, proxy=proxy): res.append(url) return filter_pdf_links(res) From e828c7010acb1bd04498e027da69f35d53a37890 Mon Sep 17 00:00:00 2001 From: aziz-ullah-khan Date: Fri, 11 Oct 2024 12:30:58 +0500 Subject: [PATCH 3/3] perf: Proxy integration in googlesearch --- =1.2.5 | 8 - notebook.ipynb | 274 -------------------- scrapegraphai/nodes/search_internet_node.py | 2 +- scrapegraphai/utils/research_web.py | 3 +- 4 files changed, 2 insertions(+), 285 deletions(-) delete mode 100644 =1.2.5 delete mode 100644 notebook.ipynb diff --git a/=1.2.5 b/=1.2.5 deleted file mode 100644 index 2da54259..00000000 --- a/=1.2.5 +++ /dev/null @@ -1,8 +0,0 @@ -Requirement already satisfied: googlesearch-python in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (1.2.5) -Requirement already satisfied: beautifulsoup4>=4.9 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from googlesearch-python) (4.12.3) -Requirement already satisfied: requests>=2.20 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from googlesearch-python) (2.32.3) -Requirement already satisfied: soupsieve>1.2 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from beautifulsoup4>=4.9->googlesearch-python) (2.6) -Requirement already satisfied: charset-normalizer<4,>=2 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (3.3.2) -Requirement already satisfied: idna<4,>=2.5 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (3.10) -Requirement already satisfied: urllib3<3,>=1.21.1 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (2.2.3) -Requirement already satisfied: certifi>=2017.4.17 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (2024.8.30) diff --git a/notebook.ipynb b/notebook.ipynb deleted file mode 100644 index 9c80e935..00000000 --- a/notebook.ipynb +++ /dev/null @@ -1,274 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pip install -e ." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from scrapegraphai.graphs import SearchGraph" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PROXY: https://vzktqema:btngo4nn7n6l@63.141.62.30:6323\n" - ] - }, - { - "ename": "TypeError", - "evalue": "search() got an unexpected keyword argument 'num_results'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 36\u001b[0m\n\u001b[1;32m 31\u001b[0m prompt \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mList the top 5 companies in the world by market capitalization.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 32\u001b[0m search_graph \u001b[38;5;241m=\u001b[39m SearchGraph(\n\u001b[1;32m 33\u001b[0m prompt\u001b[38;5;241m=\u001b[39mprompt,\n\u001b[1;32m 34\u001b[0m config\u001b[38;5;241m=\u001b[39mgraph_config\n\u001b[1;32m 35\u001b[0m )\n\u001b[0;32m---> 36\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43msearch_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/search_graph.py:121\u001b[0m, in \u001b[0;36mSearchGraph.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;124;03mExecutes the web scraping and searching process.\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \n\u001b[1;32m 117\u001b[0m \u001b[38;5;124;03mReturns:\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;124;03m str: The answer to the prompt.\u001b[39;00m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 120\u001b[0m inputs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser_prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprompt}\n\u001b[0;32m--> 121\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinal_state, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;66;03m# Store the URLs after execution\u001b[39;00m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124murls\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinal_state:\n", - "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:281\u001b[0m, in \u001b[0;36mBaseGraph.execute\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (result[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_state\u001b[39m\u001b[38;5;124m\"\u001b[39m], [])\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43minitial_state\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:197\u001b[0m, in \u001b[0;36mBaseGraph._execute_standard\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 184\u001b[0m graph_execution_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start_time\n\u001b[1;32m 185\u001b[0m log_graph_execution(\n\u001b[1;32m 186\u001b[0m graph_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgraph_name,\n\u001b[1;32m 187\u001b[0m source\u001b[38;5;241m=\u001b[39msource,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 195\u001b[0m exception\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mstr\u001b[39m(e)\n\u001b[1;32m 196\u001b[0m )\n\u001b[0;32m--> 197\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 198\u001b[0m node_exec_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m curr_time\n\u001b[1;32m 199\u001b[0m total_exec_time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m node_exec_time\n", - "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:181\u001b[0m, in \u001b[0;36mBaseGraph._execute_standard\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_manager\u001b[38;5;241m.\u001b[39mexclusive_get_callback(llm_model, llm_model_name) \u001b[38;5;28;01mas\u001b[39;00m cb:\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 181\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mcurrent_node\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstate\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 183\u001b[0m error_node \u001b[38;5;241m=\u001b[39m current_node\u001b[38;5;241m.\u001b[39mnode_name\n", - "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/nodes/search_internet_node.py:97\u001b[0m, in \u001b[0;36mSearchInternetNode.execute\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m 93\u001b[0m search_query \u001b[38;5;241m=\u001b[39m search_answer\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser_prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m: user_prompt})[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSearch Query: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msearch_query\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 97\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[43msearch_on_web\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msearch_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[43m \u001b[49m\u001b[43msearch_engine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch_engine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproxy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproxy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(answer) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mZero results found for the search query.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/utils/research_web.py:74\u001b[0m, in \u001b[0;36msearch_on_web\u001b[0;34m(query, search_engine, max_results, port, timeout, proxy)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPROXY: \u001b[39m\u001b[38;5;124m\"\u001b[39m, proxy)\n\u001b[1;32m 73\u001b[0m res \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 74\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m url \u001b[38;5;129;01min\u001b[39;00m \u001b[43mgoogle_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproxy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxy\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 75\u001b[0m res\u001b[38;5;241m.\u001b[39mappend(url)\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m filter_pdf_links(res)\n", - "\u001b[0;31mTypeError\u001b[0m: search() got an unexpected keyword argument 'num_results'" - ] - } - ], - "source": [ - "import os\n", - "os.environ['AZURE_OPENAI_GPT4O_SERVICE']=\"dwtc-openai-gpt4o\"\n", - "os.environ['AZURE_OPENAI_GPT4O_DEPLOYMENT']=\"gpt4o\"\n", - "os.environ['AZURE_OPENAI_GPT4O_KEY']=\"3cb3875145ec425880c6974d74e10cd7\"\n", - "os.environ['AZURE_OPENAI_GPT4O_API_VERSION']=\"2024-02-15-preview\"\n", - "\n", - "\n", - "graph_config = {\n", - " \"llm\": {\n", - " \"model\": \"azure_openai/gpt-4o\",\n", - " \"api_key\": os.environ['AZURE_OPENAI_GPT4O_KEY'],\n", - " \"azure_endpoint\": f\"https://{os.environ['AZURE_OPENAI_GPT4O_SERVICE']}.openai.azure.com\",\n", - " \"azure_deployment\": os.environ['AZURE_OPENAI_GPT4O_DEPLOYMENT'],\n", - " \"api_version\": os.environ['AZURE_OPENAI_GPT4O_API_VERSION'],\n", - " \"temperature\": 0.0,\n", - " },\n", - "\n", - " \"loader_kwargs\": {\n", - " \"proxy\" : {\n", - " \"server\": '63.141.62.30:6323', \n", - " \"username\": \"vzktqema\", \n", - " \"password\": \"btngo4nn7n6l\",\n", - " },\n", - " },\n", - "\n", - " \"verbose\": False,\n", - " \"headless\": True,\n", - " \"max_sites\": 1\n", - " }\n", - "\n", - "prompt = \"List the top 5 companies in the world by market capitalization.\"\n", - "search_graph = SearchGraph(\n", - " prompt=prompt,\n", - " config=graph_config\n", - " )\n", - "result = search_graph.run()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "cannot import name 'search' from 'googlesearch' (unknown location)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgooglesearch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m search\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'search' from 'googlesearch' (unknown location)" - ] - } - ], - "source": [ - "from googlesearch import search" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "102.77s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found existing installation: google 3.0.0\n", - "Uninstalling google-3.0.0:\n", - " Successfully uninstalled google-3.0.0\n" - ] - } - ], - "source": [ - "!pip uninstall google -y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "search()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'/home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages/googlesearch/__init__.py'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import inspect\n", - "inspect.getfile(search)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "cannot import name 'search' from 'googlesearch' (unknown location)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgooglesearch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m search\n\u001b[1;32m 2\u001b[0m search(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGoogle\u001b[39m\u001b[38;5;124m\"\u001b[39m, num_results\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m)\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'search' from 'googlesearch' (unknown location)" - ] - } - ], - "source": [ - "from googlesearch import search\n", - "search(\"Google\", num_results=100)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from scrapegraphai.utils import research_web" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import concurrent.futures\n", - "import time\n", - "from googlesearch import search # Ensure you have the googlesearch package installed\n", - "\n", - "def fetch_url(query):\n", - " # Fetch the URLs from the search query\n", - " return list(search(query, stop=10)) # Fetch 10 URLs for each query\n", - "\n", - "def main():\n", - " query = \"Weather in Pakistan\"\n", - " batch_size = 50 # Number of requests to send concurrently\n", - "\n", - " res = []\n", - " # Create a ThreadPoolExecutor to manage threads\n", - " with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:\n", - " # Submit multiple fetch requests to the executor\n", - " future_to_url = {executor.submit(fetch_url, query): i for i in range(batch_size)}\n", - " \n", - " for future in concurrent.futures.as_completed(future_to_url):\n", - " try:\n", - " urls = future.result()\n", - " res.append(urls) # Extend the results with the fetched URLs\n", - " except Exception as e:\n", - " print(f\"Error fetching data: {e}\")\n", - "\n", - " return res\n", - "\n", - "if __name__ == \"__main__\":\n", - " result = main()\n", - " print(len(result))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "colscrap-env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index e318f923..5688518e 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -94,7 +94,7 @@ def execute(self, state: dict) -> dict: self.logger.info(f"Search Query: {search_query}") - answer = search_on_web(query=search_query, max_results=self.max_results, + answer = search_on_web(query=search_query, num_results=self.max_results, search_engine=self.search_engine, proxy=self.proxy) if len(answer) == 0: diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 88a4566a..af351ad4 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -44,7 +44,7 @@ def format_proxy(proxy): password = proxy.get('password') if all([username, password, server]): - proxy_url = f"https://{username}:{password}@{server}" + proxy_url = f"http://{username}:{password}@{server}" return proxy_url else: raise ValueError("Proxy dictionary is missing required fields.") @@ -69,7 +69,6 @@ def filter_pdf_links(links: List[str]) -> List[str]: proxy = format_proxy(proxy) if search_engine.lower() == "google": - print("PROXY: ", proxy) res = [] for url in google_search(query, num_results=max_results, proxy=proxy): res.append(url)