Skip to content

Commit

Permalink
fix: fix robot node
Browse files Browse the repository at this point in the history
  • Loading branch information
VinciGit00 committed Jun 16, 2024
1 parent a87702f commit 2419003
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 58 deletions.
7 changes: 6 additions & 1 deletion examples/single_node/robot_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@

graph_config = {
"llm": {
"model_name": "ollama/llama3",
"model": "ollama/llama3",
"temperature": 0,
"streaming": True
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
}
}

# ************************************************
Expand Down
10 changes: 5 additions & 5 deletions scrapegraphai/nodes/robots_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,11 @@ def execute(self, state: dict) -> dict:
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
loader = AsyncChromiumLoader(f"{base_url}/robots.txt")
document = loader.load()
if "ollama" in self.llm_model.model_name:
self.llm_model.model_name = self.llm_model.model_name.split("/")[-1]
model = self.llm_model.model_name.split("/")[-1]
if "ollama" in self.llm_model.model:
self.llm_model.model = self.llm_model.model.split("/")[-1]
model = self.llm_model.model.split("/")[-1]
else:
model = self.llm_model.model_name
model = self.llm_model.model
try:
agent = robots_dictionary[model]

Expand Down Expand Up @@ -146,4 +146,4 @@ def execute(self, state: dict) -> dict:
self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m")

state.update({self.output[0]: is_scrapable})
return state
return state
117 changes: 65 additions & 52 deletions tests/nodes/robot_node_test.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,74 @@
"""
Module for the tests
"""
import os
import pytest
from scrapegraphai.graphs import SmartScraperGraph
from unittest.mock import MagicMock

from scrapegraphai.models import Ollama
from scrapegraphai.nodes import RobotsNode

@pytest.fixture
def sample_text():
"""
Example of text fixture.
"""
file_name = "inputs/plain_html_example.txt"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, file_name)
def mock_llm_model():
mock_model = MagicMock()
mock_model.model = "ollama/llama3"
mock_model.__call__ = MagicMock(return_value=["yes"])
return mock_model

with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
@pytest.fixture
def robots_node(mock_llm_model):
return RobotsNode(
input="url",
output=["is_scrapable"],
node_config={"llm_model": mock_llm_model, "headless": False}
)

return text
def test_robots_node_scrapable(robots_node):
state = {
"url": "https://perinim.github.io/robots.txt"
}

@pytest.fixture
def graph_config():
"""
Configuration of the graph fixture.
"""
return {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json",
"base_url": "http://localhost:11434",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
# Mocking AsyncChromiumLoader to return a fake robots.txt content
robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nAllow: /")))

# Execute the node
result_state, result = robots_node.execute(state)

# Check the updated state
assert result_state["is_scrapable"] == "yes"
assert result == ("is_scrapable", "yes")

def test_robots_node_not_scrapable(robots_node):
state = {
"url": "https://twitter.com/home"
}

def test_scraping_pipeline(sample_text, graph_config):
"""
Test the SmartScraperGraph scraping pipeline.
"""
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the news with their description.",
source=sample_text,
config=graph_config
)
# Mocking AsyncChromiumLoader to return a fake robots.txt content
robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nDisallow: /")))

# Mock the LLM response to return "no"
robots_node.llm_model.__call__.return_value = ["no"]

# Execute the node and expect a ValueError because force_scraping is False by default
with pytest.raises(ValueError):
robots_node.execute(state)

def test_robots_node_force_scrapable(robots_node):
state = {
"url": "https://twitter.com/home"
}

# Mocking AsyncChromiumLoader to return a fake robots.txt content
robots_node.AsyncChromiumLoader = MagicMock(return_value=MagicMock(load=MagicMock(return_value="User-agent: *\nDisallow: /")))

# Mock the LLM response to return "no"
robots_node.llm_model.__call__.return_value = ["no"]

# Set force_scraping to True
robots_node.force_scraping = True

# Execute the node
result_state, result = robots_node.execute(state)

# Check the updated state
assert result_state["is_scrapable"] == "no"
assert result == ("is_scrapable", "no")

result = smart_scraper_graph.run()

assert result is not None
# Additional assertions to check the structure of the result
assert isinstance(result, dict) # Assuming the result is a dictionary
assert "news" in result # Assuming the result should contain a key "news"
assert "is_scrapable" in result
assert isinstance(result["is_scrapable"], bool)
assert result["is_scrapable"] is True
# Ensure the execute method was called once
mock_execute.assert_called_once_with(initial_state)
if __name__ == "__main__":
pytest.main()

0 comments on commit 2419003

Please sign in to comment.