Skip to content

Commit

Permalink
feat: add Parse_Node
Browse files Browse the repository at this point in the history
  • Loading branch information
VinciGit00 committed Jun 12, 2024
1 parent 79b8326 commit e6c7940
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
14 changes: 13 additions & 1 deletion scrapegraphai/graphs/pdf_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerPDFNode
)
Expand Down Expand Up @@ -66,6 +67,15 @@ def _create_graph(self) -> BaseGraph:
output=["doc"],
)

parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"parse_html": False,
"chunk_size": self.model_token
}
)

rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
Expand All @@ -86,11 +96,13 @@ def _create_graph(self) -> BaseGraph:
return BaseGraph(
nodes=[
fetch_node,
parse_node,
rag_node,
generate_answer_node_pdf,
],
edges=[
(fetch_node, rag_node),
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node_pdf)
],
entry_point=fetch_node
Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/graphs/smart_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"""

from typing import Optional
import logging
from pydantic import BaseModel

from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph

Expand Down Expand Up @@ -70,6 +70,7 @@ def _create_graph(self) -> BaseGraph:
}
)
logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))

parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
Expand Down
20 changes: 14 additions & 6 deletions scrapegraphai/nodes/parse_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,20 @@ def execute(self, state: dict) -> dict:
docs_transformed = input_data[0]
if self.parse_html:
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
docs_transformed = docs_transformed[0]

chunks = chunk(text=docs_transformed.page_content,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)
docs_transformed = docs_transformed[0]

chunks = chunk(text=docs_transformed.page_content,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)
else:
docs_transformed = docs_transformed[0]

chunks = chunk(text=docs_transformed,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)

state.update({self.output[0]: chunks})

return state

0 comments on commit e6c7940

Please sign in to comment.