From 0ba3a594fc07127403cb716b86cbf8d55147319b Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 23 May 2024 11:43:54 +0200 Subject: [PATCH 1/5] Update models_tokens.py --- scrapegraphai/helpers/models_tokens.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index d84e1094..d05c166f 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -23,7 +23,10 @@ "azure": { "gpt-3.5-turbo": 4096, "gpt-4": 8192, - "gpt-4-32k": 32768 + "gpt-4-0613": 8192, + "gpt-4-32k": 32768, + "gpt-4-32k-0613": 32768, + "gpt-4o": 128000, }, "gemini": { "gemini-pro": 128000, @@ -131,7 +134,8 @@ "cognitivecomputations/dolphin-2.5-mixtral-8x7b": 32768, "TheBloke/dolphin-2.7-mixtral-8x7b-GGUF": 32768, "deepseek-ai/DeepSeek-V2": 131072, - "deepseek-ai/DeepSeek-V2-Chat": 131072 + "deepseek-ai/DeepSeek-V2-Chat": 131072, + "claude-3-haiku": 200000 }, "deepseek": { "deepseek-chat": 32768, From 1774b18059900b0bf6e8e9ee08bb38d2cd6606dd Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 23 May 2024 12:32:49 +0200 Subject: [PATCH 2/5] refactor of embeddings --- examples/example.py | 64 ++++++++++++++++++++++++++ pyproject.toml | 2 +- requirements-dev.lock | 12 +++-- requirements.lock | 11 +++-- scrapegraphai/graphs/abstract_graph.py | 15 +++--- 5 files changed, 85 insertions(+), 19 deletions(-) create mode 100644 examples/example.py diff --git a/examples/example.py b/examples/example.py new file mode 100644 index 00000000..322b6a81 --- /dev/null +++ b/examples/example.py @@ -0,0 +1,64 @@ +from scrapegraphai.graphs import PDFScraperGraph + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "model_tokens": 4000, + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + }, + "verbose": True, + "headless": False, +} + +# Covert to list +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons.", + "Hollywood films are generally released first in the United States and then later abroad, with some variation in lags across films and countries. With the growth in movie piracy since the appearance of BitTorrent in 2003, films have become available through illegal piracy immediately after release in the US, while they are not available for legal viewing abroad until their foreign premieres in each country. We make use of this variation in international release lags to ask whether longer lags – which facilitate more local pre-release piracy – depress theatrical box office receipts, particularly after the widespread adoption of BitTorrent. We find that longer release windows are associated with decreased box office returns, even after controlling for film and country fixed effects. This relationship is much stronger in contexts where piracy is more prevalent: after BitTorrent’s adoption and in heavily-pirated genres. Our findings indicate that, as a lower bound, international box office returns in our sample were at least 7% lower than they would have been in the absence of pre-release piracy. By contrast, we do not see evidence of elevated sales displacement in US box office revenue following the adoption of BitTorrent, and we suggest that delayed legal availability of the content abroad may drive the losses to piracy." + # Add more sources here +] + +prompt = """ +You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: + +Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. +Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. +Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. +Response Format: For each abstract, present your response in the following structured format: + +Independent Variable (IV): +Dependent Variable (DV): +Exogenous Shock: + +Example Queries and Responses: + +Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. + +Response: + +Independent Variable (IV): Employee happiness. +Dependent Variable (DV): Overall firm productivity. +Exogenous Shock: Sudden company-wide increase in bonus payments. + +Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. + +Response: + +Independent Variable (IV): Exposure to social media. +Dependent Variable (DV): Mental health outcomes. +Exogenous Shock: staggered introduction of Facebook across U.S. colleges. +""" +results = [] +for source in sources: + pdf_scraper_graph = PDFScraperGraph( + prompt=prompt, + source=source, + config=graph_config + ) + result = pdf_scraper_graph.run() + results.append(result) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8b51660e..2c61f4df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ] -requires-python = ">= 3.9" +requires-python = ">=3.9,<3.12" [build-system] requires = ["hatchling"] diff --git a/requirements-dev.lock b/requirements-dev.lock index 84a8a445..5c7c7dcb 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -45,10 +45,6 @@ certifi==2024.2.2 # via requests charset-normalizer==3.3.2 # via requests -colorama==0.4.6 - # via ipython - # via pytest - # via tqdm dataclasses-json==0.6.6 # via langchain # via langchain-community @@ -104,7 +100,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.5.0 # via langchain-groq grpcio==1.63.0 @@ -217,8 +212,11 @@ pandas==2.2.2 # via scrapegraphai parso==0.8.4 # via jedi +pexpect==4.9.0 + # via ipython playwright==1.43.0 # via scrapegraphai + # via undetected-playwright pluggy==1.5.0 # via pytest prompt-toolkit==3.0.43 @@ -233,6 +231,8 @@ protobuf==4.25.3 # via googleapis-common-protos # via grpcio-status # via proto-plus +ptyprocess==0.7.0 + # via pexpect pure-eval==0.2.2 # via stack-data pyasn1==0.6.0 @@ -342,6 +342,8 @@ typing-inspect==0.9.0 # via dataclasses-json tzdata==2024.1 # via pandas +undetected-playwright==0.3.0 + # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client urllib3==2.2.1 diff --git a/requirements.lock b/requirements.lock index f33598cf..3c1cbedf 100644 --- a/requirements.lock +++ b/requirements.lock @@ -45,9 +45,6 @@ certifi==2024.2.2 # via requests charset-normalizer==3.3.2 # via requests -colorama==0.4.6 - # via ipython - # via tqdm dataclasses-json==0.6.6 # via langchain # via langchain-community @@ -102,7 +99,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.5.0 # via langchain-groq grpcio==1.63.0 @@ -212,8 +208,11 @@ pandas==2.2.2 # via scrapegraphai parso==0.8.4 # via jedi +pexpect==4.9.0 + # via ipython playwright==1.43.0 # via scrapegraphai + # via undetected-playwright prompt-toolkit==3.0.43 # via ipython proto-plus==1.23.0 @@ -226,6 +225,8 @@ protobuf==4.25.3 # via googleapis-common-protos # via grpcio-status # via proto-plus +ptyprocess==0.7.0 + # via pexpect pure-eval==0.2.2 # via stack-data pyasn1==0.6.0 @@ -330,6 +331,8 @@ typing-inspect==0.9.0 # via dataclasses-json tzdata==2024.1 # via pandas +undetected-playwright==0.3.0 + # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client urllib3==2.2.1 diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 0377506a..f5922938 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -282,30 +282,31 @@ def _create_embedder(self, embedder_config: dict) -> object: if 'model_instance' in embedder_config: return embedder_config['model_instance'] # Instantiate the embedding model based on the model name - if "openai" in embedder_config["model"]: + if "openai" in embedder_config["model"].split("/")[0]: return OpenAIEmbeddings(api_key=embedder_config["api_key"]) elif "azure" in embedder_config["model"]: return AzureOpenAIEmbeddings() - elif "ollama" in embedder_config["model"]: + elif "ollama" in embedder_config["model"].split("/")[0]: + print("ciao") embedder_config["model"] = embedder_config["model"].split("ollama/")[-1] try: models_tokens["ollama"][embedder_config["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return OllamaEmbeddings(**embedder_config) - elif "hugging_face" in embedder_config["model"]: + elif "hugging_face" in embedder_config["model"].split("/")[0]: try: models_tokens["hugging_face"][embedder_config["model"]] except KeyError as exc: raise KeyError("Model not supported")from exc return HuggingFaceHubEmbeddings(model=embedder_config["model"]) - elif "gemini" in embedder_config["model"]: + elif "gemini" in embedder_config["model"].split("/")[0]: try: models_tokens["gemini"][embedder_config["model"]] except KeyError as exc: raise KeyError("Model not supported")from exc return GoogleGenerativeAIEmbeddings(model=embedder_config["model"]) - elif "bedrock" in embedder_config["model"]: + elif "bedrock" in embedder_config["model"].split("/")[0]: embedder_config["model"] = embedder_config["model"].split("/")[-1] client = embedder_config.get('client', None) try: @@ -313,10 +314,6 @@ def _create_embedder(self, embedder_config: dict) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return BedrockEmbeddings(client=client, model_id=embedder_config["model"]) - else: - raise ValueError( - "Model provided by the configuration not supported") - def get_state(self, key=None) -> dict: """"" Get the final state of the graph. From 909af8d9128200ab99831a20e14f7646ce04a4bd Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 23 May 2024 13:45:23 +0200 Subject: [PATCH 3/5] refactor gen answ node --- scrapegraphai/graphs/abstract_graph.py | 49 ++++++------------- scrapegraphai/graphs/pdf_scraper_graph.py | 30 ++---------- .../nodes/generate_answer_pdf_node.py | 2 +- 3 files changed, 22 insertions(+), 59 deletions(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index f5922938..31945ec2 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -8,13 +8,9 @@ from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings from ..helpers import models_tokens -from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek +from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings -from ..helpers import models_tokens -from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek - - class AbstractGraph(ABC): """ Scaffolding class for creating a graph representation and executing it. @@ -22,7 +18,6 @@ class AbstractGraph(ABC): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -33,7 +28,6 @@ class AbstractGraph(ABC): prompt (str): The prompt for the graph. config (dict): Configuration parameters for the graph. source (str, optional): The source of the graph. - schema (str, optional): The schema for the graph output. Example: >>> class MyGraph(AbstractGraph): @@ -45,21 +39,15 @@ class AbstractGraph(ABC): >>> result = my_graph.run() """ - def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[str] = None): + def __init__(self, prompt: str, config: dict, source: Optional[str] = None): self.prompt = prompt self.source = source self.config = config - self.schema = schema self.llm_model = self._create_llm(config["llm"], chat=True) self.embedder_model = self._create_default_embedder(llm_config=config["llm"] ) if "embeddings" not in config else self._create_embedder( config["embeddings"]) - self.verbose = False if config is None else config.get( - "verbose", False) - self.headless = True if config is None else config.get( - "headless", True) - self.loader_kwargs = config.get("loader_kwargs", {}) # Create the graph self.graph = self._create_graph() @@ -67,20 +55,18 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None, sche self.execution_info = None # Set common configuration parameters + self.verbose = False if config is None else config.get( "verbose", False) self.headless = True if config is None else config.get( "headless", True) self.loader_kwargs = config.get("loader_kwargs", {}) - common_params = { - "headless": self.headless, - "verbose": self.verbose, - "loader_kwargs": self.loader_kwargs, - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - + common_params = {"headless": self.headless, + + "loader_kwargs": self.loader_kwargs, + "llm_model": self.llm_model, + "embedder_model": self.embedder_model} self.set_common_params(common_params, overwrite=False) def set_common_params(self, params: dict, overwrite=False): @@ -93,7 +79,7 @@ def set_common_params(self, params: dict, overwrite=False): for node in self.graph.nodes: node.update_config(params, overwrite) - + def _set_model_token(self, llm): if 'Azure' in str(type(llm)): @@ -171,7 +157,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: raise KeyError("Model not supported") from exc return Anthropic(llm_params) elif "ollama" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("ollama/")[-1] + llm_params["model"] = llm_params["model"].split("/")[-1] # allow user to set model_tokens in config try: @@ -245,8 +231,6 @@ def _create_default_embedder(self, llm_config=None) -> object: model="models/embedding-001") if isinstance(self.llm_model, OpenAI): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - elif isinstance(self.llm_model, DeepSeek): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) elif isinstance(self.llm_model, AzureOpenAIEmbeddings): return self.llm_model elif isinstance(self.llm_model, AzureOpenAI): @@ -282,31 +266,30 @@ def _create_embedder(self, embedder_config: dict) -> object: if 'model_instance' in embedder_config: return embedder_config['model_instance'] # Instantiate the embedding model based on the model name - if "openai" in embedder_config["model"].split("/")[0]: + if "openai" in embedder_config["model"]: return OpenAIEmbeddings(api_key=embedder_config["api_key"]) elif "azure" in embedder_config["model"]: return AzureOpenAIEmbeddings() - elif "ollama" in embedder_config["model"].split("/")[0]: - print("ciao") - embedder_config["model"] = embedder_config["model"].split("ollama/")[-1] + elif "ollama" in embedder_config["model"]: + embedder_config["model"] = embedder_config["model"].split("/")[-1] try: models_tokens["ollama"][embedder_config["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return OllamaEmbeddings(**embedder_config) - elif "hugging_face" in embedder_config["model"].split("/")[0]: + elif "hugging_face" in embedder_config["model"]: try: models_tokens["hugging_face"][embedder_config["model"]] except KeyError as exc: raise KeyError("Model not supported")from exc return HuggingFaceHubEmbeddings(model=embedder_config["model"]) - elif "gemini" in embedder_config["model"].split("/")[0]: + elif "gemini" in embedder_config["model"]: try: models_tokens["gemini"][embedder_config["model"]] except KeyError as exc: raise KeyError("Model not supported")from exc return GoogleGenerativeAIEmbeddings(model=embedder_config["model"]) - elif "bedrock" in embedder_config["model"].split("/")[0]: + elif "bedrock" in embedder_config["model"]: embedder_config["model"] = embedder_config["model"].split("/")[-1] client = embedder_config.get('client', None) try: diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index af9fe7d4..39278ab7 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -11,7 +11,7 @@ FetchNode, ParseNode, RAGNode, - GenerateAnswerNode + GenerateAnswerPDFNode ) @@ -48,7 +48,7 @@ class PDFScraperGraph(AbstractGraph): """ def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): - super().__init__(prompt, config, source, schema) + super().__init__(prompt, config, source) self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir" @@ -64,41 +64,21 @@ def _create_graph(self) -> BaseGraph: input='pdf | pdf_dir', output=["doc", "link_urls", "img_urls"], ) - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": self.model_token, - } - ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model, - } - ) - generate_answer_node = GenerateAnswerNode( + generate_answer_node_pdf = GenerateAnswerPDFNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ "llm_model": self.llm_model, - "schema": self.schema, } ) return BaseGraph( nodes=[ fetch_node, - parse_node, - rag_node, - generate_answer_node, + generate_answer_node_pdf, ], edges=[ - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (fetch_node, generate_answer_node_pdf) ], entry_point=fetch_node ) diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index fcad5b5a..b64ca763 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -49,7 +49,7 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] = node_name (str): name of the node """ super().__init__(node_name, "node", input, output, 2, node_config) - self.llm_model = node_config["llm"] + self.llm_model = node_config["llm_model"] self.verbose = False if node_config is None else node_config.get( "verbose", False) From 6d33a8a25ef97180784af6409ae872bad65331dc Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 23 May 2024 18:44:04 +0200 Subject: [PATCH 4/5] rollback --- examples/example.py | 64 --------------------- scrapegraphai/graphs/abstract_graph.py | 44 ++++++++++---- scrapegraphai/graphs/pdf_scraper_graph.py | 32 +++++++++-- scrapegraphai/graphs/smart_scraper_graph.py | 14 ++--- 4 files changed, 62 insertions(+), 92 deletions(-) delete mode 100644 examples/example.py diff --git a/examples/example.py b/examples/example.py deleted file mode 100644 index 322b6a81..00000000 --- a/examples/example.py +++ /dev/null @@ -1,64 +0,0 @@ -from scrapegraphai.graphs import PDFScraperGraph - -graph_config = { - "llm": { - "model": "ollama/llama3", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - "model_tokens": 4000, - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, - "verbose": True, - "headless": False, -} - -# Covert to list -sources = [ - "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", - "The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons.", - "Hollywood films are generally released first in the United States and then later abroad, with some variation in lags across films and countries. With the growth in movie piracy since the appearance of BitTorrent in 2003, films have become available through illegal piracy immediately after release in the US, while they are not available for legal viewing abroad until their foreign premieres in each country. We make use of this variation in international release lags to ask whether longer lags – which facilitate more local pre-release piracy – depress theatrical box office receipts, particularly after the widespread adoption of BitTorrent. We find that longer release windows are associated with decreased box office returns, even after controlling for film and country fixed effects. This relationship is much stronger in contexts where piracy is more prevalent: after BitTorrent’s adoption and in heavily-pirated genres. Our findings indicate that, as a lower bound, international box office returns in our sample were at least 7% lower than they would have been in the absence of pre-release piracy. By contrast, we do not see evidence of elevated sales displacement in US box office revenue following the adoption of BitTorrent, and we suggest that delayed legal availability of the content abroad may drive the losses to piracy." - # Add more sources here -] - -prompt = """ -You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements: - -Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables. -Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable. -Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV. -Response Format: For each abstract, present your response in the following structured format: - -Independent Variable (IV): -Dependent Variable (DV): -Exogenous Shock: - -Example Queries and Responses: - -Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking. - -Response: - -Independent Variable (IV): Employee happiness. -Dependent Variable (DV): Overall firm productivity. -Exogenous Shock: Sudden company-wide increase in bonus payments. - -Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons. - -Response: - -Independent Variable (IV): Exposure to social media. -Dependent Variable (DV): Mental health outcomes. -Exogenous Shock: staggered introduction of Facebook across U.S. colleges. -""" -results = [] -for source in sources: - pdf_scraper_graph = PDFScraperGraph( - prompt=prompt, - source=source, - config=graph_config - ) - result = pdf_scraper_graph.run() - results.append(result) \ No newline at end of file diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 31945ec2..6a0c7a4c 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -8,9 +8,13 @@ from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings from ..helpers import models_tokens -from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic +from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings +from ..helpers import models_tokens +from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek + + class AbstractGraph(ABC): """ Scaffolding class for creating a graph representation and executing it. @@ -18,6 +22,7 @@ class AbstractGraph(ABC): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -28,6 +33,7 @@ class AbstractGraph(ABC): prompt (str): The prompt for the graph. config (dict): Configuration parameters for the graph. source (str, optional): The source of the graph. + schema (str, optional): The schema for the graph output. Example: >>> class MyGraph(AbstractGraph): @@ -39,15 +45,21 @@ class AbstractGraph(ABC): >>> result = my_graph.run() """ - def __init__(self, prompt: str, config: dict, source: Optional[str] = None): + def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[str] = None): self.prompt = prompt self.source = source self.config = config + self.schema = schema self.llm_model = self._create_llm(config["llm"], chat=True) self.embedder_model = self._create_default_embedder(llm_config=config["llm"] ) if "embeddings" not in config else self._create_embedder( config["embeddings"]) + self.verbose = False if config is None else config.get( + "verbose", False) + self.headless = True if config is None else config.get( + "headless", True) + self.loader_kwargs = config.get("loader_kwargs", {}) # Create the graph self.graph = self._create_graph() @@ -55,18 +67,20 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None): self.execution_info = None # Set common configuration parameters - self.verbose = False if config is None else config.get( "verbose", False) self.headless = True if config is None else config.get( "headless", True) self.loader_kwargs = config.get("loader_kwargs", {}) - common_params = {"headless": self.headless, - - "loader_kwargs": self.loader_kwargs, - "llm_model": self.llm_model, - "embedder_model": self.embedder_model} + common_params = { + "headless": self.headless, + "verbose": self.verbose, + "loader_kwargs": self.loader_kwargs, + "llm_model": self.llm_model, + "embedder_model": self.embedder_model + } + self.set_common_params(common_params, overwrite=False) def set_common_params(self, params: dict, overwrite=False): @@ -79,7 +93,7 @@ def set_common_params(self, params: dict, overwrite=False): for node in self.graph.nodes: node.update_config(params, overwrite) - + def _set_model_token(self, llm): if 'Azure' in str(type(llm)): @@ -157,7 +171,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: raise KeyError("Model not supported") from exc return Anthropic(llm_params) elif "ollama" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] + llm_params["model"] = llm_params["model"].split("ollama/")[-1] # allow user to set model_tokens in config try: @@ -231,6 +245,8 @@ def _create_default_embedder(self, llm_config=None) -> object: model="models/embedding-001") if isinstance(self.llm_model, OpenAI): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) + elif isinstance(self.llm_model, DeepSeek): + return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) elif isinstance(self.llm_model, AzureOpenAIEmbeddings): return self.llm_model elif isinstance(self.llm_model, AzureOpenAI): @@ -271,7 +287,7 @@ def _create_embedder(self, embedder_config: dict) -> object: elif "azure" in embedder_config["model"]: return AzureOpenAIEmbeddings() elif "ollama" in embedder_config["model"]: - embedder_config["model"] = embedder_config["model"].split("/")[-1] + embedder_config["model"] = embedder_config["model"].split("ollama/")[-1] try: models_tokens["ollama"][embedder_config["model"]] except KeyError as exc: @@ -297,6 +313,10 @@ def _create_embedder(self, embedder_config: dict) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return BedrockEmbeddings(client=client, model_id=embedder_config["model"]) + else: + raise ValueError( + "Model provided by the configuration not supported") + def get_state(self, key=None) -> dict: """"" Get the final state of the graph. @@ -334,4 +354,4 @@ def run(self) -> str: """ Abstract method to execute the graph and return the result. """ - pass + pass \ No newline at end of file diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 39278ab7..86ab2a49 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -11,7 +11,7 @@ FetchNode, ParseNode, RAGNode, - GenerateAnswerPDFNode + GenerateAnswerNode ) @@ -48,7 +48,7 @@ class PDFScraperGraph(AbstractGraph): """ def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): - super().__init__(prompt, config, source) + super().__init__(prompt, config, source, schema) self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir" @@ -64,21 +64,41 @@ def _create_graph(self) -> BaseGraph: input='pdf | pdf_dir', output=["doc", "link_urls", "img_urls"], ) - generate_answer_node_pdf = GenerateAnswerPDFNode( + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": self.model_token, + } + ) + rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.embedder_model, + } + ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ "llm_model": self.llm_model, + "schema": self.schema, } ) return BaseGraph( nodes=[ fetch_node, - generate_answer_node_pdf, + parse_node, + rag_node, + generate_answer_node, ], edges=[ - (fetch_node, generate_answer_node_pdf) + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) ], entry_point=fetch_node ) @@ -94,4 +114,4 @@ def run(self) -> str: inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") + return self.final_state.get("answer", "No answer found.") \ No newline at end of file diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index ee230695..4093e49f 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -2,17 +2,14 @@ SmartScraperGraph Module """ -from typing import Optional - from .base_graph import BaseGraph -from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, RAGNode, GenerateAnswerNode ) +from .abstract_graph import AbstractGraph class SmartScraperGraph(AbstractGraph): @@ -25,7 +22,6 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -36,7 +32,6 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. Example: >>> smart_scraper = SmartScraperGraph( @@ -48,8 +43,8 @@ class SmartScraperGraph(AbstractGraph): ) """ - def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): - super().__init__(prompt, config, source, schema) + def __init__(self, prompt: str, source: str, config: dict): + super().__init__(prompt, config, source) self.input_key = "url" if source.startswith("http") else "local_dir" @@ -86,8 +81,7 @@ def _create_graph(self) -> BaseGraph: input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ - "llm_model": self.llm_model, - "schema": self.schema, + "llm_model": self.llm_model } ) From c93dbe0a949ab41f250a1377c996552415c45f03 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 23 May 2024 18:44:52 +0200 Subject: [PATCH 5/5] Update smart_scraper_graph.py --- scrapegraphai/graphs/smart_scraper_graph.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 4093e49f..ee230695 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -2,14 +2,17 @@ SmartScraperGraph Module """ +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, RAGNode, GenerateAnswerNode ) -from .abstract_graph import AbstractGraph class SmartScraperGraph(AbstractGraph): @@ -22,6 +25,7 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -32,6 +36,7 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. Example: >>> smart_scraper = SmartScraperGraph( @@ -43,8 +48,8 @@ class SmartScraperGraph(AbstractGraph): ) """ - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): + super().__init__(prompt, config, source, schema) self.input_key = "url" if source.startswith("http") else "local_dir" @@ -81,7 +86,8 @@ def _create_graph(self) -> BaseGraph: input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ - "llm_model": self.llm_model + "llm_model": self.llm_model, + "schema": self.schema, } )