Modify prompts, openai and rename arxiv summary to abstract

kstathou · Dec 4, 2023 · 4e7cdd2 · 4e7cdd2
1 parent e6d214c
commit 4e7cdd2
Show file tree

Hide file tree

Showing 10 changed files with 494 additions and 16 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,10 @@ accelerate = "^0.24.1"
 torch = "2.0.1"
 feedparser = "^6.0.10"
 python-dotenv = "^1.0.0"
+httpx = "^0.25.2"
+openai = "^1.3.7"
+tenacity = "^8.2.3"
+tqdm = "^4.66.1"
 
 
 [tool.poetry.group.test]

diff --git a/src/llm_stack/build_dataset/arxiv.py b/src/llm_stack/build_dataset/arxiv.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+from typing import Optional
 
 import feedparser
 import pandas as pd
@@ -100,7 +101,7 @@ def search(
         return response
 
     @staticmethod
-    def _construct_query(query: str, fields: list[str] = None) -> str:
+    def _construct_query(query: str, fields: Optional[list] = None) -> str:
         """Construct query string for arXiv API."""
         if fields is None:
             fields = ["all"]
@@ -132,12 +133,12 @@ def _get_entries(feed: feedparser.FeedParserDict) -> list:
             raise ValueError("No entries found in feed.") from e
 
     @staticmethod
-    def _parse_entry(entry: feedparser.util.FeedParserDict) -> dict[str, str]:
+    def _parse_entry(entry: feedparser.util.FeedParserDict) -> dict:
         """Parse entry from arXiv API feed."""
         return {
             "arxiv_url": entry["id"],
-            "title": entry["title"],
-            "summary": entry["summary"],
+            "title": entry["title"].replace("\n", " "),
+            "abstract": entry["summary"].replace("\n", " "),
             "published": datetime.strftime(parser.parse(entry["published"]), "%Y-%m-%d"),
             "pdf_url": [item["href"] for item in entry["links"] if all(w in item["href"] for w in ["arxiv", "pdf"])][
                 0

diff --git a/src/llm_stack/build_dataset/openai.py b/src/llm_stack/build_dataset/openai.py
diff --git a/src/llm_stack/build_dataset/prompts/openai_system.json b/src/llm_stack/build_dataset/prompts/openai_system.json
@@ -0,0 +1 @@
+{"role": "system", "content": "You are an excellent copywriter and an expert in machine learning."}
diff --git a/src/llm_stack/build_dataset/prompts/openai_user.json b/src/llm_stack/build_dataset/prompts/openai_user.json
@@ -0,0 +1 @@
+{"role": "user", "content": "###Instructions###\nYour task is to summarize the following academic abstract in one or two sentences.\n###Context###\n{text}"}
diff --git a/src/llm_stack/openai/__init__.py b/src/llm_stack/openai/__init__.py
@@ -0,0 +1,5 @@
+from .openai_api import OpenAILLM
+from .prompt_template import MessageTemplate
+
+
+__all__ = ["OpenAILLM", "MessageTemplate"]