Skip to content

Commit

Permalink
Modify prompts, openai and rename arxiv summary to abstract
Browse files Browse the repository at this point in the history
  • Loading branch information
kstathou committed Dec 4, 2023
1 parent e6d214c commit 4e7cdd2
Show file tree
Hide file tree
Showing 10 changed files with 494 additions and 16 deletions.
120 changes: 112 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ accelerate = "^0.24.1"
torch = "2.0.1"
feedparser = "^6.0.10"
python-dotenv = "^1.0.0"
httpx = "^0.25.2"
openai = "^1.3.7"
tenacity = "^8.2.3"
tqdm = "^4.66.1"


[tool.poetry.group.test]
Expand Down
9 changes: 5 additions & 4 deletions src/llm_stack/build_dataset/arxiv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime
from typing import Optional

import feedparser
import pandas as pd
Expand Down Expand Up @@ -100,7 +101,7 @@ def search(
return response

@staticmethod
def _construct_query(query: str, fields: list[str] = None) -> str:
def _construct_query(query: str, fields: Optional[list] = None) -> str:
"""Construct query string for arXiv API."""
if fields is None:
fields = ["all"]
Expand Down Expand Up @@ -132,12 +133,12 @@ def _get_entries(feed: feedparser.FeedParserDict) -> list:
raise ValueError("No entries found in feed.") from e

@staticmethod
def _parse_entry(entry: feedparser.util.FeedParserDict) -> dict[str, str]:
def _parse_entry(entry: feedparser.util.FeedParserDict) -> dict:
"""Parse entry from arXiv API feed."""
return {
"arxiv_url": entry["id"],
"title": entry["title"],
"summary": entry["summary"],
"title": entry["title"].replace("\n", " "),
"abstract": entry["summary"].replace("\n", " "),
"published": datetime.strftime(parser.parse(entry["published"]), "%Y-%m-%d"),
"pdf_url": [item["href"] for item in entry["links"] if all(w in item["href"] for w in ["arxiv", "pdf"])][
0
Expand Down
Empty file.
1 change: 1 addition & 0 deletions src/llm_stack/build_dataset/prompts/openai_system.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"role": "system", "content": "You are an excellent copywriter and an expert in machine learning."}
1 change: 1 addition & 0 deletions src/llm_stack/build_dataset/prompts/openai_user.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"role": "user", "content": "###Instructions###\nYour task is to summarize the following academic abstract in one or two sentences.\n###Context###\n{text}"}
5 changes: 5 additions & 0 deletions src/llm_stack/openai/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .openai_api import OpenAILLM
from .prompt_template import MessageTemplate


__all__ = ["OpenAILLM", "MessageTemplate"]
Loading

0 comments on commit 4e7cdd2

Please sign in to comment.