Skip to content
This repository has been archived by the owner on Oct 2, 2024. It is now read-only.

Remove the idea of specializing a Prompt to a SUT. #21

Merged
merged 1 commit into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions mmlu/newhelm/plugins/tests/mmlu.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
from typing import List
from newhelm.annotation import AnnotatedInteraction
from newhelm.base_test import BasePromptResponseTest, BaseTest
from newhelm.placeholders import PromptTemplate, Result
from newhelm.placeholders import Prompt, Result


class MMLU(BasePromptResponseTest):
def make_prompt_templates(self) -> List[PromptTemplate]:
def make_prompts(self) -> List[Prompt]:
# In the real thing, this would use an ExampleImporter and Adapters
return [
PromptTemplate(
eval_instance_block="When I think of MMLU, the word that comes to mind is"
),
PromptTemplate(
eval_instance_block="But the worst part is when",
),
Prompt("When I think of MMLU, the word that comes to mind is"),
Prompt("But the worst part is when"),
]

def calculate_results(
Expand Down
4 changes: 2 additions & 2 deletions newhelm/base_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List
from newhelm.annotation import AnnotatedInteraction

from newhelm.placeholders import PromptTemplate, Result
from newhelm.placeholders import Prompt, Result


class BaseTest(ABC):
Expand All @@ -15,7 +15,7 @@ class BasePromptResponseTest(BaseTest, ABC):
"""This is the base class for all tests that are single turn."""

@abstractmethod
def make_prompt_templates(self) -> List[PromptTemplate]:
def make_prompts(self) -> List[Prompt]:
"""Generate all data that will eventually go to the SUT."""
pass

Expand Down
10 changes: 0 additions & 10 deletions newhelm/demo_loading_tests.py

This file was deleted.

6 changes: 2 additions & 4 deletions newhelm/demo_tests_and_suts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,11 @@
for test in all_tests:
print("\n\nStarting a new test:", test.__class__.__name__)
# Only have to make the prompt templates once, reusable across SUTs.
prompt_templates = test.make_prompt_templates()
prompts = test.make_prompts()
for sut in all_suts:
print("Running sut:", sut.__class__.__name__)
interactions = []
for template in prompt_templates:
# Splitting specialize from evaluate allows us to track the prompts created.
prompt = sut.specialize(template)
for prompt in prompts:
interaction = sut.evaluate(prompt)
print("Completed interaction:", interaction)
interactions.append(interaction)
Expand Down
83 changes: 0 additions & 83 deletions newhelm/placeholders.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,89 +8,6 @@ class Prompt:
"""What actually goes to the SUT."""

text: str
truncated: bool = False


@dataclass(frozen=True, kw_only=True)
class PromptTemplate:
"""All the pieces necessary for a SUT to construct a Prompt in its own form."""

instructions_block: str = ""
"""Instructions for the task."""

train_instance_blocks: List[str] = field(default_factory=list)
"""Train instance blocks for the prompt."""

eval_instance_block: str
"""Evaluation instance."""


@dataclass(frozen=True)
class WindowServiceConfig:
max_sequence_length: int


class BaseTokenizer(ABC):
@abstractmethod
def to_tokens(self, text: str) -> List[str]:
pass

@abstractmethod
def from_tokens(self, tokens: List[str]):
pass


class PlaceholderTokenizer(BaseTokenizer):
def to_tokens(self, text: str) -> List[str]:
# Placeholder - every character is a token.
return list(text)

def from_tokens(self, tokens: List[str]):
# Placeholder - join all tokens together with no whitespace.
return "".join(tokens)


class LocalWindowService:
"""This is roughly copied over from HELM's local_window_service with simplifications."""

def __init__(self, tokenizer: BaseTokenizer, config: WindowServiceConfig):
self.tokenizer = tokenizer
self.config = config

def fits_within_context_window(self, text: str) -> bool:
return len(self.tokenizer.to_tokens(text)) <= self.config.max_sequence_length

def truncate_from_right(self, text: str) -> str:
tokens = self.tokenizer.to_tokens(text)
if len(tokens) <= self.config.max_sequence_length:
return text
return self.tokenizer.from_tokens(tokens[: self.config.max_sequence_length])

def truncate_training_then_from_right(
self,
prompt_template: PromptTemplate,
template_to_string: Callable[[PromptTemplate], str],
) -> str:
"""Copied over from HELM's InContextLearningAdapter._make_prompt_fit.

One big difference is passing in template_to_string, which lets the individual SUT
decide how to convert the PromptTemplate to a string.
"""
while len(prompt_template.train_instance_blocks) > 0:
prompt_text = template_to_string(prompt_template)
if self.fits_within_context_window(text=prompt_text):
return prompt_text

# Remove the last training example
without_last_training_block = prompt_template.train_instance_blocks[:-1]
prompt_template = replace(
prompt_template, train_instance_blocks=without_last_training_block
)

# If removing the in-context example is still not enough, we simply truncate the prompt.
# Following the default truncation strategy used by HuggingFace, we truncate the text from the right.
prompt_text = template_to_string(prompt_template)
return self.truncate_from_right(prompt_text)


@dataclass(frozen=True)
Expand Down
5 changes: 2 additions & 3 deletions newhelm/plugins/runners/simple_benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,9 @@ def _run_prompt_response_test(
self, test: BasePromptResponseTest, sut: PromptResponseSUT
) -> TestJournal:
"""Demonstration for how to run a single Test on a single SUT, all calls serial."""
templates = test.make_prompt_templates()
prompts = test.make_prompts()
interactions = []
for template in templates:
prompt = sut.specialize(template)
for prompt in prompts:
interactions.append(sut.evaluate(prompt))
# Here is where an annotator would go
annotated = [AnnotatedInteraction(interaction) for interaction in interactions]
Expand Down
39 changes: 1 addition & 38 deletions newhelm/plugins/suts/gpt_2_sut.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,11 @@
from typing import List
from newhelm.placeholders import (
LocalWindowService,
Prompt,
PromptTemplate,
PlaceholderTokenizer,
WindowServiceConfig,
)
from newhelm.placeholders import Prompt
from newhelm.sut import Interaction, PromptResponseSUT, Turn


def template_to_string(prompt_template: PromptTemplate):
"""The idea is each SUT can have its own definition for this, but we can have libraries for the common way to do it."""
instructions = [prompt_template.instructions_block]
if instructions == [""]:
instructions = []
blocks: List[str] = (
instructions
+ prompt_template.train_instance_blocks
+ [prompt_template.eval_instance_block]
)
return "\n".join(blocks)


class GPT2(PromptResponseSUT):
"""The SUT should have all the details currently spread across model_deployment and model_metadata."""

def __init__(self):
self.window_service = LocalWindowService(
PlaceholderTokenizer(),
WindowServiceConfig(
max_sequence_length=1024,
),
)

def specialize(self, prompt_template: PromptTemplate) -> Prompt:
"""The SUT is responsible for making the PromptTemplate work."""
prompt_text = template_to_string(prompt_template)
if self.window_service.fits_within_context_window(prompt_text):
return Prompt(prompt_text, truncated=False)
prompt_text = self.window_service.truncate_training_then_from_right(
prompt_template, template_to_string
)
return Prompt(text=prompt_text, truncated=True)

def evaluate(self, prompt: Prompt) -> Interaction:
# Pure placeholder.
number_of_words = len(prompt.text.split())
Expand Down
15 changes: 7 additions & 8 deletions newhelm/plugins/tests/bbq.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
from typing import List
from newhelm.annotation import AnnotatedInteraction
from newhelm.base_test import BasePromptResponseTest
from newhelm.placeholders import PromptTemplate, Result
from newhelm.placeholders import Prompt, Result


class BBQ(BasePromptResponseTest):
def make_prompt_templates(self) -> List[PromptTemplate]:
def make_prompts(self) -> List[Prompt]:
# In the real thing, this would use an ExampleImporter and Adapters
return [
PromptTemplate(
instructions_block="The following are multiple choice questions (with answers).",
train_instance_blocks=["Passage: Is this the BBQ Test? Answer: Yes"],
eval_instance_block="Passage: is this the real BBQ? Answer:",
)
lines = [
"The following are multiple choice questions (with answers).",
"Passage: Is this the BBQ Test? Answer: Yes",
"Passage: is this the real BBQ? Answer:",
]
return [Prompt("\n".join(lines))]

def calculate_results(
self, interactions: List[AnnotatedInteraction]
Expand Down
6 changes: 1 addition & 5 deletions newhelm/sut.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from dataclasses import dataclass
from typing import List

from newhelm.placeholders import Prompt, PromptTemplate
from newhelm.placeholders import Prompt


@dataclass(frozen=True)
Expand Down Expand Up @@ -30,10 +30,6 @@ class SUT(ABC):
class PromptResponseSUT(SUT, ABC):
"""The base class for any SUT that is designed for handling a single-turn."""

@abstractmethod
def specialize(self, prompt_template: PromptTemplate) -> Prompt:
pass

@abstractmethod
def evaluate(self, prompt: Prompt) -> Interaction:
pass
64 changes: 1 addition & 63 deletions tests/plugins/suts/test_gpt_2_sut.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,8 @@
from newhelm.placeholders import Prompt, PromptTemplate, WindowServiceConfig
from newhelm.placeholders import Prompt
from newhelm.plugins.suts.gpt_2_sut import GPT2
from newhelm.sut import Interaction, Turn


def test_specialize():
sut = GPT2()
prompt = sut.specialize(
PromptTemplate(
instructions_block="Instructions",
train_instance_blocks=["The sky is: blue", "The grass is: green"],
eval_instance_block="A polar bear is: ",
)
)
assert prompt == Prompt(
"""\
Instructions
The sky is: blue
The grass is: green
A polar bear is: \
"""
)


def test_specialize_truncates_training():
sut = GPT2()
# Monkey patch to make the window smaller.
sut.window_service.config = WindowServiceConfig(max_sequence_length=50)
prompt = sut.specialize(
PromptTemplate(
instructions_block="Instructions",
train_instance_blocks=["The sky is: blue", "The grass is: green"],
eval_instance_block="A polar bear is: ",
)
)
# Remove the second training example
assert prompt == Prompt(
truncated=True,
text="""\
Instructions
The sky is: blue
A polar bear is: \
""",
)


def test_specialize_truncates_everything():
sut = GPT2()
# Monkey patch to make the window smaller.
sut.window_service.config = WindowServiceConfig(max_sequence_length=25)
prompt = sut.specialize(
PromptTemplate(
instructions_block="Instructions",
train_instance_blocks=["The sky is: blue", "The grass is: green"],
eval_instance_block="A polar bear is: ",
)
)
# Remove both training examples, and part of the eval example.
assert prompt == Prompt(
truncated=True,
text="""\
Instructions
A polar bear\
""",
)


def test_evaluate():
sut = GPT2()
prompt = Prompt("One two three")
Expand Down