mlcommons · brianwgoldman · Dec 20, 2023 · Dec 15, 2023
@@ -1,19 +1,15 @@
 from typing import List
 from newhelm.annotation import AnnotatedInteraction
 from newhelm.base_test import BasePromptResponseTest, BaseTest
-from newhelm.placeholders import PromptTemplate, Result
+from newhelm.placeholders import Prompt, Result
 
 
 class MMLU(BasePromptResponseTest):
-    def make_prompt_templates(self) -> List[PromptTemplate]:
+    def make_prompts(self) -> List[Prompt]:
         # In the real thing, this would use an ExampleImporter and Adapters
         return [
-            PromptTemplate(
-                eval_instance_block="When I think of MMLU, the word that comes to mind is"
-            ),
-            PromptTemplate(
-                eval_instance_block="But the worst part is when",
-            ),
+            Prompt("When I think of MMLU, the word that comes to mind is"),
+            Prompt("But the worst part is when"),
         ]
 
     def calculate_results(

@@ -2,7 +2,7 @@
 from typing import List
 from newhelm.annotation import AnnotatedInteraction
 
-from newhelm.placeholders import PromptTemplate, Result
+from newhelm.placeholders import Prompt, Result
 
 
 class BaseTest(ABC):
@@ -15,7 +15,7 @@ class BasePromptResponseTest(BaseTest, ABC):
     """This is the base class for all tests that are single turn."""
 
     @abstractmethod
-    def make_prompt_templates(self) -> List[PromptTemplate]:
+    def make_prompts(self) -> List[Prompt]:
         """Generate all data that will eventually go to the SUT."""
         pass
 

@@ -19,13 +19,11 @@
     for test in all_tests:
         print("\n\nStarting a new test:", test.__class__.__name__)
         # Only have to make the prompt templates once, reusable across SUTs.
-        prompt_templates = test.make_prompt_templates()
+        prompts = test.make_prompts()
         for sut in all_suts:
             print("Running sut:", sut.__class__.__name__)
             interactions = []
-            for template in prompt_templates:
-                # Splitting specialize from evaluate allows us to track the prompts created.
-                prompt = sut.specialize(template)
+            for prompt in prompts:
                 interaction = sut.evaluate(prompt)
                 print("Completed interaction:", interaction)
                 interactions.append(interaction)

@@ -8,89 +8,6 @@ class Prompt:
     """What actually goes to the SUT."""
 
     text: str
-    truncated: bool = False
-
-
-@dataclass(frozen=True, kw_only=True)
-class PromptTemplate:
-    """All the pieces necessary for a SUT to construct a Prompt in its own form."""
-
-    instructions_block: str = ""
-    """Instructions for the task."""
-
-    train_instance_blocks: List[str] = field(default_factory=list)
-    """Train instance blocks for the prompt."""
-
-    eval_instance_block: str
-    """Evaluation instance."""
-
-
-@dataclass(frozen=True)
-class WindowServiceConfig:
-    max_sequence_length: int
-
-
-class BaseTokenizer(ABC):
-    @abstractmethod
-    def to_tokens(self, text: str) -> List[str]:
-        pass
-
-    @abstractmethod
-    def from_tokens(self, tokens: List[str]):
-        pass
-
-
-class PlaceholderTokenizer(BaseTokenizer):
-    def to_tokens(self, text: str) -> List[str]:
-        # Placeholder - every character is a token.
-        return list(text)
-
-    def from_tokens(self, tokens: List[str]):
-        # Placeholder - join all tokens together with no whitespace.
-        return "".join(tokens)
-
-
-class LocalWindowService:
-    """This is roughly copied over from HELM's local_window_service with simplifications."""
-
-    def __init__(self, tokenizer: BaseTokenizer, config: WindowServiceConfig):
-        self.tokenizer = tokenizer
-        self.config = config
-
-    def fits_within_context_window(self, text: str) -> bool:
-        return len(self.tokenizer.to_tokens(text)) <= self.config.max_sequence_length
-
-    def truncate_from_right(self, text: str) -> str:
-        tokens = self.tokenizer.to_tokens(text)
-        if len(tokens) <= self.config.max_sequence_length:
-            return text
-        return self.tokenizer.from_tokens(tokens[: self.config.max_sequence_length])
-
-    def truncate_training_then_from_right(
-        self,
-        prompt_template: PromptTemplate,
-        template_to_string: Callable[[PromptTemplate], str],
-    ) -> str:
-        """Copied over from HELM's InContextLearningAdapter._make_prompt_fit.
-
-        One big difference is passing in template_to_string, which lets the individual SUT
-        decide how to convert the PromptTemplate to a string.
-        """
-        while len(prompt_template.train_instance_blocks) > 0:
-            prompt_text = template_to_string(prompt_template)
-            if self.fits_within_context_window(text=prompt_text):
-                return prompt_text
-
-            # Remove the last training example
-            without_last_training_block = prompt_template.train_instance_blocks[:-1]
-            prompt_template = replace(
-                prompt_template, train_instance_blocks=without_last_training_block
-            )
-
-        # If removing the in-context example is still not enough, we simply truncate the prompt.
-        # Following the default truncation strategy used by HuggingFace, we truncate the text from the right.
-        prompt_text = template_to_string(prompt_template)
-        return self.truncate_from_right(prompt_text)
 
 
 @dataclass(frozen=True)

@@ -52,10 +52,9 @@ def _run_prompt_response_test(
         self, test: BasePromptResponseTest, sut: PromptResponseSUT
     ) -> TestJournal:
         """Demonstration for how to run a single Test on a single SUT, all calls serial."""
-        templates = test.make_prompt_templates()
+        prompts = test.make_prompts()
         interactions = []
-        for template in templates:
-            prompt = sut.specialize(template)
+        for prompt in prompts:
             interactions.append(sut.evaluate(prompt))
         # Here is where an annotator would go
         annotated = [AnnotatedInteraction(interaction) for interaction in interactions]

@@ -1,48 +1,11 @@
 from typing import List
-from newhelm.placeholders import (
-    LocalWindowService,
-    Prompt,
-    PromptTemplate,
-    PlaceholderTokenizer,
-    WindowServiceConfig,
-)
+from newhelm.placeholders import Prompt
 from newhelm.sut import Interaction, PromptResponseSUT, Turn
 
 
-def template_to_string(prompt_template: PromptTemplate):
-    """The idea is each SUT can have its own definition for this, but we can have libraries for the common way to do it."""
-    instructions = [prompt_template.instructions_block]
-    if instructions == [""]:
-        instructions = []
-    blocks: List[str] = (
-        instructions
-        + prompt_template.train_instance_blocks
-        + [prompt_template.eval_instance_block]
-    )
-    return "\n".join(blocks)
-
-
 class GPT2(PromptResponseSUT):
     """The SUT should have all the details currently spread across model_deployment and model_metadata."""
 
-    def __init__(self):
-        self.window_service = LocalWindowService(
-            PlaceholderTokenizer(),
-            WindowServiceConfig(
-                max_sequence_length=1024,
-            ),
-        )
-
-    def specialize(self, prompt_template: PromptTemplate) -> Prompt:
-        """The SUT is responsible for making the PromptTemplate work."""
-        prompt_text = template_to_string(prompt_template)
-        if self.window_service.fits_within_context_window(prompt_text):
-            return Prompt(prompt_text, truncated=False)
-        prompt_text = self.window_service.truncate_training_then_from_right(
-            prompt_template, template_to_string
-        )
-        return Prompt(text=prompt_text, truncated=True)
-
     def evaluate(self, prompt: Prompt) -> Interaction:
         # Pure placeholder.
         number_of_words = len(prompt.text.split())

@@ -1,19 +1,18 @@
 from typing import List
 from newhelm.annotation import AnnotatedInteraction
 from newhelm.base_test import BasePromptResponseTest
-from newhelm.placeholders import PromptTemplate, Result
+from newhelm.placeholders import Prompt, Result
 
 
 class BBQ(BasePromptResponseTest):
-    def make_prompt_templates(self) -> List[PromptTemplate]:
+    def make_prompts(self) -> List[Prompt]:
         # In the real thing, this would use an ExampleImporter and Adapters
-        return [
-            PromptTemplate(
-                instructions_block="The following are multiple choice questions (with answers).",
-                train_instance_blocks=["Passage: Is this the BBQ Test? Answer: Yes"],
-                eval_instance_block="Passage: is this the real BBQ? Answer:",
-            )
+        lines = [
+            "The following are multiple choice questions (with answers).",
+            "Passage: Is this the BBQ Test? Answer: Yes",
+            "Passage: is this the real BBQ? Answer:",
         ]
+        return [Prompt("\n".join(lines))]
 
     def calculate_results(
         self, interactions: List[AnnotatedInteraction]

@@ -2,7 +2,7 @@
 from dataclasses import dataclass
 from typing import List
 
-from newhelm.placeholders import Prompt, PromptTemplate
+from newhelm.placeholders import Prompt
 
 
 @dataclass(frozen=True)
@@ -30,10 +30,6 @@ class SUT(ABC):
 class PromptResponseSUT(SUT, ABC):
     """The base class for any SUT that is designed for handling a single-turn."""
 
-    @abstractmethod
-    def specialize(self, prompt_template: PromptTemplate) -> Prompt:
-        pass
-
     @abstractmethod
     def evaluate(self, prompt: Prompt) -> Interaction:
         pass
@@ -1,70 +1,8 @@
-from newhelm.placeholders import Prompt, PromptTemplate, WindowServiceConfig
+from newhelm.placeholders import Prompt
 from newhelm.plugins.suts.gpt_2_sut import GPT2
 from newhelm.sut import Interaction, Turn
 
 
-def test_specialize():
-    sut = GPT2()
-    prompt = sut.specialize(
-        PromptTemplate(
-            instructions_block="Instructions",
-            train_instance_blocks=["The sky is: blue", "The grass is: green"],
-            eval_instance_block="A polar bear is: ",
-        )
-    )
-    assert prompt == Prompt(
-        """\
-Instructions
-The sky is: blue
-The grass is: green
-A polar bear is: \
-"""
-    )
-
-
-def test_specialize_truncates_training():
-    sut = GPT2()
-    # Monkey patch to make the window smaller.
-    sut.window_service.config = WindowServiceConfig(max_sequence_length=50)
-    prompt = sut.specialize(
-        PromptTemplate(
-            instructions_block="Instructions",
-            train_instance_blocks=["The sky is: blue", "The grass is: green"],
-            eval_instance_block="A polar bear is: ",
-        )
-    )
-    # Remove the second training example
-    assert prompt == Prompt(
-        truncated=True,
-        text="""\
-Instructions
-The sky is: blue
-A polar bear is: \
-""",
-    )
-
-
-def test_specialize_truncates_everything():
-    sut = GPT2()
-    # Monkey patch to make the window smaller.
-    sut.window_service.config = WindowServiceConfig(max_sequence_length=25)
-    prompt = sut.specialize(
-        PromptTemplate(
-            instructions_block="Instructions",
-            train_instance_blocks=["The sky is: blue", "The grass is: green"],
-            eval_instance_block="A polar bear is: ",
-        )
-    )
-    # Remove both training examples, and part of the eval example.
-    assert prompt == Prompt(
-        truncated=True,
-        text="""\
-Instructions
-A polar bear\
-""",
-    )
-
-
 def test_evaluate():
     sut = GPT2()
     prompt = Prompt("One two three")