Adding ruff, running pre-commit hooks, small fixes and documentation (#…

…1303) This doesn't contribute an Eval but slightly improves the developer experience for contributors.
openai · Sep 26, 2023 · dd96814 · dd96814
1 parent 0dc0ba4
commit dd96814
Show file tree

Hide file tree

Showing 39 changed files with 669 additions and 525 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -75,7 +75,7 @@ We know that you might be excited to contribute to OpenAI's mission, help improv
 
 - [ ] I have filled out all required fields of this form
 - [ ] I have used **Git LFS** for the Eval JSON data
-- [ ] (Ignore if not submitting code) I have run `pip install pre-commit; pre-commit install` and have verified that `mypy`, `black`, `isort`, and `autoflake` are running when I commit and push
+- [ ] (Ignore if not submitting code) I have run `pip install pre-commit; pre-commit install` and have verified that `mypy`, `black`, `isort`, `autoflake` and `ruff` are running when I commit and push
 
 Failure to fill out all required fields will result in the PR being closed.
 

diff --git a/.github/workflows/parse_yaml.py b/.github/workflows/parse_yaml.py
@@ -1,12 +1,15 @@
 import sys
+
 import yaml
 
+
 def get_first_key(file_path):
-    with open(file_path, 'r') as yaml_file:
+    with open(file_path, "r") as yaml_file:
         content = yaml.safe_load(yaml_file)
         first_key = next(iter(content))
         return first_key
 
+
 if __name__ == "__main__":
     yaml_file_path = sys.argv[1]
     print(get_first_key(yaml_file_path))
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,3 +33,12 @@ repos:
           - "--remove-unused-variables"
           - "--remove-all-unused-imports"
         exclude: "evals/__init__.py"
+
+  # This allows ruff to run and autofix the code
+  # The line length is so high because some of the evals are very long
+  # TODO: fix the evals and then reduce the line length here
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.0.277
+    hooks:
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix, --line-length=767]
diff --git a/README.md b/README.md
@@ -63,6 +63,10 @@ Optionally, you can install the formatters for pre-committing with:
 pip install -e .[formatters]
 ```
 
+Then run `pre-commit install` to install pre-commit into your git hooks. pre-commit will now run on every commit.
+
+If you want to manually run all pre-commit hooks on a repository, run `pre-commit run --all-files`. To run individual hooks use `pre-commit run <hook_id>`.
+
 ### Running evals
 
 If you don't want to contribute new evals, but simply want to run them locally, you can install the evals package via pip:

diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
@@ -198,11 +198,11 @@ def parse_extra_eval_params(
         def to_number(x: str) -> Union[int, float, str]:
             try:
                 return int(x)
-            except:
+            except (ValueError, TypeError):
                 pass
             try:
                 return float(x)
-            except:
+            except (ValueError, TypeError):
                 pass
             return x
 

diff --git a/evals/completion_fns/langchain_math.py b/evals/completion_fns/langchain_math.py
@@ -1,11 +1,7 @@
-import importlib
-from typing import Optional
-
+from langchain import LLMMathChain, OpenAI
 from openai import Completion
-from evals.api import CompletionResult
-
-from langchain import OpenAI, LLMMathChain
 
+from evals.api import CompletionResult
 from evals.prompt.base import CompletionPrompt
 from evals.record import record_sampling
 

diff --git a/evals/elsuite/basic/match_test.py b/evals/elsuite/basic/match_test.py
@@ -43,7 +43,7 @@ def test_eval_sample(
         ("world", ["world"], True),
     ],
 )
-def test_eval_sample(
+def test_eval_sample_2(
     completion: str,
     ideal: list[str],
     expected_match: bool,

diff --git a/evals/elsuite/lambada.py b/evals/elsuite/lambada.py
@@ -1,8 +1,10 @@
+from datasets import load_dataset
+
 import evals
 import evals.metrics
 from evals.api import CompletionFn
 from evals.record import RecorderBase
-from datasets import load_dataset
+
 
 class Lambada(evals.Eval):
     def __init__(
@@ -37,7 +39,6 @@ def eval_sample(self, sample, rng):
             expected=a,
         )
 
-
     def run(self, recorder: RecorderBase):
         samples = load_dataset("EleutherAI/lambada_openai", self.subset, split="test")
         self.eval_all_samples(recorder, samples)

diff --git a/evals/elsuite/modelgraded/classify.py b/evals/elsuite/modelgraded/classify.py
@@ -119,9 +119,9 @@ def run(self, recorder):
         # record the scores
         scores = [m["score"] for m in all_sample_metrics if m["score"] is not None]
         if scores:
-            record_metrics[f"score"] = sum(scores) / len(scores)
+            record_metrics["score"] = sum(scores) / len(scores)
         metascores = [m["metascore"] for m in all_sample_metrics if "metascore" in m]
         if metascores:
-            record_metrics[f"metascore"] = sum(metascores) / len(metascores)
+            record_metrics["metascore"] = sum(metascores) / len(metascores)
 
         return record_metrics
diff --git a/evals/elsuite/multiple_choice.py b/evals/elsuite/multiple_choice.py
@@ -1,18 +1,22 @@
 from typing import Optional
 from urllib.parse import parse_qs, urlparse
+
+from datasets import load_dataset
 from pydantic import BaseModel
+
 import evals
 import evals.metrics
 from evals.api import CompletionFn
 from evals.formatting import make_abc
 from evals.record import RecorderBase
-from datasets import load_dataset
+
 
 class Sample(BaseModel):
     question: str
     answers: list[str]
     label: int
 
+
 def get_dataset(url: str) -> list[Sample]:
     parsed = urlparse(url)
     if parsed.scheme == "hf":
@@ -43,6 +47,7 @@ def get_dataset(url: str) -> list[Sample]:
 
     raise ValueError(f"Unknown question dataset {url}")
 
+
 class MultipleChoice(evals.Eval):
     def __init__(
         self,
@@ -66,7 +71,14 @@ def eval_sample(self, sample, rng):
             rng=rng,
         )
 
-        prompt = self.instructions + "\nPlease answer with the letter of the correct answer." + "\n\n" + sample.question + "\n" + options
+        prompt = (
+            self.instructions
+            + "\nPlease answer with the letter of the correct answer."
+            + "\n\n"
+            + sample.question
+            + "\n"
+            + options
+        )
         result = self.completion_fn(
             prompt=prompt,
             temperature=0.0,
@@ -80,7 +92,6 @@ def eval_sample(self, sample, rng):
             expected=correct_answer,
         )
 
-
     def run(self, recorder: RecorderBase):
         samples = get_dataset(self.dataset)
         self.eval_all_samples(recorder, samples)

diff --git a/evals/elsuite/test/match.py b/evals/elsuite/test/match.py
@@ -2,14 +2,30 @@
 
 
 class TestMatch(Match):
-    def __init__(self,
-        *args,
-        **kwargs):
+    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs, samples_jsonl="")
 
     def get_samples(self):
         return [
-            {"input": [{"role": "system", "content": "Complete the phrase as concisely as possible."}, {"role": "user", "content": "Once upon a "}], "ideal": "time"},
-            {"input": [{"role": "system", "content": "Complete the phrase as concisely as possible."}, {"role": "user", "content": "The first US president was "}], "ideal": "George Washington"},
-            {"input": [{"role": "system", "content": "Complete the phrase as concisely as possible."}, {"role": "user", "content": "OpenAI was founded in 20"}], "ideal": "15"}
+            {
+                "input": [
+                    {"role": "system", "content": "Complete the phrase as concisely as possible."},
+                    {"role": "user", "content": "Once upon a "},
+                ],
+                "ideal": "time",
+            },
+            {
+                "input": [
+                    {"role": "system", "content": "Complete the phrase as concisely as possible."},
+                    {"role": "user", "content": "The first US president was "},
+                ],
+                "ideal": "George Washington",
+            },
+            {
+                "input": [
+                    {"role": "system", "content": "Complete the phrase as concisely as possible."},
+                    {"role": "user", "content": "OpenAI was founded in 20"},
+                ],
+                "ideal": "15",
+            },
         ]
diff --git a/evals/elsuite/utils.py b/evals/elsuite/utils.py
@@ -22,7 +22,7 @@ def get_answer(text, answer_prompt, ignore_case=False):
 
     if idx == -1:
         return None
-    return text[idx:idx + len(answer_prompt)]
+    return text[idx : idx + len(answer_prompt)]
 
 
 def get_consensus(answers):

diff --git a/evals/elsuite/utils_test.py b/evals/elsuite/utils_test.py
@@ -1,25 +1,34 @@
 from pytest import mark
+
 from evals.elsuite.utils import fuzzy_match, normalize
 
-@mark.parametrize("s, expected", [
-    ("", ""),
-    ("Hello", "hello"),
-    ("hello\nworld", "hello world"),
-])
+
+@mark.parametrize(
+    "s, expected",
+    [
+        ("", ""),
+        ("Hello", "hello"),
+        ("hello\nworld", "hello world"),
+    ],
+)
 def test_normalize(s: str, expected: str):
     assert normalize(s) == expected
 
-@mark.parametrize("s1, s2, expected", [
-    ("", "", True),
-    ("x", "", False),
-    ("Hello", "Hello", True),
-    ("hello", "othello", True),
-    ("hello", "oh tello", False),
-    ("Hello World", "foo\nhello world", True),
-    ("who's there?", "whos there", True),
-    ("who's there?", "whosthere", False),
-    ("an apple a day that the", "apple day that", True),
-])
+
+@mark.parametrize(
+    "s1, s2, expected",
+    [
+        ("", "", True),
+        ("x", "", False),
+        ("Hello", "Hello", True),
+        ("hello", "othello", True),
+        ("hello", "oh tello", False),
+        ("Hello World", "foo\nhello world", True),
+        ("who's there?", "whos there", True),
+        ("who's there?", "whosthere", False),
+        ("an apple a day that the", "apple day that", True),
+    ],
+)
 def test_fuzzy_match(s1: str, s2: str, expected: bool):
     assert fuzzy_match(s1, s2) == expected
     assert fuzzy_match(s2, s1) == expected
diff --git a/evals/eval.py b/evals/eval.py
@@ -136,7 +136,7 @@ def eval_sample(args):
 
         with ThreadPool(threads) as pool:
             if os.environ.get("EVALS_SEQUENTIAL", "0") in {"1", "true", "yes"}:
-                logger.info(f"Running in sequential mode!")
+                logger.info("Running in sequential mode!")
                 iter = map(eval_sample, work_items)
             else:
                 logger.info(f"Running in threaded mode with {threads} threads!")

diff --git a/evals/metrics.py b/evals/metrics.py
@@ -24,7 +24,7 @@ def get_bootstrap_accuracy_std(events: Sequence[Event], num_samples: int = 1000)
 
 
 def get_confusion_matrix(
-        matches: Sequence[Event], class_labels: Optional[Set] = None
+    matches: Sequence[Event], class_labels: Optional[Set] = None
 ) -> np.ndarray:
     labels = {match.data["expected"] for match in matches}
     if class_labels is None:
@@ -63,7 +63,9 @@ def compute_f_score(confusion_matrix: np.ndarray, idx: int = 0, beta: float = 1.
     return (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
 
 
-def compute_averaged_f_score(confusion_matrix: np.ndarray, beta: float = 1.0, average: str = "macro") -> float:
+def compute_averaged_f_score(
+    confusion_matrix: np.ndarray, beta: float = 1.0, average: str = "macro"
+) -> float:
     assert average in ["macro"]
     f_scores = []
     for i in range(confusion_matrix.shape[0]):

diff --git a/evals/record.py b/evals/record.py
@@ -344,7 +344,7 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]):
             raise e
 
         with bf.BlobFile(self.event_file_path, "ab") as f:
-            f.write(b"".join([l.encode("utf-8") for l in lines]))
+            f.write(b"".join([line.encode("utf-8") for line in lines]))
 
         logger.info(
             f"Logged {len(lines)} rows of events to {self.event_file_path}: insert_time={t(time.time()-start)}"
@@ -543,7 +543,7 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]):
                 idx_l = idx_r
 
             with bf.BlobFile(self.event_file_path, "ab") as f:
-                f.write(b"".join([l.encode("utf-8") for l in lines]))
+                f.write(b"".join([line.encode("utf-8") for line in lines]))
             self._last_flush_time = time.time()
             self._flushes_done += 1
 

diff --git a/evals/registry.py b/evals/registry.py
@@ -144,7 +144,7 @@ def get_class(self, spec: EvalSpec) -> Any:
     def _dereference(
         self, name: str, d: RawRegistry, object: str, type: Type[T], **kwargs: dict
     ) -> Optional[T]:
-        if not name in d:
+        if name not in d:
             logger.warning(
                 (
                     f"{object} '{name}' not found. "
@@ -217,7 +217,7 @@ def get_base_evals(self) -> list[Optional[BaseEvalSpec]]:
         return base_evals
 
     def get_base_eval(self, name: str) -> Optional[BaseEvalSpec]:
-        if not name in self._evals:
+        if name not in self._evals:
             return None
 
         spec_or_alias = self._evals[name]