Skip to content

Commit

Permalink
Adding ruff, running pre-commit hooks, small fixes and documentation (#…
Browse files Browse the repository at this point in the history
…1303)

This doesn't contribute an Eval but slightly improves the developer
experience for contributors.
  • Loading branch information
benomahony authored Sep 26, 2023
1 parent 0dc0ba4 commit dd96814
Show file tree
Hide file tree
Showing 39 changed files with 669 additions and 525 deletions.
2 changes: 1 addition & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ We know that you might be excited to contribute to OpenAI's mission, help improv

- [ ] I have filled out all required fields of this form
- [ ] I have used **Git LFS** for the Eval JSON data
- [ ] (Ignore if not submitting code) I have run `pip install pre-commit; pre-commit install` and have verified that `mypy`, `black`, `isort`, and `autoflake` are running when I commit and push
- [ ] (Ignore if not submitting code) I have run `pip install pre-commit; pre-commit install` and have verified that `mypy`, `black`, `isort`, `autoflake` and `ruff` are running when I commit and push

Failure to fill out all required fields will result in the PR being closed.

Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/parse_yaml.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import sys

import yaml


def get_first_key(file_path):
with open(file_path, 'r') as yaml_file:
with open(file_path, "r") as yaml_file:
content = yaml.safe_load(yaml_file)
first_key = next(iter(content))
return first_key


if __name__ == "__main__":
yaml_file_path = sys.argv[1]
print(get_first_key(yaml_file_path))
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,12 @@ repos:
- "--remove-unused-variables"
- "--remove-all-unused-imports"
exclude: "evals/__init__.py"

# This allows ruff to run and autofix the code
# The line length is so high because some of the evals are very long
# TODO: fix the evals and then reduce the line length here
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.277
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --line-length=767]
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ Optionally, you can install the formatters for pre-committing with:
pip install -e .[formatters]
```

Then run `pre-commit install` to install pre-commit into your git hooks. pre-commit will now run on every commit.

If you want to manually run all pre-commit hooks on a repository, run `pre-commit run --all-files`. To run individual hooks use `pre-commit run <hook_id>`.

### Running evals

If you don't want to contribute new evals, but simply want to run them locally, you can install the evals package via pip:
Expand Down
4 changes: 2 additions & 2 deletions evals/cli/oaieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,11 @@ def parse_extra_eval_params(
def to_number(x: str) -> Union[int, float, str]:
try:
return int(x)
except:
except (ValueError, TypeError):
pass
try:
return float(x)
except:
except (ValueError, TypeError):
pass
return x

Expand Down
8 changes: 2 additions & 6 deletions evals/completion_fns/langchain_math.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
import importlib
from typing import Optional

from langchain import LLMMathChain, OpenAI
from openai import Completion
from evals.api import CompletionResult

from langchain import OpenAI, LLMMathChain

from evals.api import CompletionResult
from evals.prompt.base import CompletionPrompt
from evals.record import record_sampling

Expand Down
2 changes: 1 addition & 1 deletion evals/elsuite/basic/match_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_eval_sample(
("world", ["world"], True),
],
)
def test_eval_sample(
def test_eval_sample_2(
completion: str,
ideal: list[str],
expected_match: bool,
Expand Down
5 changes: 3 additions & 2 deletions evals/elsuite/lambada.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from datasets import load_dataset

import evals
import evals.metrics
from evals.api import CompletionFn
from evals.record import RecorderBase
from datasets import load_dataset


class Lambada(evals.Eval):
def __init__(
Expand Down Expand Up @@ -37,7 +39,6 @@ def eval_sample(self, sample, rng):
expected=a,
)


def run(self, recorder: RecorderBase):
samples = load_dataset("EleutherAI/lambada_openai", self.subset, split="test")
self.eval_all_samples(recorder, samples)
Expand Down
4 changes: 2 additions & 2 deletions evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ def run(self, recorder):
# record the scores
scores = [m["score"] for m in all_sample_metrics if m["score"] is not None]
if scores:
record_metrics[f"score"] = sum(scores) / len(scores)
record_metrics["score"] = sum(scores) / len(scores)
metascores = [m["metascore"] for m in all_sample_metrics if "metascore" in m]
if metascores:
record_metrics[f"metascore"] = sum(metascores) / len(metascores)
record_metrics["metascore"] = sum(metascores) / len(metascores)

return record_metrics
17 changes: 14 additions & 3 deletions evals/elsuite/multiple_choice.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
from typing import Optional
from urllib.parse import parse_qs, urlparse

from datasets import load_dataset
from pydantic import BaseModel

import evals
import evals.metrics
from evals.api import CompletionFn
from evals.formatting import make_abc
from evals.record import RecorderBase
from datasets import load_dataset


class Sample(BaseModel):
question: str
answers: list[str]
label: int


def get_dataset(url: str) -> list[Sample]:
parsed = urlparse(url)
if parsed.scheme == "hf":
Expand Down Expand Up @@ -43,6 +47,7 @@ def get_dataset(url: str) -> list[Sample]:

raise ValueError(f"Unknown question dataset {url}")


class MultipleChoice(evals.Eval):
def __init__(
self,
Expand All @@ -66,7 +71,14 @@ def eval_sample(self, sample, rng):
rng=rng,
)

prompt = self.instructions + "\nPlease answer with the letter of the correct answer." + "\n\n" + sample.question + "\n" + options
prompt = (
self.instructions
+ "\nPlease answer with the letter of the correct answer."
+ "\n\n"
+ sample.question
+ "\n"
+ options
)
result = self.completion_fn(
prompt=prompt,
temperature=0.0,
Expand All @@ -80,7 +92,6 @@ def eval_sample(self, sample, rng):
expected=correct_answer,
)


def run(self, recorder: RecorderBase):
samples = get_dataset(self.dataset)
self.eval_all_samples(recorder, samples)
Expand Down
28 changes: 22 additions & 6 deletions evals/elsuite/test/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,30 @@


class TestMatch(Match):
def __init__(self,
*args,
**kwargs):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, samples_jsonl="")

def get_samples(self):
return [
{"input": [{"role": "system", "content": "Complete the phrase as concisely as possible."}, {"role": "user", "content": "Once upon a "}], "ideal": "time"},
{"input": [{"role": "system", "content": "Complete the phrase as concisely as possible."}, {"role": "user", "content": "The first US president was "}], "ideal": "George Washington"},
{"input": [{"role": "system", "content": "Complete the phrase as concisely as possible."}, {"role": "user", "content": "OpenAI was founded in 20"}], "ideal": "15"}
{
"input": [
{"role": "system", "content": "Complete the phrase as concisely as possible."},
{"role": "user", "content": "Once upon a "},
],
"ideal": "time",
},
{
"input": [
{"role": "system", "content": "Complete the phrase as concisely as possible."},
{"role": "user", "content": "The first US president was "},
],
"ideal": "George Washington",
},
{
"input": [
{"role": "system", "content": "Complete the phrase as concisely as possible."},
{"role": "user", "content": "OpenAI was founded in 20"},
],
"ideal": "15",
},
]
2 changes: 1 addition & 1 deletion evals/elsuite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_answer(text, answer_prompt, ignore_case=False):

if idx == -1:
return None
return text[idx:idx + len(answer_prompt)]
return text[idx : idx + len(answer_prompt)]


def get_consensus(answers):
Expand Down
41 changes: 25 additions & 16 deletions evals/elsuite/utils_test.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
from pytest import mark

from evals.elsuite.utils import fuzzy_match, normalize

@mark.parametrize("s, expected", [
("", ""),
("Hello", "hello"),
("hello\nworld", "hello world"),
])

@mark.parametrize(
"s, expected",
[
("", ""),
("Hello", "hello"),
("hello\nworld", "hello world"),
],
)
def test_normalize(s: str, expected: str):
assert normalize(s) == expected

@mark.parametrize("s1, s2, expected", [
("", "", True),
("x", "", False),
("Hello", "Hello", True),
("hello", "othello", True),
("hello", "oh tello", False),
("Hello World", "foo\nhello world", True),
("who's there?", "whos there", True),
("who's there?", "whosthere", False),
("an apple a day that the", "apple day that", True),
])

@mark.parametrize(
"s1, s2, expected",
[
("", "", True),
("x", "", False),
("Hello", "Hello", True),
("hello", "othello", True),
("hello", "oh tello", False),
("Hello World", "foo\nhello world", True),
("who's there?", "whos there", True),
("who's there?", "whosthere", False),
("an apple a day that the", "apple day that", True),
],
)
def test_fuzzy_match(s1: str, s2: str, expected: bool):
assert fuzzy_match(s1, s2) == expected
assert fuzzy_match(s2, s1) == expected
2 changes: 1 addition & 1 deletion evals/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def eval_sample(args):

with ThreadPool(threads) as pool:
if os.environ.get("EVALS_SEQUENTIAL", "0") in {"1", "true", "yes"}:
logger.info(f"Running in sequential mode!")
logger.info("Running in sequential mode!")
iter = map(eval_sample, work_items)
else:
logger.info(f"Running in threaded mode with {threads} threads!")
Expand Down
6 changes: 4 additions & 2 deletions evals/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def get_bootstrap_accuracy_std(events: Sequence[Event], num_samples: int = 1000)


def get_confusion_matrix(
matches: Sequence[Event], class_labels: Optional[Set] = None
matches: Sequence[Event], class_labels: Optional[Set] = None
) -> np.ndarray:
labels = {match.data["expected"] for match in matches}
if class_labels is None:
Expand Down Expand Up @@ -63,7 +63,9 @@ def compute_f_score(confusion_matrix: np.ndarray, idx: int = 0, beta: float = 1.
return (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)


def compute_averaged_f_score(confusion_matrix: np.ndarray, beta: float = 1.0, average: str = "macro") -> float:
def compute_averaged_f_score(
confusion_matrix: np.ndarray, beta: float = 1.0, average: str = "macro"
) -> float:
assert average in ["macro"]
f_scores = []
for i in range(confusion_matrix.shape[0]):
Expand Down
4 changes: 2 additions & 2 deletions evals/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]):
raise e

with bf.BlobFile(self.event_file_path, "ab") as f:
f.write(b"".join([l.encode("utf-8") for l in lines]))
f.write(b"".join([line.encode("utf-8") for line in lines]))

logger.info(
f"Logged {len(lines)} rows of events to {self.event_file_path}: insert_time={t(time.time()-start)}"
Expand Down Expand Up @@ -543,7 +543,7 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]):
idx_l = idx_r

with bf.BlobFile(self.event_file_path, "ab") as f:
f.write(b"".join([l.encode("utf-8") for l in lines]))
f.write(b"".join([line.encode("utf-8") for line in lines]))
self._last_flush_time = time.time()
self._flushes_done += 1

Expand Down
4 changes: 2 additions & 2 deletions evals/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def get_class(self, spec: EvalSpec) -> Any:
def _dereference(
self, name: str, d: RawRegistry, object: str, type: Type[T], **kwargs: dict
) -> Optional[T]:
if not name in d:
if name not in d:
logger.warning(
(
f"{object} '{name}' not found. "
Expand Down Expand Up @@ -217,7 +217,7 @@ def get_base_evals(self) -> list[Optional[BaseEvalSpec]]:
return base_evals

def get_base_eval(self, name: str) -> Optional[BaseEvalSpec]:
if not name in self._evals:
if name not in self._evals:
return None

spec_or_alias = self._evals[name]
Expand Down
Loading

0 comments on commit dd96814

Please sign in to comment.