Skip to content

Commit aae522b

Browse files
authored
[evals] moved modelgraded specs to registry (openai#392)
- each Eval now keeps track of "registry"
1 parent 1a48737 commit aae522b

14 files changed

+181
-158
lines changed

docs/build-eval.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ Congratulations, you have built your eval! Keep iterating on it until you are co
6565

6666
We expect that the existing model-graded evals such as `fact`, `closedqa`, and `battle` will fit many use cases. However, other use cases may benefit from more customization, e.g., a different evaluation prompt. For these, there will be a bit more work involved, but generally still no coding required!
6767

68-
1. If you can't use an existing model-graded eval, create a new YAML in `evals/registry/modelgraded` to specify the [parameters](eval-templates.md#parameters-for-model-graded-evals) of your eval. See [`humor.yaml`](../evals/registry/modelgraded/humor.yaml) for an example.
68+
1. If you can't use an existing model-graded eval, create a new YAML or create a new entry to an existing YAML in `evals/registry/modelgraded` to specify the [parameters](eval-templates.md#parameters-for-model-graded-evals) of your eval. See [`humor.yaml`](../evals/registry/modelgraded/humor.yaml) for an example.
6969
- Note that, even if you are creating a new YAML, you may find it easiest to copy an existing YAML as a starting point. For example, model-graded evals which check a model completion against a rubric can copy `closedqa.yaml` and just edit the `args`.
7070
2. Next, you will create your dataset and register your eval, as described above. See [`joke_fruits_labeled.jsonl`](../evals/registry/data/test_metaeval/joke_fruits_labeled.jsonl) and [`joke-fruits`](../evals/registry/evals/test-modelgraded.yaml), for example.
7171
- Note that it is recommended to specify `eval_type` at this step, when you register your eval, rather than step 1.

evals/cli/oaieval.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,13 @@ def to_number(x):
211211
extra_eval_params = parse_extra_eval_params(args.extra_eval_params)
212212

213213
eval_class = registry.get_class(eval_spec)
214-
eval = eval_class(model_specs=model_specs, seed=args.seed, name=eval_name, **extra_eval_params)
214+
eval = eval_class(
215+
model_specs=model_specs,
216+
seed=args.seed,
217+
name=eval_name,
218+
registry=registry,
219+
**extra_eval_params,
220+
)
215221
result = eval.run(recorder)
216222
recorder.record_final_report(result)
217223

evals/elsuite/modelgraded/classify.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,7 @@
1313
import evals
1414
import evals.record
1515
from evals.base import ModelSpec
16-
from evals.elsuite.utils import (
17-
PromptFn,
18-
format_necessary,
19-
load_modelgraded_specs,
20-
scrub_formatting_from_prompt,
21-
)
16+
from evals.elsuite.utils import PromptFn, format_necessary, scrub_formatting_from_prompt
2217

2318
INVALID_STR = "__invalid__"
2419
CHOICE_KEY = "choice"
@@ -135,7 +130,7 @@ def __init__(
135130
)
136131

137132
"""import prompt and set attributes"""
138-
modelgraded_specs = load_modelgraded_specs(modelgraded_spec_file)
133+
modelgraded_specs = self.registry.get_modelgraded_spec(modelgraded_spec_file)
139134

140135
# 'choice_strings' is a list of strings that specifies the possible choices
141136
self.choice_strings = modelgraded_specs.pop("choice_strings")
@@ -211,6 +206,8 @@ def __init__(
211206
), "completion_sample_templates must be specified if multicomp_n > 1"
212207

213208
# since we accept optional args, we need to check that all args are used
209+
for key in ("key", "group"):
210+
modelgraded_specs.pop(key, None)
214211
assert not modelgraded_specs, f"Unused args: {modelgraded_specs}. Typo in YAML?"
215212

216213
def eval_sample(self, test_sample: dict, rng: Random) -> None:

evals/elsuite/utils.py

-9
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,12 @@
11
import copy
2-
import os
32
import re
43
import string
54
from collections import Counter, defaultdict
65

7-
import yaml
8-
96
from evals.api import sample_freeform
107
from evals.prompt.base import chat_prompt_to_text_prompt, is_chat_prompt
118

129

13-
def load_modelgraded_specs(spec_file: str) -> str:
14-
current_dir = os.path.dirname(os.path.abspath(__file__))
15-
yaml_path = os.path.join(current_dir, "../registry/modelgraded", f"{spec_file}.yaml")
16-
return yaml.load(open(yaml_path, "r"), Loader=yaml.FullLoader)
17-
18-
1910
def get_answer(text, answer_prompt):
2011
idx = text.rfind(answer_prompt)
2112
if idx == -1:

evals/eval.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@
33
"""
44
import abc
55
import asyncio
6+
import concurrent.futures
67
import logging
78
import os
89
import random
9-
import concurrent.futures
1010
from multiprocessing.pool import ThreadPool
11-
from typing import Any, Awaitable, Callable, Dict, List, Tuple
11+
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
1212

1313
from tqdm import tqdm
1414

1515
from .base import ModelSpec, ModelSpecs
16-
from .record import Recorder, RecorderBase
16+
from .record import RecorderBase
17+
from .registry import Registry
1718

1819
logger = logging.getLogger(__name__)
1920

@@ -53,6 +54,7 @@ def __init__(
5354
model_specs: ModelSpecs,
5455
seed: int = 20220722,
5556
name: str = "no_name_eval.default",
57+
registry: Optional[Registry] = None,
5658
):
5759
splits = name.split(".")
5860
if len(splits) < 2:
@@ -61,6 +63,7 @@ def __init__(
6163
self.model_specs = model_specs
6264
self.seed = seed
6365
self.name = name
66+
self.registry = registry or Registry()
6467

6568
def eval_sample(self, sample: Any, rng: random.Random):
6669
raise NotImplementedError()

evals/registry.py

+16
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
By convention, every eval name should start with {base_eval}.{split}.
55
"""
66

7+
import difflib
78
import functools
89
import logging
910
import os
@@ -58,6 +59,13 @@ def get_alias():
5859
except TypeError as e:
5960
raise TypeError(f"Error while processing {object} {name}: {e}")
6061

62+
def get_modelgraded_spec(self, name: str) -> dict[str, Any]:
63+
assert name in self._modelgraded_specs, (
64+
f"Modelgraded spec {name} not found. "
65+
f"Closest matches: {difflib.get_close_matches(name, self._modelgraded_specs.keys(), n=5)}"
66+
)
67+
return self._modelgraded_specs[name]
68+
6169
def get_eval(self, name: str) -> EvalSpec:
6270
return self._dereference(name, self._evals, "eval", EvalSpec)
6371

@@ -136,6 +144,10 @@ def _process_directory(self, registry, path):
136144
self._process_file(registry, file)
137145

138146
def _load_registry(self, paths):
147+
"""Load registry from a list of paths.
148+
149+
Each path or yaml specifies a dictionary of name -> spec.
150+
"""
139151
registry = {}
140152
for path in paths:
141153
logging.info(f"Loading registry from {path}")
@@ -154,5 +166,9 @@ def _eval_sets(self):
154166
def _evals(self):
155167
return self._load_registry([p / "evals" for p in self._registry_paths])
156168

169+
@functools.cached_property
170+
def _modelgraded_specs(self):
171+
return self._load_registry([p / "modelgraded" for p in self._registry_paths])
172+
157173

158174
registry = Registry()
+21-20
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,25 @@
1-
prompt: |-
2-
You are comparing two responses to the following two instructions.
1+
battle:
2+
prompt: |-
3+
You are comparing two responses to the following two instructions.
34
4-
[Instruction 1]
5-
{input1}
6-
[Response 1]
7-
{completion1}
5+
[Instruction 1]
6+
{input1}
7+
[Response 1]
8+
{completion1}
89
9-
[Instruction 2]
10-
{input2}
11-
[Response 2]
12-
{completion2}
10+
[Instruction 2]
11+
{input2}
12+
[Response 2]
13+
{completion2}
1314
1415
15-
Is the first response better than the second? You must provide one answer based on your subjective view.
16-
choice_strings:
17-
- "Yes"
18-
- "No"
19-
choice_scores:
20-
"Yes": 1.0
21-
"No": 0.0
22-
input_outputs:
23-
input1: completion1
24-
input2: completion2
16+
Is the first response better than the second? You must provide one answer based on your subjective view.
17+
choice_strings:
18+
- "Yes"
19+
- "No"
20+
choice_scores:
21+
"Yes": 1.0
22+
"No": 0.0
23+
input_outputs:
24+
input1: completion1
25+
input2: completion2

evals/registry/modelgraded/best.yaml

+11-10
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
prompt: |-
2-
Which of the following {n} texts is best response to the following instruction?
1+
best:
2+
prompt: |-
3+
Which of the following {n} texts is best response to the following instruction?
34
4-
Instruction: {input}
5+
Instruction: {input}
56
6-
Responses:
7-
{completion}
8-
completion_sample_templates:
9-
completion: "{i}. {output}\n"
10-
choice_strings: from_n
11-
input_outputs:
12-
input: completion
7+
Responses:
8+
{completion}
9+
completion_sample_templates:
10+
completion: "{i}. {output}\n"
11+
choice_strings: from_n
12+
input_outputs:
13+
input: completion
+26-25
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,27 @@
1-
prompt: |-
2-
You are assessing a submitted answer on a given task based on a criterion. Here is the data:
3-
[BEGIN DATA]
4-
***
5-
[Task]: {input}
6-
***
7-
[Submission]: {completion}
8-
***
9-
[Criterion]: {criteria}
10-
***
11-
[END DATA]
12-
Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
1+
closedqa:
2+
prompt: |-
3+
You are assessing a submitted answer on a given task based on a criterion. Here is the data:
4+
[BEGIN DATA]
5+
***
6+
[Task]: {input}
7+
***
8+
[Submission]: {completion}
9+
***
10+
[Criterion]: {criteria}
11+
***
12+
[END DATA]
13+
Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
1314
14-
Reasoning:
15-
eval_type: cot_classify
16-
choice_scores:
17-
"Y": 1.0
18-
"N": 0.0
19-
choice_strings: 'YN'
20-
args:
21-
criteria:
22-
relevance: "relevance: Is the submission referring to a real quote from the text?"
23-
conciseness: "conciseness: Is the answer concise and to the point?"
24-
correct: "correctness: Is the answer correct?"
25-
input_outputs:
26-
input: "completion"
15+
Reasoning:
16+
eval_type: cot_classify
17+
choice_scores:
18+
"Y": 1.0
19+
"N": 0.0
20+
choice_strings: 'YN'
21+
args:
22+
criteria:
23+
relevance: "relevance: Is the submission referring to a real quote from the text?"
24+
conciseness: "conciseness: Is the answer concise and to the point?"
25+
correct: "correctness: Is the answer correct?"
26+
input_outputs:
27+
input: "completion"
+14-13
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
prompt: |-
2-
Are the following {n} texts diverse?
1+
diversity:
2+
prompt: |-
3+
Are the following {n} texts diverse?
34
4-
{completion}
5-
completion_sample_templates:
6-
completion: "{i}. {output}\n"
7-
choice_scores:
8-
"Yes": 1.0
9-
"No": 0.0
10-
choice_strings:
11-
- "Yes"
12-
- "No"
13-
input_outputs:
14-
input: completion
5+
{completion}
6+
completion_sample_templates:
7+
completion: "{i}. {output}\n"
8+
choice_scores:
9+
"Yes": 1.0
10+
"No": 0.0
11+
choice_strings:
12+
- "Yes"
13+
- "No"
14+
input_outputs:
15+
input: completion

evals/registry/modelgraded/fact.yaml

+22-21
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
1-
prompt: |-
2-
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
3-
[BEGIN DATA]
4-
************
5-
[Question]: {input}
6-
************
7-
[Expert]: {ideal}
8-
************
9-
[Submission]: {completion}
10-
************
11-
[END DATA]
1+
fact:
2+
prompt: |-
3+
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
4+
[BEGIN DATA]
5+
************
6+
[Question]: {input}
7+
************
8+
[Expert]: {ideal}
9+
************
10+
[Submission]: {completion}
11+
************
12+
[END DATA]
1213
13-
Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
14-
The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
15-
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
16-
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
17-
(C) The submitted answer contains all the same details as the expert answer.
18-
(D) There is a disagreement between the submitted answer and the expert answer.
19-
(E) The answers differ, but these differences don't matter from the perspective of factuality.
20-
choice_strings: ABCDE
21-
input_outputs:
22-
input: completion
14+
Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
15+
The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
16+
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
17+
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
18+
(C) The submitted answer contains all the same details as the expert answer.
19+
(D) There is a disagreement between the submitted answer and the expert answer.
20+
(E) The answers differ, but these differences don't matter from the perspective of factuality.
21+
choice_strings: ABCDE
22+
input_outputs:
23+
input: completion

0 commit comments

Comments
 (0)