Skip to content

Commit

Permalink
feat(eval): added ability to run evals against different providers/mo…
Browse files Browse the repository at this point in the history
…dels
  • Loading branch information
ErikBjare committed Aug 9, 2024
1 parent f4b1f40 commit ed19746
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 34 deletions.
14 changes: 11 additions & 3 deletions eval/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@


class Agent:
def __init__(self, llm: str, model: str):
self.llm = llm
self.model = model

@abstractmethod
def act(self, files: Files | None, prompt: str) -> Files:
"""
Expand All @@ -28,14 +32,18 @@ def act(self, files: Files | None, prompt: str):

print("\n--- Start of generation ---")
print(f"Working in {store.working_dir}")
prompt_sys = get_prompt()
prompt_sys.content += (
"\n\nIf you have trouble and dont seem to make progress, stop trying."
)
# TODO: add timeout
try:
gptme_chat(
[Message("user", prompt)],
[get_prompt()],
[prompt_sys],
f"gptme-evals-{store.id}",
llm=None,
model=None,
llm=self.llm,
model=self.model,
no_confirm=True,
interactive=False,
)
Expand Down
150 changes: 119 additions & 31 deletions eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
Inspired by a document by Anton Osika and Axel Theorell.
"""

import csv
import inspect
import logging
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path

from .agents import GPTMe
from .agents import Agent, GPTMe
from .evals import tests, tests_map
from .execenv import SimpleExecutionEnv
from .types import (
Expand All @@ -21,13 +25,16 @@

logger = logging.getLogger(__name__)

project_dir = Path(__file__).parent.parent

def execute(test: ExecTest) -> ExecResult:

def execute(test: ExecTest, agent: Agent) -> ExecResult:
"""
Executes the code.
Executes the code for a specific model.
"""
print(f"Running test {test['name']} with prompt: {test['prompt']}")
agent = GPTMe()
print(
f"Running test {test['name']} with prompt: {test['prompt']} for model: {agent.model}"
)

# generate code
gen_start = time.time()
Expand Down Expand Up @@ -74,48 +81,129 @@ def execute(test: ExecTest) -> ExecResult:


def main():
models = [
# "openai/gpt-3.5-turbo",
# "openai/gpt-4-turbo",
# "openai/gpt-4o",
"openai/gpt-4o-mini",
# "anthropic/claude-3-5-sonnet-20240620",
"anthropic/claude-3-haiku-20240307",
]
test_name = sys.argv[1] if len(sys.argv) > 1 else None
results = []
if test_name:
print(f"=== Running test {test_name} ===")
result = execute(tests_map[test_name])
results.append(result)
else:
print("=== Running all tests ===")
for test in tests:
result = execute(test)

all_results = {}
for model in models:
print(f"\n=== Running tests for model: {model} ===")
llm, model = model.split("/")
agent = GPTMe(llm=llm, model=model)

results = []
if test_name:
print(f"=== Running test {test_name} ===")
result = execute(tests_map[test_name], agent)
results.append(result)
else:
print("=== Running all tests ===")
for test in tests:
result = execute(test, agent)
results.append(result)

print("=== Finished ===\n")
duration_total = sum(
result["timings"]["gen"] + result["timings"]["run"] + result["timings"]["eval"]
for result in results
)
print(f"Completed {len(results)} tests in {duration_total:.2f}s:")
for result in results:
checkmark = "✅" if all(case["passed"] for case in result["results"]) else "❌"
duration_result = (
all_results[model] = results

print("\n=== Finished ===\n")

for model, results in all_results.items():
print(f"\nResults for model: {model}")
duration_total = sum(
result["timings"]["gen"]
+ result["timings"]["run"]
+ result["timings"]["eval"]
for result in results
)
print(
f"- {result['name']} in {duration_result:.2f}s (gen: {result['timings']['gen']:.2f}s, run: {result['timings']['run']:.2f}s, eval: {result['timings']['eval']:.2f}s)"
)
for case in result["results"]:
checkmark = "✅" if case["passed"] else "❌"
print(f" {checkmark} {case['name']}")
print(f"Completed {len(results)} tests in {duration_total:.2f}s:")
for result in results:
checkmark = (
"✅" if all(case["passed"] for case in result["results"]) else "❌"
)
duration_result = (
result["timings"]["gen"]
+ result["timings"]["run"]
+ result["timings"]["eval"]
)
print(
f"- {result['name']} in {duration_result:.2f}s (gen: {result['timings']['gen']:.2f}s, run: {result['timings']['run']:.2f}s, eval: {result['timings']['eval']:.2f}s)"
)
for case in result["results"]:
checkmark = "✅" if case["passed"] else "❌"
print(f" {checkmark} {case['name']}")

print("\n=== Model Comparison ===")
for test in tests:
print(f"\nTest: {test['name']}")
for model, results in all_results.items():
result = next(r for r in results if r["name"] == test["name"])
passed = all(case["passed"] for case in result["results"])
checkmark = "✅" if passed else "❌"
duration = sum(result["timings"].values())
print(f"{model}: {checkmark} {duration:.2f}s")

all_success = all(
all(case["passed"] for case in result["results"]) for result in results
all(all(case["passed"] for case in result["results"]) for result in results)
for results in all_results.values()
)
if all_success:
print("\n✅ All tests passed!")
print("\n✅ All tests passed for all models!")
else:
print("\n❌ Some tests failed!")

# Write results to CSV
write_results_to_csv(all_results)

sys.exit(0 if all_success else 1)


def write_results_to_csv(all_results):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# get current commit hash and dirty status, like: a8b2ef0-dirty
commit_hash = subprocess.run(
["git", "describe", "--always", "--dirty", "--exclude", "'*'"],
text=True,
capture_output=True,
).stdout.strip()
filename = project_dir / f"eval_results_{timestamp}.csv"

with open(filename, "w", newline="") as csvfile:
fieldnames = [
"Model",
"Test",
"Passed",
"Total Duration",
"Generation Time",
"Run Time",
"Eval Time",
"Commit Hash",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

writer.writeheader()
for model, results in all_results.items():
for result in results:
passed = all(case["passed"] for case in result["results"])
writer.writerow(
{
"Model": model,
"Test": result["name"],
"Passed": "true" if passed else "false",
"Total Duration": sum(result["timings"].values()),
"Generation Time": result["timings"]["gen"],
"Run Time": result["timings"]["run"],
"Eval Time": result["timings"]["eval"],
"Commit Hash": commit_hash,
}
)

print(f"\nResults saved to {filename.resolve()}")


if __name__ == "__main__":
main()

0 comments on commit ed19746

Please sign in to comment.