-
Notifications
You must be signed in to change notification settings - Fork 778
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Python Verifier implementation #1702
Changes from 8 commits
ff2251d
ea62686
456c4ed
da38ea0
8bc34f0
01a6890
b29fbb6
a64032c
f70a448
ff53d1a
7f1f871
af71fd4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | ||
|
||
import asyncio | ||
import os | ||
import shutil | ||
import subprocess | ||
import tempfile | ||
import venv | ||
from typing import List, Optional | ||
|
||
from camel.logger import get_logger | ||
from camel.verifiers import BaseVerifier | ||
|
||
from .models import VerificationOutcome, VerificationResult, VerifierInput | ||
|
||
logger = get_logger(__name__) | ||
|
||
|
||
class PythonVerifier(BaseVerifier): | ||
r""" | ||
The PythonVerifier class verifies Python-based implementations | ||
by executing them in an isolated virtual environment. | ||
|
||
Features: | ||
- Creates a virtual environment with a specified Python version. | ||
- Installs required packages before executing the provided script. | ||
- Executes the script and compares the output against a ground truth, | ||
if supplied. | ||
- Automatically cleans up the virtual environment after execution. | ||
|
||
The verification process ensures that the code runs in a controlled | ||
environment, minimizing external dependencies and conflicts. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
python_version: str = "python3", | ||
timeout: Optional[float] = 30.0, | ||
required_packages: Optional[List[str]] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we let user use interpreters integrated in camel for code execution? https://github.com/camel-ai/camel/tree/master/camel/interpreters There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this would be a good feature request for later. As of now, we can just leave it like this so we can accelerate the Loong project. |
||
): | ||
r""" | ||
Initializes the PythonVerifier. | ||
hallerite marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Args: | ||
python_version (str, optional): The Python version to use for | ||
the virtual environment. Defaults to "python3". | ||
timeout (Optional[float], optional): The execution timeout in | ||
seconds. Defaults to 30.0. | ||
required_packages (Optional[List[str]], optional): A list of | ||
packages to install in the virtual environment. | ||
Defaults to an empty list. | ||
""" | ||
super().__init__(timeout=timeout) | ||
self.python_version = python_version | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not used |
||
self.venv_path: Optional[str] = None | ||
self.required_packages = required_packages or [] | ||
|
||
async def _setup(self) -> None: | ||
r"""Set up a virtual environment for execution | ||
and install required packages.""" | ||
self.venv_path = tempfile.mkdtemp() | ||
venv.create(self.venv_path, with_pip=True) | ||
logger.info(f"Virtual environment created at {self.venv_path}") | ||
|
||
venv_pip = os.path.join(self.venv_path, "bin", "pip") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. path for different system could be different |
||
|
||
if self.required_packages: | ||
try: | ||
subprocess.run( | ||
[venv_pip, "install", *self.required_packages], | ||
check=True, | ||
capture_output=True, | ||
) | ||
logger.info( | ||
"Installed required packages:" | ||
f"{', '.join(self.required_packages)}" | ||
) | ||
except subprocess.CalledProcessError as e: | ||
logger.error( | ||
"Failed to install required packages: " | ||
f"{e.stderr.decode().strip()}" | ||
) | ||
|
||
async def _cleanup(self) -> None: | ||
r"""Clean up the virtual environment.""" | ||
if self.venv_path: | ||
shutil.rmtree(self.venv_path) | ||
logger.info(f"Virtual environment at {self.venv_path} removed") | ||
self.venv_path = None | ||
|
||
async def _verify_implementation( | ||
self, result: VerifierInput | ||
) -> VerificationResult: | ||
r"""Executes the LLM-generated response in a | ||
Python virtual environment.""" | ||
if not self.venv_path: | ||
return VerificationResult( | ||
status=VerificationOutcome.ERROR, | ||
result="", | ||
error_message="Virtual environment is not set up.", | ||
) | ||
|
||
script = result.llm_response.strip() | ||
venv_python = os.path.join(self.venv_path, "bin", "python") | ||
|
||
if not os.path.exists(venv_python): | ||
return VerificationResult( | ||
status=VerificationOutcome.ERROR, | ||
result="", | ||
error_message="Python binary not found in virtual environment", | ||
) | ||
|
||
try: | ||
process = await asyncio.create_subprocess_exec( | ||
venv_python, | ||
"-c", | ||
script, | ||
stdout=asyncio.subprocess.PIPE, | ||
stderr=asyncio.subprocess.PIPE, | ||
) | ||
|
||
stdout, stderr = await asyncio.wait_for( | ||
process.communicate(), timeout=self._timeout | ||
) | ||
|
||
output_result = stdout.decode().strip() | ||
error_output = stderr.decode().strip() | ||
|
||
if process.returncode == 0: | ||
# If ground truth is provided, compare it with the result | ||
if result.ground_truth is not None: | ||
if output_result == str(result.ground_truth).strip(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this could be problematic if there are whitespace differences or line ending differences There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an area that will have to be improved in general. We will need to use math-verify or extractors for more comprehensive semantic matching. |
||
return VerificationResult( | ||
status=VerificationOutcome.SUCCESS, | ||
result=output_result, | ||
) | ||
else: | ||
return VerificationResult( | ||
status=VerificationOutcome.FAILURE, | ||
error_message="Output doesn't match ground truth", | ||
result=output_result, | ||
) | ||
else: | ||
return VerificationResult( | ||
status=VerificationOutcome.SUCCESS, | ||
result=output_result, | ||
) | ||
|
||
else: | ||
return VerificationResult( | ||
status=VerificationOutcome.ERROR, | ||
error_message=error_output, | ||
result=output_result, | ||
) | ||
|
||
except asyncio.TimeoutError: | ||
return VerificationResult( | ||
status=VerificationOutcome.TIMEOUT, | ||
result="", | ||
error_message="Execution timed out.", | ||
) | ||
|
||
except Exception as e: | ||
return VerificationResult( | ||
status=VerificationOutcome.ERROR, | ||
result="", | ||
error_message=f"Execution error: {e}", | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | ||
import asyncio | ||
|
||
from camel.verifiers import PythonVerifier, VerifierInput | ||
|
||
verifier = PythonVerifier(required_packages=["numpy"]) | ||
asyncio.run(verifier.setup()) | ||
|
||
numpy_test_code = """ | ||
import numpy as np | ||
a = np.array([1, 2, 3]) | ||
b = np.array([4, 5, 6]) | ||
result = np.dot(a, b) | ||
print(result) | ||
""" | ||
|
||
|
||
# Since the output of the above numpy code evaluates to 32, | ||
# we expect the verification outcome to be a success. | ||
response = VerifierInput(llm_response=numpy_test_code, ground_truth="32") | ||
result = asyncio.run(verifier.verify(response)) | ||
print(f"Result: {result.status}") | ||
|
||
response = VerifierInput(llm_response=numpy_test_code, ground_truth="40") | ||
result = asyncio.run(verifier.verify(response)) | ||
|
||
# Now we expect the VerificationOutcome to be a failure, | ||
# because the answer is wrong. | ||
print(f"Result: {result.status}") | ||
|
||
asyncio.run(verifier.cleanup()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this variable is not used