Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Python Verifier implementation #1702

Merged
merged 12 commits into from
Mar 4, 2025
8 changes: 7 additions & 1 deletion camel/verifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,11 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from .base import BaseVerifier
from .models import VerificationOutcome, VerifierInput
from .python_verifier import PythonVerifier

__all__ = ["BaseVerifier", "VerificationOutcome", "VerifierInput"]
__all__ = [
"BaseVerifier",
"VerificationOutcome",
"VerifierInput",
"PythonVerifier",
]
8 changes: 7 additions & 1 deletion camel/verifiers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@


class VerifierInput(BaseModel):
r"""Structured input to the verifier"""

llm_response: str = Field(
description="The LLM response to be verified."
"Needs to be in a format that the verifier can handle."
)
ground_truth: Optional[str] = Field(
description="The ground truth data, if available."
None, description="The ground truth data, if available."
)


Expand All @@ -36,6 +38,10 @@ class VerificationOutcome(Enum):
ERROR = "error"
TIMEOUT = "timeout"

def __bool__(self):
r"""Only VerificationOutcome.SUCCESS is truthy; others are falsy."""
return self is VerificationOutcome.SUCCESS


class VerificationResult(BaseModel):
r"""Structured result from a verification."""
Expand Down
179 changes: 179 additions & 0 deletions camel/verifiers/python_verifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import asyncio
import os
import shutil
import subprocess
import tempfile
import venv
from typing import List, Optional

from camel.logger import get_logger
from camel.verifiers import BaseVerifier

from .models import VerificationOutcome, VerificationResult, VerifierInput

logger = get_logger(__name__)


class PythonVerifier(BaseVerifier):
r"""
The PythonVerifier class verifies Python-based implementations
by executing them in an isolated virtual environment.

Features:
- Creates a virtual environment with a specified Python version.
- Installs required packages before executing the provided script.
- Executes the script and compares the output against a ground truth,
if supplied.
- Automatically cleans up the virtual environment after execution.

The verification process ensures that the code runs in a controlled
environment, minimizing external dependencies and conflicts.
"""

def __init__(
self,
python_version: str = "python3",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this variable is not used

timeout: Optional[float] = 30.0,
required_packages: Optional[List[str]] = None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we let user use interpreters integrated in camel for code execution? https://github.com/camel-ai/camel/tree/master/camel/interpreters

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this would be a good feature request for later. As of now, we can just leave it like this so we can accelerate the Loong project.

):
r"""Initializes the PythonVerifier.

Args:
python_version (str, optional): The Python version to use for
the virtual environment. Defaults to "python3".
timeout (Optional[float], optional): The execution timeout in
seconds. Defaults to 30.0.
required_packages (Optional[List[str]], optional): A list of
packages to install in the virtual environment.
Defaults to an empty list.
"""
super().__init__(timeout=timeout)
self.python_version = python_version
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not used

self.venv_path: Optional[str] = None
self.required_packages = required_packages or []

async def _setup(self) -> None:
r"""Set up a virtual environment for execution
and install required packages."""
self.venv_path = tempfile.mkdtemp()
venv.create(self.venv_path, with_pip=True)
logger.info(f"Virtual environment created at {self.venv_path}")

venv_pip = os.path.join(self.venv_path, "bin", "pip")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

path for different system could be different


if self.required_packages:
try:
subprocess.run(
[venv_pip, "install", *self.required_packages],
check=True,
capture_output=True,
)
logger.info(
"Installed required packages:"
f"{', '.join(self.required_packages)}"
)
except subprocess.CalledProcessError as e:
logger.error(
"Failed to install required packages: "
f"{e.stderr.decode().strip()}"
)

async def _cleanup(self) -> None:
r"""Clean up the virtual environment."""
if self.venv_path:
shutil.rmtree(self.venv_path)
logger.info(f"Virtual environment at {self.venv_path} removed")
self.venv_path = None

async def _verify_implementation(
self, result: VerifierInput
) -> VerificationResult:
r"""Executes the LLM-generated response in a
Python virtual environment."""
if not self.venv_path:
return VerificationResult(
status=VerificationOutcome.ERROR,
result="",
error_message="Virtual environment is not set up.",
)

script = result.llm_response.strip()
venv_python = os.path.join(self.venv_path, "bin", "python")

if not os.path.exists(venv_python):
return VerificationResult(
status=VerificationOutcome.ERROR,
result="",
error_message="Python binary not found in virtual environment",
)

try:
process = await asyncio.create_subprocess_exec(
venv_python,
"-c",
script,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)

stdout, stderr = await asyncio.wait_for(
process.communicate(), timeout=self._timeout
)

output_result = stdout.decode().strip()
error_output = stderr.decode().strip()

if process.returncode == 0:
# If ground truth is provided, compare it with the result
if result.ground_truth is not None:
if output_result == str(result.ground_truth).strip():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could be problematic if there are whitespace differences or line ending differences

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an area that will have to be improved in general. We will need to use math-verify or extractors for more comprehensive semantic matching.

return VerificationResult(
status=VerificationOutcome.SUCCESS,
result=output_result,
)
else:
return VerificationResult(
status=VerificationOutcome.FAILURE,
error_message="Output doesn't match ground truth",
result=output_result,
)
else:
return VerificationResult(
status=VerificationOutcome.SUCCESS,
result=output_result,
)

else:
return VerificationResult(
status=VerificationOutcome.ERROR,
error_message=error_output,
result=output_result,
)

except asyncio.TimeoutError:
return VerificationResult(
status=VerificationOutcome.TIMEOUT,
result="",
error_message="Execution timed out.",
)

except Exception as e:
return VerificationResult(
status=VerificationOutcome.ERROR,
result="",
error_message=f"Execution error: {e}",
)
43 changes: 43 additions & 0 deletions examples/verifier/python_verifier_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import asyncio

from camel.verifiers import PythonVerifier, VerifierInput

verifier = PythonVerifier(required_packages=["numpy"])
asyncio.run(verifier.setup())

numpy_test_code = """
import numpy as np
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
result = np.dot(a, b)
print(result)
"""


# Since the output of the above numpy code evaluates to 32,
# we expect the verification outcome to be a success.
response = VerifierInput(llm_response=numpy_test_code, ground_truth="32")
result = asyncio.run(verifier.verify(response))
print(f"Result: {result.status}")

response = VerifierInput(llm_response=numpy_test_code, ground_truth="40")
result = asyncio.run(verifier.verify(response))

# Now we expect the VerificationOutcome to be a failure,
# because the answer is wrong.
print(f"Result: {result.status}")

asyncio.run(verifier.cleanup())
Loading