feat: Python Verifier implementation (#1702)

Co-authored-by: Wendong <w3ndong.fan@gmail.com>
camel-ai · Mar 4, 2025 · f47897c · f47897c
1 parent 84185a5
commit f47897c
Show file tree

Hide file tree

Showing 5 changed files with 409 additions and 2 deletions.
diff --git a/camel/verifiers/__init__.py b/camel/verifiers/__init__.py
@@ -13,5 +13,11 @@
 # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
 from .base import BaseVerifier
 from .models import VerificationOutcome, VerifierInput
+from .python_verifier import PythonVerifier
 
-__all__ = ["BaseVerifier", "VerificationOutcome", "VerifierInput"]
+__all__ = [
+    "BaseVerifier",
+    "VerificationOutcome",
+    "VerifierInput",
+    "PythonVerifier",
+]
diff --git a/camel/verifiers/models.py b/camel/verifiers/models.py
@@ -19,12 +19,14 @@
 
 
 class VerifierInput(BaseModel):
+    r"""Structured input to the verifier"""
+
     llm_response: str = Field(
         description="The LLM response to be verified."
         "Needs to be in a format that the verifier can handle."
     )
     ground_truth: Optional[str] = Field(
-        description="The ground truth data, if available."
+        None, description="The ground truth data, if available."
     )
 
 
@@ -36,6 +38,10 @@ class VerificationOutcome(Enum):
     ERROR = "error"
     TIMEOUT = "timeout"
 
+    def __bool__(self):
+        r"""Only VerificationOutcome.SUCCESS is truthy; others are falsy."""
+        return self is VerificationOutcome.SUCCESS
+
 
 class VerificationResult(BaseModel):
     r"""Structured result from a verification."""

diff --git a/camel/verifiers/python_verifier.py b/camel/verifiers/python_verifier.py
@@ -0,0 +1,202 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+import asyncio
+import os
+import shutil
+import subprocess
+import tempfile
+import venv
+from typing import List, Optional
+
+from camel.logger import get_logger
+from camel.verifiers import BaseVerifier
+
+from .models import VerificationOutcome, VerificationResult, VerifierInput
+
+logger = get_logger(__name__)
+
+
+class PythonVerifier(BaseVerifier):
+    r"""The PythonVerifier class verifies Python-based implementations
+    by executing them in an isolated virtual environment.
+
+    Features:
+    - Creates a virtual environment with a specified Python version.
+    - Installs required packages before executing the provided script.
+    - Executes the script and compares the output against a ground truth,
+      if supplied.
+    - Automatically cleans up the virtual environment after execution.
+
+    The verification process ensures that the code runs in a controlled
+    environment, minimizing external dependencies and conflicts.
+    """
+
+    def __init__(
+        self,
+        timeout: Optional[float] = 30.0,
+        required_packages: Optional[List[str]] = None,
+    ):
+        r"""Initializes the PythonVerifier.
+
+        Args:
+            timeout (Optional[float], optional): The execution timeout in
+                seconds. (default: :obj:`30.0`)
+            required_packages (Optional[List[str]], optional): A list of
+                packages to install in the virtual environment.
+                (default: :obj:`None`)
+        """
+        # TODO: Use CAMEL's Interpreter to execute the code
+        super().__init__(timeout=timeout)
+        self.venv_path: Optional[str] = None
+        self.required_packages = required_packages or []
+
+        if os.name == 'nt':  # Windows
+            self.bin_dir = 'Scripts'
+        else:  # Unix-like systems
+            self.bin_dir = 'bin'
+
+    async def _setup(self) -> None:
+        r"""Set up a virtual environment for execution
+        and install required packages.
+        """
+        self.venv_path = tempfile.mkdtemp()
+        venv.create(self.venv_path, with_pip=True)
+        logger.info(f"Virtual environment created at {self.venv_path}")
+
+        venv_pip = os.path.join(self.venv_path, self.bin_dir, "pip")
+
+        if self.required_packages:
+            try:
+                subprocess.run(
+                    [venv_pip, "install", *self.required_packages],
+                    check=True,
+                    capture_output=True,
+                )
+                logger.info(
+                    "Installed required packages:"
+                    f"{', '.join(self.required_packages)}"
+                )
+            except subprocess.CalledProcessError as e:
+                logger.error(
+                    "Failed to install required packages: "
+                    f"{e.stderr.decode().strip()}"
+                )
+
+    async def _cleanup(self) -> None:
+        r"""Clean up the virtual environment."""
+        if self.venv_path:
+            shutil.rmtree(self.venv_path)
+            logger.info(f"Virtual environment at {self.venv_path} removed")
+            self.venv_path = None
+
+    async def _verify_implementation(
+        self, result: VerifierInput
+    ) -> VerificationResult:
+        r"""Executes the LLM-generated response in a Python virtual
+        environment.
+
+        Args:
+            result (VerifierInput): Contains the LLM-generated Python code to
+                execute and optional ground truth for comparison.
+
+        Returns:
+            VerificationResult: Contains verification status (SUCCESS/FAILURE/
+                ERROR), execution output, error messages if any, and execution
+                duration.
+
+        Raises:
+            asyncio.TimeoutError: If execution exceeds the configured timeout.
+            Exception: Any unexpected errors during execution are caught and
+                converted to an ERROR verification result.
+        """
+        if not self.venv_path:
+            return VerificationResult(
+                status=VerificationOutcome.ERROR,
+                result="",
+                error_message="Virtual environment is not set up.",
+            )
+
+        script = result.llm_response.strip()
+        venv_python = os.path.join(self.venv_path, self.bin_dir, "python")
+
+        if not os.path.exists(venv_python):
+            return VerificationResult(
+                status=VerificationOutcome.ERROR,
+                result="",
+                error_message="Python binary not found in virtual environment",
+            )
+
+        try:
+            process = await asyncio.create_subprocess_exec(
+                venv_python,
+                "-c",
+                script,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+
+            stdout, stderr = await asyncio.wait_for(
+                process.communicate(), timeout=self._timeout
+            )
+
+            output_result = stdout.decode().strip()
+            error_output = stderr.decode().strip()
+
+            if process.returncode == 0:
+                # If ground truth is provided, compare it with the result
+                if result.ground_truth is not None:
+                    # Normalize both strings by removing extra whitespace
+                    normalized_output = ' '.join(output_result.strip().split())
+                    normalized_truth = ' '.join(
+                        str(result.ground_truth).strip().split()
+                    )
+
+                    if normalized_output == normalized_truth:
+                        return VerificationResult(
+                            status=VerificationOutcome.SUCCESS,
+                            result=output_result,
+                        )
+                    else:
+                        return VerificationResult(
+                            status=VerificationOutcome.FAILURE,
+                            error_message="Output doesn't match ground truth",
+                            result=output_result,
+                        )
+                else:
+                    return VerificationResult(
+                        status=VerificationOutcome.SUCCESS,
+                        result=output_result,
+                    )
+
+            else:
+                return VerificationResult(
+                    status=VerificationOutcome.ERROR,
+                    error_message=error_output,
+                    result=output_result,
+                )
+
+        except asyncio.TimeoutError:
+            return VerificationResult(
+                status=VerificationOutcome.TIMEOUT,
+                result="",
+                error_message="Execution timed out.",
+            )
+
+        except Exception as e:
+            return VerificationResult(
+                status=VerificationOutcome.ERROR,
+                result="",
+                error_message=f"Execution error: {e}",
+            )
diff --git a/examples/verifier/python_verifier_example.py b/examples/verifier/python_verifier_example.py
@@ -0,0 +1,43 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+import asyncio
+
+from camel.verifiers import PythonVerifier, VerifierInput
+
+verifier = PythonVerifier(required_packages=["numpy"])
+asyncio.run(verifier.setup())
+
+numpy_test_code = """
+import numpy as np
+a = np.array([1, 2, 3])
+b = np.array([4, 5, 6])
+result = np.dot(a, b)
+print(result)
+"""
+
+
+# Since the output of the above numpy code evaluates to 32,
+# we expect the verification outcome to be a success.
+response = VerifierInput(llm_response=numpy_test_code, ground_truth="32")
+result = asyncio.run(verifier.verify(response))
+print(f"Result: {result.status}")
+
+response = VerifierInput(llm_response=numpy_test_code, ground_truth="40")
+result = asyncio.run(verifier.verify(response))
+
+# Now we expect the VerificationOutcome to be a failure,
+# because the answer is wrong.
+print(f"Result: {result.status}")
+
+asyncio.run(verifier.cleanup())