feat: GenAI Evaluation: Release GenAI Evaluation SDK Agent Evaluation…

… features to vertexai.preview module. PiperOrigin-RevId: 698077630
googleapis · Nov 19, 2024 · 41cd5a8 · 41cd5a8
1 parent df2c650
commit 41cd5a8
Show file tree

Hide file tree

Showing 18 changed files with 5,234 additions and 18 deletions.
diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
@@ -32,6 +32,12 @@
 from google.cloud.aiplatform_v1.types import (
     evaluation_service as gapic_evaluation_service_types,
 )
+from google.cloud.aiplatform_v1beta1.services import (
+    evaluation_service as gapic_evaluation_services_preview,
+)
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_evaluation_service_types_preview,
+)
 from vertexai import evaluation
 from vertexai import generative_models
 from vertexai.evaluation import _base as eval_base
@@ -45,13 +51,19 @@
 )
 from vertexai.evaluation.metrics import pairwise_metric
 from vertexai.evaluation.metrics import pointwise_metric
+from vertexai.preview import evaluation as evaluation_preview
+from vertexai.preview import reasoning_engines
 import numpy as np
 import pandas as pd
 import pytest
 
 
 EvalTask = eval_task.EvalTask
+EvalTaskPreview = evaluation_preview.eval_task.EvalTask
 Pointwise = metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise
+PointwisePreview = (
+    evaluation_preview.metrics.metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise
+)
 Pairwise = metric_prompt_template_examples.MetricPromptTemplateExamples.Pairwise
 
 _TEST_PROJECT = "test-project"
@@ -142,6 +154,15 @@
         "instruction": ["test", "instruction"],
     }
 )
+_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE = pd.DataFrame(
+    {
+        "prompt": ["test_input1", "test_input2"],
+        "reference_trajectory": [
+            [{"tool_name": "test_tool1"}, {"tool_name": "test_tool2"}],
+            [{"tool_name": "test_tool3"}, {"tool_name": "test_tool4"}],
+        ],
+    },
+)
 _TEST_EVAL_DATASET_ALL_INCLUDED = pd.DataFrame(
     {
         "prompt": ["test_prompt", "text_prompt"],
@@ -300,6 +321,25 @@
 {response}
 """
 
+_MOCK_RUNNABLE_INFERENCE_RESPONSE = [
+    {
+        "input": "test_input",
+        "output": "test_output",
+        "intermediate_steps": [
+            [{"kwargs": {"tool": "test_tool1"}, "tool_output": "test_tool_output"}],
+            [{"kwargs": {"tool": "test_tool2"}, "tool_output": "test_tool_output"}],
+        ],
+    },
+    {
+        "input": "test_input",
+        "output": "test_output",
+        "intermediate_steps": [
+            [{"kwargs": {"tool": "test_tool2"}, "tool_output": "test_tool_output"}],
+            [{"kwargs": {"tool": "test_tool3"}, "tool_output": "test_tool_output"}],
+        ],
+    },
+]
+
 _MOCK_EXACT_MATCH_RESULT = (
     gapic_evaluation_service_types.EvaluateInstancesResponse(
         exact_match_results=gapic_evaluation_service_types.ExactMatchResults(
@@ -316,6 +356,26 @@
         )
     ),
 )
+_MOCK_TRAJECTORY_EXACT_MATCH_RESULT = (
+    gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
+        trajectory_exact_match_results=gapic_evaluation_service_types_preview.TrajectoryExactMatchResults(
+            trajectory_exact_match_metric_values=[
+                gapic_evaluation_service_types_preview.TrajectoryExactMatchMetricValue(
+                    score=1.0
+                ),
+            ]
+        )
+    ),
+    gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
+        trajectory_exact_match_results=gapic_evaluation_service_types_preview.TrajectoryExactMatchResults(
+            trajectory_exact_match_metric_values=[
+                gapic_evaluation_service_types_preview.TrajectoryExactMatchMetricValue(
+                    score=0.0
+                ),
+            ]
+        )
+    ),
+)
 _MOCK_POINTWISE_RESULT = (
     gapic_evaluation_service_types.EvaluateInstancesResponse(
         pointwise_metric_result=gapic_evaluation_service_types.PointwiseMetricResult(
@@ -354,6 +414,18 @@
         )
     ),
 )
+_MOCK_COHERENCE_RESULT = (
+    gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
+        pointwise_metric_result=gapic_evaluation_service_types_preview.PointwiseMetricResult(
+            score=5, explanation="explanation"
+        )
+    ),
+    gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
+        pointwise_metric_result=gapic_evaluation_service_types_preview.PointwiseMetricResult(
+            score=4, explanation="explanation"
+        )
+    ),
+)
 _MOCK_PAIRWISE_SUMMARIZATION_QUALITY_RESULT = (
     gapic_evaluation_service_types.EvaluateInstancesResponse(
         pairwise_metric_result=gapic_evaluation_service_types.PairwiseMetricResult(
@@ -1177,6 +1249,106 @@ def test_eval_result_experiment_run_logging(self):
         )
 
 
+@pytest.mark.usefixtures("google_auth_mock")
+class TestAgentEvaluation:
+    def setup_method(self):
+        vertexai.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+        )
+
+    def teardown_method(self):
+        initializer.global_pool.shutdown(wait=True)
+
+    @pytest.mark.parametrize("api_transport", ["grpc", "rest"])
+    def test_runnable_response_eval_with_runnable_inference(self, api_transport):
+        aiplatform.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+            api_transport=api_transport,
+        )
+        mock_runnable = mock.create_autospec(reasoning_engines.Queryable, instance=True)
+        mock_runnable.query.return_value = _MOCK_RUNNABLE_INFERENCE_RESPONSE
+
+        test_metrics = [PointwisePreview.COHERENCE]
+        test_eval_task = EvalTaskPreview(
+            dataset=_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE, metrics=test_metrics
+        )
+        mock_metric_results = _MOCK_COHERENCE_RESULT
+        with mock.patch.object(
+            target=gapic_evaluation_services_preview.EvaluationServiceClient,
+            attribute="evaluate_instances",
+            side_effect=mock_metric_results,
+        ):
+            test_result = test_eval_task.evaluate(
+                runnable=mock_runnable,
+                prompt_template="test prompt template",
+            )
+
+        assert test_result.summary_metrics["row_count"] == 2
+        assert test_result.summary_metrics["coherence/mean"] == 4.5
+        assert test_result.summary_metrics["coherence/std"] == pytest.approx(0.7, 0.1)
+        assert set(test_result.metrics_table.columns.values) == set(
+            [
+                "prompt",
+                "reference_trajectory",
+                "response",
+                "latency_in_seconds",
+                "failure",
+                "predicted_trajectory",
+                "coherence/score",
+                "coherence/explanation",
+            ]
+        )
+        assert list(test_result.metrics_table["coherence/score"].values) == [5, 4]
+        assert list(test_result.metrics_table["coherence/explanation"].values) == [
+            "explanation",
+            "explanation",
+        ]
+
+    @pytest.mark.parametrize("api_transport", ["grpc", "rest"])
+    def test_runnable_trajectory_eval_with_runnable_inference(self, api_transport):
+        aiplatform.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+            api_transport=api_transport,
+        )
+        mock_runnable = mock.create_autospec(reasoning_engines.Queryable, instance=True)
+        mock_runnable.query.return_value = _MOCK_RUNNABLE_INFERENCE_RESPONSE
+
+        test_metrics = ["trajectory_exact_match"]
+        test_eval_task = EvalTaskPreview(
+            dataset=_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE, metrics=test_metrics
+        )
+        mock_metric_results = _MOCK_TRAJECTORY_EXACT_MATCH_RESULT
+        with mock.patch.object(
+            target=gapic_evaluation_services_preview.EvaluationServiceClient,
+            attribute="evaluate_instances",
+            side_effect=mock_metric_results,
+        ):
+            test_result = test_eval_task.evaluate(runnable=mock_runnable)
+
+        assert test_result.summary_metrics["row_count"] == 2
+        assert test_result.summary_metrics["trajectory_exact_match/mean"] == 0.5
+        assert test_result.summary_metrics[
+            "trajectory_exact_match/std"
+        ] == pytest.approx(0.7, 0.1)
+        assert set(test_result.metrics_table.columns.values) == set(
+            [
+                "prompt",
+                "response",
+                "latency_in_seconds",
+                "failure",
+                "predicted_trajectory",
+                "reference_trajectory",
+                "trajectory_exact_match/score",
+            ]
+        )
+        assert list(
+            test_result.metrics_table["trajectory_exact_match/score"].values
+        ) == [1.0, 0.0]
+
+
 @pytest.mark.usefixtures("google_auth_mock")
 class TestEvaluationErrors:
     def setup_method(self):
@@ -1376,11 +1548,10 @@ def test_evaluate_baseline_model_response_column_not_provided(
         ):
             test_eval_task.evaluate()
 
-    def test_evaluate_response_column_not_provided(
-        self,
-    ):
+    @pytest.mark.parametrize("eval_task_version", [EvalTask, EvalTaskPreview])
+    def test_evaluate_response_column_not_provided(self, eval_task_version):
         test_eval_dataset = _TEST_EVAL_DATASET_SINGLE
-        test_eval_task = EvalTask(
+        test_eval_task = eval_task_version(
             dataset=test_eval_dataset,
             metrics=["exact_match"],
         )
@@ -1395,11 +1566,10 @@ def test_evaluate_response_column_not_provided(
         ):
             test_eval_task.evaluate()
 
-    def test_evaluate_reference_column_not_provided(
-        self,
-    ):
+    @pytest.mark.parametrize("eval_task_version", [EvalTask, EvalTaskPreview])
+    def test_evaluate_reference_column_not_provided(self, eval_task_version):
         test_eval_dataset = pd.DataFrame({"response": ["test", "text"]})
-        test_eval_task = EvalTask(
+        test_eval_task = eval_task_version(
             dataset=test_eval_dataset,
             metrics=["exact_match"],
         )

diff --git a/vertexai/preview/evaluation/__init__.py b/vertexai/preview/evaluation/__init__.py
@@ -16,10 +16,10 @@
 #
 """Vertex Gen AI Evaluation Service Module."""
 
-from vertexai.evaluation import _base
-from vertexai.evaluation import eval_task
-from vertexai.evaluation import metrics
-from vertexai.evaluation import prompt_template
+from vertexai.preview.evaluation import _base
+from vertexai.preview.evaluation import eval_task
+from vertexai.preview.evaluation import metrics
+from vertexai.preview.evaluation import prompt_template
 
 
 EvalResult = _base.EvalResult

diff --git a/vertexai/preview/evaluation/_base.py b/vertexai/preview/evaluation/_base.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Base classes for evaluation."""
+
+
+import dataclasses
+from typing import Dict, List, Optional, TYPE_CHECKING, Union
+
+from google.cloud.aiplatform_v1beta1.services import (
+    evaluation_service as gapic_evaluation_services,
+)
+from vertexai.preview.evaluation.metrics import (
+    _base as metrics_base,
+)
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+@dataclasses.dataclass
+class EvaluationRunConfig:
+    """Evaluation Run Configurations.
+
+    Attributes:
+      dataset: The dataset to evaluate.
+      metrics: The list of metric names, or Metric instances to evaluate.
+      metric_column_mapping: An optional dictionary column mapping that overrides
+        the metric prompt template input variable names with mapped the evaluation
+        dataset column names, used during evaluation. For example, if the
+        input_variables of the metric prompt template are ["context",
+        "reference"], the metric_column_mapping can be { "context":
+        "news_context", "reference": "ground_truth", "response":
+        "model_1_response" } if the dataset has columns "news_context",
+        "ground_truth" and "model_1_response".
+      client: The evaluation service client.
+      evaluation_service_qps: The custom QPS limit for the evaluation service.
+      retry_timeout: How long to keep retrying the evaluation requests, in
+        seconds.
+    """
+
+    dataset: "pd.DataFrame"
+    metrics: List[Union[str, metrics_base._Metric]]
+    metric_column_mapping: Dict[str, str]
+    client: gapic_evaluation_services.EvaluationServiceClient
+    evaluation_service_qps: float
+    retry_timeout: float
+
+    def validate_dataset_column(self, column_name: str) -> None:
+        """Validates that the column names in the column map are in the dataset.
+
+        Args:
+          column_name: The column name to validate.
+
+        Raises:
+          KeyError: If any of the column names are not in the dataset.
+        """
+        if (
+            self.metric_column_mapping.get(column_name, column_name)
+            not in self.dataset.columns
+        ):
+            raise KeyError(
+                "Required column"
+                f" `{self.metric_column_mapping.get(column_name, column_name)}` not"
+                " found in the evaluation dataset. The columns in the evaluation"
+                f" dataset are {list(self.dataset.columns)}."
+            )
+
+
+@dataclasses.dataclass
+class EvalResult:
+    """Evaluation result.
+
+    Attributes:
+      summary_metrics: A dictionary of summary evaluation metrics for an
+        evaluation run.
+      metrics_table: A pandas.DataFrame table containing evaluation dataset
+        inputs, predictions, explanations, and metric results per row.
+      metadata: The metadata for the evaluation run.
+    """
+
+    summary_metrics: Dict[str, float]
+    metrics_table: Optional["pd.DataFrame"] = None
+    metadata: Optional[Dict[str, str]] = None