Skip to content

Commit

Permalink
feat: GenAI Evaluation: Release GenAI Evaluation SDK Agent Evaluation…
Browse files Browse the repository at this point in the history
… features to vertexai.preview module.

PiperOrigin-RevId: 698077630
  • Loading branch information
vertex-sdk-bot authored and copybara-github committed Nov 19, 2024
1 parent df2c650 commit 41cd5a8
Show file tree
Hide file tree
Showing 18 changed files with 5,234 additions and 18 deletions.
186 changes: 178 additions & 8 deletions tests/unit/vertexai/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@
from google.cloud.aiplatform_v1.types import (
evaluation_service as gapic_evaluation_service_types,
)
from google.cloud.aiplatform_v1beta1.services import (
evaluation_service as gapic_evaluation_services_preview,
)
from google.cloud.aiplatform_v1beta1.types import (
evaluation_service as gapic_evaluation_service_types_preview,
)
from vertexai import evaluation
from vertexai import generative_models
from vertexai.evaluation import _base as eval_base
Expand All @@ -45,13 +51,19 @@
)
from vertexai.evaluation.metrics import pairwise_metric
from vertexai.evaluation.metrics import pointwise_metric
from vertexai.preview import evaluation as evaluation_preview
from vertexai.preview import reasoning_engines
import numpy as np
import pandas as pd
import pytest


EvalTask = eval_task.EvalTask
EvalTaskPreview = evaluation_preview.eval_task.EvalTask
Pointwise = metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise
PointwisePreview = (
evaluation_preview.metrics.metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise
)
Pairwise = metric_prompt_template_examples.MetricPromptTemplateExamples.Pairwise

_TEST_PROJECT = "test-project"
Expand Down Expand Up @@ -142,6 +154,15 @@
"instruction": ["test", "instruction"],
}
)
_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE = pd.DataFrame(
{
"prompt": ["test_input1", "test_input2"],
"reference_trajectory": [
[{"tool_name": "test_tool1"}, {"tool_name": "test_tool2"}],
[{"tool_name": "test_tool3"}, {"tool_name": "test_tool4"}],
],
},
)
_TEST_EVAL_DATASET_ALL_INCLUDED = pd.DataFrame(
{
"prompt": ["test_prompt", "text_prompt"],
Expand Down Expand Up @@ -300,6 +321,25 @@
{response}
"""

_MOCK_RUNNABLE_INFERENCE_RESPONSE = [
{
"input": "test_input",
"output": "test_output",
"intermediate_steps": [
[{"kwargs": {"tool": "test_tool1"}, "tool_output": "test_tool_output"}],
[{"kwargs": {"tool": "test_tool2"}, "tool_output": "test_tool_output"}],
],
},
{
"input": "test_input",
"output": "test_output",
"intermediate_steps": [
[{"kwargs": {"tool": "test_tool2"}, "tool_output": "test_tool_output"}],
[{"kwargs": {"tool": "test_tool3"}, "tool_output": "test_tool_output"}],
],
},
]

_MOCK_EXACT_MATCH_RESULT = (
gapic_evaluation_service_types.EvaluateInstancesResponse(
exact_match_results=gapic_evaluation_service_types.ExactMatchResults(
Expand All @@ -316,6 +356,26 @@
)
),
)
_MOCK_TRAJECTORY_EXACT_MATCH_RESULT = (
gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
trajectory_exact_match_results=gapic_evaluation_service_types_preview.TrajectoryExactMatchResults(
trajectory_exact_match_metric_values=[
gapic_evaluation_service_types_preview.TrajectoryExactMatchMetricValue(
score=1.0
),
]
)
),
gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
trajectory_exact_match_results=gapic_evaluation_service_types_preview.TrajectoryExactMatchResults(
trajectory_exact_match_metric_values=[
gapic_evaluation_service_types_preview.TrajectoryExactMatchMetricValue(
score=0.0
),
]
)
),
)
_MOCK_POINTWISE_RESULT = (
gapic_evaluation_service_types.EvaluateInstancesResponse(
pointwise_metric_result=gapic_evaluation_service_types.PointwiseMetricResult(
Expand Down Expand Up @@ -354,6 +414,18 @@
)
),
)
_MOCK_COHERENCE_RESULT = (
gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
pointwise_metric_result=gapic_evaluation_service_types_preview.PointwiseMetricResult(
score=5, explanation="explanation"
)
),
gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
pointwise_metric_result=gapic_evaluation_service_types_preview.PointwiseMetricResult(
score=4, explanation="explanation"
)
),
)
_MOCK_PAIRWISE_SUMMARIZATION_QUALITY_RESULT = (
gapic_evaluation_service_types.EvaluateInstancesResponse(
pairwise_metric_result=gapic_evaluation_service_types.PairwiseMetricResult(
Expand Down Expand Up @@ -1177,6 +1249,106 @@ def test_eval_result_experiment_run_logging(self):
)


@pytest.mark.usefixtures("google_auth_mock")
class TestAgentEvaluation:
def setup_method(self):
vertexai.init(
project=_TEST_PROJECT,
location=_TEST_LOCATION,
)

def teardown_method(self):
initializer.global_pool.shutdown(wait=True)

@pytest.mark.parametrize("api_transport", ["grpc", "rest"])
def test_runnable_response_eval_with_runnable_inference(self, api_transport):
aiplatform.init(
project=_TEST_PROJECT,
location=_TEST_LOCATION,
api_transport=api_transport,
)
mock_runnable = mock.create_autospec(reasoning_engines.Queryable, instance=True)
mock_runnable.query.return_value = _MOCK_RUNNABLE_INFERENCE_RESPONSE

test_metrics = [PointwisePreview.COHERENCE]
test_eval_task = EvalTaskPreview(
dataset=_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE, metrics=test_metrics
)
mock_metric_results = _MOCK_COHERENCE_RESULT
with mock.patch.object(
target=gapic_evaluation_services_preview.EvaluationServiceClient,
attribute="evaluate_instances",
side_effect=mock_metric_results,
):
test_result = test_eval_task.evaluate(
runnable=mock_runnable,
prompt_template="test prompt template",
)

assert test_result.summary_metrics["row_count"] == 2
assert test_result.summary_metrics["coherence/mean"] == 4.5
assert test_result.summary_metrics["coherence/std"] == pytest.approx(0.7, 0.1)
assert set(test_result.metrics_table.columns.values) == set(
[
"prompt",
"reference_trajectory",
"response",
"latency_in_seconds",
"failure",
"predicted_trajectory",
"coherence/score",
"coherence/explanation",
]
)
assert list(test_result.metrics_table["coherence/score"].values) == [5, 4]
assert list(test_result.metrics_table["coherence/explanation"].values) == [
"explanation",
"explanation",
]

@pytest.mark.parametrize("api_transport", ["grpc", "rest"])
def test_runnable_trajectory_eval_with_runnable_inference(self, api_transport):
aiplatform.init(
project=_TEST_PROJECT,
location=_TEST_LOCATION,
api_transport=api_transport,
)
mock_runnable = mock.create_autospec(reasoning_engines.Queryable, instance=True)
mock_runnable.query.return_value = _MOCK_RUNNABLE_INFERENCE_RESPONSE

test_metrics = ["trajectory_exact_match"]
test_eval_task = EvalTaskPreview(
dataset=_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE, metrics=test_metrics
)
mock_metric_results = _MOCK_TRAJECTORY_EXACT_MATCH_RESULT
with mock.patch.object(
target=gapic_evaluation_services_preview.EvaluationServiceClient,
attribute="evaluate_instances",
side_effect=mock_metric_results,
):
test_result = test_eval_task.evaluate(runnable=mock_runnable)

assert test_result.summary_metrics["row_count"] == 2
assert test_result.summary_metrics["trajectory_exact_match/mean"] == 0.5
assert test_result.summary_metrics[
"trajectory_exact_match/std"
] == pytest.approx(0.7, 0.1)
assert set(test_result.metrics_table.columns.values) == set(
[
"prompt",
"response",
"latency_in_seconds",
"failure",
"predicted_trajectory",
"reference_trajectory",
"trajectory_exact_match/score",
]
)
assert list(
test_result.metrics_table["trajectory_exact_match/score"].values
) == [1.0, 0.0]


@pytest.mark.usefixtures("google_auth_mock")
class TestEvaluationErrors:
def setup_method(self):
Expand Down Expand Up @@ -1376,11 +1548,10 @@ def test_evaluate_baseline_model_response_column_not_provided(
):
test_eval_task.evaluate()

def test_evaluate_response_column_not_provided(
self,
):
@pytest.mark.parametrize("eval_task_version", [EvalTask, EvalTaskPreview])
def test_evaluate_response_column_not_provided(self, eval_task_version):
test_eval_dataset = _TEST_EVAL_DATASET_SINGLE
test_eval_task = EvalTask(
test_eval_task = eval_task_version(
dataset=test_eval_dataset,
metrics=["exact_match"],
)
Expand All @@ -1395,11 +1566,10 @@ def test_evaluate_response_column_not_provided(
):
test_eval_task.evaluate()

def test_evaluate_reference_column_not_provided(
self,
):
@pytest.mark.parametrize("eval_task_version", [EvalTask, EvalTaskPreview])
def test_evaluate_reference_column_not_provided(self, eval_task_version):
test_eval_dataset = pd.DataFrame({"response": ["test", "text"]})
test_eval_task = EvalTask(
test_eval_task = eval_task_version(
dataset=test_eval_dataset,
metrics=["exact_match"],
)
Expand Down
8 changes: 4 additions & 4 deletions vertexai/preview/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
#
"""Vertex Gen AI Evaluation Service Module."""

from vertexai.evaluation import _base
from vertexai.evaluation import eval_task
from vertexai.evaluation import metrics
from vertexai.evaluation import prompt_template
from vertexai.preview.evaluation import _base
from vertexai.preview.evaluation import eval_task
from vertexai.preview.evaluation import metrics
from vertexai.preview.evaluation import prompt_template


EvalResult = _base.EvalResult
Expand Down
97 changes: 97 additions & 0 deletions vertexai/preview/evaluation/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Base classes for evaluation."""


import dataclasses
from typing import Dict, List, Optional, TYPE_CHECKING, Union

from google.cloud.aiplatform_v1beta1.services import (
evaluation_service as gapic_evaluation_services,
)
from vertexai.preview.evaluation.metrics import (
_base as metrics_base,
)

if TYPE_CHECKING:
import pandas as pd


@dataclasses.dataclass
class EvaluationRunConfig:
"""Evaluation Run Configurations.
Attributes:
dataset: The dataset to evaluate.
metrics: The list of metric names, or Metric instances to evaluate.
metric_column_mapping: An optional dictionary column mapping that overrides
the metric prompt template input variable names with mapped the evaluation
dataset column names, used during evaluation. For example, if the
input_variables of the metric prompt template are ["context",
"reference"], the metric_column_mapping can be { "context":
"news_context", "reference": "ground_truth", "response":
"model_1_response" } if the dataset has columns "news_context",
"ground_truth" and "model_1_response".
client: The evaluation service client.
evaluation_service_qps: The custom QPS limit for the evaluation service.
retry_timeout: How long to keep retrying the evaluation requests, in
seconds.
"""

dataset: "pd.DataFrame"
metrics: List[Union[str, metrics_base._Metric]]
metric_column_mapping: Dict[str, str]
client: gapic_evaluation_services.EvaluationServiceClient
evaluation_service_qps: float
retry_timeout: float

def validate_dataset_column(self, column_name: str) -> None:
"""Validates that the column names in the column map are in the dataset.
Args:
column_name: The column name to validate.
Raises:
KeyError: If any of the column names are not in the dataset.
"""
if (
self.metric_column_mapping.get(column_name, column_name)
not in self.dataset.columns
):
raise KeyError(
"Required column"
f" `{self.metric_column_mapping.get(column_name, column_name)}` not"
" found in the evaluation dataset. The columns in the evaluation"
f" dataset are {list(self.dataset.columns)}."
)


@dataclasses.dataclass
class EvalResult:
"""Evaluation result.
Attributes:
summary_metrics: A dictionary of summary evaluation metrics for an
evaluation run.
metrics_table: A pandas.DataFrame table containing evaluation dataset
inputs, predictions, explanations, and metric results per row.
metadata: The metadata for the evaluation run.
"""

summary_metrics: Dict[str, float]
metrics_table: Optional["pd.DataFrame"] = None
metadata: Optional[Dict[str, str]] = None
Loading

0 comments on commit 41cd5a8

Please sign in to comment.