red-hat-data-services · tarukumar · Sep 11, 2024 · Sep 11, 2024
diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_grpc.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_grpc.yaml
@@ -11,7 +11,8 @@ spec:
         - '--model=/mnt/models'
         - '--served-model-name={{.Name}}'
         - '--distributed-executor-backend=mp'
-      image: quay.io/modh/vllm@sha256:a2593489ee20b8e5f01358a9aa984fc90618c6335f4c8e138e94ce635ffb112a
+        - '--chat-template=/app/data/template/template_chatml.jinja'
+      image: quay.io/modh/vllm@sha256:2e7f97b69d6e0aa7366ee6a841a7e709829136a143608bee859b1fe700c36d31
       name: kserve-container
       command:
         - python3

diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml
@@ -11,7 +11,8 @@ spec:
         - '--model=/mnt/models'
         - '--served-model-name={{.Name}}'
         - '--distributed-executor-backend=mp'
-      image: quay.io/modh/vllm@sha256:a2593489ee20b8e5f01358a9aa984fc90618c6335f4c8e138e94ce635ffb112a
+        - '--chat-template=/app/data/template/template_chatml.jinja'
+      image: quay.io/modh/vllm@sha256:2e7f97b69d6e0aa7366ee6a841a7e709829136a143608bee859b1fe700c36d31
       name: kserve-container
       command:
         - python3

diff --git a/...ts/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_models.robot b/...ts/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_models.robot
@@ -217,10 +217,10 @@ Verify User Can Serve And Query A elyza/elyza-japanese-llama-2-7b-instruct Model
             ...    port_forwarding=${use_port_forwarding}
     ELSE IF    "${RUNTIME_NAME}" == "vllm-runtime" and "${KSERVE_MODE}" == "Serverless"
             Query Model Multiple Times    model_name=${model_name}      runtime=${RUNTIME_NAME}    protocol=http
-            ...    inference_type=chat-completions    n_times=1    query_idx=9
+            ...    inference_type=completions    n_times=1    query_idx=10
             ...    namespace=${test_namespace}    string_check_only=${TRUE}
             Query Model Multiple Times    model_name=${model_name}      runtime=${RUNTIME_NAME}    protocol=http
-            ...    inference_type=completions    n_times=1    query_idx=10
+            ...    inference_type=chat-completions    n_times=1    query_idx=9
             ...    namespace=${test_namespace}    string_check_only=${TRUE}
     END
     [Teardown]    Run Keywords