Added vllm-gpu markers for Nvidia and AMD GPU testcases (#1039)

Raghul-M · web-flow · commit af6015034742 · 2026-01-20T12:46:03.000Z
diff --git a/pytest.ini b/pytest.ini
@@ -33,7 +33,9 @@ markers =
     kueue: Mark tests which are testing Kueue
     model_server_gpu: Mark tests which are testing model server with GPU resources
     gpu: Mark tests which require GPU resources
-    multigpu: Mark tests which require multiple GPU resources
+    vllm_nvidia_single_gpu: Mark tests which require GPU resources for VLLM NVIDIA deployment
+    vllm_nvidia_multi_gpu: Mark tests which require multiple GPU resources for VLLM NVIDIA deployment
+    vllm_amd_gpu: Mark tests which require  GPU resources for VLLM AMD deployment
     multinode: Mark tests which require multiple nodes
     keda: Mark tests which are testing KEDA scaling
     llmd_cpu: Mark tests which are testing LLMD (LLM Deployment) with CPU resources
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_elyza_japanese_llama_2_7b_instruct.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_elyza_japanese_llama_2_7b_instruct.py
@@ -31,6 +31,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -79,7 +81,8 @@ def test_elyza_raw_simple_tgis_model_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_granite_2b_instruct_preview_4k_r240917a.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_granite_2b_instruct_preview_4k_r240917a.py
@@ -11,6 +11,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -46,7 +48,8 @@ def test_deploy_model_inference(self, vllm_inference_service, vllm_pod_resource,
         assert completion_responses == response_snapshot
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_granite_7b_redhat_lab.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_granite_7b_redhat_lab.py
@@ -31,6 +31,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -79,7 +81,8 @@ def test_granite_lab_raw_simple_tgis_model_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_granite_7b_starter.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_granite_7b_starter.py
@@ -31,6 +31,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -79,7 +81,8 @@ def test_granite_starter_raw_simple_tgis_model_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_llama31_8B_instruct.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_llama31_8B_instruct.py
@@ -31,6 +31,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -79,7 +81,8 @@ def test_llama31_instruct_8b_raw_simple_tgis_model_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_llama3_8B_instruct.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_llama3_8B_instruct.py
@@ -31,6 +31,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -79,7 +81,8 @@ def test_llama3_instruct_8b_raw_simple_tgis_model_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_llama_2_13b_chat.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_llama_2_13b_chat.py
@@ -32,6 +32,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -78,7 +80,8 @@ def test_llamachat_raw_simple_tgis_model_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_merlinite_7b_lab.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/test_merlinite_7b_lab.py
@@ -31,6 +31,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -79,7 +81,8 @@ def test_merlinite_lab_7b_raw_simple_tgis_model_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/multimodal/test_granite_31_2b_vision.py b/tests/model_serving/model_runtime/vllm/multimodal/test_granite_31_2b_vision.py
@@ -21,6 +21,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -72,7 +74,8 @@ def test_multi_image_query_inference(
         validate_inference_output(model_info, chat_responses, completion_responses, response_snapshot=response_snapshot)
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/quantization/test_openhermes-2_5_mistral-7b_awq.py b/tests/model_serving/model_runtime/vllm/quantization/test_openhermes-2_5_mistral-7b_awq.py
@@ -21,6 +21,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -92,7 +94,8 @@ def test_deploy_model_inference_raw(self, vllm_inference_service, vllm_pod_resou
             pytest.skip("Model deployment is only for kserve raw")
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/speculative_decoding/test_granite_7b_lab_draft.py b/tests/model_serving/model_runtime/vllm/speculative_decoding/test_granite_7b_lab_draft.py
@@ -27,6 +27,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -79,6 +81,8 @@ def test_spec_draft_inference(
         )
 
 
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/speculative_decoding/test_granite_7b_lab_ngram.py b/tests/model_serving/model_runtime/vllm/speculative_decoding/test_granite_7b_lab_ngram.py
@@ -27,6 +27,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -75,7 +77,8 @@ def test_spec_ngram_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
diff --git a/tests/model_serving/model_runtime/vllm/toolcalling/test_granite_3_2_8b_instruct_preview.py b/tests/model_serving/model_runtime/vllm/toolcalling/test_granite_3_2_8b_instruct_preview.py
@@ -41,6 +41,8 @@
 pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
 
 
+@pytest.mark.vllm_nvidia_single_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [
@@ -116,7 +118,8 @@ def test_granite_model_weather_tool_inference(
         )
 
 
-@pytest.mark.multigpu
+@pytest.mark.vllm_nvidia_multi_gpu
+@pytest.mark.vllm_amd_gpu
 @pytest.mark.parametrize(
     "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
     [

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,8 @@`
`31`	`31`	`pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")`
`32`	`32`
`33`	`33`
	`34`	`+@pytest.mark.vllm_nvidia_single_gpu`
	`35`	`+@pytest.mark.vllm_amd_gpu`
`34`	`36`	`@pytest.mark.parametrize(`
`35`	`37`	`"model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",`
`36`	`38`	`[`
`@@ -79,7 +81,8 @@ def test_elyza_raw_simple_tgis_model_inference(`
`79`	`81`	`)`
`80`	`82`
`81`	`83`
`82`		`-@pytest.mark.multigpu`
	`84`	`+@pytest.mark.vllm_nvidia_multi_gpu`
	`85`	`+@pytest.mark.vllm_amd_gpu`
`83`	`86`	`@pytest.mark.parametrize(`
`84`	`87`	`"model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",`
`85`	`88`	`[`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,8 @@`
`11`	`11`	`pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")`
`12`	`12`
`13`	`13`
	`14`	`+@pytest.mark.vllm_nvidia_single_gpu`
	`15`	`+@pytest.mark.vllm_amd_gpu`
`14`	`16`	`@pytest.mark.parametrize(`
`15`	`17`	`"model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",`
`16`	`18`	`[`
`@@ -46,7 +48,8 @@ def test_deploy_model_inference(self, vllm_inference_service, vllm_pod_resource,`
`46`	`48`	`assert completion_responses == response_snapshot`
`47`	`49`
`48`	`50`
`49`		`-@pytest.mark.multigpu`
	`51`	`+@pytest.mark.vllm_nvidia_multi_gpu`
	`52`	`+@pytest.mark.vllm_amd_gpu`
`50`	`53`	`@pytest.mark.parametrize(`
`51`	`54`	`"model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",`
`52`	`55`	`[`
Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,8 @@`
`32`	`32`	`pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")`
`33`	`33`
`34`	`34`
	`35`	`+@pytest.mark.vllm_nvidia_single_gpu`
	`36`	`+@pytest.mark.vllm_amd_gpu`
`35`	`37`	`@pytest.mark.parametrize(`
`36`	`38`	`"model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",`
`37`	`39`	`[`
`@@ -78,7 +80,8 @@ def test_llamachat_raw_simple_tgis_model_inference(`
`78`	`80`	`)`
`79`	`81`
`80`	`82`
`81`		`-@pytest.mark.multigpu`
	`83`	`+@pytest.mark.vllm_nvidia_multi_gpu`
	`84`	`+@pytest.mark.vllm_amd_gpu`
`82`	`85`	`@pytest.mark.parametrize(`
`83`	`86`	`"model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",`
`84`	`87`	`[`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,8 @@`
`21`	`21`	`pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")`
`22`	`22`
`23`	`23`
	`24`	`+@pytest.mark.vllm_nvidia_single_gpu`
	`25`	`+@pytest.mark.vllm_amd_gpu`
`24`	`26`	`@pytest.mark.parametrize(`
`25`	`27`	`"model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",`
`26`	`28`	`[`
`@@ -72,7 +74,8 @@ def test_multi_image_query_inference(`
`72`	`74`	`validate_inference_output(model_info, chat_responses, completion_responses, response_snapshot=response_snapshot)`
`73`	`75`
`74`	`76`
`75`		`-@pytest.mark.multigpu`
	`77`	`+@pytest.mark.vllm_nvidia_multi_gpu`
	`78`	`+@pytest.mark.vllm_amd_gpu`
`76`	`79`	`@pytest.mark.parametrize(`
`77`	`80`	`"model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",`
`78`	`81`	`[`