Skip to content

Commit 68d5a6d

Browse files
vertex-mg-botcopybara-github
authored andcommitted
Enable dedicate endpoint for Llama Guard deployment
PiperOrigin-RevId: 700022732
1 parent ff2f20f commit 68d5a6d

File tree

1 file changed

+12
-6
lines changed

1 file changed

+12
-6
lines changed

notebooks/community/model_garden/model_garden_llama_guard_deployment.ipynb

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,6 @@
147147
")\n",
148148
"\n",
149149
"models, endpoints = {}, {}\n",
150-
"# Dedicated endpoint not supported yet\n",
151-
"use_dedicated_endpoint = False\n",
152150
"\n",
153151
"# Get the default cloud project id.\n",
154152
"PROJECT_ID = os.environ[\"GOOGLE_CLOUD_PROJECT\"]\n",
@@ -390,6 +388,10 @@
390388
" return model, endpoint\n",
391389
"\n",
392390
"\n",
391+
"# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).\n",
392+
"use_dedicated_endpoint = True # @param {type:\"boolean\"}\n",
393+
"\n",
394+
"\n",
393395
"models[\"vllm_gpu\"], endpoints[\"vllm_gpu\"] = deploy_model_vllm(\n",
394396
" model_name=common_util.get_job_name_with_datetime(prefix=\"llama3-guard\"),\n",
395397
" model_id=model_id,\n",
@@ -520,9 +522,11 @@
520522
" \"@requestFormat\": \"chatCompletions\",\n",
521523
" },\n",
522524
"]\n",
523-
"response = endpoints[\"vllm_gpu\"].predict(instances=instances)\n",
525+
"response = endpoints[\"vllm_gpu\"].predict(\n",
526+
" instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n",
527+
")\n",
524528
"\n",
525-
"prediction = response.predictions[0]\n",
529+
"prediction = response.predictions\n",
526530
"print(prediction)\n",
527531
"print(\"Llama Guard prediction:\", prediction[\"choices\"][0][\"message\"][\"content\"])"
528532
]
@@ -653,9 +657,11 @@
653657
" \"@requestFormat\": \"chatCompletions\",\n",
654658
" },\n",
655659
"]\n",
656-
"response = endpoints[\"vllm_gpu\"].predict(instances=instances)\n",
660+
"response = endpoints[\"vllm_gpu\"].predict(\n",
661+
" instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n",
662+
")\n",
657663
"\n",
658-
"prediction = response.predictions[0]\n",
664+
"prediction = response.predictions\n",
659665
"print(prediction)\n",
660666
"print(\"Llama Guard prediction:\", prediction[\"choices\"][0][\"message\"][\"content\"])"
661667
]

0 commit comments

Comments
 (0)