Enable dedicate endpoint for Llama Guard deployment

vertex-mg-bot · copybara-github · commit 68d5a6d2d2ed · 2024-11-25T10:11:56.000-08:00
PiperOrigin-RevId: 700022732
diff --git a/notebooks/community/model_garden/model_garden_llama_guard_deployment.ipynb b/notebooks/community/model_garden/model_garden_llama_guard_deployment.ipynb
@@ -147,8 +147,6 @@
         ")\n",
         "\n",
         "models, endpoints = {}, {}\n",
-        "# Dedicated endpoint not supported yet\n",
-        "use_dedicated_endpoint = False\n",
         "\n",
         "# Get the default cloud project id.\n",
         "PROJECT_ID = os.environ[\"GOOGLE_CLOUD_PROJECT\"]\n",
@@ -390,6 +388,10 @@
         "    return model, endpoint\n",
         "\n",
         "\n",
+        "# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).\n",
+        "use_dedicated_endpoint = True  # @param {type:\"boolean\"}\n",
+        "\n",
+        "\n",
         "models[\"vllm_gpu\"], endpoints[\"vllm_gpu\"] = deploy_model_vllm(\n",
         "    model_name=common_util.get_job_name_with_datetime(prefix=\"llama3-guard\"),\n",
         "    model_id=model_id,\n",
@@ -520,9 +522,11 @@
         "        \"@requestFormat\": \"chatCompletions\",\n",
         "    },\n",
         "]\n",
-        "response = endpoints[\"vllm_gpu\"].predict(instances=instances)\n",
+        "response = endpoints[\"vllm_gpu\"].predict(\n",
+        "    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n",
+        ")\n",
         "\n",
-        "prediction = response.predictions[0]\n",
+        "prediction = response.predictions\n",
         "print(prediction)\n",
         "print(\"Llama Guard prediction:\", prediction[\"choices\"][0][\"message\"][\"content\"])"
       ]
@@ -653,9 +657,11 @@
         "        \"@requestFormat\": \"chatCompletions\",\n",
         "    },\n",
         "]\n",
-        "response = endpoints[\"vllm_gpu\"].predict(instances=instances)\n",
+        "response = endpoints[\"vllm_gpu\"].predict(\n",
+        "    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n",
+        ")\n",
         "\n",
-        "prediction = response.predictions[0]\n",
+        "prediction = response.predictions\n",
         "print(prediction)\n",
         "print(\"Llama Guard prediction:\", prediction[\"choices\"][0][\"message\"][\"content\"])"
       ]