diff --git a/notebooks/community/model_garden/model_garden_pytorch_llama3_3_deployment.ipynb b/notebooks/community/model_garden/model_garden_pytorch_llama3_3_deployment.ipynb
new file mode 100644
index 000000000..30eced007
--- /dev/null
+++ b/notebooks/community/model_garden/model_garden_pytorch_llama3_3_deployment.ipynb
@@ -0,0 +1,574 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "SgQ6t5bqZVlH"
+ },
+ "outputs": [],
+ "source": [
+ "# Copyright 2024 Google LLC\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "99c1c3fc2ca5"
+ },
+ "source": [
+ "# Vertex AI Model Garden - Llama 3.3 (Deployment)\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ "  Run in Colab Enterprise\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ "  View on GitHub\n",
+ " \n",
+ " | \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3de7470326a2"
+ },
+ "source": [
+ "## Overview\n",
+ "\n",
+ "This notebook demonstrates downloading, deploying, and serving prebuilt Llama 3.3 model with [vLLM](https://github.com/vllm-project/vllm).\n",
+ "\n",
+ "\n",
+ "### Objective\n",
+ "\n",
+ "- Deploy Llama 3.3 70B Instruct with vLLM on GPU, optionally with dynamic LoRA adapters.\n",
+ "\n",
+ "### Costs\n",
+ "\n",
+ "This tutorial uses billable components of Google Cloud:\n",
+ "\n",
+ "* Vertex AI\n",
+ "* Cloud Storage\n",
+ "\n",
+ "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "264c07757582"
+ },
+ "source": [
+ "## Before you begin"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "ax7zWynUDcjk"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Request for quota\n",
+ "\n",
+ "# @markdown By default, the quota for H100 deployment `Custom model serving per region` is 0. You need to request for H100 quota following the instructions at [\"Request a higher quota\"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "YXFGIp1l-qtT"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Setup Google Cloud project\n",
+ "\n",
+ "# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n",
+ "\n",
+ "# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. \"us\") is not considered a match for a single region covered by the multi-region range (eg. \"us-central1\"). If not set, a unique GCS bucket will be created instead.\n",
+ "\n",
+ "BUCKET_URI = \"gs://\" # @param {type:\"string\"}\n",
+ "\n",
+ "# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.\n",
+ "\n",
+ "REGION = \"\" # @param {type:\"string\"}\n",
+ "\n",
+ "# @markdown 4. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).\n",
+ "\n",
+ "# @markdown > | Machine Type | Accelerator Type | Recommended Regions |\n",
+ "# @markdown | ----------- | ----------- | ----------- |\n",
+ "# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |\n",
+ "# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |\n",
+ "# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |\n",
+ "# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, us-east5, europe-west4, us-west1, asia-southeast1 |\n",
+ "\n",
+ "# Import the necessary packages\n",
+ "\n",
+ "# Upgrade Vertex AI SDK.\n",
+ "! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.64.0'\n",
+ "! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git\n",
+ "\n",
+ "import datetime\n",
+ "import importlib\n",
+ "import os\n",
+ "import uuid\n",
+ "from typing import Tuple\n",
+ "\n",
+ "from google.cloud import aiplatform\n",
+ "\n",
+ "common_util = importlib.import_module(\n",
+ " \"vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util\"\n",
+ ")\n",
+ "\n",
+ "models, endpoints = {}, {}\n",
+ "\n",
+ "# Get the default cloud project id.\n",
+ "PROJECT_ID = os.environ[\"GOOGLE_CLOUD_PROJECT\"]\n",
+ "\n",
+ "# Get the default region for launching jobs.\n",
+ "if not REGION:\n",
+ " REGION = os.environ[\"GOOGLE_CLOUD_REGION\"]\n",
+ "\n",
+ "# Enable the Vertex AI API and Compute Engine API, if not already.\n",
+ "print(\"Enabling Vertex AI API and Compute Engine API.\")\n",
+ "! gcloud services enable aiplatform.googleapis.com compute.googleapis.com\n",
+ "\n",
+ "# Cloud Storage bucket for storing the experiment artifacts.\n",
+ "# A unique GCS bucket will be created for the purpose of this notebook. If you\n",
+ "# prefer using your own GCS bucket, change the value yourself below.\n",
+ "now = datetime.datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
+ "BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n",
+ "\n",
+ "if BUCKET_URI is None or BUCKET_URI.strip() == \"\" or BUCKET_URI == \"gs://\":\n",
+ " BUCKET_URI = f\"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}\"\n",
+ " BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n",
+ " ! gsutil mb -l {REGION} {BUCKET_URI}\n",
+ "else:\n",
+ " assert BUCKET_URI.startswith(\"gs://\"), \"BUCKET_URI must start with `gs://`.\"\n",
+ " shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep \"Location constraint:\" | sed \"s/Location constraint://\"\n",
+ " bucket_region = shell_output[0].strip().lower()\n",
+ " if bucket_region != REGION:\n",
+ " raise ValueError(\n",
+ " \"Bucket region %s is different from notebook region %s\"\n",
+ " % (bucket_region, REGION)\n",
+ " )\n",
+ "print(f\"Using this GCS Bucket: {BUCKET_URI}\")\n",
+ "\n",
+ "STAGING_BUCKET = os.path.join(BUCKET_URI, \"temporal\")\n",
+ "MODEL_BUCKET = os.path.join(BUCKET_URI, \"llama3-3\")\n",
+ "\n",
+ "\n",
+ "# Initialize Vertex AI API.\n",
+ "print(\"Initializing Vertex AI API.\")\n",
+ "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)\n",
+ "\n",
+ "# Gets the default SERVICE_ACCOUNT.\n",
+ "shell_output = ! gcloud projects describe $PROJECT_ID\n",
+ "project_number = shell_output[-1].split(\":\")[1].strip().replace(\"'\", \"\")\n",
+ "SERVICE_ACCOUNT = f\"{project_number}-compute@developer.gserviceaccount.com\"\n",
+ "print(\"Using this default Service Account:\", SERVICE_ACCOUNT)\n",
+ "\n",
+ "\n",
+ "# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket\n",
+ "! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME\n",
+ "\n",
+ "! gcloud config set project $PROJECT_ID\n",
+ "! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=\"roles/storage.admin\"\n",
+ "! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=\"roles/aiplatform.user\"\n",
+ "\n",
+ "# @markdown # Access Llama 3.3 models on Vertex AI for serving\n",
+ "# @markdown The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.\n",
+ "# @markdown Accept the model agreement to access the models:\n",
+ "# @markdown 1. Open the [Llama 3.3 model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama3-3) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).\n",
+ "# @markdown 2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.\n",
+ "# @markdown 3. After accepting the agreement of Llama 3.3, a `gs://` URI containing Llama 3.3 models will be shared.\n",
+ "# @markdown 4. Paste the URI in the `VERTEX_AI_MODEL_GARDEN_LLAMA_3_3` field below.\n",
+ "\n",
+ "\n",
+ "VERTEX_AI_MODEL_GARDEN_LLAMA_3_3 = \"\" # @param {type:\"string\", isTemplate:true}\n",
+ "assert (\n",
+ " VERTEX_AI_MODEL_GARDEN_LLAMA_3_3\n",
+ "), \"Click the agreement of Llama 3.3 in Vertex AI Model Garden, and get the GCS path of Llama 3.3 model artifacts.\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "z-XybZjtgF9M"
+ },
+ "source": [
+ "## Deploy Llama 3.3 70B Instruct with vLLM"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "E8OiHHNNE_wj"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Deploy\n",
+ "\n",
+ "# @markdown This section uploads Llama 3.3 to Model Registry and deploys it to a Vertex AI Endpoint. It takes ~30 minutes.\n",
+ "\n",
+ "# @markdown The serving efficiency of L4 GPUs is inferior to that of H100 GPUs, but L4 GPUs are nevertheless good serving solutions if you do not have H100 quota.\n",
+ "\n",
+ "# @markdown H100 is hard to get for now. It's recommended to use the deployment button in the model card. You can still try to deploy H100 endpoint through the notebook, but there is a chance that resource is not available.\n",
+ "\n",
+ "# @markdown Set the model to deploy.\n",
+ "\n",
+ "base_model_name = \"Llama-3.3-70B-Instruct\" # @param [\"Llama-3.3-70B-Instruct\"] {isTemplate:true}\n",
+ "model_id = os.path.join(VERTEX_AI_MODEL_GARDEN_LLAMA_3_3, base_model_name)\n",
+ "ENABLE_DYNAMIC_LORA = True # @param {type:\"boolean\", isTemplate:true}\n",
+ "hf_model_id = \"meta-llama/\" + base_model_name\n",
+ "\n",
+ "accelerator_type = \"NVIDIA_H100_80GB\" # @param [\"NVIDIA_H100_80GB\", \"NVIDIA_L4\"]\n",
+ "\n",
+ "# The pre-built serving docker images.\n",
+ "VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241001_0916_RC00\"\n",
+ "\n",
+ "# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).\n",
+ "use_dedicated_endpoint = True # @param {type:\"boolean\"}\n",
+ "\n",
+ "# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.\n",
+ "if accelerator_type == \"NVIDIA_L4\":\n",
+ " machine_type = \"g2-standard-96\"\n",
+ " accelerator_count = 8\n",
+ " max_loras = 1\n",
+ "elif accelerator_type == \"NVIDIA_H100_80GB\":\n",
+ " machine_type = \"a3-highgpu-4g\"\n",
+ " accelerator_count = 4\n",
+ " max_loras = 1\n",
+ "else:\n",
+ " raise ValueError(\n",
+ " f\"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}.\"\n",
+ " )\n",
+ "\n",
+ "common_util.check_quota(\n",
+ " project_id=PROJECT_ID,\n",
+ " region=REGION,\n",
+ " accelerator_type=accelerator_type,\n",
+ " accelerator_count=accelerator_count,\n",
+ " is_for_training=False,\n",
+ ")\n",
+ "\n",
+ "gpu_memory_utilization = 0.95\n",
+ "max_model_len = 8192 # Maximum context length.\n",
+ "\n",
+ "\n",
+ "def deploy_model_vllm(\n",
+ " model_name: str,\n",
+ " model_id: str,\n",
+ " service_account: str,\n",
+ " base_model_id: str = None,\n",
+ " machine_type: str = \"g2-standard-8\",\n",
+ " accelerator_type: str = \"NVIDIA_L4\",\n",
+ " accelerator_count: int = 1,\n",
+ " gpu_memory_utilization: float = 0.9,\n",
+ " max_model_len: int = 4096,\n",
+ " dtype: str = \"auto\",\n",
+ " enable_trust_remote_code: bool = False,\n",
+ " enforce_eager: bool = False,\n",
+ " enable_lora: bool = False,\n",
+ " enable_chunked_prefill: bool = False,\n",
+ " max_loras: int = 1,\n",
+ " max_cpu_loras: int = 8,\n",
+ " use_dedicated_endpoint: bool = False,\n",
+ " max_num_seqs: int = 256,\n",
+ " model_type: str = None,\n",
+ ") -> Tuple[aiplatform.Model, aiplatform.Endpoint]:\n",
+ " \"\"\"Deploys trained models with vLLM into Vertex AI.\"\"\"\n",
+ " endpoint = aiplatform.Endpoint.create(\n",
+ " display_name=f\"{model_name}-endpoint\",\n",
+ " dedicated_endpoint_enabled=use_dedicated_endpoint,\n",
+ " )\n",
+ "\n",
+ " if not base_model_id:\n",
+ " base_model_id = model_id\n",
+ "\n",
+ " # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.\n",
+ " vllm_args = [\n",
+ " \"python\",\n",
+ " \"-m\",\n",
+ " \"vllm.entrypoints.api_server\",\n",
+ " \"--host=0.0.0.0\",\n",
+ " \"--port=8080\",\n",
+ " f\"--model={model_id}\",\n",
+ " f\"--tensor-parallel-size={accelerator_count}\",\n",
+ " \"--swap-space=16\",\n",
+ " f\"--gpu-memory-utilization={gpu_memory_utilization}\",\n",
+ " f\"--max-model-len={max_model_len}\",\n",
+ " f\"--dtype={dtype}\",\n",
+ " f\"--max-loras={max_loras}\",\n",
+ " f\"--max-cpu-loras={max_cpu_loras}\",\n",
+ " f\"--max-num-seqs={max_num_seqs}\",\n",
+ " \"--disable-log-stats\",\n",
+ " ]\n",
+ "\n",
+ " if enable_trust_remote_code:\n",
+ " vllm_args.append(\"--trust-remote-code\")\n",
+ "\n",
+ " if enforce_eager:\n",
+ " vllm_args.append(\"--enforce-eager\")\n",
+ "\n",
+ " if enable_lora:\n",
+ " vllm_args.append(\"--enable-lora\")\n",
+ "\n",
+ " if enable_chunked_prefill:\n",
+ " vllm_args.append(\"--enable-chunked-prefill\")\n",
+ "\n",
+ " if model_type:\n",
+ " vllm_args.append(f\"--model-type={model_type}\")\n",
+ "\n",
+ " env_vars = {\n",
+ " \"MODEL_ID\": base_model_id,\n",
+ " \"DEPLOY_SOURCE\": \"notebook\",\n",
+ " }\n",
+ "\n",
+ " # HF_TOKEN is not a compulsory field and may not be defined.\n",
+ " try:\n",
+ " if HF_TOKEN:\n",
+ " env_vars[\"HF_TOKEN\"] = HF_TOKEN\n",
+ " except NameError:\n",
+ " pass\n",
+ "\n",
+ " model = aiplatform.Model.upload(\n",
+ " display_name=model_name,\n",
+ " serving_container_image_uri=VLLM_DOCKER_URI,\n",
+ " serving_container_args=vllm_args,\n",
+ " serving_container_ports=[8080],\n",
+ " serving_container_predict_route=\"/generate\",\n",
+ " serving_container_health_route=\"/ping\",\n",
+ " serving_container_environment_variables=env_vars,\n",
+ " serving_container_shared_memory_size_mb=(16 * 1024), # 16 GB\n",
+ " serving_container_deployment_timeout=7200,\n",
+ " )\n",
+ " print(\n",
+ " f\"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s).\"\n",
+ " )\n",
+ " model.deploy(\n",
+ " endpoint=endpoint,\n",
+ " machine_type=machine_type,\n",
+ " accelerator_type=accelerator_type,\n",
+ " accelerator_count=accelerator_count,\n",
+ " deploy_request_timeout=1800,\n",
+ " service_account=service_account,\n",
+ " )\n",
+ " print(\"endpoint_name:\", endpoint.name)\n",
+ "\n",
+ " return model, endpoint\n",
+ "\n",
+ "\n",
+ "models[\"vllm_gpu\"], endpoints[\"vllm_gpu\"] = deploy_model_vllm(\n",
+ " model_name=common_util.get_job_name_with_datetime(prefix=\"llama3-3-serve\"),\n",
+ " model_id=model_id,\n",
+ " base_model_id=hf_model_id,\n",
+ " service_account=SERVICE_ACCOUNT,\n",
+ " machine_type=machine_type,\n",
+ " accelerator_type=accelerator_type,\n",
+ " accelerator_count=accelerator_count,\n",
+ " gpu_memory_utilization=gpu_memory_utilization,\n",
+ " max_model_len=max_model_len,\n",
+ " max_loras=max_loras,\n",
+ " enforce_eager=True,\n",
+ " enable_lora=ENABLE_DYNAMIC_LORA,\n",
+ " enable_chunked_prefill=not ENABLE_DYNAMIC_LORA,\n",
+ " use_dedicated_endpoint=use_dedicated_endpoint,\n",
+ " model_type=\"llama3.1\",\n",
+ ")\n",
+ "# @markdown Click \"Show Code\" to see more details."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "rDHsCOqvFYBi"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Raw predict\n",
+ "\n",
+ "# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).\n",
+ "\n",
+ "# @markdown Example:\n",
+ "\n",
+ "# @markdown ```\n",
+ "# @markdown Human: What is a car?\n",
+ "# @markdown Assistant: A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another.\n",
+ "# @markdown ```\n",
+ "\n",
+ "# @markdown Optionally, you can apply LoRA weights to prediction. Set `lora_id` to be either a GCS URI or a HuggingFace repo containing the LoRA weight.\n",
+ "\n",
+ "# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.\n",
+ "\n",
+ "prompt = \"What is a car?\" # @param {type: \"string\"}\n",
+ "# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.\n",
+ "max_tokens = 50 # @param {type:\"integer\"}\n",
+ "temperature = 1.0 # @param {type:\"number\"}\n",
+ "top_p = 1.0 # @param {type:\"number\"}\n",
+ "top_k = 1 # @param {type:\"integer\"}\n",
+ "# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `\"Prompt:\\n{prompt.strip()}\\nOutput:\\n{output}\"`.\n",
+ "raw_response = False # @param {type:\"boolean\"}\n",
+ "lora_id = \"\" # @param {type:\"string\", isTemplate: true}\n",
+ "\n",
+ "# Overrides parameters for inferences.\n",
+ "instance = {\n",
+ " \"prompt\": prompt,\n",
+ " \"max_tokens\": max_tokens,\n",
+ " \"temperature\": temperature,\n",
+ " \"top_p\": top_p,\n",
+ " \"top_k\": top_k,\n",
+ " \"raw_response\": raw_response,\n",
+ "}\n",
+ "if lora_id:\n",
+ " instance[\"dynamic-lora\"] = lora_id\n",
+ "instances = [instance]\n",
+ "response = endpoints[\"vllm_gpu\"].predict(\n",
+ " instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n",
+ ")\n",
+ "\n",
+ "for prediction in response.predictions:\n",
+ " print(prediction)\n",
+ "\n",
+ "# @markdown Click \"Show Code\" to see more details."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "LSG9ITWTbTb7"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Chat completion\n",
+ "\n",
+ "if use_dedicated_endpoint:\n",
+ " DEDICATED_ENDPOINT_DNS = endpoints[\"vllm_gpu\"].gca_resource.dedicated_endpoint_dns\n",
+ "ENDPOINT_RESOURCE_NAME = \"projects/{}/locations/{}/endpoints/{}\".format(\n",
+ " PROJECT_ID, REGION, endpoints[\"vllm_gpu\"].name\n",
+ ")\n",
+ "\n",
+ "# @title Chat Completions Inference\n",
+ "\n",
+ "# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.\n",
+ "\n",
+ "# @markdown First you will need to install the SDK and some auth-related dependencies.\n",
+ "\n",
+ "! pip install -qU openai google-auth requests\n",
+ "\n",
+ "# @markdown Next fill out some request parameters:\n",
+ "\n",
+ "user_message = \"How is your day going?\" # @param {type: \"string\"}\n",
+ "# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.\n",
+ "max_tokens = 50 # @param {type: \"integer\"}\n",
+ "temperature = 1.0 # @param {type: \"number\"}\n",
+ "\n",
+ "# @markdown Now we can send a request.\n",
+ "\n",
+ "import google.auth\n",
+ "import openai\n",
+ "\n",
+ "creds, project = google.auth.default()\n",
+ "auth_req = google.auth.transport.requests.Request()\n",
+ "creds.refresh(auth_req)\n",
+ "\n",
+ "BASE_URL = (\n",
+ " f\"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ ")\n",
+ "try:\n",
+ " if use_dedicated_endpoint:\n",
+ " BASE_URL = f\"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}\"\n",
+ "except NameError:\n",
+ " pass\n",
+ "\n",
+ "client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)\n",
+ "\n",
+ "model_response = client.chat.completions.create(\n",
+ " model=\"\",\n",
+ " messages=[{\"role\": \"user\", \"content\": user_message}],\n",
+ " temperature=temperature,\n",
+ " max_tokens=max_tokens,\n",
+ ")\n",
+ "print(model_response)\n",
+ "\n",
+ "# @markdown Click \"Show Code\" to see more details."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JETd33jIDcjm"
+ },
+ "source": [
+ "## Clean up resources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "911406c1561e"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Delete the models and endpoints\n",
+ "\n",
+ "# @markdown Delete the experiment models and endpoints to recycle the resources\n",
+ "# @markdown and avoid unnecessary continuous charges that may incur.\n",
+ "\n",
+ "# Undeploy model and delete endpoint.\n",
+ "for endpoint in endpoints.values():\n",
+ " endpoint.delete(force=True)\n",
+ "\n",
+ "# Delete models.\n",
+ "for model in models.values():\n",
+ " model.delete()\n",
+ "\n",
+ "delete_bucket = False # @param {type:\"boolean\"}\n",
+ "if delete_bucket:\n",
+ " ! gsutil -m rm -r $BUCKET_NAME"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "name": "model_garden_pytorch_llama3_3_deployment.ipynb",
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}