diff --git a/notebooks/community/model_garden/model_garden_tfvision_image_classification.ipynb b/notebooks/community/model_garden/model_garden_tfvision_image_classification.ipynb
index 3a3e1a96f..1330d7cfe 100644
--- a/notebooks/community/model_garden/model_garden_tfvision_image_classification.ipynb
+++ b/notebooks/community/model_garden/model_garden_tfvision_image_classification.ipynb
@@ -4,11 +4,12 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
+ "cellView": "form",
"id": "ur8xi4C7S06n"
},
"outputs": [],
"source": [
- "# Copyright 2023 Google LLC\n",
+ "# Copyright 2024 Google LLC\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
@@ -31,19 +32,18 @@
"source": [
"# Vertex AI Model Garden TFVision With Image Classification\n",
"\n",
- "
\n",
- " \n",
+ "\n",
+ " \n",
" \n",
"  Run in Colab Enterprise\n",
" \n",
" | \n",
- " \n",
+ " | \n",
" \n",
- "  \n",
- " View on GitHub\n",
+ "  View on GitHub\n",
" \n",
" | \n",
- " "
+ " |
"
]
},
{
@@ -77,11 +77,7 @@
"* Vertex AI\n",
"* Cloud Storage\n",
"\n",
- "Learn about [Vertex AI\n",
- "pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage\n",
- "pricing](https://cloud.google.com/storage/pricing), and use the [Pricing\n",
- "Calculator](https://cloud.google.com/products/calculator/)\n",
- "to generate a cost estimate based on your projected usage."
+ "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.\n"
]
},
{
@@ -106,56 +102,68 @@
"\n",
"# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n",
"\n",
- "# @markdown 2. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. \"us\") is not considered a match for a single region covered by the multi-region range (eg. \"us-central1\"). If not set, a unique GCS bucket will be created instead.\n",
+ "# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. \"us\") is not considered a match for a single region covered by the multi-region range (eg. \"us-central1\"). If not set, a unique GCS bucket will be created instead.\n",
+ "\n",
+ "BUCKET_URI = \"gs://\" # @param {type:\"string\"}\n",
+ "\n",
+ "# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.\n",
+ "\n",
+ "REGION = \"\" # @param {type:\"string\"}\n",
+ "\n",
+ "# @markdown 4. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).\n",
+ "\n",
+ "# @markdown > | Machine Type | Accelerator Type | Recommended Regions |\n",
+ "# @markdown | ----------- | ----------- | ----------- |\n",
+ "# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |\n",
+ "# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, us-west1, europe-west4, asia-southeast1 |\n",
+ "\n",
+ "! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git\n",
"\n",
"import base64\n",
+ "import datetime\n",
+ "import importlib\n",
+ "import io\n",
"import json\n",
"import os\n",
- "import sys\n",
- "from datetime import datetime\n",
- "from io import BytesIO\n",
- "from typing import Dict, List, Union\n",
- "\n",
- "import matplotlib.pyplot as plt\n",
- "import numpy\n",
- "import tensorflow as tf\n",
+ "import subprocess\n",
+ "import uuid\n",
+ "from typing import Any, Dict, List, Union\n",
+ "\n",
"import yaml\n",
"from google.cloud import aiplatform\n",
"from google.protobuf import json_format\n",
"from google.protobuf.struct_pb2 import Value\n",
- "from PIL import Image\n",
+ "\n",
+ "common_util = importlib.import_module(\n",
+ " \"vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util\"\n",
+ ")\n",
+ "\n",
+ "models, endpoints = {}, {}\n",
+ "\n",
"\n",
"# Get the default cloud project id.\n",
"PROJECT_ID = os.environ[\"GOOGLE_CLOUD_PROJECT\"]\n",
"\n",
"# Get the default region for launching jobs.\n",
- "REGION = os.environ[\"GOOGLE_CLOUD_REGION\"]\n",
- "\n",
- "# Only regions prefixed by \"us\", \"asia\", or \"europe\" are supported.\n",
- "REGION_PREFIX = REGION.split(\"-\")[0]\n",
- "assert REGION_PREFIX in (\n",
- " \"us\",\n",
- " \"europe\",\n",
- " \"asia\",\n",
- "), f'{REGION} is not supported. It must be prefixed by \"us\", \"asia\", or \"europe\".'\n",
+ "if not REGION:\n",
+ " REGION = os.environ[\"GOOGLE_CLOUD_REGION\"]\n",
"\n",
"# Enable the Vertex AI API and Compute Engine API, if not already.\n",
+ "print(\"Enabling Vertex AI API and Compute Engine API.\")\n",
"! gcloud services enable aiplatform.googleapis.com compute.googleapis.com\n",
"\n",
"# Cloud Storage bucket for storing the experiment artifacts.\n",
"# A unique GCS bucket will be created for the purpose of this notebook. If you\n",
- "# prefer using your own GCS bucket, please change the value yourself below.\n",
- "now = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
- "BUCKET_URI = \"gs://\" # @param {type: \"string\"}\n",
+ "# prefer using your own GCS bucket, change the value yourself below.\n",
+ "now = datetime.datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
+ "BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n",
"\n",
- "# Create a unique GCS bucket for this notebook, if not specified by the user.\n",
"if BUCKET_URI is None or BUCKET_URI.strip() == \"\" or BUCKET_URI == \"gs://\":\n",
- " BUCKET_URI = f\"gs://{PROJECT_ID}-tmp-{now}\"\n",
- " BUCKET_NAME = BUCKET_URI\n",
+ " BUCKET_URI = f\"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}\"\n",
+ " BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n",
" ! gsutil mb -l {REGION} {BUCKET_URI}\n",
"else:\n",
" assert BUCKET_URI.startswith(\"gs://\"), \"BUCKET_URI must start with `gs://`.\"\n",
- " BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n",
" shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep \"Location constraint:\" | sed \"s/Location constraint://\"\n",
" bucket_region = shell_output[0].strip().lower()\n",
" if bucket_region != REGION:\n",
@@ -163,30 +171,40 @@
" \"Bucket region %s is different from notebook region %s\"\n",
" % (bucket_region, REGION)\n",
" )\n",
- "\n",
"print(f\"Using this GCS Bucket: {BUCKET_URI}\")\n",
"\n",
- "# Set up the default SERVICE_ACCOUNT.\n",
- "SERVICE_ACCOUNT = None\n",
+ "STAGING_BUCKET = os.path.join(BUCKET_URI, \"temporal\")\n",
+ "MODEL_BUCKET = os.path.join(BUCKET_URI, \"tfvision_image_classification\")\n",
+ "\n",
+ "\n",
+ "# Initialize Vertex AI API.\n",
+ "print(\"Initializing Vertex AI API.\")\n",
+ "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)\n",
+ "\n",
+ "# Gets the default SERVICE_ACCOUNT.\n",
"shell_output = ! gcloud projects describe $PROJECT_ID\n",
"project_number = shell_output[-1].split(\":\")[1].strip().replace(\"'\", \"\")\n",
"SERVICE_ACCOUNT = f\"{project_number}-compute@developer.gserviceaccount.com\"\n",
- "\n",
"print(\"Using this default Service Account:\", SERVICE_ACCOUNT)\n",
"\n",
+ "\n",
"# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket\n",
"! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME\n",
"\n",
- "if \"google.colab\" in sys.modules:\n",
- " from google.colab import auth\n",
- "\n",
- " auth.authenticate_user(project_id=PROJECT_ID)\n",
+ "! gcloud config set project $PROJECT_ID\n",
+ "! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=\"roles/storage.admin\"\n",
+ "! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=\"roles/aiplatform.user\"\n",
"\n",
- "STAGING_BUCKET = os.path.join(BUCKET_URI, \"temporal\")\n",
- "CHECKPOINT_BUCKET = os.path.join(BUCKET_URI, \"ckpt\")\n",
"CONFIG_DIR = os.path.join(BUCKET_URI, \"config\")\n",
+ "CHECKPOINT_BUCKET = os.path.join(BUCKET_URI, \"ckpt\")\n",
"\n",
- "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME)\n",
+ "# Only regions prefixed by \"us\", \"asia\", or \"europe\" are supported.\n",
+ "REGION_PREFIX = REGION.split(\"-\")[0]\n",
+ "assert REGION_PREFIX in (\n",
+ " \"us\",\n",
+ " \"europe\",\n",
+ " \"asia\",\n",
+ "), f'{REGION} is not supported. It must be prefixed by \"us\", \"asia\", or \"europe\".'\n",
"\n",
"\n",
"def upload_config_to_gcs(url):\n",
@@ -210,152 +228,8 @@
"# Define constants.\n",
"OBJECTIVE = \"icn\"\n",
"\n",
- "# Data converter constants.\n",
- "DATA_CONVERTER_JOB_PREFIX = \"data_converter\"\n",
- "DATA_CONVERTER_CONTAINER = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/data-converter\"\n",
- "DATA_CONVERTER_MACHINE_TYPE = \"n1-highmem-8\"\n",
- "\n",
- "\n",
- "# Training constants.\n",
- "TRAINING_JOB_PREFIX = \"train\"\n",
- "TRAIN_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/tfvision-oss\"\n",
- "TRAIN_MACHINE_TYPE = \"n1-highmem-16\"\n",
- "TRAIN_ACCELERATOR_TYPE = \"NVIDIA_TESLA_P100\"\n",
- "TRAIN_NUM_GPU = 1\n",
- "\n",
"# Evaluation constants.\n",
- "EVALUATION_METRIC = \"accuracy\"\n",
- "\n",
- "# Export constants.\n",
- "EXPORT_JOB_PREFIX = \"export\"\n",
- "EXPORT_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/tfvision-model-export\"\n",
- "EXPORT_MACHINE_TYPE = \"n1-highmem-8\"\n",
- "\n",
- "# Prediction constants.\n",
- "# You can deploy models with\n",
- "# pre-build-dockers: https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers.\n",
- "# and optimized tensorflow runtime dockers: https://cloud.google.com/vertex-ai/docs/predictions/optimized-tensorflow-runtime.\n",
- "# The example in this notebook uses optimized tensorflow runtime dockers.\n",
- "# You can adjust accelerator types and machine types to get faster predictions.\n",
- "PREDICTION_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.2-11:latest\"\n",
- "SERVING_CONTAINER_ARGS = [\"--allow_precompilation\", \"--allow_compression\"]\n",
- "PREDICTION_ACCELERATOR_TYPE = \"NVIDIA_TESLA_T4\"\n",
- "PREDICTION_MACHINE_TYPE = \"n1-standard-4\"\n",
- "UPLOAD_JOB_PREFIX = \"upload\"\n",
- "DEPLOY_JOB_PREFIX = \"deploy\"\n",
- "\n",
- "\n",
- "# Define common functions.\n",
- "def get_job_name_with_datetime(prefix: str):\n",
- " return prefix + datetime.now().strftime(\"_%Y%m%d_%H%M%S\")\n",
- "\n",
- "\n",
- "def predict_custom_trained_model(\n",
- " project: str,\n",
- " endpoint_id: str,\n",
- " instances: Union[Dict, List[Dict]],\n",
- " location: str = \"us-central1\",\n",
- "):\n",
- " # The AI Platform services require regional API endpoints.\n",
- " client_options = {\"api_endpoint\": f\"{location}-aiplatform.googleapis.com\"}\n",
- " # Initialize client that will be used to create and send requests.\n",
- " # This client only needs to be created once, and can be reused for multiple requests.\n",
- " client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)\n",
- " parameters_dict = {}\n",
- " parameters = json_format.ParseDict(parameters_dict, Value())\n",
- " endpoint = client.endpoint_path(\n",
- " project=project, location=location, endpoint=endpoint_id\n",
- " )\n",
- " response = client.predict(\n",
- " endpoint=endpoint, instances=instances, parameters=parameters\n",
- " )\n",
- " return response.predictions, response.deployed_model_id\n",
- "\n",
- "\n",
- "def load_img(path):\n",
- " img = tf.io.read_file(path)\n",
- " img = tf.image.decode_jpeg(img, channels=3)\n",
- " return Image.fromarray(numpy.uint8(img)).convert(\"RGB\")\n",
- "\n",
- "\n",
- "def display_image(image):\n",
- " _ = plt.figure(figsize=(20, 15))\n",
- " plt.grid(False)\n",
- " plt.imshow(image)\n",
- "\n",
- "\n",
- "def get_prediction_instances(test_filepath, new_width=-1):\n",
- " if new_width <= 0:\n",
- " with tf.io.gfile.GFile(test_filepath, \"rb\") as input_file:\n",
- " encoded_string = base64.b64encode(input_file.read()).decode(\"utf-8\")\n",
- " else:\n",
- " img = load_img(test_filepath)\n",
- " width, height = img.size\n",
- " print(\"original input image size: \", width, \" , \", height)\n",
- " new_height = int(height * new_width / width)\n",
- " new_img = img.resize((new_width, new_height))\n",
- " print(\"resized input image size: \", new_width, \" , \", new_height)\n",
- " buffered = BytesIO()\n",
- " new_img.save(buffered, format=\"JPEG\")\n",
- " encoded_string = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n",
- "\n",
- " instances = [\n",
- " {\n",
- " \"encoded_image\": {\"b64\": encoded_string},\n",
- " }\n",
- " ]\n",
- " return instances\n",
- "\n",
- "\n",
- "def get_label_map(label_map_yaml_filepath):\n",
- " with tf.io.gfile.GFile(label_map_yaml_filepath, \"rb\") as input_file:\n",
- " label_map = yaml.safe_load(input_file.read())\n",
- " return label_map\n",
- "\n",
- "\n",
- "def get_best_trial(model_dir, max_trial_count, evaluation_metric):\n",
- " best_trial_dir = \"\"\n",
- " best_trial_evaluation_results = {}\n",
- " best_performance = -1\n",
- "\n",
- " for i in range(max_trial_count):\n",
- " current_trial = i + 1\n",
- " current_trial_dir = os.path.join(model_dir, \"trial_\" + str(current_trial))\n",
- " current_trial_best_ckpt_dir = os.path.join(current_trial_dir, \"best_ckpt\")\n",
- " current_trial_best_ckpt_evaluation_filepath = os.path.join(\n",
- " current_trial_best_ckpt_dir, \"info.json\"\n",
- " )\n",
- " with tf.io.gfile.GFile(current_trial_best_ckpt_evaluation_filepath, \"rb\") as f:\n",
- " eval_metric_results = json.load(f)\n",
- " current_performance = eval_metric_results[evaluation_metric]\n",
- " if current_performance > best_performance:\n",
- " best_performance = current_performance\n",
- " best_trial_dir = current_trial_dir\n",
- " best_trial_evaluation_results = eval_metric_results\n",
- " return best_trial_dir, best_trial_evaluation_results\n",
- "\n",
- "\n",
- "def upload_checkpoint_to_gcs(checkpoint_url):\n",
- " filename = os.path.basename(checkpoint_url)\n",
- " checkpoint_name = filename.replace(\".tar.gz\", \"\")\n",
- " print(\"Download checkpoint from\", checkpoint_url, \"and store to\", CHECKPOINT_BUCKET)\n",
- " ! wget $checkpoint_url -O $filename\n",
- " ! mkdir -p $checkpoint_name\n",
- " ! tar -xvzf $filename -C $checkpoint_name\n",
- "\n",
- " # Search for relative path to the checkpoint.\n",
- " checkpoint_path = None\n",
- " for root, dirs, files in os.walk(checkpoint_name):\n",
- " for file in files:\n",
- " if file.endswith(\".index\"):\n",
- " checkpoint_path = os.path.join(root, os.path.splitext(file)[0])\n",
- " checkpoint_path = os.path.relpath(checkpoint_path, checkpoint_name)\n",
- " break\n",
- "\n",
- " ! gsutil cp -r $checkpoint_name $CHECKPOINT_BUCKET/\n",
- " checkpoint_uri = os.path.join(CHECKPOINT_BUCKET, checkpoint_name, checkpoint_path)\n",
- " print(\"Checkpoint uploaded to\", checkpoint_uri)\n",
- " return checkpoint_uri"
+ "EVALUATION_METRIC = \"accuracy\""
]
},
{
@@ -364,11 +238,11 @@
"id": "b2356e904526"
},
"source": [
- "## Train new models\n",
+ "## Training\n",
"\n",
"This section trains model with the following steps:\n",
- "1. Convert input data to training formats.\n",
- "2. Create hyperparameter tuning jobs to train new models.\n",
+ "1. Prepare data by converting the input data into training format.\n",
+ "2. Run hyperparameter tuning jobs to train new models.\n",
"3. Find and export best models."
]
},
@@ -397,7 +271,12 @@
"\n",
"from google.cloud.aiplatform import hyperparameter_tuning as hpt\n",
"\n",
- "data_converter_job_name = get_job_name_with_datetime(\n",
+ "# Data converter constants.\n",
+ "DATA_CONVERTER_JOB_PREFIX = \"data_converter\"\n",
+ "DATA_CONVERTER_CONTAINER = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/data-converter\"\n",
+ "DATA_CONVERTER_MACHINE_TYPE = \"n1-highmem-8\"\n",
+ "\n",
+ "data_converter_job_name = common_util.get_job_name_with_datetime(\n",
" DATA_CONVERTER_JOB_PREFIX + \"_\" + OBJECTIVE\n",
")\n",
"\n",
@@ -470,12 +349,21 @@
"# input_train_data_path = ''\n",
"# input_validation_data_path = ''\n",
"\n",
+ "# Training constants.\n",
+ "TRAINING_JOB_PREFIX = \"train\"\n",
+ "TRAIN_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/tfvision-oss\"\n",
+ "TRAIN_MACHINE_TYPE = \"g2-standard-4\"\n",
+ "TRAIN_ACCELERATOR_TYPE = \"NVIDIA_L4\"\n",
+ "TRAIN_NUM_GPU = 1\n",
+ "\n",
"experiment = \"Efficientnetv2-m\" # @param [\"Efficientnetv2-m\",\"ViT-ti16\",\"ViT-s16\",\"ViT-b16\",\"ViT-l16\", \"MaxViT\"]\n",
"\n",
- "train_job_name = get_job_name_with_datetime(TRAINING_JOB_PREFIX + \"_\" + OBJECTIVE)\n",
+ "train_job_name = common_util.get_job_name_with_datetime(\n",
+ " TRAINING_JOB_PREFIX + \"_\" + OBJECTIVE\n",
+ ")\n",
"model_dir = os.path.join(BUCKET_URI, train_job_name)\n",
"\n",
- "# The arguments here are mainly for test purposes. Please update them\n",
+ "# The arguments here are mainly for test purposes. Kindly update them\n",
"# to get better performances.\n",
"common_args = {\n",
" \"input_train_data_path\": input_train_data_path,\n",
@@ -543,6 +431,30 @@
"}\n",
"experiment_container_args = experiment_container_args_dict[experiment]\n",
"\n",
+ "\n",
+ "def upload_checkpoint_to_gcs(checkpoint_url):\n",
+ " filename = os.path.basename(checkpoint_url)\n",
+ " checkpoint_name = filename.replace(\".tar.gz\", \"\")\n",
+ " print(\"Download checkpoint from\", checkpoint_url, \"and store to\", CHECKPOINT_BUCKET)\n",
+ " ! wget $checkpoint_url -O $filename\n",
+ " ! mkdir -p $checkpoint_name\n",
+ " ! tar -xvzf $filename -C $checkpoint_name\n",
+ "\n",
+ " # Search for relative path to the checkpoint.\n",
+ " checkpoint_path = None\n",
+ " for root, dirs, files in os.walk(checkpoint_name):\n",
+ " for file in files:\n",
+ " if file.endswith(\".index\"):\n",
+ " checkpoint_path = os.path.join(root, os.path.splitext(file)[0])\n",
+ " checkpoint_path = os.path.relpath(checkpoint_path, checkpoint_name)\n",
+ " break\n",
+ "\n",
+ " ! gsutil cp -r $checkpoint_name $CHECKPOINT_BUCKET/\n",
+ " checkpoint_uri = os.path.join(CHECKPOINT_BUCKET, checkpoint_name, checkpoint_path)\n",
+ " print(\"Checkpoint uploaded to\", checkpoint_uri)\n",
+ " return checkpoint_uri\n",
+ "\n",
+ "\n",
"# Copy checkpoint to GCS bucket if specified.\n",
"init_checkpoint = experiment_container_args.get(\"init_checkpoint\")\n",
"if init_checkpoint:\n",
@@ -586,6 +498,16 @@
"\n",
"print(worker_pool_specs, metric_spec, parameter_spec)\n",
"\n",
+ "# Check quota.\n",
+ "common_util.check_quota(\n",
+ " project_id=PROJECT_ID,\n",
+ " region=REGION,\n",
+ " accelerator_type=TRAIN_ACCELERATOR_TYPE,\n",
+ " accelerator_count=1,\n",
+ " is_for_training=True,\n",
+ ")\n",
+ "\n",
+ "\n",
"# Run the hyperparameter job.\n",
"train_custom_job = aiplatform.CustomJob(\n",
" display_name=train_job_name,\n",
@@ -622,12 +544,41 @@
"source": [
"# @title Export best models as TF Saved Model format\n",
"\n",
- "# @markdown This section exports best models.\n",
- "\n",
- "# @markdown The exported model can be used in the next section \"Test trained models\" for online prediction.\n",
+ "# @markdown This section exports best model.\n",
"\n",
"# Export models from TF checkpoints to TF saved model format.\n",
"# model_dir is from the section above.\n",
+ "\n",
+ "# Export constants.\n",
+ "EXPORT_JOB_PREFIX = \"export\"\n",
+ "EXPORT_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/tfvision-model-export\"\n",
+ "EXPORT_MACHINE_TYPE = \"n1-highmem-8\"\n",
+ "\n",
+ "\n",
+ "def get_best_trial(model_dir, max_trial_count, evaluation_metric):\n",
+ " best_trial_dir = \"\"\n",
+ " best_trial_evaluation_results = {}\n",
+ " best_performance = -1\n",
+ "\n",
+ " for i in range(max_trial_count):\n",
+ " current_trial = i + 1\n",
+ " current_trial_dir = os.path.join(model_dir, \"trial_\" + str(current_trial))\n",
+ " current_trial_best_ckpt_dir = os.path.join(current_trial_dir, \"best_ckpt\")\n",
+ " current_trial_best_ckpt_evaluation_filepath = os.path.join(\n",
+ " current_trial_best_ckpt_dir, \"info.json\"\n",
+ " )\n",
+ " ! gsutil cp $current_trial_best_ckpt_evaluation_filepath .\n",
+ " with open(\"info.json\", \"r\") as f:\n",
+ " eval_metric_results = json.load(f)\n",
+ " current_performance = eval_metric_results[evaluation_metric]\n",
+ " if current_performance > best_performance:\n",
+ " best_performance = current_performance\n",
+ " best_trial_dir = current_trial_dir\n",
+ " best_trial_evaluation_results = eval_metric_results\n",
+ " print(\"best_trial_dir: \", current_trial_best_ckpt_evaluation_filepath)\n",
+ " return best_trial_dir, best_trial_evaluation_results\n",
+ "\n",
+ "\n",
"best_trial_dir, best_trial_evaluation_results = get_best_trial(\n",
" model_dir, MAX_TRIAL_COUNT, EVALUATION_METRIC\n",
")\n",
@@ -655,7 +606,9 @@
" }\n",
"]\n",
"\n",
- "model_export_name = get_job_name_with_datetime(EXPORT_JOB_PREFIX + \"_\" + OBJECTIVE)\n",
+ "model_export_name = common_util.get_job_name_with_datetime(\n",
+ " EXPORT_JOB_PREFIX + \"_\" + OBJECTIVE\n",
+ ")\n",
"model_export_custom_job = aiplatform.CustomJob(\n",
" display_name=model_export_name,\n",
" project=PROJECT_ID,\n",
@@ -674,7 +627,7 @@
"id": "c68112dc90b9"
},
"source": [
- "## Test trained models"
+ "## Deployment"
]
},
{
@@ -690,15 +643,24 @@
"\n",
"# @markdown This section uploads and deploy models to model registry for online prediction. This example uses the exported best model from \"Train new models\" section.\n",
"\n",
+ "PREDICTION_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.2-11:latest\"\n",
+ "SERVING_CONTAINER_ARGS = [\"--allow_precompilation\", \"--allow_compression\"]\n",
+ "PREDICTION_ACCELERATOR_TYPE = \"NVIDIA_L4\"\n",
+ "PREDICTION_MACHINE_TYPE = \"g2-standard-12\"\n",
+ "UPLOAD_JOB_PREFIX = \"upload\"\n",
+ "DEPLOY_JOB_PREFIX = \"deploy\"\n",
+ "\n",
"trained_model_dir = os.path.join(model_dir, \"best_model/saved_model\")\n",
- "upload_job_name = get_job_name_with_datetime(UPLOAD_JOB_PREFIX + \"_\" + OBJECTIVE)\n",
+ "upload_job_name = common_util.get_job_name_with_datetime(\n",
+ " UPLOAD_JOB_PREFIX + \"_\" + OBJECTIVE\n",
+ ")\n",
"\n",
"serving_env = {\n",
" \"MODEL_ID\": \"tensorflow-hub-efficientnetv2\",\n",
" \"DEPLOY_SOURCE\": \"notebook\",\n",
"}\n",
"\n",
- "model = aiplatform.Model.upload(\n",
+ "models[\"model_icn\"] = aiplatform.Model.upload(\n",
" display_name=upload_job_name,\n",
" artifact_uri=trained_model_dir,\n",
" serving_container_image_uri=PREDICTION_CONTAINER_URI,\n",
@@ -706,14 +668,24 @@
" serving_container_environment_variables=serving_env,\n",
")\n",
"\n",
- "model.wait()\n",
+ "models[\"model_icn\"].wait()\n",
"\n",
"print(\"The uploaded model name is: \", upload_job_name)\n",
"\n",
- "deploy_model_name = get_job_name_with_datetime(DEPLOY_JOB_PREFIX + \"_\" + OBJECTIVE)\n",
+ "deploy_model_name = common_util.get_job_name_with_datetime(\n",
+ " DEPLOY_JOB_PREFIX + \"_\" + OBJECTIVE\n",
+ ")\n",
"print(\"The deployed job name is: \", deploy_model_name)\n",
"\n",
- "endpoint = model.deploy(\n",
+ "common_util.check_quota(\n",
+ " project_id=PROJECT_ID,\n",
+ " region=REGION,\n",
+ " accelerator_type=PREDICTION_ACCELERATOR_TYPE,\n",
+ " accelerator_count=1,\n",
+ " is_for_training=False,\n",
+ ")\n",
+ "\n",
+ "endpoints[\"endpoint_icn\"] = models[\"model_icn\"].deploy(\n",
" deployed_model_display_name=deploy_model_name,\n",
" machine_type=PREDICTION_MACHINE_TYPE,\n",
" traffic_split={\"0\": 100},\n",
@@ -723,10 +695,19 @@
" max_replica_count=1,\n",
")\n",
"\n",
- "endpoint_id = endpoint.name\n",
+ "endpoint_id = endpoints[\"endpoint_icn\"].name\n",
"print(\"endpoint id is: \", endpoint_id)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1ULa2VTQqWfo"
+ },
+ "source": [
+ "## Predict"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
@@ -743,14 +724,94 @@
"# @markdown `test_filepath`: gcs uri to the test image file. The uri should start with \"gs://\".\n",
"\n",
"# endpoint_id was generated in the section above (`Upload and deploy models`).\n",
- "endpoint_id = endpoint.name\n",
+ "endpoint_id = endpoints[\"endpoint_icn\"].name\n",
"\n",
"test_filepath = \"gs://cloud-samples-data/ai-platform/flowers/roses/9423755543_edb35141a3_n.jpg\" # @param {type:\"string\"} {isTemplate:true}\n",
+ "\n",
+ "\n",
+ "def get_label_map(label_map_yaml_filepath: str) -> Dict[int, str]:\n",
+ " \"\"\"Returns class id to label mapping given a filepath to the label map.\n",
+ "\n",
+ " Args:\n",
+ " label_map_yaml_filepath: A string of label map yaml file path.\n",
+ "\n",
+ " Returns:\n",
+ " A dictionary of class id to label mapping.\n",
+ " \"\"\"\n",
+ " label_map_filename = os.path.basename(label_map_yaml_filepath)\n",
+ " subprocess.check_output(\n",
+ " [\"gsutil\", \"cp\", label_map_yaml_filepath, label_map_filename],\n",
+ " stderr=subprocess.STDOUT,\n",
+ " )\n",
+ " with open(label_map_filename, \"rb\") as input_file:\n",
+ " label_map = yaml.safe_load(input_file.read())[\"label_map\"]\n",
+ " return label_map\n",
+ "\n",
+ "\n",
+ "def get_prediction_instances(test_filepath: str, new_width: int = -1) -> Any:\n",
+ " \"\"\"Generate instance from image path to pass to Vertex AI Endpoint for prediction.\n",
+ "\n",
+ " Args:\n",
+ " test_filepath: A string of test image path.\n",
+ " new_width: An integer of new image width.\n",
+ "\n",
+ " Returns:\n",
+ " A list of instances.\n",
+ " \"\"\"\n",
+ " if new_width <= 0:\n",
+ " test_file = os.path.basename(test_filepath)\n",
+ " subprocess.check_output(\n",
+ " [\"gsutil\", \"cp\", test_filepath, test_file], stderr=subprocess.STDOUT\n",
+ " )\n",
+ " with open(test_file, \"rb\") as input_file:\n",
+ " encoded_string = base64.b64encode(input_file.read()).decode(\"utf-8\")\n",
+ " else:\n",
+ " img = common_util.load_img(test_filepath)\n",
+ " width, height = img.size\n",
+ " print(\"original input image size: \", width, \" , \", height)\n",
+ " new_height = int(height * new_width / width)\n",
+ " new_img = img.resize((new_width, new_height))\n",
+ " print(\"resized input image size: \", new_width, \" , \", new_height)\n",
+ " buffered = io.BytesIO()\n",
+ " new_img.save(buffered, format=\"JPEG\")\n",
+ " encoded_string = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n",
+ "\n",
+ " instances = [\n",
+ " {\n",
+ " \"encoded_image\": {\"b64\": encoded_string},\n",
+ " }\n",
+ " ]\n",
+ " return instances\n",
+ "\n",
+ "\n",
"# If the input image is too large, we will resize it for prediction.\n",
"instances = get_prediction_instances(test_filepath, new_width=1000)\n",
"\n",
"# The label map file was generated from the section above (`Convert input data for training`).\n",
- "label_map = get_label_map(label_map_path)[\"label_map\"]\n",
+ "label_map = get_label_map(label_map_path)\n",
+ "\n",
+ "\n",
+ "def predict_custom_trained_model(\n",
+ " project: str,\n",
+ " endpoint_id: str,\n",
+ " instances: Union[Dict, List[Dict]],\n",
+ " location: str = \"us-central1\",\n",
+ "):\n",
+ " # The AI Platform services require regional API endpoints.\n",
+ " client_options = {\"api_endpoint\": f\"{location}-aiplatform.googleapis.com\"}\n",
+ " # Initialize client that will be used to create and send requests.\n",
+ " # This client only needs to be created once, and can be reused for multiple requests.\n",
+ " client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)\n",
+ " parameters_dict = {}\n",
+ " parameters = json_format.ParseDict(parameters_dict, Value())\n",
+ " endpoint = client.endpoint_path(\n",
+ " project=project, location=location, endpoint=endpoint_id\n",
+ " )\n",
+ " response = client.predict(\n",
+ " endpoint=endpoint, instances=instances, parameters=parameters\n",
+ " )\n",
+ " return response.predictions, response.deployed_model_id\n",
+ "\n",
"\n",
"predictions, _ = predict_custom_trained_model(\n",
" project=PROJECT_ID, location=REGION, endpoint_id=endpoint_id, instances=instances\n",
@@ -761,8 +822,8 @@
"max_index = probs.index(max_prob)\n",
"print(\"The test image: \", test_filepath)\n",
"print(\"max_prob: \", max_prob, \", for label: \", label_map[max_index])\n",
- "img = load_img(test_filepath)\n",
- "display_image(img)"
+ "img = common_util.load_img(test_filepath)\n",
+ "common_util.display_image(img)"
]
},
{
@@ -771,7 +832,7 @@
"id": "f72e754f2802"
},
"source": [
- "## Clean up"
+ "## Clean up resources"
]
},
{
@@ -783,20 +844,7 @@
},
"outputs": [],
"source": [
- "# @title Clean up resources\n",
- "\n",
- "# @markdown Delete the experiment models and endpoints to recycle the resources\n",
- "# @markdown and avoid unnecessary continouous charges that may incur.\n",
- "\n",
- "try:\n",
- " # Undeploy model and delete endpoint.\n",
- " endpoint.delete(force=True)\n",
- "\n",
- " # Delete model.\n",
- " model.delete()\n",
- "\n",
- "except Exception as e:\n",
- " print(e)\n",
+ "# @title Clean up training jobs, models, endpoints and buckets\n",
"\n",
"try:\n",
" # Delete custom and hpt jobs.\n",
@@ -811,7 +859,17 @@
"except Exception as e:\n",
" print(e)\n",
"\n",
- "# Delete bucket.\n",
+ "# @markdown Delete the experiment models and endpoints to recycle the resources\n",
+ "# @markdown and avoid unnecessary continuous charges that may incur.\n",
+ "\n",
+ "# Undeploy model and delete endpoint.\n",
+ "for endpoint in endpoints.values():\n",
+ " endpoint.delete(force=True)\n",
+ "\n",
+ "# Delete models.\n",
+ "for model in models.values():\n",
+ " model.delete()\n",
+ "\n",
"delete_bucket = False # @param {type:\"boolean\"}\n",
"if delete_bucket:\n",
" ! gsutil -m rm -r $BUCKET_NAME"