diff --git a/notebooks/community/model_garden/model_garden_tfvision_image_classification.ipynb b/notebooks/community/model_garden/model_garden_tfvision_image_classification.ipynb index 3a3e1a96f..1330d7cfe 100644 --- a/notebooks/community/model_garden/model_garden_tfvision_image_classification.ipynb +++ b/notebooks/community/model_garden/model_garden_tfvision_image_classification.ipynb @@ -4,11 +4,12 @@ "cell_type": "code", "execution_count": null, "metadata": { + "cellView": "form", "id": "ur8xi4C7S06n" }, "outputs": [], "source": [ - "# Copyright 2023 Google LLC\n", + "# Copyright 2024 Google LLC\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -31,19 +32,18 @@ "source": [ "# Vertex AI Model Garden TFVision With Image Classification\n", "\n", - "\n", - "
\n", + "\n", + " \n", - " \n", - "
\n", " \n", " \"Google
Run in Colab Enterprise\n", "
\n", "
\n", + " \n", " \n", - " \"GitHub
\n", - " View on GitHub\n", + " \"GitHub
View on GitHub\n", "
\n", "
" + "
" ] }, { @@ -77,11 +77,7 @@ "* Vertex AI\n", "* Cloud Storage\n", "\n", - "Learn about [Vertex AI\n", - "pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage\n", - "pricing](https://cloud.google.com/storage/pricing), and use the [Pricing\n", - "Calculator](https://cloud.google.com/products/calculator/)\n", - "to generate a cost estimate based on your projected usage." + "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.\n" ] }, { @@ -106,56 +102,68 @@ "\n", "# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", "\n", - "# @markdown 2. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. \"us\") is not considered a match for a single region covered by the multi-region range (eg. \"us-central1\"). If not set, a unique GCS bucket will be created instead.\n", + "# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. \"us\") is not considered a match for a single region covered by the multi-region range (eg. \"us-central1\"). If not set, a unique GCS bucket will be created instead.\n", + "\n", + "BUCKET_URI = \"gs://\" # @param {type:\"string\"}\n", + "\n", + "# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.\n", + "\n", + "REGION = \"\" # @param {type:\"string\"}\n", + "\n", + "# @markdown 4. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).\n", + "\n", + "# @markdown > | Machine Type | Accelerator Type | Recommended Regions |\n", + "# @markdown | ----------- | ----------- | ----------- |\n", + "# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |\n", + "# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, us-west1, europe-west4, asia-southeast1 |\n", + "\n", + "! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git\n", "\n", "import base64\n", + "import datetime\n", + "import importlib\n", + "import io\n", "import json\n", "import os\n", - "import sys\n", - "from datetime import datetime\n", - "from io import BytesIO\n", - "from typing import Dict, List, Union\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy\n", - "import tensorflow as tf\n", + "import subprocess\n", + "import uuid\n", + "from typing import Any, Dict, List, Union\n", + "\n", "import yaml\n", "from google.cloud import aiplatform\n", "from google.protobuf import json_format\n", "from google.protobuf.struct_pb2 import Value\n", - "from PIL import Image\n", + "\n", + "common_util = importlib.import_module(\n", + " \"vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util\"\n", + ")\n", + "\n", + "models, endpoints = {}, {}\n", + "\n", "\n", "# Get the default cloud project id.\n", "PROJECT_ID = os.environ[\"GOOGLE_CLOUD_PROJECT\"]\n", "\n", "# Get the default region for launching jobs.\n", - "REGION = os.environ[\"GOOGLE_CLOUD_REGION\"]\n", - "\n", - "# Only regions prefixed by \"us\", \"asia\", or \"europe\" are supported.\n", - "REGION_PREFIX = REGION.split(\"-\")[0]\n", - "assert REGION_PREFIX in (\n", - " \"us\",\n", - " \"europe\",\n", - " \"asia\",\n", - "), f'{REGION} is not supported. It must be prefixed by \"us\", \"asia\", or \"europe\".'\n", + "if not REGION:\n", + " REGION = os.environ[\"GOOGLE_CLOUD_REGION\"]\n", "\n", "# Enable the Vertex AI API and Compute Engine API, if not already.\n", + "print(\"Enabling Vertex AI API and Compute Engine API.\")\n", "! gcloud services enable aiplatform.googleapis.com compute.googleapis.com\n", "\n", "# Cloud Storage bucket for storing the experiment artifacts.\n", "# A unique GCS bucket will be created for the purpose of this notebook. If you\n", - "# prefer using your own GCS bucket, please change the value yourself below.\n", - "now = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", - "BUCKET_URI = \"gs://\" # @param {type: \"string\"}\n", + "# prefer using your own GCS bucket, change the value yourself below.\n", + "now = datetime.datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", + "BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n", "\n", - "# Create a unique GCS bucket for this notebook, if not specified by the user.\n", "if BUCKET_URI is None or BUCKET_URI.strip() == \"\" or BUCKET_URI == \"gs://\":\n", - " BUCKET_URI = f\"gs://{PROJECT_ID}-tmp-{now}\"\n", - " BUCKET_NAME = BUCKET_URI\n", + " BUCKET_URI = f\"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}\"\n", + " BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n", " ! gsutil mb -l {REGION} {BUCKET_URI}\n", "else:\n", " assert BUCKET_URI.startswith(\"gs://\"), \"BUCKET_URI must start with `gs://`.\"\n", - " BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n", " shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep \"Location constraint:\" | sed \"s/Location constraint://\"\n", " bucket_region = shell_output[0].strip().lower()\n", " if bucket_region != REGION:\n", @@ -163,30 +171,40 @@ " \"Bucket region %s is different from notebook region %s\"\n", " % (bucket_region, REGION)\n", " )\n", - "\n", "print(f\"Using this GCS Bucket: {BUCKET_URI}\")\n", "\n", - "# Set up the default SERVICE_ACCOUNT.\n", - "SERVICE_ACCOUNT = None\n", + "STAGING_BUCKET = os.path.join(BUCKET_URI, \"temporal\")\n", + "MODEL_BUCKET = os.path.join(BUCKET_URI, \"tfvision_image_classification\")\n", + "\n", + "\n", + "# Initialize Vertex AI API.\n", + "print(\"Initializing Vertex AI API.\")\n", + "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)\n", + "\n", + "# Gets the default SERVICE_ACCOUNT.\n", "shell_output = ! gcloud projects describe $PROJECT_ID\n", "project_number = shell_output[-1].split(\":\")[1].strip().replace(\"'\", \"\")\n", "SERVICE_ACCOUNT = f\"{project_number}-compute@developer.gserviceaccount.com\"\n", - "\n", "print(\"Using this default Service Account:\", SERVICE_ACCOUNT)\n", "\n", + "\n", "# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket\n", "! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME\n", "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user(project_id=PROJECT_ID)\n", + "! gcloud config set project $PROJECT_ID\n", + "! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=\"roles/storage.admin\"\n", + "! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=\"roles/aiplatform.user\"\n", "\n", - "STAGING_BUCKET = os.path.join(BUCKET_URI, \"temporal\")\n", - "CHECKPOINT_BUCKET = os.path.join(BUCKET_URI, \"ckpt\")\n", "CONFIG_DIR = os.path.join(BUCKET_URI, \"config\")\n", + "CHECKPOINT_BUCKET = os.path.join(BUCKET_URI, \"ckpt\")\n", "\n", - "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME)\n", + "# Only regions prefixed by \"us\", \"asia\", or \"europe\" are supported.\n", + "REGION_PREFIX = REGION.split(\"-\")[0]\n", + "assert REGION_PREFIX in (\n", + " \"us\",\n", + " \"europe\",\n", + " \"asia\",\n", + "), f'{REGION} is not supported. It must be prefixed by \"us\", \"asia\", or \"europe\".'\n", "\n", "\n", "def upload_config_to_gcs(url):\n", @@ -210,152 +228,8 @@ "# Define constants.\n", "OBJECTIVE = \"icn\"\n", "\n", - "# Data converter constants.\n", - "DATA_CONVERTER_JOB_PREFIX = \"data_converter\"\n", - "DATA_CONVERTER_CONTAINER = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/data-converter\"\n", - "DATA_CONVERTER_MACHINE_TYPE = \"n1-highmem-8\"\n", - "\n", - "\n", - "# Training constants.\n", - "TRAINING_JOB_PREFIX = \"train\"\n", - "TRAIN_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/tfvision-oss\"\n", - "TRAIN_MACHINE_TYPE = \"n1-highmem-16\"\n", - "TRAIN_ACCELERATOR_TYPE = \"NVIDIA_TESLA_P100\"\n", - "TRAIN_NUM_GPU = 1\n", - "\n", "# Evaluation constants.\n", - "EVALUATION_METRIC = \"accuracy\"\n", - "\n", - "# Export constants.\n", - "EXPORT_JOB_PREFIX = \"export\"\n", - "EXPORT_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/tfvision-model-export\"\n", - "EXPORT_MACHINE_TYPE = \"n1-highmem-8\"\n", - "\n", - "# Prediction constants.\n", - "# You can deploy models with\n", - "# pre-build-dockers: https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers.\n", - "# and optimized tensorflow runtime dockers: https://cloud.google.com/vertex-ai/docs/predictions/optimized-tensorflow-runtime.\n", - "# The example in this notebook uses optimized tensorflow runtime dockers.\n", - "# You can adjust accelerator types and machine types to get faster predictions.\n", - "PREDICTION_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.2-11:latest\"\n", - "SERVING_CONTAINER_ARGS = [\"--allow_precompilation\", \"--allow_compression\"]\n", - "PREDICTION_ACCELERATOR_TYPE = \"NVIDIA_TESLA_T4\"\n", - "PREDICTION_MACHINE_TYPE = \"n1-standard-4\"\n", - "UPLOAD_JOB_PREFIX = \"upload\"\n", - "DEPLOY_JOB_PREFIX = \"deploy\"\n", - "\n", - "\n", - "# Define common functions.\n", - "def get_job_name_with_datetime(prefix: str):\n", - " return prefix + datetime.now().strftime(\"_%Y%m%d_%H%M%S\")\n", - "\n", - "\n", - "def predict_custom_trained_model(\n", - " project: str,\n", - " endpoint_id: str,\n", - " instances: Union[Dict, List[Dict]],\n", - " location: str = \"us-central1\",\n", - "):\n", - " # The AI Platform services require regional API endpoints.\n", - " client_options = {\"api_endpoint\": f\"{location}-aiplatform.googleapis.com\"}\n", - " # Initialize client that will be used to create and send requests.\n", - " # This client only needs to be created once, and can be reused for multiple requests.\n", - " client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)\n", - " parameters_dict = {}\n", - " parameters = json_format.ParseDict(parameters_dict, Value())\n", - " endpoint = client.endpoint_path(\n", - " project=project, location=location, endpoint=endpoint_id\n", - " )\n", - " response = client.predict(\n", - " endpoint=endpoint, instances=instances, parameters=parameters\n", - " )\n", - " return response.predictions, response.deployed_model_id\n", - "\n", - "\n", - "def load_img(path):\n", - " img = tf.io.read_file(path)\n", - " img = tf.image.decode_jpeg(img, channels=3)\n", - " return Image.fromarray(numpy.uint8(img)).convert(\"RGB\")\n", - "\n", - "\n", - "def display_image(image):\n", - " _ = plt.figure(figsize=(20, 15))\n", - " plt.grid(False)\n", - " plt.imshow(image)\n", - "\n", - "\n", - "def get_prediction_instances(test_filepath, new_width=-1):\n", - " if new_width <= 0:\n", - " with tf.io.gfile.GFile(test_filepath, \"rb\") as input_file:\n", - " encoded_string = base64.b64encode(input_file.read()).decode(\"utf-8\")\n", - " else:\n", - " img = load_img(test_filepath)\n", - " width, height = img.size\n", - " print(\"original input image size: \", width, \" , \", height)\n", - " new_height = int(height * new_width / width)\n", - " new_img = img.resize((new_width, new_height))\n", - " print(\"resized input image size: \", new_width, \" , \", new_height)\n", - " buffered = BytesIO()\n", - " new_img.save(buffered, format=\"JPEG\")\n", - " encoded_string = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n", - "\n", - " instances = [\n", - " {\n", - " \"encoded_image\": {\"b64\": encoded_string},\n", - " }\n", - " ]\n", - " return instances\n", - "\n", - "\n", - "def get_label_map(label_map_yaml_filepath):\n", - " with tf.io.gfile.GFile(label_map_yaml_filepath, \"rb\") as input_file:\n", - " label_map = yaml.safe_load(input_file.read())\n", - " return label_map\n", - "\n", - "\n", - "def get_best_trial(model_dir, max_trial_count, evaluation_metric):\n", - " best_trial_dir = \"\"\n", - " best_trial_evaluation_results = {}\n", - " best_performance = -1\n", - "\n", - " for i in range(max_trial_count):\n", - " current_trial = i + 1\n", - " current_trial_dir = os.path.join(model_dir, \"trial_\" + str(current_trial))\n", - " current_trial_best_ckpt_dir = os.path.join(current_trial_dir, \"best_ckpt\")\n", - " current_trial_best_ckpt_evaluation_filepath = os.path.join(\n", - " current_trial_best_ckpt_dir, \"info.json\"\n", - " )\n", - " with tf.io.gfile.GFile(current_trial_best_ckpt_evaluation_filepath, \"rb\") as f:\n", - " eval_metric_results = json.load(f)\n", - " current_performance = eval_metric_results[evaluation_metric]\n", - " if current_performance > best_performance:\n", - " best_performance = current_performance\n", - " best_trial_dir = current_trial_dir\n", - " best_trial_evaluation_results = eval_metric_results\n", - " return best_trial_dir, best_trial_evaluation_results\n", - "\n", - "\n", - "def upload_checkpoint_to_gcs(checkpoint_url):\n", - " filename = os.path.basename(checkpoint_url)\n", - " checkpoint_name = filename.replace(\".tar.gz\", \"\")\n", - " print(\"Download checkpoint from\", checkpoint_url, \"and store to\", CHECKPOINT_BUCKET)\n", - " ! wget $checkpoint_url -O $filename\n", - " ! mkdir -p $checkpoint_name\n", - " ! tar -xvzf $filename -C $checkpoint_name\n", - "\n", - " # Search for relative path to the checkpoint.\n", - " checkpoint_path = None\n", - " for root, dirs, files in os.walk(checkpoint_name):\n", - " for file in files:\n", - " if file.endswith(\".index\"):\n", - " checkpoint_path = os.path.join(root, os.path.splitext(file)[0])\n", - " checkpoint_path = os.path.relpath(checkpoint_path, checkpoint_name)\n", - " break\n", - "\n", - " ! gsutil cp -r $checkpoint_name $CHECKPOINT_BUCKET/\n", - " checkpoint_uri = os.path.join(CHECKPOINT_BUCKET, checkpoint_name, checkpoint_path)\n", - " print(\"Checkpoint uploaded to\", checkpoint_uri)\n", - " return checkpoint_uri" + "EVALUATION_METRIC = \"accuracy\"" ] }, { @@ -364,11 +238,11 @@ "id": "b2356e904526" }, "source": [ - "## Train new models\n", + "## Training\n", "\n", "This section trains model with the following steps:\n", - "1. Convert input data to training formats.\n", - "2. Create hyperparameter tuning jobs to train new models.\n", + "1. Prepare data by converting the input data into training format.\n", + "2. Run hyperparameter tuning jobs to train new models.\n", "3. Find and export best models." ] }, @@ -397,7 +271,12 @@ "\n", "from google.cloud.aiplatform import hyperparameter_tuning as hpt\n", "\n", - "data_converter_job_name = get_job_name_with_datetime(\n", + "# Data converter constants.\n", + "DATA_CONVERTER_JOB_PREFIX = \"data_converter\"\n", + "DATA_CONVERTER_CONTAINER = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/data-converter\"\n", + "DATA_CONVERTER_MACHINE_TYPE = \"n1-highmem-8\"\n", + "\n", + "data_converter_job_name = common_util.get_job_name_with_datetime(\n", " DATA_CONVERTER_JOB_PREFIX + \"_\" + OBJECTIVE\n", ")\n", "\n", @@ -470,12 +349,21 @@ "# input_train_data_path = ''\n", "# input_validation_data_path = ''\n", "\n", + "# Training constants.\n", + "TRAINING_JOB_PREFIX = \"train\"\n", + "TRAIN_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/tfvision-oss\"\n", + "TRAIN_MACHINE_TYPE = \"g2-standard-4\"\n", + "TRAIN_ACCELERATOR_TYPE = \"NVIDIA_L4\"\n", + "TRAIN_NUM_GPU = 1\n", + "\n", "experiment = \"Efficientnetv2-m\" # @param [\"Efficientnetv2-m\",\"ViT-ti16\",\"ViT-s16\",\"ViT-b16\",\"ViT-l16\", \"MaxViT\"]\n", "\n", - "train_job_name = get_job_name_with_datetime(TRAINING_JOB_PREFIX + \"_\" + OBJECTIVE)\n", + "train_job_name = common_util.get_job_name_with_datetime(\n", + " TRAINING_JOB_PREFIX + \"_\" + OBJECTIVE\n", + ")\n", "model_dir = os.path.join(BUCKET_URI, train_job_name)\n", "\n", - "# The arguments here are mainly for test purposes. Please update them\n", + "# The arguments here are mainly for test purposes. Kindly update them\n", "# to get better performances.\n", "common_args = {\n", " \"input_train_data_path\": input_train_data_path,\n", @@ -543,6 +431,30 @@ "}\n", "experiment_container_args = experiment_container_args_dict[experiment]\n", "\n", + "\n", + "def upload_checkpoint_to_gcs(checkpoint_url):\n", + " filename = os.path.basename(checkpoint_url)\n", + " checkpoint_name = filename.replace(\".tar.gz\", \"\")\n", + " print(\"Download checkpoint from\", checkpoint_url, \"and store to\", CHECKPOINT_BUCKET)\n", + " ! wget $checkpoint_url -O $filename\n", + " ! mkdir -p $checkpoint_name\n", + " ! tar -xvzf $filename -C $checkpoint_name\n", + "\n", + " # Search for relative path to the checkpoint.\n", + " checkpoint_path = None\n", + " for root, dirs, files in os.walk(checkpoint_name):\n", + " for file in files:\n", + " if file.endswith(\".index\"):\n", + " checkpoint_path = os.path.join(root, os.path.splitext(file)[0])\n", + " checkpoint_path = os.path.relpath(checkpoint_path, checkpoint_name)\n", + " break\n", + "\n", + " ! gsutil cp -r $checkpoint_name $CHECKPOINT_BUCKET/\n", + " checkpoint_uri = os.path.join(CHECKPOINT_BUCKET, checkpoint_name, checkpoint_path)\n", + " print(\"Checkpoint uploaded to\", checkpoint_uri)\n", + " return checkpoint_uri\n", + "\n", + "\n", "# Copy checkpoint to GCS bucket if specified.\n", "init_checkpoint = experiment_container_args.get(\"init_checkpoint\")\n", "if init_checkpoint:\n", @@ -586,6 +498,16 @@ "\n", "print(worker_pool_specs, metric_spec, parameter_spec)\n", "\n", + "# Check quota.\n", + "common_util.check_quota(\n", + " project_id=PROJECT_ID,\n", + " region=REGION,\n", + " accelerator_type=TRAIN_ACCELERATOR_TYPE,\n", + " accelerator_count=1,\n", + " is_for_training=True,\n", + ")\n", + "\n", + "\n", "# Run the hyperparameter job.\n", "train_custom_job = aiplatform.CustomJob(\n", " display_name=train_job_name,\n", @@ -622,12 +544,41 @@ "source": [ "# @title Export best models as TF Saved Model format\n", "\n", - "# @markdown This section exports best models.\n", - "\n", - "# @markdown The exported model can be used in the next section \"Test trained models\" for online prediction.\n", + "# @markdown This section exports best model.\n", "\n", "# Export models from TF checkpoints to TF saved model format.\n", "# model_dir is from the section above.\n", + "\n", + "# Export constants.\n", + "EXPORT_JOB_PREFIX = \"export\"\n", + "EXPORT_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/tfvision-model-export\"\n", + "EXPORT_MACHINE_TYPE = \"n1-highmem-8\"\n", + "\n", + "\n", + "def get_best_trial(model_dir, max_trial_count, evaluation_metric):\n", + " best_trial_dir = \"\"\n", + " best_trial_evaluation_results = {}\n", + " best_performance = -1\n", + "\n", + " for i in range(max_trial_count):\n", + " current_trial = i + 1\n", + " current_trial_dir = os.path.join(model_dir, \"trial_\" + str(current_trial))\n", + " current_trial_best_ckpt_dir = os.path.join(current_trial_dir, \"best_ckpt\")\n", + " current_trial_best_ckpt_evaluation_filepath = os.path.join(\n", + " current_trial_best_ckpt_dir, \"info.json\"\n", + " )\n", + " ! gsutil cp $current_trial_best_ckpt_evaluation_filepath .\n", + " with open(\"info.json\", \"r\") as f:\n", + " eval_metric_results = json.load(f)\n", + " current_performance = eval_metric_results[evaluation_metric]\n", + " if current_performance > best_performance:\n", + " best_performance = current_performance\n", + " best_trial_dir = current_trial_dir\n", + " best_trial_evaluation_results = eval_metric_results\n", + " print(\"best_trial_dir: \", current_trial_best_ckpt_evaluation_filepath)\n", + " return best_trial_dir, best_trial_evaluation_results\n", + "\n", + "\n", "best_trial_dir, best_trial_evaluation_results = get_best_trial(\n", " model_dir, MAX_TRIAL_COUNT, EVALUATION_METRIC\n", ")\n", @@ -655,7 +606,9 @@ " }\n", "]\n", "\n", - "model_export_name = get_job_name_with_datetime(EXPORT_JOB_PREFIX + \"_\" + OBJECTIVE)\n", + "model_export_name = common_util.get_job_name_with_datetime(\n", + " EXPORT_JOB_PREFIX + \"_\" + OBJECTIVE\n", + ")\n", "model_export_custom_job = aiplatform.CustomJob(\n", " display_name=model_export_name,\n", " project=PROJECT_ID,\n", @@ -674,7 +627,7 @@ "id": "c68112dc90b9" }, "source": [ - "## Test trained models" + "## Deployment" ] }, { @@ -690,15 +643,24 @@ "\n", "# @markdown This section uploads and deploy models to model registry for online prediction. This example uses the exported best model from \"Train new models\" section.\n", "\n", + "PREDICTION_CONTAINER_URI = f\"{REGION_PREFIX}-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.2-11:latest\"\n", + "SERVING_CONTAINER_ARGS = [\"--allow_precompilation\", \"--allow_compression\"]\n", + "PREDICTION_ACCELERATOR_TYPE = \"NVIDIA_L4\"\n", + "PREDICTION_MACHINE_TYPE = \"g2-standard-12\"\n", + "UPLOAD_JOB_PREFIX = \"upload\"\n", + "DEPLOY_JOB_PREFIX = \"deploy\"\n", + "\n", "trained_model_dir = os.path.join(model_dir, \"best_model/saved_model\")\n", - "upload_job_name = get_job_name_with_datetime(UPLOAD_JOB_PREFIX + \"_\" + OBJECTIVE)\n", + "upload_job_name = common_util.get_job_name_with_datetime(\n", + " UPLOAD_JOB_PREFIX + \"_\" + OBJECTIVE\n", + ")\n", "\n", "serving_env = {\n", " \"MODEL_ID\": \"tensorflow-hub-efficientnetv2\",\n", " \"DEPLOY_SOURCE\": \"notebook\",\n", "}\n", "\n", - "model = aiplatform.Model.upload(\n", + "models[\"model_icn\"] = aiplatform.Model.upload(\n", " display_name=upload_job_name,\n", " artifact_uri=trained_model_dir,\n", " serving_container_image_uri=PREDICTION_CONTAINER_URI,\n", @@ -706,14 +668,24 @@ " serving_container_environment_variables=serving_env,\n", ")\n", "\n", - "model.wait()\n", + "models[\"model_icn\"].wait()\n", "\n", "print(\"The uploaded model name is: \", upload_job_name)\n", "\n", - "deploy_model_name = get_job_name_with_datetime(DEPLOY_JOB_PREFIX + \"_\" + OBJECTIVE)\n", + "deploy_model_name = common_util.get_job_name_with_datetime(\n", + " DEPLOY_JOB_PREFIX + \"_\" + OBJECTIVE\n", + ")\n", "print(\"The deployed job name is: \", deploy_model_name)\n", "\n", - "endpoint = model.deploy(\n", + "common_util.check_quota(\n", + " project_id=PROJECT_ID,\n", + " region=REGION,\n", + " accelerator_type=PREDICTION_ACCELERATOR_TYPE,\n", + " accelerator_count=1,\n", + " is_for_training=False,\n", + ")\n", + "\n", + "endpoints[\"endpoint_icn\"] = models[\"model_icn\"].deploy(\n", " deployed_model_display_name=deploy_model_name,\n", " machine_type=PREDICTION_MACHINE_TYPE,\n", " traffic_split={\"0\": 100},\n", @@ -723,10 +695,19 @@ " max_replica_count=1,\n", ")\n", "\n", - "endpoint_id = endpoint.name\n", + "endpoint_id = endpoints[\"endpoint_icn\"].name\n", "print(\"endpoint id is: \", endpoint_id)" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "1ULa2VTQqWfo" + }, + "source": [ + "## Predict" + ] + }, { "cell_type": "code", "execution_count": null, @@ -743,14 +724,94 @@ "# @markdown `test_filepath`: gcs uri to the test image file. The uri should start with \"gs://\".\n", "\n", "# endpoint_id was generated in the section above (`Upload and deploy models`).\n", - "endpoint_id = endpoint.name\n", + "endpoint_id = endpoints[\"endpoint_icn\"].name\n", "\n", "test_filepath = \"gs://cloud-samples-data/ai-platform/flowers/roses/9423755543_edb35141a3_n.jpg\" # @param {type:\"string\"} {isTemplate:true}\n", + "\n", + "\n", + "def get_label_map(label_map_yaml_filepath: str) -> Dict[int, str]:\n", + " \"\"\"Returns class id to label mapping given a filepath to the label map.\n", + "\n", + " Args:\n", + " label_map_yaml_filepath: A string of label map yaml file path.\n", + "\n", + " Returns:\n", + " A dictionary of class id to label mapping.\n", + " \"\"\"\n", + " label_map_filename = os.path.basename(label_map_yaml_filepath)\n", + " subprocess.check_output(\n", + " [\"gsutil\", \"cp\", label_map_yaml_filepath, label_map_filename],\n", + " stderr=subprocess.STDOUT,\n", + " )\n", + " with open(label_map_filename, \"rb\") as input_file:\n", + " label_map = yaml.safe_load(input_file.read())[\"label_map\"]\n", + " return label_map\n", + "\n", + "\n", + "def get_prediction_instances(test_filepath: str, new_width: int = -1) -> Any:\n", + " \"\"\"Generate instance from image path to pass to Vertex AI Endpoint for prediction.\n", + "\n", + " Args:\n", + " test_filepath: A string of test image path.\n", + " new_width: An integer of new image width.\n", + "\n", + " Returns:\n", + " A list of instances.\n", + " \"\"\"\n", + " if new_width <= 0:\n", + " test_file = os.path.basename(test_filepath)\n", + " subprocess.check_output(\n", + " [\"gsutil\", \"cp\", test_filepath, test_file], stderr=subprocess.STDOUT\n", + " )\n", + " with open(test_file, \"rb\") as input_file:\n", + " encoded_string = base64.b64encode(input_file.read()).decode(\"utf-8\")\n", + " else:\n", + " img = common_util.load_img(test_filepath)\n", + " width, height = img.size\n", + " print(\"original input image size: \", width, \" , \", height)\n", + " new_height = int(height * new_width / width)\n", + " new_img = img.resize((new_width, new_height))\n", + " print(\"resized input image size: \", new_width, \" , \", new_height)\n", + " buffered = io.BytesIO()\n", + " new_img.save(buffered, format=\"JPEG\")\n", + " encoded_string = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n", + "\n", + " instances = [\n", + " {\n", + " \"encoded_image\": {\"b64\": encoded_string},\n", + " }\n", + " ]\n", + " return instances\n", + "\n", + "\n", "# If the input image is too large, we will resize it for prediction.\n", "instances = get_prediction_instances(test_filepath, new_width=1000)\n", "\n", "# The label map file was generated from the section above (`Convert input data for training`).\n", - "label_map = get_label_map(label_map_path)[\"label_map\"]\n", + "label_map = get_label_map(label_map_path)\n", + "\n", + "\n", + "def predict_custom_trained_model(\n", + " project: str,\n", + " endpoint_id: str,\n", + " instances: Union[Dict, List[Dict]],\n", + " location: str = \"us-central1\",\n", + "):\n", + " # The AI Platform services require regional API endpoints.\n", + " client_options = {\"api_endpoint\": f\"{location}-aiplatform.googleapis.com\"}\n", + " # Initialize client that will be used to create and send requests.\n", + " # This client only needs to be created once, and can be reused for multiple requests.\n", + " client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)\n", + " parameters_dict = {}\n", + " parameters = json_format.ParseDict(parameters_dict, Value())\n", + " endpoint = client.endpoint_path(\n", + " project=project, location=location, endpoint=endpoint_id\n", + " )\n", + " response = client.predict(\n", + " endpoint=endpoint, instances=instances, parameters=parameters\n", + " )\n", + " return response.predictions, response.deployed_model_id\n", + "\n", "\n", "predictions, _ = predict_custom_trained_model(\n", " project=PROJECT_ID, location=REGION, endpoint_id=endpoint_id, instances=instances\n", @@ -761,8 +822,8 @@ "max_index = probs.index(max_prob)\n", "print(\"The test image: \", test_filepath)\n", "print(\"max_prob: \", max_prob, \", for label: \", label_map[max_index])\n", - "img = load_img(test_filepath)\n", - "display_image(img)" + "img = common_util.load_img(test_filepath)\n", + "common_util.display_image(img)" ] }, { @@ -771,7 +832,7 @@ "id": "f72e754f2802" }, "source": [ - "## Clean up" + "## Clean up resources" ] }, { @@ -783,20 +844,7 @@ }, "outputs": [], "source": [ - "# @title Clean up resources\n", - "\n", - "# @markdown Delete the experiment models and endpoints to recycle the resources\n", - "# @markdown and avoid unnecessary continouous charges that may incur.\n", - "\n", - "try:\n", - " # Undeploy model and delete endpoint.\n", - " endpoint.delete(force=True)\n", - "\n", - " # Delete model.\n", - " model.delete()\n", - "\n", - "except Exception as e:\n", - " print(e)\n", + "# @title Clean up training jobs, models, endpoints and buckets\n", "\n", "try:\n", " # Delete custom and hpt jobs.\n", @@ -811,7 +859,17 @@ "except Exception as e:\n", " print(e)\n", "\n", - "# Delete bucket.\n", + "# @markdown Delete the experiment models and endpoints to recycle the resources\n", + "# @markdown and avoid unnecessary continuous charges that may incur.\n", + "\n", + "# Undeploy model and delete endpoint.\n", + "for endpoint in endpoints.values():\n", + " endpoint.delete(force=True)\n", + "\n", + "# Delete models.\n", + "for model in models.values():\n", + " model.delete()\n", + "\n", "delete_bucket = False # @param {type:\"boolean\"}\n", "if delete_bucket:\n", " ! gsutil -m rm -r $BUCKET_NAME"