diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2ddfe8ccb932..afcaefe2e963 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,11 +37,11 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["tensorflow"] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = true +build_training = false build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR @@ -155,7 +155,7 @@ dlc-pr-tensorflow-2-habana-training = "" # Standard Framework Inference dlc-pr-pytorch-inference = "" -dlc-pr-tensorflow-2-inference = "" +dlc-pr-tensorflow-2-inference = "tensorflow/inference/buildspec-2-20-sm.yml" dlc-pr-autogluon-inference = "" # Graviton Inference diff --git a/tensorflow/inference/RUNBOOK.md b/tensorflow/inference/RUNBOOK.md new file mode 100644 index 000000000000..dbd302bd4682 --- /dev/null +++ b/tensorflow/inference/RUNBOOK.md @@ -0,0 +1,383 @@ +# TensorFlow Inference Image Build Runbook + +This runbook documents the process for creating new TensorFlow Inference Docker images for AWS Deep Learning Containers. It captures lessons learned and common issues encountered during builds. + +## Table of Contents +1. [Using This Runbook with AI Assistance](#using-this-runbook-with-ai-assistance) +2. [Prerequisites](#prerequisites) +3. [File Structure](#file-structure) +4. [Step-by-Step Build Process](#step-by-step-build-process) +5. [Common Issues and Solutions](#common-issues-and-solutions) +6. [Testing](#testing) +7. [Checklist](#checklist) + +--- + +## Using This Runbook with AI Assistance + +This runbook is designed to be used with AI coding assistants (like Cline). Simply reference this runbook in your prompt, and the AI will follow the documented process. + +### Quick Start Prompt + +For a new TensorFlow inference image, use this prompt format: + +``` +Create TensorFlow X.Y inference images following the runbook at `tensorflow/inference/RUNBOOK.md` +``` + +### Recommended Prompt Format + +For more control, include these details: + +``` +Create TensorFlow X.Y inference SageMaker images (CPU and GPU with CUDA Z.Z) +following `tensorflow/inference/RUNBOOK.md`. +Base it on the TF 2.20 images. +``` + +### Example Prompts + +**Basic:** +> "Create TensorFlow 2.21 inference images following the runbook at `tensorflow/inference/RUNBOOK.md`" + +**With CUDA version:** +> "Create TensorFlow 2.21 inference images with CUDA 12.6 following `tensorflow/inference/RUNBOOK.md`" + +**With Python version:** +> "Create TensorFlow 2.21 inference images with Python 3.13 following `tensorflow/inference/RUNBOOK.md`" + +**Full specification:** +> "Create TensorFlow 2.21 inference SageMaker images (CPU and GPU with CUDA 12.6, Python 3.13) following `tensorflow/inference/RUNBOOK.md`. Base it on the TF 2.20 images we created." + +### What the AI Will Do + +When you provide a prompt referencing this runbook, the AI will: + +1. **Read this runbook** to understand the process +2. **Check availability** of TF Serving, TF Serving API, and license files +3. **Create all necessary files**: + - Buildspec YAML (with proper version quoting) + - CPU Dockerfile + - GPU Dockerfile +4. **Apply CVE fixes** proactively (wheel, setuptools, nvjpeg) +5. **Update test allowlists** if TF Serving version differs from TF version +6. **Format Python files** with black +7. **Run through the checklist** before completion + +### Tips for Efficient Prompts + +- Always include the target TensorFlow version (e.g., "2.21") +- Specify CUDA version for GPU images if different from default +- Mention if you want to base it on an existing version +- Include any special requirements upfront + +--- + +## Prerequisites + +Before starting a new TensorFlow inference image build: + +1. **Check TensorFlow Serving availability**: Visit [TensorFlow Serving releases](https://github.com/tensorflow/serving/releases) to verify if TF Serving for your target version exists + - If TF Serving X.Y.Z doesn't exist, use the latest available version (usually X.Y-1.Z) + +2. **Check TensorFlow Serving API availability**: Verify on PyPI + - CPU: `tensorflow-serving-api==X.Y.Z` + - GPU: `tensorflow-serving-api-gpu==X.Y.Z` + +3. **Check license file availability**: Verify `s3://aws-dlc-licenses/tensorflow-X.Y/license.txt` exists + - If not, use the previous version's license file + +4. **Identify CUDA/cuDNN versions**: For GPU images, check TensorFlow's tested configurations + +--- + +## File Structure + +For a new TensorFlow inference version (e.g., 2.20), create: + +``` +tensorflow/inference/ +├── buildspec-2-20-sm.yml # SageMaker buildspec +├── buildspec-2-20-ec2.yml # EC2 buildspec (if needed) +├── docker/ +│ └── 2.20/ +│ └── py3/ +│ ├── Dockerfile.cpu # CPU Dockerfile +│ └── cu125/ # CUDA version directory +│ └── Dockerfile.gpu # GPU Dockerfile +└── sagemaker/ # SageMaker serving code (copied to image) +``` + +--- + +## Step-by-Step Build Process + +### Step 1: Create Buildspec File + +Create `tensorflow/inference/buildspec-X-Y-sm.yml`: + +```yaml +account_id: &ACCOUNT_ID <+ACCOUNT_ID+> +region: ®ION <+REGION+> +framework: &FRAMEWORK tensorflow +version: &VERSION "X.Y.0" # IMPORTANT: Quote version numbers like "2.20" to prevent YAML parsing issues +short_version: &SHORT_VERSION "X.Y" # IMPORTANT: Must be quoted! + +repository_info: + inference_repository: &INFERENCE_REPOSITORY + image_type: &INFERENCE_IMAGE_TYPE inference + root: !join [ *FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *INFERENCE_IMAGE_TYPE] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + +context: + inference_context: &INFERENCE_CONTEXT + sagemaker: + source: docker/X.Y/py3/sagemaker + target: sagemaker + bash_telemetry: # Required for telemetry tests + source: ../../miscellaneous_scripts/bash_telemetry.sh + target: bash_telemetry.sh + +images: + BuildTFInferenceCPUPy3DockerImage: + <<: *INFERENCE_REPOSITORY + build: &TENSORFLOW_CPU_INFERENCE_PY3 false + image_size_baseline: 4000 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py312 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + docker_file: !join [ docker/, *SHORT_VERSION, /py3/Dockerfile.cpu ] + target: sagemaker + context: + <<: *INFERENCE_CONTEXT + + # Add GPU image configuration similarly +``` + +**Key Points:** +- Always quote version numbers like `"2.20"` to prevent YAML float parsing +- Include `bash_telemetry.sh` in context for telemetry tests +- Set `build: false` initially for testing + +### Step 2: Create CPU Dockerfile + +Create `tensorflow/inference/docker/X.Y/py3/Dockerfile.cpu`: + +```dockerfile +# Note: Using TF Serving X.Y-1.0 as TF Serving X.Y.0 is not yet released (if applicable) +FROM tensorflow/serving:X.Y.0-devel as build_image +# OR if TF Serving doesn't exist: +FROM tensorflow/serving:X.Y-1.0-devel as build_image + +FROM ubuntu:22.04 AS base_image +# ... base configuration ... + +FROM base_image AS ec2 +# ... EC2 stage ... + +# Key sections to include: + +# 1. Python installation from source +RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ + && tar -xvf Python-${PYTHON_VERSION}.tgz \ + && cd Python-${PYTHON_VERSION} \ + && ./configure && make && make install \ + && rm -rf ../Python-${PYTHON_VERSION}* + +# 2. Pip/setuptools upgrade with CVE fixes +RUN ${PIP} --no-cache-dir install --upgrade \ + pip \ + "setuptools>=75.8.2" \ + "wheel>=0.46.2" \ + && rm -rf /usr/local/lib/python*/site-packages/setuptools/_vendor/wheel* + +# 3. TF Serving API installation (use matching TF Serving version) +RUN ${PIP} install --no-dependencies --no-cache-dir \ + tensorflow-serving-api=="X.Y.0" + +# 4. Copy TF model server binary +COPY --from=build_image /usr/local/bin/tensorflow_model_server /usr/bin/tensorflow_model_server + +# 5. Entrypoint script (IMPORTANT: use >> for appending) +RUN echo '#!/bin/bash \n\n' > /usr/bin/tf_serving_entrypoint.sh \ + && echo 'bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true' >> /usr/bin/tf_serving_entrypoint.sh \ + && echo '/usr/bin/tensorflow_model_server --port=8500 --rest_api_port=8501 --model_name=${MODEL_NAME} --model_base_path=${MODEL_BASE_PATH}/${MODEL_NAME} "$@"' >> /usr/bin/tf_serving_entrypoint.sh \ + && chmod +x /usr/bin/tf_serving_entrypoint.sh + +# 6. License file (use previous version if current not available) +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-X.Y/license.txt -o /license.txt +# OR if not available: +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-X.Y-1/license.txt -o /license.txt + +# SageMaker stage +FROM ec2 AS sagemaker +# ... SageMaker specific configurations ... +``` + +### Step 3: Create GPU Dockerfile + +Create `tensorflow/inference/docker/X.Y/py3/cu125/Dockerfile.gpu`: + +Key differences from CPU: +- Base image: `nvidia/cuda:12.5.0-base-ubuntu22.04` +- Build image: `tensorflow/serving:X.Y.0-devel-gpu` +- Install CUDA toolkit and cuDNN packages +- Use `tensorflow-serving-api-gpu` instead of `tensorflow-serving-api` +- Add CUDA compat scripts for SageMaker + +**cuDNN Package Naming:** +```dockerfile +# For CUDA 12.x with cuDNN 9.x: +libcudnn9-cuda-12=${CUDNN_VERSION} +libcudnn9-dev-cuda-12=${CUDNN_VERSION} +``` + +### Step 4: Update Test Allowlists (if needed) + +If using a different TF Serving version than the TF version, update `test/dlc_tests/sanity/test_pre_release.py`: + +```python +# In test_tf_serving_version_cpu and _test_framework_and_cuda_version: +# TF Serving X.Y is not yet released, so TF X.Y images use TF Serving X.Y-1 +expected_serving_version = tag_framework_version +if Version(tag_framework_version) >= Version("X.Y.0") and Version(tag_framework_version) < Version("X.Y+1.0"): + expected_serving_version = "X.Y-1" + +# In test_license_file: +# TF X.Y license is not yet available in S3, use TF X.Y-1 license +if framework == "tensorflow" and short_version == "X.Y": + short_version = "X.Y-1" +``` + +### Step 5: Configure Build + +Update `dlc_developer_config.toml`: + +```toml +[build] +build_frameworks = ["tensorflow"] +build_job_types = ["inference"] + +[buildspec_override] +tensorflow-inference-cpu = "buildspec-X-Y-sm.yml" +tensorflow-inference-gpu = "buildspec-X-Y-sm.yml" +``` + +--- + +## Common Issues and Solutions + +### 1. YAML Parsing Error: `expected ''` +**Cause:** Version number `2.20` is parsed as float `2.2` +**Solution:** Quote version numbers: `"2.20"` + +### 2. Build Error: `COPY failed: file not found` +**Cause:** Missing context files in buildspec +**Solution:** Add required files to buildspec context section (e.g., `bash_telemetry.sh`) + +### 3. TF Serving Image Not Found +**Cause:** TF Serving version not yet released +**Solution:** Use previous TF Serving version that's compatible + +### 4. cuDNN Package Not Found +**Cause:** Package naming changed between CUDA versions +**Solution:** Use correct naming: `libcudnn9-cuda-12` instead of `libcudnn9-cuda-12-5` + +### 5. Exec Format Error +**Cause:** Shell script missing shebang due to `>` overwriting instead of `>>` appending +**Solution:** Use `>>` for all lines after the first: +```dockerfile +RUN echo '#!/bin/bash \n\n' > /script.sh \ + && echo 'line2' >> /script.sh \ # Note: >> not > + && echo 'line3' >> /script.sh +``` + +### 6. License File Not Found (404) +**Cause:** License file for new version not uploaded to S3 +**Solution:** Use previous version's license file temporarily + +### 7. CVE in Vendored wheel (setuptools) +**Cause:** setuptools bundles an old wheel version +**Solution:** Remove vendored wheel after install: +```dockerfile +RUN rm -rf /usr/local/lib/python*/site-packages/setuptools/_vendor/wheel* +``` + +### 8. nvjpeg CVE +**Cause:** Vulnerable nvjpeg version in base CUDA image +**Solution:** Patch nvjpeg with latest available version: +```dockerfile +RUN wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ + && # Extract and replace files +``` + +### 9. Black Formatting Failures +**Cause:** Python files not formatted with project's black config +**Solution:** Format with: `black --verbose -l 100 ` or `pip install black==24.8.0 && black ` + +--- + +## Testing + +### Local Testing + +1. Build image locally: +```bash +docker build -t tf-inf-test -f tensorflow/inference/docker/X.Y/py3/Dockerfile.cpu --target sagemaker . +``` + +2. Run basic tests: +```bash +docker run -it tf-inf-test tensorflow_model_server --version +docker run -it tf-inf-test python -c "import tensorflow_serving" +``` + +### CI/CD Testing + +Tests run automatically include: +- `test_tf_serving_version_cpu`: Verifies TF Serving version +- `test_tf_serving_api_version`: Verifies TF Serving API version +- `test_license_file`: Verifies license file matches S3 +- `test_ecr_enhanced_scan`: Security vulnerability scanning +- `test_ec2_tensorflow_inference_telemetry`: Telemetry functionality + +--- + +## Checklist + +Before submitting PR: + +- [ ] Version numbers quoted in YAML (`"2.20"` not `2.20`) +- [ ] `bash_telemetry.sh` added to buildspec context +- [ ] TF Serving version exists (or using compatible earlier version) +- [ ] TF Serving API version matches TF Serving version +- [ ] cuDNN package names correct for CUDA version +- [ ] Entrypoint script uses `>>` for appending (not `>`) +- [ ] License URL points to existing file in S3 +- [ ] CVE fixes applied (wheel >= 0.46.2, setuptools >= 75.8.2, remove vendored wheel) +- [ ] nvjpeg patched if needed +- [ ] Test allowlists updated for TF Serving version mismatch +- [ ] Python files formatted with `black --verbose -l 100` +- [ ] Build tested locally + +--- + +## Future Automation Opportunities + +1. **Version Availability Check Script**: Auto-check TF Serving, TF Serving API, and license file availability +2. **Template Generation**: Auto-generate Dockerfiles from templates with version substitution +3. **CVE Auto-patching**: Script to automatically apply known CVE fixes +4. **Buildspec Generator**: Generate buildspec from template with version parameters +5. **Test Updater**: Auto-update test allowlists when using different TF Serving version + +--- + +## References + +- [TensorFlow Serving Releases](https://github.com/tensorflow/serving/releases) +- [TensorFlow Serving API on PyPI](https://pypi.org/project/tensorflow-serving-api/) +- [NVIDIA CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/) +- [AWS DLC License Bucket](https://aws-dlc-licenses.s3.amazonaws.com/) diff --git a/tensorflow/inference/buildspec-2-20-sm.yml b/tensorflow/inference/buildspec-2-20-sm.yml new file mode 100644 index 000000000000..92ac899a2e2c --- /dev/null +++ b/tensorflow/inference/buildspec-2-20-sm.yml @@ -0,0 +1,77 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK tensorflow +version: &VERSION 2.20.0 +short_version: &SHORT_VERSION "2.20" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + inference_repository: &INFERENCE_REPOSITORY + image_type: &INFERENCE_IMAGE_TYPE inference + root: !join [ *FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *INFERENCE_IMAGE_TYPE] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *INFERENCE_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, + *RELEASE_REPOSITORY_NAME ] + +context: + inference_context: &INFERENCE_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + sagemaker_package_name: + source: docker/build_artifacts/sagemaker + target: sagemaker + init: + source: docker/build_artifacts/__init__.py + target: __init__.py + dockerd-entrypoint: + source: docker/build_artifacts/dockerd-entrypoint.py + target: dockerd-entrypoint.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + bash_telemetry: + source: ../../miscellaneous_scripts/bash_telemetry.sh + target: bash_telemetry.sh + +images: + BuildSageMakerTensorflowCPUInferencePy3DockerImage: + <<: *INFERENCE_REPOSITORY + build: &TENSORFLOW_CPU_INFERENCE_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 4899 + framework_version: &FRAMEWORK_VERSION 2.20.0 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *FRAMEWORK_VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *FRAMEWORK_VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + enable_test_promotion: true + context: + <<: *INFERENCE_CONTEXT + BuildSageMakerTensorflowGPUInferencePy3DockerImage: + <<: *INFERENCE_REPOSITORY + build: &TENSORFLOW_GPU_INFERENCE_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 13100 + framework_version: &FRAMEWORK_VERSION 2.20.0 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu125 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *FRAMEWORK_VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *FRAMEWORK_VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + enable_test_promotion: true + context: + <<: *INFERENCE_CONTEXT diff --git a/tensorflow/inference/buildspec.yml b/tensorflow/inference/buildspec.yml index 7a2dfe62289b..81669fb15808 100644 --- a/tensorflow/inference/buildspec.yml +++ b/tensorflow/inference/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-19-sm.yml +buildspec_pointer: buildspec-2-20-sm.yml diff --git a/tensorflow/inference/docker/2.20/py3/Dockerfile.cpu b/tensorflow/inference/docker/2.20/py3/Dockerfile.cpu new file mode 100644 index 000000000000..0019f4fe602f --- /dev/null +++ b/tensorflow/inference/docker/2.20/py3/Dockerfile.cpu @@ -0,0 +1,231 @@ +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +# Note: Using TF Serving 2.19.0 as TF Serving 2.20.0 is not yet released +# TF Serving 2.19 is compatible with TF 2.20 models (SavedModel format is backward compatible) +FROM tensorflow/serving:2.19.0-devel as build_image + +FROM ubuntu:22.04 AS base_image + +ENV DEBIAN_FRONTEND=noninteractive \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +FROM base_image AS ec2 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON=python3.12 +ARG PYTHON_PIP=python3-pip +ARG PIP=pip3 +ARG PYTHON_VERSION=3.12.10 +ARG TFS_SHORT_VERSION=2.20 + + +# ENV variable to be passed to SageMaker stage +ENV PIP=${PIP} +ENV PYTHON=${PYTHON} +ENV PYTHON_VERSION=${PYTHON_VERSION} + +# See http://bugs.python.org/issue19846 +ENV LANG=C.UTF-8 +# Python won't try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV LD_LIBRARY_PATH='/usr/local/lib:$LD_LIBRARY_PATH' +ENV MODEL_BASE_PATH=/models +# The only required piece is the model name in order to differentiate endpoints +ENV MODEL_NAME=model +ENV DEBIAN_FRONTEND=noninteractive + +# First install basic tools needed for Python compilation +RUN apt-get update \ + && apt-get -y install --no-install-recommends \ + ca-certificates \ + curl \ + wget \ + gnupg2 \ + build-essential \ + zlib1g-dev \ + libssl-dev \ + libbz2-dev \ + liblzma-dev \ + libffi-dev \ + libreadline-dev \ + libncursesw5-dev \ + libsqlite3-dev \ + libgdbm-dev \ + tk-dev \ + libc6-dev \ + openssl \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install python3.12 +RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ + && tar -xvf Python-${PYTHON_VERSION}.tgz \ + && cd Python-${PYTHON_VERSION} \ + && ./configure && make && make install \ + && rm -rf ../Python-${PYTHON_VERSION}* + +# Install remaining packages +RUN apt-get update \ + && apt-get -y install --no-install-recommends \ + emacs \ + git \ + unzip \ + vim \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Upgrade pip, setuptools, and wheel to fix CVE-2026-24049 +RUN ${PIP} --no-cache-dir install --upgrade \ + pip \ + "setuptools>=75.8.2" \ + "wheel>=0.46.2" \ + && rm -rf /usr/local/lib/python*/site-packages/setuptools/_vendor/wheel* \ + && rm -rf /usr/local/lib/python*/site-packages/setuptools/_distutils_hack/__pycache__ + +RUN ${PIP} install --no-cache-dir \ + "awscli<2" \ + boto3 \ + cython \ + gevent \ + "requests>=2.32.0" \ + "urllib3>=2.5.0" \ + grpcio \ + "protobuf>=4.25.0,<6.0.0" \ + packaging \ +# using --no-dependencies to avoid installing tensorflow binary +# Note: Using TF Serving API 2.19.0 as 2.20.0 is not yet released + && ${PIP} install --no-dependencies --no-cache-dir \ + tensorflow-serving-api=="2.19.0" + +# Some TF tools expect a "python" binary +RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \ + && ln -s $(which ${PIP}) /usr/bin/pip + + +# Install TF Serving pkg +COPY --from=build_image /usr/local/bin/tensorflow_model_server /usr/bin/tensorflow_model_server + +# Expose ports +# gRPC and REST +EXPOSE 8500 8501 + +# Set where models should be stored in the container +RUN mkdir -p ${MODEL_BASE_PATH} + +ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh + +RUN chmod +x /usr/local/bin/bash_telemetry.sh + +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Create a script that runs the model server so we can use environment variables +# while also passing in arguments from the docker command line +RUN echo '#!/bin/bash \n\n' > /usr/bin/tf_serving_entrypoint.sh \ + && echo 'bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true' >> /usr/bin/tf_serving_entrypoint.sh \ + && echo '/usr/bin/tensorflow_model_server --port=8500 --rest_api_port=8501 --model_name=${MODEL_NAME} --model_base_path=${MODEL_BASE_PATH}/${MODEL_NAME} "$@"' >> /usr/bin/tf_serving_entrypoint.sh \ + && chmod +x /usr/bin/tf_serving_entrypoint.sh + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* + +# Use TF 2.19 license file since TF 2.20 license is not yet available +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.19/license.txt -o /license.txt + +RUN rm -rf /tmp/* + +CMD ["/usr/bin/tf_serving_entrypoint.sh"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM ec2 AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +# Specify accept-bind-to-port LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT +# https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipeline-real-time.html +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true +LABEL com.amazonaws.sagemaker.capabilities.multi-models=true + +ARG TFS_SHORT_VERSION=2.20 +ENV SAGEMAKER_TFS_VERSION="${TFS_SHORT_VERSION}" +ENV PATH="$PATH:/sagemaker" + +# nginx + njs +RUN curl -s http://nginx.org/keys/nginx_signing.key | apt-key add - \ + && echo 'deb http://nginx.org/packages/ubuntu/ jammy nginx' >> /etc/apt/sources.list \ + && apt-get update \ + && apt-get -y install --no-install-recommends \ + nginx \ + nginx-module-njs \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# the Pins are for the TFS SageMaker Toolkit +RUN ${PIP} install --no-cache-dir \ + falcon==3.1.0 \ + "gunicorn>=22.0.0" + +COPY ./sagemaker /sagemaker + +# Expose ports +# gRPC and REST +EXPOSE 8500 8501 + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* + +RUN rm -rf /tmp/* + +CMD ["/usr/bin/tf_serving_entrypoint.sh"] diff --git a/tensorflow/inference/docker/2.20/py3/cu125/Dockerfile.gpu b/tensorflow/inference/docker/2.20/py3/cu125/Dockerfile.gpu new file mode 100644 index 000000000000..15d52cc6f118 --- /dev/null +++ b/tensorflow/inference/docker/2.20/py3/cu125/Dockerfile.gpu @@ -0,0 +1,305 @@ +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +# Note: Using TF Serving 2.19.0 as TF Serving 2.20.0 is not yet released +# TF Serving 2.19 is compatible with TF 2.20 models (SavedModel format is backward compatible) +FROM tensorflow/serving:2.19.0-devel-gpu as build_image + +FROM nvidia/cuda:12.5.0-base-ubuntu22.04 AS base_image + +ENV DEBIAN_FRONTEND=noninteractive \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +FROM base_image AS ec2 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON=python3.12 +ARG PYTHON_PIP=python3-pip +ARG PIP=pip3 +ARG PYTHON_VERSION=3.12.10 +ARG TFS_SHORT_VERSION=2.20 +ARG CUDA_DASH=12-5 + +# TF Serving 2.19.0 git commit +ARG TF_SERVING_VERSION_GIT_COMMIT=c491ba6 + +# ENV variable to be passed to SageMaker stage +ENV PIP=${PIP} +ENV PYTHON=${PYTHON} +ENV PYTHON_VERSION=${PYTHON_VERSION} + + +ENV CUDA_DASH=12-5 +ENV NCCL_VERSION=2.21.5-1+cuda12.5 +ENV CUDNN_VERSION=9.3.0.75-1 + +# See http://bugs.python.org/issue19846 +ENV LANG=C.UTF-8 +ENV PYTHONDONTWRITEBYTECODE=1 +# Python won't try to write .pyc or .pyo files on the import of source modules +ENV PYTHONUNBUFFERED=1 +ENV MODEL_BASE_PATH=/models +# The only required piece is the model name in order to differentiate endpoints +ENV MODEL_NAME=model +# Fix for the interactive mode during an install in step 21 +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl + +RUN curl -sSL --retry 5 https://raw.githubusercontent.com/tensorflow/serving/${TF_SERVING_VERSION_GIT_COMMIT}/tensorflow_serving/tools/docker/setup.sources.sh | sh + + +# First install basic tools needed for Python compilation +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + wget \ + gnupg2 \ + build-essential \ + zlib1g-dev \ + libssl-dev \ + libbz2-dev \ + liblzma-dev \ + libffi-dev \ + libreadline-dev \ + libncursesw5-dev \ + libsqlite3-dev \ + libgdbm-dev \ + tk-dev \ + libc6-dev \ + openssl \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + rm cuda-keyring_1.0-1_all.deb && \ + apt-get update + +# Install python +RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ + && tar -xvf Python-${PYTHON_VERSION}.tgz \ + && cd Python-${PYTHON_VERSION} \ + && ./configure && make && make install -j \ + && rm -rf ../Python-${PYTHON_VERSION}* + +RUN apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated --allow-downgrades \ + cuda-command-line-tools-${CUDA_DASH} \ + cuda-cupti-${CUDA_DASH} \ + cuda-libraries-${CUDA_DASH} \ + cuda-nvprune-${CUDA_DASH} \ + cuda-nvrtc-${CUDA_DASH} \ + cuda-nvrtc-dev-${CUDA_DASH} \ + cuda-cudart-dev-${CUDA_DASH} \ + cuda-nvcc-${CUDA_DASH} \ + libcufft-${CUDA_DASH} \ + libcufft-dev-${CUDA_DASH} \ + libcurand-${CUDA_DASH} \ + libcurand-dev-${CUDA_DASH} \ + libcusolver-${CUDA_DASH} \ + libcusolver-dev-${CUDA_DASH} \ + libcusparse-dev-${CUDA_DASH} \ + #cuda-cublas-dev not available with 10-1, install libcublas instead + libcublas-${CUDA_DASH} \ + libcublas-dev-${CUDA_DASH} \ + libcudnn9-cuda-12=${CUDNN_VERSION} \ + libcudnn9-dev-cuda-12=${CUDNN_VERSION} \ + libnccl2=${NCCL_VERSION} \ + libnccl-dev=${NCCL_VERSION} \ + libgomp1 \ + emacs \ + git \ + unzip \ + vim \ + libfreetype6-dev \ + pkg-config \ + software-properties-common \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +# Upgrade pip, setuptools, and wheel to fix CVE-2026-24049 +RUN ${PIP} --no-cache-dir install --upgrade \ + pip \ + "setuptools>=75.8.2" \ + "wheel>=0.46.2" \ + && rm -rf /usr/local/lib/python*/site-packages/setuptools/_vendor/wheel* \ + && rm -rf /usr/local/lib/python*/site-packages/setuptools/_distutils_hack/__pycache__ + +# Upgrade libsasl2-2 for fixing cyrus-sasl2 related CVE +RUN apt-get install -y --only-upgrade libsasl2-2 + +# Some TF tools expect a "python" binary +RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \ + && ln -s $(which ${PIP}) /usr/bin/pip + +RUN apt-get update \ + && apt-get -y install --no-install-recommends \ + curl \ + gnupg2 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# patch nvjpeg - use latest available version for CUDA 12.x +RUN mkdir -p /tmp/nvjpeg \ +&& cd /tmp/nvjpeg \ +&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \ +&& rm -rf /tmp/nvjpeg \ +# patch cuobjdump and nvdisasm +&& rm -rf /usr/local/cuda/bin/cuobjdump* \ +&& rm -rf /usr/local/cuda/bin/nvdisasm* + +RUN ${PIP} install -U --no-cache-dir \ + "awscli<2" \ + boto3 \ + cython \ + gevent \ + "requests>=2.32.0" \ + "urllib3>=2.5.0" \ + grpcio \ + "protobuf>=4.25.0,<6.0.0" \ + packaging \ +# using --no-dependencies to avoid installing tensorflow binary +# Note: Using TF Serving API 2.19.0 as 2.20.0 is not yet released + && ${PIP} install --no-dependencies --no-cache-dir \ + tensorflow-serving-api-gpu=="2.19.0" + + +# Install TF Serving GPU pkg +COPY --from=build_image /usr/local/bin/tensorflow_model_server /usr/bin/tensorflow_model_server + +# Expose gRPC and REST port +EXPOSE 8500 8501 + +# Set where models should be stored in the container +RUN mkdir -p ${MODEL_BASE_PATH} + +ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh + +RUN chmod +x /usr/local/bin/bash_telemetry.sh + +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Create a script that runs the model server so we can use environment variables +# while also passing in arguments from the docker command line +RUN echo '#!/bin/bash \n\n' > /usr/bin/tf_serving_entrypoint.sh \ + && echo 'bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true' >> /usr/bin/tf_serving_entrypoint.sh \ + && echo '/usr/bin/tensorflow_model_server --port=8500 --rest_api_port=8501 --model_name=${MODEL_NAME} --model_base_path=${MODEL_BASE_PATH}/${MODEL_NAME} "$@"' >> /usr/bin/tf_serving_entrypoint.sh \ + && chmod +x /usr/bin/tf_serving_entrypoint.sh + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* + +# Use TF 2.19 license file since TF 2.20 license is not yet available +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.19/license.txt -o /license.txt + +RUN rm -rf /tmp/* + +CMD ["/usr/bin/tf_serving_entrypoint.sh"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM ec2 AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +# Specify accept-bind-to-port LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT +# https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipeline-real-time.html +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true +LABEL com.amazonaws.sagemaker.capabilities.multi-models=true +LABEL com.amazonaws.sagemaker.inference.cuda.verified_versions=12.5 + +ARG TFS_SHORT_VERSION=2.20 +ENV SAGEMAKER_TFS_VERSION="${TFS_SHORT_VERSION}" +ENV PATH="$PATH:/sagemaker" + +# nginx + njs +RUN curl -s http://nginx.org/keys/nginx_signing.key | apt-key add - \ + && echo 'deb http://nginx.org/packages/ubuntu/ jammy nginx' >> /etc/apt/sources.list \ + && apt-get update \ + && apt-get -y install --no-install-recommends \ + nginx \ + nginx-module-njs \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# the Pins are for the TFS SageMaker Toolkit +RUN ${PIP} install -U --no-cache-dir \ + falcon==3.1.0 \ + "gunicorn>=22.0.0" + +COPY ./sagemaker /sagemaker + +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/start_cuda_compat.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +# Expose gRPC and REST port +EXPOSE 8500 8501 + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* + +RUN rm -rf /tmp/* + +ENTRYPOINT ["bash", "-m", "/usr/local/bin/dockerd_entrypoint.sh"] +CMD ["/usr/bin/tf_serving_entrypoint.sh"] diff --git a/test/dlc_tests/sanity/test_pre_release.py b/test/dlc_tests/sanity/test_pre_release.py index b2959cfcdca7..10e4cb638f14 100644 --- a/test/dlc_tests/sanity/test_pre_release.py +++ b/test/dlc_tests/sanity/test_pre_release.py @@ -256,6 +256,14 @@ def test_tf_serving_version_cpu(tensorflow_inference): "Skipping this test for TF 2.6.3 inference as the v2.6.3 version is already on production" ) + # TF Serving 2.20 is not yet released, so TF 2.20 images use TF Serving 2.19 + # Map TF 2.20.x to expected TF Serving 2.19.x + expected_serving_version = tag_framework_version + if Version(tag_framework_version) >= Version("2.20.0") and Version( + tag_framework_version + ) < Version("2.21.0"): + expected_serving_version = "2.19" + ctx = Context() container_name = get_container_name("tf-serving-version", image) start_container(container_name, image, ctx) @@ -263,8 +271,8 @@ def test_tf_serving_version_cpu(tensorflow_inference): container_name, ctx, "tensorflow_model_server --version", executable="bash" ) assert re.match( - rf"TensorFlow ModelServer: {tag_framework_version}(\D+)?", output.stdout - ), f"Cannot find model server version {tag_framework_version} in {output.stdout}" + rf"TensorFlow ModelServer: {expected_serving_version}(\D+)?", output.stdout + ), f"Cannot find model server version {expected_serving_version} in {output.stdout}" stop_and_remove_container(container_name, ctx) @@ -295,6 +303,13 @@ def test_tf_serving_api_version(tensorflow_inference): _, tag_framework_version = get_framework_and_version_from_tag(image) + # TF Serving 2.20 is not yet released, so TF 2.20 images use TF Serving API 2.19 + expected_serving_api_version = tag_framework_version + if Version(tag_framework_version) >= Version("2.20.0") and Version( + tag_framework_version + ) < Version("2.21.0"): + expected_serving_api_version = "2.19.0" + ctx = Context() container_name = get_container_name("tf-serving-api-version", image) start_container(container_name, image, ctx) @@ -302,8 +317,8 @@ def test_tf_serving_api_version(tensorflow_inference): output = run_cmd_on_container(container_name, ctx, cmd, executable="bash") str_version_from_output = ((str(output.stdout).split(" "))[1]).strip() assert ( - tag_framework_version == str_version_from_output - ), f"Tensorflow serving API version is {str_version_from_output} while the Tensorflow version is {tag_framework_version}. Both don't match!" + expected_serving_api_version == str_version_from_output + ), f"Tensorflow serving API version is {str_version_from_output} while the expected version is {expected_serving_api_version}. Both don't match!" except Exception as e: LOGGER.error(f"Unable to execute command on container. Error: {e}") raise @@ -786,7 +801,8 @@ def test_cuda_paths(gpu): if image_spec["device_type"] == "gpu" and image_spec["tag"] == image_tag: image_tag_in_buildspec = True dockerfile_spec_abs_path = os.path.join( - os.path.dirname(framework_version_path), image_spec["docker_file"].lstrip("docker/") + os.path.dirname(framework_version_path), + image_spec["docker_file"].lstrip("docker/"), ) break try: @@ -899,11 +915,18 @@ def _test_framework_and_cuda_version(gpu, ec2_connection): ), image_repo_name, ): + # TF Serving 2.20 is not yet released, so TF 2.20 images use TF Serving 2.19 + expected_serving_version = tag_framework_version + if Version(tag_framework_version) >= Version("2.20.0") and Version( + tag_framework_version + ) < Version("2.21.0"): + expected_serving_version = "2.19" + cmd = f"tensorflow_model_server --version" output = ec2.execute_ec2_training_test(ec2_connection, image, cmd, executable="bash").stdout assert re.match( - rf"TensorFlow ModelServer: {tag_framework_version}(\D+)?", output - ), f"Cannot find model server version {tag_framework_version} in {output}" + rf"TensorFlow ModelServer: {expected_serving_version}(\D+)?", output + ), f"Cannot find model server version {expected_serving_version} in {output}" else: # Framework name may include huggingface if any( @@ -1094,6 +1117,10 @@ def test_license_file(image): else: raise Exception(f"Invalid huggingface framework detected: {framework}") + # TF 2.20 license is not yet available in S3, use TF 2.19 license + if framework == "tensorflow" and short_version == "2.20": + short_version = "2.19" + LICENSE_FILE_BUCKET = "aws-dlc-licenses" local_repo_path = get_repository_local_path() container_filename = "CONTAINER_LICENSE_FILE"