Skip to content

Commit 1245ddc

Browse files
authored
Merge branch 'master' into 2.7-tr
2 parents cec9d38 + 7374fd7 commit 1245ddc

File tree

12 files changed

+285
-75
lines changed

12 files changed

+285
-75
lines changed

base/x86_64/gpu/cu129/ubuntu22.04/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ RUN mv /usr/local/cuda/compat /usr/local \
4848
libffi-dev \
4949
libbz2-dev \
5050
liblzma-dev \
51+
libsqlite3-dev \
5152
&& apt-get autoremove -y \
5253
&& apt-get clean \
5354
&& rm -rf /var/lib/apt/lists/*

release_images_general.yml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,20 @@ release_images:
1515
force_release: False
1616
public_registry: True
1717
2:
18+
framework: "base"
19+
version: "12.8.1"
20+
arch_type: "x86"
21+
customer_type: "ec2"
22+
general:
23+
device_types: [ "gpu" ]
24+
python_versions: [ "py312" ]
25+
os_version: "ubuntu22.04"
26+
cuda_version: "cu128"
27+
example: False
28+
disable_sm_tag: False
29+
force_release: False
30+
public_registry: True
31+
3:
1832
framework: "base"
1933
version: "12.9.1"
2034
arch_type: "x86"
@@ -28,7 +42,7 @@ release_images:
2842
disable_sm_tag: False
2943
force_release: False
3044
public_registry: True
31-
3:
45+
4:
3246
framework: "vllm"
3347
version: "0.10.0"
3448
arch_type: "x86"

src/image.py

Lines changed: 186 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import constants
2222
import logging
23-
import json
23+
import subprocess
2424

2525
LOGGER = logging.getLogger(__name__)
2626
LOGGER.setLevel(logging.INFO)
@@ -173,10 +173,14 @@ def build(self):
173173
# Conduct some preprocessing before building the image
174174
self.update_pre_build_configuration()
175175

176-
# Start building the image
177-
with open(self.context.context_path, "rb") as context_file:
178-
self.docker_build(fileobj=context_file, custom_context=True)
179-
self.context.remove()
176+
# Start building the image with Buildx
177+
build_start_time = datetime.now()
178+
self.docker_build(context_path=self.context.context_path, custom_context=True)
179+
build_end_time = datetime.now()
180+
duration_seconds = (build_end_time - build_start_time).total_seconds()
181+
LOGGER.info(f"Build duration: {duration_seconds:.2f} seconds")
182+
183+
self.context.remove()
180184

181185
if self.build_status != constants.SUCCESS:
182186
LOGGER.info(f"Exiting with image build status {self.build_status} without image check.")
@@ -193,64 +197,196 @@ def build(self):
193197
# This return is necessary. Otherwise FORMATTER fails while displaying the status.
194198
return self.build_status
195199

196-
def docker_build(self, fileobj=None, custom_context=False):
200+
def docker_build(self, context_path, custom_context=False):
197201
"""
198-
Uses low level Docker API Client to actually start the process of building the image.
202+
Uses Docker Buildx for vLLM images, falls back to legacy Docker API for others
199203
200-
:param fileobj: FileObject, a readable file-like object pointing to the context tarfile.
201-
:param custom_context: bool
202-
:return: int, Build Status
204+
:param context_path: str, Path to build context
205+
:param custom_context: bool, Whether to use custom context from stdin (default: False)
206+
:return: int, Build status
203207
"""
204-
response = [f"Starting the Build Process for {self.repository}:{self.tag}"]
205-
LOGGER.info(f"Starting the Build Process for {self.repository}:{self.tag}")
208+
if self._is_vllm_image():
209+
LOGGER.info(f"Using Buildx for vLLM image: {self.repository}:{self.tag}")
210+
return self._buildx_build(context_path, custom_context)
211+
else:
212+
LOGGER.info(f"Using legacy Docker API for non-vLLM image: {self.repository}:{self.tag}")
213+
return self._legacy_docker_build(context_path, custom_context)
206214

207-
line_counter = 0
208-
line_interval = 50
209-
for line in self.client.build(
210-
fileobj=fileobj,
211-
path=self.dockerfile,
212-
custom_context=custom_context,
213-
rm=True,
214-
decode=True,
215-
tag=self.ecr_url,
216-
buildargs=self.build_args,
217-
labels=self.labels,
218-
target=self.target,
219-
):
220-
# print the log line during build for every line_interval lines for debugging
221-
if line_counter % line_interval == 0:
222-
LOGGER.debug(line)
223-
line_counter += 1
215+
def _is_vllm_image(self):
216+
"""
217+
Determine if current image is a vLLM image
224218
225-
if line.get("error") is not None:
226-
response.append(line["error"])
227-
self.log.append(response)
228-
self.build_status = constants.FAIL
229-
self.summary["status"] = constants.STATUS_MESSAGE[self.build_status]
230-
self.summary["end_time"] = datetime.now()
219+
:return: bool, True if this is a vLLM image
220+
"""
221+
return (
222+
self.info.get("framework") == "vllm"
223+
or "vllm" in self.repository.lower()
224+
or "vllm" in str(self.info.get("name", "")).lower()
225+
)
231226

232-
LOGGER.info(f"Docker Build Logs: \n {self.get_tail_logs_in_pretty_format(100)}")
233-
LOGGER.error("ERROR during Docker BUILD")
234-
LOGGER.error(
235-
f"Error message received for {self.dockerfile} while docker build: {line}"
236-
)
227+
def _buildx_build(self, context_path, custom_context=False):
228+
"""
229+
Uses Docker Buildx CLI for building with real-time streaming and advanced caching.
237230
238-
return self.build_status
239231
240-
if line.get("stream") is not None:
241-
response.append(line["stream"])
242-
elif line.get("status") is not None:
243-
response.append(line["status"])
232+
Automatically finds and uses the latest available image as a cache source from ECR
233+
to speed up builds through layer reuse.
234+
235+
:param context_path: str, Path to build context
236+
:param custom_context: bool, Whether to use custom context from stdin (default: False)
237+
:return: int, Build status
238+
"""
239+
240+
response = [f"Starting Buildx Process for {self.repository}:{self.tag}"]
241+
LOGGER.info(f"Starting Buildx Process for {self.repository}:{self.tag}")
242+
243+
cmd = [
244+
"docker",
245+
"buildx",
246+
"build",
247+
"-t",
248+
self.ecr_url,
249+
"--progress=plain", # Real-time log streaming
250+
]
251+
252+
for k, v in self.build_args.items():
253+
cmd.extend(["--build-arg", f"{k}={v}"])
254+
255+
for k, v in self.labels.items():
256+
cmd.extend(["--label", f"{k}={v}"])
257+
258+
if self.target:
259+
cmd.extend(["--target", self.target])
260+
261+
# Always use inline cache-to for maximum caching
262+
cmd.extend(["--cache-to", "type=inline"])
263+
264+
# Use shortest tag from additional_tags as a suitable cache source
265+
latest_tag = min(self.additional_tags, key=len)
266+
267+
if latest_tag:
268+
latest_image_uri = f"{self.repository}:{latest_tag}"
269+
LOGGER.info(f"Using cache from registry: {latest_image_uri}")
270+
cmd.extend(["--cache-from", f"type=registry,ref={latest_image_uri}"])
271+
else:
272+
LOGGER.info("No suitable cache source found. Proceeding without registry cache")
273+
274+
if custom_context:
275+
cmd.append("-")
276+
else:
277+
cmd.append(context_path)
278+
279+
context_tarball = open(context_path, "rb") if custom_context else None
280+
281+
try:
282+
process = subprocess.Popen(
283+
cmd,
284+
stdin=context_tarball,
285+
stdout=subprocess.PIPE,
286+
stderr=subprocess.STDOUT,
287+
universal_newlines=True,
288+
bufsize=1,
289+
)
290+
291+
# Stream output in real-time
292+
for line in iter(process.stdout.readline, ""):
293+
line = line.rstrip()
294+
if line:
295+
response.append(line)
296+
LOGGER.info(line)
297+
298+
process.wait()
299+
300+
if process.returncode == 0:
301+
self.build_status = constants.SUCCESS
302+
LOGGER.info(f"Completed Buildx for {self.repository}:{self.tag}")
244303
else:
245-
response.append(str(line))
304+
self.build_status = constants.FAIL
305+
LOGGER.error(f"Buildx failed for {self.repository}:{self.tag}")
306+
307+
except Exception as e:
308+
response.append(f"Buildx error: {str(e)}")
309+
self.build_status = constants.FAIL
310+
LOGGER.error(f"Buildx exception: {str(e)}")
311+
finally:
312+
if context_tarball:
313+
context_tarball.close()
246314

247315
self.log.append(response)
316+
return self.build_status
248317

249-
LOGGER.info(f"DOCKER BUILD LOGS: \n{self.get_tail_logs_in_pretty_format()}")
250-
LOGGER.info(f"Completed Build for {self.repository}:{self.tag}")
318+
def _legacy_docker_build(self, context_path, custom_context=False):
319+
"""
320+
Uses legacy Docker API Client to build the image (for non-vLLM images).
251321
252-
self.build_status = constants.SUCCESS
253-
return self.build_status
322+
:param context_path: str, Path to build context
323+
:param custom_context: bool, Whether to use custom context from stdin (default: False)
324+
:return: int, Build Status
325+
"""
326+
response = [f"Starting Legacy Docker Build Process for {self.repository}:{self.tag}"]
327+
LOGGER.info(f"Starting Legacy Docker Build Process for {self.repository}:{self.tag}")
328+
329+
# Open context tarball for legacy API
330+
fileobj = open(context_path, "rb") if custom_context else None
331+
332+
line_counter = 0
333+
line_interval = 50
334+
335+
try:
336+
for line in self.client.build(
337+
fileobj=fileobj,
338+
path=self.dockerfile if not custom_context else None,
339+
custom_context=custom_context,
340+
rm=True,
341+
decode=True,
342+
tag=self.ecr_url,
343+
buildargs=self.build_args,
344+
labels=self.labels,
345+
target=self.target,
346+
):
347+
# print the log line during build for every line_interval lines
348+
if line_counter % line_interval == 0:
349+
LOGGER.info(line)
350+
line_counter += 1
351+
352+
if line.get("error") is not None:
353+
response.append(line["error"])
354+
self.log.append(response)
355+
self.build_status = constants.FAIL
356+
self.summary["status"] = constants.STATUS_MESSAGE[self.build_status]
357+
self.summary["end_time"] = datetime.now()
358+
359+
LOGGER.info(f"Docker Build Logs: \n {self.get_tail_logs_in_pretty_format(100)}")
360+
LOGGER.error("ERROR during Docker BUILD")
361+
LOGGER.error(
362+
f"Error message received for {self.dockerfile} while docker build: {line}"
363+
)
364+
365+
return self.build_status
366+
367+
if line.get("stream") is not None:
368+
response.append(line["stream"])
369+
elif line.get("status") is not None:
370+
response.append(line["status"])
371+
else:
372+
response.append(str(line))
373+
374+
self.log.append(response)
375+
376+
LOGGER.info(f"DOCKER BUILD LOGS: \n{self.get_tail_logs_in_pretty_format()}")
377+
LOGGER.info(f"Completed Legacy Build for {self.repository}:{self.tag}")
378+
379+
self.build_status = constants.SUCCESS
380+
return self.build_status
381+
382+
except Exception as e:
383+
response.append(f"Legacy Docker build error: {str(e)}")
384+
self.build_status = constants.FAIL
385+
LOGGER.error(f"Legacy Docker build exception: {str(e)}")
386+
return self.build_status
387+
finally:
388+
if fileobj:
389+
fileobj.close()
254390

255391
def image_size_check(self):
256392
"""

tensorflow/training/buildspec-2-18-sm.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ images:
4242
tag_python_version: &TAG_PYTHON_VERSION py310
4343
os_version: &OS_VERSION ubuntu22.04
4444
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
45-
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
45+
# latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
4646
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
4747
# skip_build: "False"
4848
target: sagemaker
@@ -59,7 +59,7 @@ images:
5959
cuda_version: &CUDA_VERSION cu125
6060
os_version: &OS_VERSION ubuntu22.04
6161
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
62-
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
62+
# latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
6363
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
6464
# skip_build: "False"
6565
target: sagemaker

tensorflow/training/docker/2.18/py3/Dockerfile.cpu

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -270,20 +270,21 @@ RUN $PYTHON -m pip install --no-cache-dir -U \
270270
opencv-python \
271271
plotly \
272272
seaborn \
273-
shap
273+
shap \
274+
numpy
274275

275276
RUN $PYTHON -m pip install --no-cache-dir -U \
276277
"sagemaker<3"
277278

278279
RUN $PYTHON -m pip install --no-cache-dir -U \
279280
sagemaker-experiments==0.1.45
280281

281-
RUN $PYTHON -m pip install --no-cache-dir -U \
282-
sagemaker-tensorflow-training
283-
284282
RUN $PYTHON -m pip install --no-cache-dir -U \
285283
sagemaker-training
286284

285+
RUN $PYTHON -m pip install --no-cache-dir -U \
286+
sagemaker-tensorflow-training==20.4.1
287+
287288
RUN $PYTHON -m pip install --no-cache-dir -U \
288289
sagemaker-studio-analytics-extension==0.1.4
289290

@@ -294,9 +295,6 @@ RUN $PYTHON -m pip install --no-cache-dir -U \
294295
sparkmagic==0.21.0 \
295296
smclarify
296297

297-
#pin numpy version because of sagemaker-tensorflow-training dependency
298-
RUN $PYTHON -m pip install --no-cache-dir numpy==1.26.4
299-
300298
# Remove python kernel installed by sparkmagic
301299
RUN /usr/local/bin/jupyter-kernelspec remove -f python3
302300

tensorflow/training/docker/2.18/py3/Dockerfile.sagemaker.cpu.core_packages.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"version_specifier": "<3.0"
77
},
88
"protobuf": {
9-
"version_specifier": ">=5.29.5"
9+
"version_specifier": ">=3.20.3"
1010
},
1111
"pyyaml": {
1212
"version_specifier": ">=6.0,<6.1"
@@ -15,10 +15,10 @@
1515
"version_specifier": ">=2.207.1,<3"
1616
},
1717
"sagemaker-tensorflow-training": {
18-
"version_specifier": ">=20.4.1,<21"
18+
"version_specifier": "==20.4.1"
1919
},
2020
"sagemaker-training": {
21-
"version_specifier": ">=5"
21+
"version_specifier": ">=4.8.3"
2222
},
2323
"sagemaker-studio-analytics-extension": {
2424
"version_specifier": "<1"

0 commit comments

Comments
 (0)