Skip to content

Commit 870cd4b

Browse files
Merge branch 'master' into vllm-ec2
2 parents 68c247d + ada99ed commit 870cd4b

28 files changed

+241
-174
lines changed

eks_infrastructure/build_param.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
"MAINLINE"
55
],
66
"eks_clusters": [
7-
"dlc-pytorch",
8-
"dlc-tensorflow",
97
"dlc-vllm"
108
],
119
"eks_version": "1.32",

huggingface/pytorch/training/docker/2.1/py3/sdk2.20.0/Dockerfile.neuronx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ RUN apt-get update \
5858
libgstreamer1.0-0 \
5959
libgstreamer-plugins-base1.0-0 \
6060
libsoup2.4-1 \
61+
libsqlite3-0 \
6162
&& apt-get upgrade -y apparmor \
6263
&& apt-get clean \
6364
&& rm -rf /var/lib/apt/lists/*

huggingface/pytorch/training/docker/2.1/py3/sdk2.20.0/Dockerfile.neuronx.os_scan_allowlist.json

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1907,5 +1907,36 @@
19071907
"title":"CVE-2024-7348 - postgresql-12",
19081908
"reason_to_ignore":"N/A"
19091909
}
1910-
]
1911-
}
1910+
],
1911+
"torch": [
1912+
{
1913+
"description": "In PyTorch <=2.4.1, the RemoteModule has Deserialization RCE. NOTE: this is disputed by multiple parties because this is intended behavior in PyTorch distributed computing.",
1914+
"vulnerability_id": "CVE-2024-48063",
1915+
"name": "CVE-2024-48063",
1916+
"package_name": "torch",
1917+
"package_details": {
1918+
"file_path": "/opt/conda/lib/python3.11/site-packages/torch-2.4.0+cu124.dist-info/METADATA",
1919+
"name": "torch",
1920+
"package_manager": "PYTHON",
1921+
"version": "2.4.0+cu124",
1922+
"release": null
1923+
},
1924+
"remediation": {
1925+
"recommendation": {
1926+
"text": "None Provided"
1927+
}
1928+
},
1929+
"cvss_v3_score": 9.8,
1930+
"cvss_v30_score": 0.0,
1931+
"cvss_v31_score": 9.8,
1932+
"cvss_v2_score": 0.0,
1933+
"cvss_v3_severity": "CRITICAL",
1934+
"source_url": "https://nvd.nist.gov/vuln/detail/CVE-2024-48063",
1935+
"source": "NVD",
1936+
"severity": "CRITICAL",
1937+
"status": "ACTIVE",
1938+
"title": "CVE-2024-48063 - torch",
1939+
"reason_to_ignore": "this container is specifically pytorch 2.4.x so we cant upgrade to later minor versions"
1940+
}
1941+
]
1942+
}

huggingface/pytorch/training/docker/2.1/py3/sdk2.20.0/Dockerfile.neuronx.py_scan_allowlist.json

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,17 @@
1818
"72394": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.27.0 or newer, enabling a maliciously crafted Recipe to execute arbitrary code on an end user’s system when run.', reason_to_ignore='N/A', spec='>=1.27.0'",
1919
"73889": "[pkg: werkzeug] Required by sagemaker. advisory='Affected versions of Werkzeug are potentially vulnerable to resource exhaustion when parsing file data in forms.', reason_to_ignore='N/A', spec='>=3.0.6'",
2020
"73969": "[pkg: werkzeug] Required by sagemaker. advisory='Affected versions of Werkzeug are vulnerable to Path Traversal (CWE-22) on Windows systems running Python versions below 3.11.', reason_to_ignore='N/A', spec='>=3.0.6'",
21-
"72809": "[pkg: gunicorn] A vulnerability in Gunicorn allowed the TolerateDangerousFraming setting to process conflicting headers (Transfer-Encoding and Content-Length) and dangerous characters in HTTP header fields.', reason_to_ignore='N/A', spec='>=23.0.0'"
21+
"72809": "[pkg: gunicorn] A vulnerability in Gunicorn allowed the TolerateDangerousFraming setting to process conflicting headers (Transfer-Encoding and Content-Length) and dangerous characters in HTTP header fields.', reason_to_ignore='N/A', spec='>=23.0.0'",
22+
"77680": "Requests is an HTTP library. Due to a URL parsing issue, Requests releases prior to 2.32.4 may leak .netrc credentials to third parties for specific maliciously-crafted URLs. Users should upgrade to version 2.32.4 to receive a fix. For older versions of Requests, use of the .netrc file can be disabled with `trust_env=False` on one's Requests Session.",
23+
"77740": "Affected versions of this package are vulnerable to a potential Denial of Service (DoS) attack due to unbounded recursion when parsing untrusted Protocol Buffers data. The pure-Python implementation fails to enforce recursion depth limits when processing recursive groups, recursive messages, or a series of SGROUP tags, leading to stack overflow conditions that can crash the application by exceeding Python's recursion limit.",
24+
"78558": "Affected versions of this package are potentially vulnerable to Regular Expression Denial of Service (ReDoS) due to catastrophic backtracking in the V1 engine when processing patterns that combine full‑casefolding with the quantifier. The engine's AnyAll node fails to prevent nested quantifier backtracking, leading to infinite loops and CPU exhaustion.",
25+
"77680": "Requests is an HTTP library. Due to a URL parsing issue, Requests releases prior to 2.32.4 may leak .netrc credentials to third parties for specific maliciously-crafted URLs. Users should upgrade to version 2.32.4 to receive a fix. For older versions of Requests, use of the .netrc file can be disabled with trust_env=False on one's Requests Session.",
26+
"78153": "A Regular Expression Denial of Service (ReDoS) vulnerability was discovered in the Hugging Face Transformers library, specifically within the DonutProcessor class's token2json() method. This vulnerability affects versions 4.51.3 and earlier, and is fixed in version 4.52.1. The issue arises from the regex pattern <s_(.*?)> which can be exploited to cause excessive CPU consumption through crafted input strings due to catastrophic backtracking. This vulnerability can lead to service disruption, resource exhaustion, and potential API service vulnerabilities, impacting document processing tasks using the Donut model.",
27+
"77986": "Hugging Face Transformers versions up to 4.49.0 are affected by an improper input validation vulnerability in the image_utils.py file. The vulnerability arises from insecure URL validation using the startswith() method, which can be bypassed through URL username injection. This allows attackers to craft URLs that appear to be from YouTube but resolve to malicious domains, potentially leading to phishing attacks, malware distribution, or data exfiltration. The issue is fixed in version 4.52.1.",
28+
"77985": "A Regular Expression Denial of Service (ReDoS) vulnerability was discovered in the Hugging Face Transformers library, specifically in the get_configuration_file() function within the transformers.configuration_utils module. The affected version is 4.49.0, and the issue is resolved in version 4.51.0. The vulnerability arises from the use of a regular expression pattern config.(.*).json that can be exploited to cause excessive CPU consumption through crafted input strings, leading to catastrophic backtracking. This can result in model serving disruption, resource exhaustion, and increased latency in applications using the library.",
29+
"77988": "A Regular Expression Denial of Service (ReDoS) vulnerability was discovered in the Hugging Face Transformers library, specifically in the get_imports() function within dynamic_module_utils.py. This vulnerability affects versions 4.49.0 and is fixed in version 4.51.0. The issue arises from a regular expression pattern s*trys*:.*?except.*?: used to filter out try/except blocks from Python code, which can be exploited to cause excessive CPU consumption through crafted input strings due to catastrophic backtracking. This vulnerability can lead to remote code loading disruption, resource exhaustion in model serving, supply chain attack vectors, and development pipeline disruption.",
30+
"77149": "A Regular Expression Denial of Service (ReDoS) vulnerability was identified in the huggingface/transformers library, specifically in the file tokenization_gpt_neox_japanese.py of the GPT-NeoX-Japanese model. The vulnerability occurs in the SubWordJapaneseTokenizer class, where regular expressions process specially crafted inputs. The issue stems from a regex exhibiting exponential complexity under certain conditions, leading to excessive backtracking. This can result in high CPU usage and potential application downtime, effectively creating a Denial of Service (DoS) scenario. The affected version is v4.48.1 (latest).",
31+
"77714": "A vulnerability in the preprocess_string() function of the transformers.testing_utils module in huggingface/transformers version v4.48.3 allows for a Regular Expression Denial of Service (ReDoS) attack. The regular expression used to process code blocks in docstrings contains nested quantifiers, leading to exponential backtracking when processing input with a large number of newline characters. An attacker can exploit this by providing a specially crafted payload, causing high CPU usage and potential application downtime, effectively resulting in a Denial of Service (DoS) scenario.",
32+
"77744": "urllib3 is a user-friendly HTTP client library for Python. Prior to 2.5.0, it is possible to disable redirects for all requests by instantiating a PoolManager and specifying retries in a way that disable redirects. By default, requests and botocore users are not affected. An application attempting to mitigate SSRF or open redirect vulnerabilities by disabling redirects at the PoolManager level will remain vulnerable. This issue has been patched in version 2.5.0.",
33+
"77745": "Urllib3 is a user-friendly HTTP client library for Python. Starting in version 2.2.0 and before 2.5.0, urllib3 does not control redirects in browsers and Node.js. urllib3 supports being used in a Pyodide runtime, utilizing the JavaScript Fetch API or falling back on XMLHttpRequest. This means Python libraries can be used to make HTTP requests from a browser or Node.js. Additionally, urllib3 provides a mechanism to control redirects, but the retries and redirect parameters are ignored with Pyodide; the runtime itself determines redirect behaviour. This issue has been patched in version 2.5.0."
2234
}

huggingface/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ ENV HF_HUB_ENABLE_HF_TRANSFER="1"
6363

6464
RUN apt-get update \
6565
# TODO: Remove upgrade statements once packages are updated in base image
66-
&& apt-get -y upgrade --only-upgrade systemd openssl cryptsetup libkrb5-3 linux-libc-dev \
66+
&& apt-get -y upgrade --only-upgrade systemd openssl cryptsetup libkrb5-3 linux-libc-dev libsqlite3-0 \
6767
&& apt-get install -y git git-lfs wget tar \
6868
&& wget https://go.dev/dl/go1.24.2.linux-amd64.tar.gz \
6969
&& rm -rf /usr/local/go \

huggingface/pytorch/training/docker/2.6/py3/cu126/Dockerfile.gpu

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ LABEL dlc_major_version="2"
1212

1313
# version args
1414
ARG TRANSFORMERS_VERSION=4.51.3
15-
ARG DATASETS_VERSION=3.5.0
15+
ARG DATASETS_VERSION=2.18.0
1616
ARG HUGGINGFACE_HUB_VERSION=0.30.0
1717
ARG DIFFUSERS_VERSION=0.33.1
1818
ARG EVALUATE_VERSION=0.4.3
@@ -22,6 +22,8 @@ ARG PEFT_VERSION=0.14.0
2222
ARG FLASH_ATTN_VERSION=2.7.4.post1
2323
ARG NINJA_VERSION=1.11.1.3
2424
ARG PYTHON=python3
25+
ARG MULTIPROCESS_VERSION=0.70.16
26+
ARG DILL_VERSION=0.3.8
2527

2628
# TODO: Remove when the base image is updated
2729
RUN pip install --upgrade pip \
@@ -47,8 +49,10 @@ RUN pip install --no-cache-dir \
4749
ninja==${NINJA_VERSION} \
4850
trl==${TRL_VERSION} \
4951
peft==${PEFT_VERSION} \
50-
flash-attn==${FLASH_ATTN_VERSION}
51-
52+
flash-attn==${FLASH_ATTN_VERSION} \
53+
dill==${DILL_VERSION} \
54+
multiprocess==${MULTIPROCESS_VERSION} \
55+
"pathos<0.3.3"
5256

5357
# hf_transfer will be a built-in feature, remove the env variable then
5458
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"78279": "Starlette is a lightweight ASGI (Asynchronous Server Gateway Interface) framework/toolkit, designed for building async web services in Python. In versions 0.47.1 and below, when parsing a multi-part form with large files (greater than the default max spool size) starlette will block the main thread to roll the file over to disk. This blocks the event thread which means the application can't accept new connections. The UploadFile code has a minor bug where instead of just checking for self._in_memory, the logic should also check if the additional bytes will cause a rollover. The vulnerability is fixed in version 0.47.2."
3+
}

pytorch/training/docker/2.7/py3/Dockerfile.sagemaker.cpu.core_packages.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,8 @@
6464
},
6565
"opencv-python": {
6666
"version_specifier": "==4.11.0.86"
67+
},
68+
"sagemaker-pytorch-training": {
69+
"version_specifier": ">=2.9.0,<3"
6770
}
6871
}

pytorch/training/docker/2.7/py3/cu128/Dockerfile.sagemaker.gpu.core_packages.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,5 +71,8 @@
7171
},
7272
"awscli": {
7373
"version_specifier": "<2"
74+
},
75+
"sagemaker-pytorch-training": {
76+
"version_specifier": ">=2.9.0,<3"
7477
}
7578
}

tensorflow/training/buildspec-2-18-ec2.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow
55
version: &VERSION 2.18.0
66
short_version: &SHORT_VERSION "2.18"
77
arch_type: x86
8-
# autopatch_build: "True"
8+
autopatch_build: "True"
99

1010
repository_info:
1111
training_repository: &TRAINING_REPOSITORY

0 commit comments

Comments
 (0)