Skip to content

Commit 338e095

Browse files
authored
fix: fix v2 integ tests (#5822)
* fix: fix v2 integ tests * fix: regression issue caused by #5803 * fix: InsufficientInstanceCapacity for ml.g5.12xlarge, try ml.g5.48xlarge * fix: try ml.g5.24xlarge * fix: test_jumpstart_estimator ml.p3.2xlarge deprecated use ml.g4dn.xlarge instead * fix: handle hub teardown error in jumpstart estimator integ test * fix: traceback catch
1 parent 0058173 commit 338e095

16 files changed

Lines changed: 68 additions & 80 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ scratch*.py
1818
examples/tensorflow/distributed_mnist/data
1919
*.iml
2020
doc/_build
21+
docs/_build
22+
docs/api/generated
2123
doc/_static
2224
doc/_templates
2325
**/.DS_Store

src/sagemaker/modules/local_core/local_container.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,18 @@ def train(
204204
# Print our Job Complete line
205205
logger.info("Local training job completed, output artifacts saved to %s", artifacts)
206206

207-
shutil.rmtree(os.path.join(self.container_root, "input"))
208-
shutil.rmtree(os.path.join(self.container_root, "shared"))
207+
for dir_name in ["input", "shared"]:
208+
dir_path = os.path.join(self.container_root, dir_name)
209+
if os.path.exists(dir_path):
210+
shutil.rmtree(dir_path, ignore_errors=True)
209211
for host in self.hosts:
210-
shutil.rmtree(os.path.join(self.container_root, host))
212+
host_path = os.path.join(self.container_root, host)
213+
if os.path.exists(host_path):
214+
shutil.rmtree(host_path, ignore_errors=True)
211215
for folder in self._temporary_folders:
212-
shutil.rmtree(os.path.join(self.container_root, folder))
216+
folder_path = os.path.join(self.container_root, folder)
217+
if os.path.exists(folder_path):
218+
shutil.rmtree(folder_path, ignore_errors=True)
213219
return artifacts
214220

215221
def retrieve_artifacts(

src/sagemaker/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,11 @@ def download_folder(bucket_name, prefix, target, sagemaker_session):
419419

420420
# Spot check: enforce ownership only when downloading from the session's default
421421
# bucket. Cross-account buckets are left untouched.
422-
expected_owner = sagemaker_session._get_account_id_if_default_bucket(bucket_name)
422+
expected_owner = (
423+
sagemaker_session._get_account_id_if_default_bucket(bucket_name)
424+
if hasattr(sagemaker_session, "_get_account_id_if_default_bucket")
425+
else None
426+
)
423427
extra_args = None
424428
if expected_owner:
425429
extra_args = {"ExpectedBucketOwner": expected_owner}

tests/conftest.py

Lines changed: 4 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -641,42 +641,17 @@ def cpu_instance_type(sagemaker_session, request):
641641

642642
@pytest.fixture(scope="session")
643643
def gpu_instance_type(sagemaker_session, request):
644-
region = sagemaker_session.boto_session.region_name
645-
if region in NO_P3_REGIONS:
646-
return "ml.p2.xlarge"
647-
else:
648-
return "ml.p3.2xlarge"
644+
return "ml.g4dn.xlarge"
649645

650646

651647
@pytest.fixture()
652648
def gpu_pytorch_instance_type(sagemaker_session, request):
653-
fw_version = None
654-
for pytorch_version_fixture in [
655-
"pytorch_inference_version",
656-
"huggingface_training_pytorch_latest_version",
657-
"huggingface_inference_pytorch_latest_version",
658-
]:
659-
if pytorch_version_fixture in request.fixturenames:
660-
fw_version = request.getfixturevalue(pytorch_version_fixture)
661-
if fw_version is None:
662-
fw_version = request.param
663-
region = sagemaker_session.boto_session.region_name
664-
if region in NO_P3_REGIONS:
665-
if Version(fw_version) >= Version("1.13"):
666-
return PYTORCH_RENEWED_GPU
667-
else:
668-
return "ml.p2.xlarge"
669-
else:
670-
return "ml.p3.2xlarge"
649+
return "ml.g4dn.xlarge"
671650

672651

673652
@pytest.fixture(scope="session")
674653
def gpu_instance_type_list(sagemaker_session, request):
675-
region = sagemaker_session.boto_session.region_name
676-
if region in NO_P3_REGIONS:
677-
return ["ml.p2.xlarge"]
678-
else:
679-
return ["ml.p3.2xlarge", "ml.p2.xlarge"]
654+
return ["ml.g4dn.xlarge"]
680655

681656

682657
@pytest.fixture(scope="session")
@@ -717,16 +692,7 @@ def pytest_generate_tests(metafunc):
717692
cpu_instance_type = "ml.m5.xlarge" if region in NO_M4_REGIONS else "ml.m4.xlarge"
718693

719694
params = [cpu_instance_type]
720-
if not (
721-
region in tests.integ.HOSTING_NO_P3_REGIONS
722-
or region in tests.integ.TRAINING_NO_P3_REGIONS
723-
):
724-
params.append("ml.p3.2xlarge")
725-
elif not (
726-
region in tests.integ.HOSTING_NO_P2_REGIONS
727-
or region in tests.integ.TRAINING_NO_P2_REGIONS
728-
):
729-
params.append("ml.p2.xlarge")
695+
params.append("ml.g4dn.xlarge")
730696

731697
metafunc.parametrize("instance_type", params, scope="session")
732698

Binary file not shown.

tests/integ/sagemaker/jumpstart/conftest.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -186,13 +186,19 @@ def _cleanup_old_hubs(sagemaker_session):
186186

187187
def _delete_hubs(sagemaker_session, hub_name):
188188
# list and delete all hub contents first
189-
list_hub_content_response = sagemaker_session.list_hub_contents(
190-
hub_name=hub_name, hub_content_type=HubContentType.MODEL_REFERENCE.value
191-
)
192-
for model in list_hub_content_response["HubContentSummaries"]:
193-
_delete_hub_contents(sagemaker_session, hub_name, model)
189+
try:
190+
list_hub_content_response = sagemaker_session.list_hub_contents(
191+
hub_name=hub_name, hub_content_type=HubContentType.MODEL_REFERENCE.value
192+
)
193+
for model in list_hub_content_response["HubContentSummaries"]:
194+
_delete_hub_contents(sagemaker_session, hub_name, model)
194195

195-
sagemaker_session.delete_hub(hub_name)
196+
sagemaker_session.delete_hub(hub_name)
197+
except Exception as e:
198+
if "ResourceNotFound" in str(e):
199+
print(f"Hub {hub_name} does not exist, skipping deletion.")
200+
else:
201+
raise
196202

197203

198204
@with_exponential_backoff()

tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,10 @@ def test_jumpstart_estimator(setup):
6060
sagemaker_session=get_sm_session(),
6161
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
6262
max_run=259200, # avoid exceeding resource limits
63+
instance_type="ml.g4dn.xlarge",
6364
)
6465

65-
# uses ml.p3.2xlarge instance
66+
# uses ml.g4dn.xlarge instance
6667
estimator.fit(
6768
{
6869
"training": f"s3://{get_jumpstart_content_bucket(JUMPSTART_DEFAULT_REGION_NAME)}/"
@@ -78,11 +79,12 @@ def test_jumpstart_estimator(setup):
7879
sagemaker_session=get_sm_session(),
7980
)
8081

81-
# uses ml.p3.2xlarge instance
82+
# uses ml.g4dn.xlarge instance
8283
predictor = estimator.deploy(
8384
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
8485
role=get_sm_session().get_caller_identity_arn(),
8586
sagemaker_session=get_sm_session(),
87+
instance_type="ml.g4dn.xlarge",
8688
)
8789

8890
response = predictor.predict(["hello", "world"])

tests/integ/sagemaker/jumpstart/model/test_jumpstart_model.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,10 @@ def test_prepacked_jumpstart_model(setup):
9595
sagemaker_session=get_sm_session(),
9696
)
9797

98-
# uses ml.p3.2xlarge instance
98+
# uses ml.g4dn.xlarge instance
9999
predictor = model.deploy(
100100
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
101+
instance_type="ml.g4dn.xlarge",
101102
)
102103

103104
response = predictor.predict("hello world!")
@@ -120,7 +121,7 @@ def test_model_package_arn_jumpstart_model(setup):
120121
sagemaker_session=get_sm_session(),
121122
)
122123

123-
# uses ml.g5.2xlarge instance
124+
# uses ml.g4dn.2xlarge instance
124125
predictor = model.deploy(
125126
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
126127
)
@@ -175,7 +176,7 @@ def test_jumpstart_gated_model(setup):
175176
sagemaker_session=get_sm_session(),
176177
)
177178

178-
# uses ml.g5.2xlarge instance
179+
# uses ml.g4dn.2xlarge instance
179180
predictor = model.deploy(
180181
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
181182
accept_eula=True,
@@ -206,6 +207,7 @@ def test_jumpstart_gated_model_inference_component_enabled(setup):
206207
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
207208
accept_eula=True,
208209
endpoint_type=EndpointType.INFERENCE_COMPONENT_BASED,
210+
instance_type="ml.g5.24xlarge",
209211
)
210212

211213
predictor = retrieve_default(
@@ -262,7 +264,7 @@ def test_jumpstart_model_register(setup):
262264

263265
# uses instance
264266
predictor = model_package.deploy(
265-
instance_type="ml.p3.2xlarge",
267+
instance_type="ml.g4dn.xlarge",
266268
initial_instance_count=1,
267269
)
268270

@@ -379,7 +381,7 @@ def test_jumpstart_model_with_deployment_configs(setup):
379381

380382
model.set_deployment_config(
381383
configs[0]["ConfigName"],
382-
"ml.g5.2xlarge",
384+
"ml.g4dn.2xlarge",
383385
)
384386
assert model.config_name == configs[0]["ConfigName"]
385387

tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def test_jumpstart_hub_estimator(setup, add_model_references):
6969
model_id=model_id,
7070
hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
7171
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
72+
instance_type="ml.g4dn.xlarge",
7273
)
7374

7475
estimator.fit(
@@ -85,9 +86,10 @@ def test_jumpstart_hub_estimator(setup, add_model_references):
8586
model_version=model_version,
8687
)
8788

88-
# uses ml.p3.2xlarge instance
89+
# uses ml.g4dn.xlarge instance
8990
predictor = estimator.deploy(
9091
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
92+
instance_type="ml.g4dn.xlarge",
9193
)
9294

9395
response = predictor.predict(["hello", "world"])
@@ -107,6 +109,7 @@ def test_jumpstart_hub_estimator_with_session(setup, add_model_references):
107109
sagemaker_session=sagemaker_session,
108110
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
109111
hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
112+
instance_type="ml.g4dn.xlarge",
110113
)
111114

112115
estimator.fit(
@@ -124,11 +127,12 @@ def test_jumpstart_hub_estimator_with_session(setup, add_model_references):
124127
sagemaker_session=get_sm_session(),
125128
)
126129

127-
# uses ml.p3.2xlarge instance
130+
# uses ml.g4dn.xlarge instance
128131
predictor = estimator.deploy(
129132
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
130133
role=get_sm_session().get_caller_identity_arn(),
131134
sagemaker_session=get_sm_session(),
135+
instance_type="ml.g4dn.xlarge",
132136
)
133137

134138
response = predictor.predict(["hello", "world"])
@@ -144,6 +148,7 @@ def test_jumpstart_hub_gated_estimator_with_eula(setup, add_model_references):
144148
model_id=model_id,
145149
hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
146150
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
151+
instance_type="ml.g5.2xlarge",
147152
)
148153

149154
estimator.fit(
@@ -158,6 +163,7 @@ def test_jumpstart_hub_gated_estimator_with_eula(setup, add_model_references):
158163
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
159164
role=get_sm_session().get_caller_identity_arn(),
160165
sagemaker_session=get_sm_session(),
166+
instance_type="ml.g5.2xlarge",
161167
)
162168

163169
payload = {
@@ -178,6 +184,7 @@ def test_jumpstart_hub_gated_estimator_without_eula(setup, add_model_references)
178184
model_id=model_id,
179185
hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
180186
tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
187+
instance_type="ml.g5.2xlarge",
181188
)
182189
with pytest.raises(Exception):
183190
estimator.fit(

0 commit comments

Comments
 (0)