From 190156bab7886fcf9fd3670927038295afc49c5a Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 16 May 2025 13:20:17 +0200 Subject: [PATCH 1/2] rename --cuda-sanity-check-error-on-fail to --cuda-sanity-check-error-on-failed-checks + improve help text for --cuda-sanity-check-* configuration options --- easybuild/framework/easyblock.py | 2 +- easybuild/tools/config.py | 2 +- easybuild/tools/options.py | 29 +++++++++++++++-------------- easybuild/tools/systemtools.py | 2 ++ 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 72fe18d3d8..95c3bfb402 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3360,7 +3360,7 @@ def sanity_check_cuda(self, cuda_dirs=None): fail_msgs = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) - ignore_failures = not build_option('cuda_sanity_check_error_on_fail') + ignore_failures = not build_option('cuda_sanity_check_error_on_failed_checks') strict_cc_check = build_option('cuda_sanity_check_strict') accept_ptx_as_devcode = build_option('cuda_sanity_check_accept_ptx_as_devcode') accept_missing_ptx = build_option('cuda_sanity_check_accept_missing_ptx') diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index b9ddaff7ba..609295b94f 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -299,7 +299,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'container_build_image', 'cuda_sanity_check_accept_ptx_as_devcode', 'cuda_sanity_check_accept_missing_ptx', - 'cuda_sanity_check_error_on_fail', + 'cuda_sanity_check_error_on_failed_checks', 'cuda_sanity_check_strict', 'debug', 'debug_lmod', diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index 6be7ed3029..62443a4303 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -406,30 +406,31 @@ def override_options(self): "--cuda-sanity-check-accept-ptx-as-devcode, " "or made more stringent using --cuda-sanity-check-strict.", 'strlist', 'extend', None), - 'cuda-sanity-check-accept-missing-ptx': ("CUDA sanity check also passes if PTX code for the highest " + 'cuda-sanity-check-accept-missing-ptx': ("Relax CUDA sanity check to accept that PTX code for the highest " "requested CUDA compute capability is not present (but will " "print a warning)", None, 'store_true', False), - 'cuda-sanity-check-accept-ptx-as-devcode': ("CUDA sanity check also passes if requested device code is " - "not present, as long as PTX code is present that can be " - "JIT-compiled for each target in --cuda-compute-capabilities " - "E.g. if --cuda-compute-capabilities=8.0 and a binary is " - "found in the installation that does not have device code for " - "8.0, but it does have PTX code for 7.0, the sanity check " - "will pass if, and only if, this option is True. " + 'cuda-sanity-check-accept-ptx-as-devcode': ("Relax CUDA sanity check to accept that requested device code " + "is not present, as long as PTX code is present that can be " + "JIT-compiled for each target in --cuda-compute-capabilities. " + "For example, if --cuda-compute-capabilities=8.0 and a binary " + "is found in the installation that does not have device code " + "for 8.0, but it does have PTX code for 7.0, the sanity check " + "will pass if, and only if, this option is enabled. " "Note that JIT-compiling means the binary will work on the " "requested architecture, but is it not necessarily as well " "optimized as when actual device code is present for the " "requested architecture ", None, 'store_true', False), - 'cuda-sanity-check-error-on-fail': ("If True, failures in the CUDA sanity check will produce an error. " - "If False, the CUDA sanity check will be performed, and failures will " - "be reported, but they will not result in an error", - None, 'store_true', False), + 'cuda-sanity-check-error-on-failed-checks': ("If enabled, failures in the CUDA sanity check will produce " + "an error. If disabled, the CUDA sanity check will be " + "performed and failures will be reported through warnings, " + "but they will not result in an error", + None, 'store_true', False), 'cuda-sanity-check-strict': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " "check will fail if the CUDA binaries don't contain code for (at least) " - "all compute capabilities defined in --cude-compute-capabilities, but will " - "accept if code for additional compute capabilities is present. " + "all compute capabilities defined in --cude-compute-capabilities, " + "but will accept if code for additional compute capabilities is present. " "With this setting, the sanity check will also fail if code is present for " "more compute capabilities than defined in --cuda-compute-capabilities.", None, 'store_true', False), diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index f471b7dd86..634e8fdb34 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -27,10 +27,12 @@ Authors: +* Kenneth Hoste (Ghent University) * Jens Timmerman (Ghent University) * Ward Poelmans (Ghent University) * Jasper Grimm (UoY) * Jan Andre Reuter (Forschungszentrum Juelich GmbH) +* Caspar van Leeuwen (SURF) """ import csv import ctypes From 22858ec74b416db024685f20cf9891eee051b71c Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 16 May 2025 14:58:33 +0200 Subject: [PATCH 2/2] also rename to --cuda-sanity-check-error-on-failed-checks in comments, trace/log messages, and tests --- easybuild/framework/easyblock.py | 24 ++++++++++++------------ test/framework/toy_build.py | 30 +++++++++++++++--------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 95c3bfb402..e9cabc9724 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3446,13 +3446,13 @@ def format_file_list(files_list): missing_devcodes = list(set(cfg_ccs) - set(found_dev_code_ccs)) # There are two reasons for ignoring failures: - # - We are running with --disable-cuda-sanity-check-error-on-fail + # - We are running with --disable-cuda-sanity-check-error-on-failed-checks # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig # In case we run with both, we'll just report that we're running with - # --disable-cuda-sanity-check-error-on-fail + # --disable-cuda-sanity-check-error-on-failed-checks if ignore_failures: ignore_msg = f"Failure for {path} will be ignored since we are not running with " - ignore_msg += "--cuda-sanity-check-error-on-fail" + ignore_msg += "--cuda-sanity-check-error-on-failed-checks" else: ignore_msg = f"This failure will be ignored as '{path}' is listed in " ignore_msg += "'cuda_sanity_ignore_files'." @@ -3473,8 +3473,8 @@ def format_file_list(files_list): if strict_cc_check: # cuda-sanity-check-strict, so no additional compute capabilities allowed if path in ignore_file_list or ignore_failures: - # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + # No error, either path is in cuda_sanity_ignore_files list in easyconfig, + # or we are running with --disable-cuda-sanity-check-error-on-failed-checks files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg else: @@ -3513,7 +3513,7 @@ def format_file_list(files_list): if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in # the easyconfig, or we are running with - # --disable-cuda-sanity-check-error-on-fail + # --disable-cuda-sanity-check-error-on-failed-checks files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg else: @@ -3524,8 +3524,8 @@ def format_file_list(files_list): # This is considered a failure files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: - # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + # No error, either path is in cuda_sanity_ignore_files list in easyconfig, + # or we are running with --disable-cuda-sanity-check-error-on-failed-checks files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg else: @@ -3547,7 +3547,7 @@ def format_file_list(files_list): fail_msg += "(PTX architectures supported in that file: %s). " if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-failed-checks files_missing_ptx_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg self.log.warning(fail_msg, highest_cc[0], path, found_ptx_ccs) @@ -3578,7 +3578,7 @@ def trace_and_log(msg): elif ignore_failures: msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-failed-checks, so not considered failures)") else: msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" msg += f" (ignored: {len(files_missing_devcode_ignored)}, " @@ -3595,7 +3595,7 @@ def trace_and_log(msg): msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " msg += f"{len(files_additional_devcode)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-failed-checks, so not considered failures)") elif strict_cc_check: msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_devcode_ignored)}, " @@ -3612,7 +3612,7 @@ def trace_and_log(msg): msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " msg += f"{len(files_missing_ptx)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-failed-checks, so not considered failures)") elif accept_missing_ptx: msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " msg += f"{len(files_missing_ptx)}" diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 484c1a2ff2..bfa8b4de4a 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3270,7 +3270,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 1a: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains # 8.0 device code - # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) + # This should succeed (since the default for --cuda-sanity-check-error-on-failed-checks is False) # as to not break backwards compatibility write_file(cuobjdump_file, cuobjdump_txt_shebang), write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) @@ -3287,7 +3287,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Note that the difference with 1a is the presense of additional device code, PTX code foor the right # architecture, but missing device code for the requested architecture # It should not matter for the result, but triggers slightly different code paths in easyblock.py - # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) + # This should succeed (since the default for --cuda-sanity-check-error-on-failed-checks is False) # as to not break backwards compatibility write_file(cuobjdump_file, cuobjdump_txt_shebang), write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) @@ -3305,9 +3305,9 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) assert_cuda_report(missing_cc=1, additional_cc=1, missing_ptx=0, log=outtxt, stdout=stdout) - # Test case 2: same as Test case 1, but add --cuda-sanity-check-error-on-fail + # Test case 2: same as Test case 1, but add --cuda-sanity-check-error-on-failed-checks # This is expected to fail since there is missing device code for CC80 - args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-fail'] + args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to fail, so first check error, then run again to check output error_pattern = r"Files missing CUDA device code: 1." with self.mocked_stdout_stderr(): @@ -3325,7 +3325,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # This is expected to succeed, since now the PTX code for CC80 will be accepted as # device code. Note that also PTX code for the highest requested compute architecture (also CC80) # is present, so also this part of the sanity check passes - args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-ptx-as-devcode'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): @@ -3340,7 +3340,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 4: same as Test case 2, but run with --cuda-compute-capabilities=9.0 # This is expected to fail: device code is present, but PTX code for the highest CC (9.0) is missing - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail'] + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to fail, so first check error, then run again to check output error_pattern = r"Files missing CUDA PTX code: 1" with self.mocked_stdout_stderr(): @@ -3354,7 +3354,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 5: same as Test case 4, but add --cuda-sanity-check-accept-missing-ptx # This is expected to succeed: device code is present, PTX code is missing, but that's accepted - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-missing-ptx'] # We expect this to pass, so no need to check errors warning_pattern = r"Configured highest compute capability was '9\.0', " @@ -3373,7 +3373,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 6: same as Test case 5, but add --cuda-sanity-check-strict # This is expected to fail: device code is present, PTX code is missing (but accepted due to option) # but additional device code is present, which is not allowed by --cuda-sanity-check-strict - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to fail, so first check error, then run again to check output error_pattern = r"Files with additional CUDA device code: 1" @@ -3392,7 +3392,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, toy_whitelist_ec = os.path.join(self.test_prefix, 'toy-0.0-cuda-whitelist.eb') write_file(toy_whitelist_ec, read_file(toy_ec) + '\ncuda_sanity_ignore_files = ["bin/toy"]') - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to succeed, so check output for expected patterns with self.mocked_stdout_stderr(): @@ -3402,7 +3402,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) - # Test case 8: try with --cuda-sanity-check-error-on-fail --cuda-compute-capabilities=9.0,9.0a + # Test case 8: try with --cuda-sanity-check-error-on-failed-checks --cuda-compute-capabilities=9.0,9.0a # and --cuda-sanity-check-strict # on a binary that contains 9.0 and 9.0a device code, and 9.0a ptx code. This tests the correct # ordering (i.e. 9.0a > 9.0). It should pass, since device code is present for both CCs and PTX @@ -3413,7 +3413,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, write_file(cuobjdump_file, cuobjdump_txt_sm90a, append=True) write_file(cuobjdump_file, cuobjdump_txt_sm90a_ptx, append=True) adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=9.0,9.0a', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=9.0,9.0a', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-strict'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): @@ -3431,7 +3431,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 9: same as 8, but no --cuda-compute-capabilities are defined # We expect this to lead to a skip of the CUDA sanity check, and a success for the overall sanity check - args = ['--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-strict'] + args = ['--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-strict'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3446,7 +3446,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, self.assertTrue(expected_result.search(outtxt), msg) # Test case 10: running with default options and a binary that does not contain ANY CUDA device code - # This is expected to succeed, since the default is --disable-cuda-sanity-check-error-on-fail + # This is expected to succeed, since the default is --disable-cuda-sanity-check-error-on-failed-checks write_file(cuobjdump_file, cuobjdump_txt_shebang) write_file(cuobjdump_file, cuobjdump_txt_no_cuda, append=True) adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable @@ -3466,9 +3466,9 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, self.assertTrue(expected_result.search(outtxt), msg) assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=0, log=outtxt, stdout=stdout, num_checked=0) - # Test case 11: same as Test case 10, but add --cuda-sanity-check-error-on-fail + # Test case 11: same as Test case 10, but add --cuda-sanity-check-error-on-failed-checks # This should pass: if it's not a CUDA binary, it shouldn't fail the CUDA sanity check - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail'] + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True)