Skip to content
Draft
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
2ff3e81
Enable ocltst execution on Rock
agunashe Feb 24, 2026
cae277b
add ocltst to the test matrix
agunashe Feb 24, 2026
eb1a603
install ocl-icd-libopencl1, enable windows exe of ocltst, typo for li…
agunashe Feb 25, 2026
c54f377
install ocl-icd-opencl-dev and fix exe path on windows
agunashe Feb 25, 2026
647db32
fix windows ocltst path
agunashe Feb 26, 2026
34d2e08
use container image no_rocm_image_ubuntu24_04_ocl_rt
agunashe Feb 27, 2026
3baa94f
include lib path and ocltst path to LD_LIBRARY_PATH
agunashe Mar 3, 2026
1adb9cd
loggin to list contents of lib
agunashe Mar 3, 2026
472a365
add opencl/lib, llvm/lib, rocm_sysdeps/lib to LD_LIBRARY_PATH
agunashe Mar 4, 2026
909661c
fix libamdocl64.so path
agunashe Mar 4, 2026
d569966
set env OCL_ICD_VENDORS to local icd file, copy stage/etc/* file as p…
agunashe Mar 6, 2026
2729746
use users/agunashe/ocl_icd_vendors for rocm-systems for testing
agunashe Mar 6, 2026
cefb9a5
fix OCL_ICD_VENDORS path to add / at the end
agunashe Mar 7, 2026
6c418ef
fix precommit failures
agunashe Mar 7, 2026
b03d94f
fix more precommit failures
agunashe Mar 7, 2026
e23f78e
adding strace and lld to ocltst for debugging
agunashe Mar 7, 2026
07d20c7
use ocl_rt container with strace
agunashe Mar 9, 2026
17305e0
use absolute paths for execution instead of relative paths
agunashe Mar 10, 2026
4bf2085
add AMD_LOG_LEVEL to ocltst
agunashe Mar 10, 2026
a196243
add --cap-add=SYS_NICE to docker
agunashe Mar 10, 2026
b19ab12
rebase rocm-systems to pick comgr versioning commit
agunashe Mar 13, 2026
5d8b7e1
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Mar 13, 2026
606754e
fix code review comments
agunashe Mar 13, 2026
aacd766
fix pre commit issue
agunashe Mar 13, 2026
3040331
Merge branch 'main' into users/agunashe/run_ocltst_rock
agunashe Mar 13, 2026
73738fb
removing --ocltst artifact arg since its causing failure
agunashe Mar 14, 2026
c9ca10e
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Mar 14, 2026
c87a036
Merge branch 'users/agunashe/run_ocltst_rock' of github.com:ROCm/TheR…
agunashe Mar 14, 2026
58033ca
Merge branch 'main' into users/agunashe/run_ocltst_rock
agunashe Mar 16, 2026
bfe2dc1
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Mar 18, 2026
119c18a
use develop rocm-systems
agunashe Mar 18, 2026
8a299aa
windows: copy the required dlls to local ocltst path
agunashe Mar 18, 2026
7df37de
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Mar 20, 2026
0935826
use develop rocm-systems 18eefed
agunashe Mar 20, 2026
9d48b9c
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Mar 23, 2026
e3125af
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Mar 25, 2026
fcd1a9f
change timeout to 20 mins and remove -J opt to ocltst
agunashe Mar 25, 2026
e36bedf
increase timeout from 20 to 30 mins
agunashe Mar 25, 2026
4bd3017
fix precommit errors
agunashe Mar 25, 2026
acdafb8
more precommit fixes
agunashe Mar 25, 2026
6c369e4
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Mar 27, 2026
d8f56a2
fixing review comments.
agunashe Mar 27, 2026
1f09167
fix pre-commit errors
agunashe Mar 27, 2026
ebcafd8
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Mar 29, 2026
2026025
ocltst on windows typos
agunashe Mar 29, 2026
299b076
new line after execute_tests
agunashe Mar 29, 2026
aef601b
clean up more conditional statements
agunashe Mar 29, 2026
3913d0c
Merge remote-tracking branch 'origin/main' into users/agunashe/run_oc…
agunashe Apr 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions build_tools/github_actions/fetch_test_configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@ def _get_script_path(script_name: str) -> str:
"windows": 4,
},
},
# ocltest
"ocltst": {
"job_name": "ocltst",
"fetch_artifact_args": "--tests",
Comment thread
agunashe marked this conversation as resolved.
Outdated
"timeout_minutes": 30,
"test_script": f"python {_get_script_path('test_ocltst.py')}",
"platform": ["linux", "windows"],
Comment thread
agunashe marked this conversation as resolved.
"total_shards_dict": {
"linux": 1,
"windows": 1,
},
"container_image": "ghcr.io/rocm/no_rocm_image_ubuntu24_04_ocl_rt@sha256:f059191fdd5cf3bf8c5a47cb017a7be6e9bded0d52be594feb96f724f6a838eb",
"container_options": "--cap-add=SYS_PTRACE --cap-add=SYS_NICE",
},
# BLAS tests
"rocblas": {
"job_name": "rocblas",
Expand Down
96 changes: 96 additions & 0 deletions build_tools/github_actions/test_executable_scripts/test_ocltst.py
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@geomin12 can you review the test script?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

30 mins is long. any options for parallelization? having a GPU machine run 30 mins for quick/sanity checks is really long and costly

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i have added 16 threads for execution. Lets see if the execution time comes down

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, 16 mins is better, but let's open an ocl test ticket to introduce filtering

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

16 mins is okay now, but let's introduce an internal ticket for ocl test filtering

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#4192 - can create a internal ticket for this as well

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we add that GH issue to the code here, so we don't forget

Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import logging
import os
import shlex
import subprocess
from pathlib import Path
import sys
import platform
import shutil

logging.basicConfig(level=logging.INFO)
THEROCK_BIN_DIR_STR = os.getenv("THEROCK_BIN_DIR")
if THEROCK_BIN_DIR_STR is None:
logging.info(
"++ Error: env(THEROCK_BIN_DIR) is not set. Please set it before executing tests."
)
sys.exit(1)
THEROCK_BIN_DIR = Path(THEROCK_BIN_DIR_STR).resolve()
THEROCK_DIR = Path(THEROCK_BIN_DIR).parent
env = os.environ.copy()
is_windows = platform.system() == "Windows"


# copies the dlls to local ocltst path.
# to overwrite the registry entries
def copy_dlls_exe_path(ocltst_path):
if platform.system() == "Windows":
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: use is_windows

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would prefer to fix it in the clean up. We need this PR merged ASAP for ocl coverage.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a 1 line change, in the code this PR is directly introducing, immediately prior in the code. Fix it please.

@geomin12 @amd-shiraz please don't approve PRs that so obviously are ignoring reviewer feedback.

# hip and comgr dlls need to be copied to the same folder as exectuable
dlls_pattern = ["amdocl*.dll", "amd_comgr*.dll", "OpenCL.dll"]
dlls_to_copy = []
for pattern in dlls_pattern:
dlls_to_copy.extend(THEROCK_BIN_DIR.glob(pattern))
for dll in dlls_to_copy:
try:
shutil.copy(dll, ocltst_path)
logging.info(f"++ Copied: {dll} to {ocltst_path}")
except Exception as e:
logging.info(f"++ Error copying {dll}: {e}")


# returns ocltst path
def setup_env(env):
ROCM_PATH = Path(THEROCK_DIR)
env["ROCM_PATH"] = str(ROCM_PATH)
Comment on lines +42 to +43
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This intermediate ROCM_PATH variable is doing nothing. THEROCK_DIR is already of type Path

if not is_windows:
ROCK_LIB_PATH = Path(THEROCK_DIR) / "lib"
OCL_LIB = Path(ROCK_LIB_PATH) / "opencl"
LLVM_LIB = Path(ROCK_LIB_PATH) / "llvm" / "lib"
ROCM_SYSDEPS_LIB = Path(ROCK_LIB_PATH) / "rocm_sysdeps" / "lib"
OCL_ICD_VENDORS = Path(THEROCK_DIR) / "etc" / "OpenCL" / "vendors"
OCLTST_PATH = Path(THEROCK_DIR) / "share" / "opencl" / "ocltst"
Comment on lines +45 to +50
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These variables are all already of type Path, no need to wrap in Path()

LD_LIBRARY_PATH = os.getenv("LD_LIBRARY_PATH")
if LD_LIBRARY_PATH is not None:
LD_LIBRARY_PATH = Path(LD_LIBRARY_PATH)
env["LD_LIBRARY_PATH"] = (
f"{ROCK_LIB_PATH}:{OCL_LIB}:{LLVM_LIB}:{ROCM_SYSDEPS_LIB}:{LD_LIBRARY_PATH}:{OCLTST_PATH}"
)
env["OCL_ICD_VENDORS"] = f"{OCL_ICD_VENDORS}/"
else:

OCLTST_PATH = Path(THEROCK_DIR) / "tests" / "ocltst"
copy_dlls_exe_path(OCLTST_PATH)
OCL_DLL_FILE = Path(OCLTST_PATH) / "amdocl64.dll"
OCL_ICD_DLL = Path(THEROCK_BIN_DIR) / "OpenCL.dll"
env["OCL_ICD_FILENAMES"] = str(OCL_DLL_FILE)
return OCLTST_PATH
Comment on lines +23 to +68
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we can (and should) remove nearly all of this setup code from this test script if we fix https://github.com/ROCm/rocm-systems/blob/be8076461de7f44e21a0f273d94f5872c89c1e17/projects/clr/opencl/tests/ocltst/CMakeLists.txt#L7-L11 to install ocltst into a standard install location like bin/ instead of a new location like tests/.

See for example what hipblaslt does:

Or a more recent / simpler case like dnn-providers/hipblaslt-provider: https://github.com/ROCm/rocm-libraries/blob/94453c6cb85db167e96e3f463a12d808734674b5/dnn-providers/hipblaslt-provider/cmake/Tests.cmake#L180-L192

The guiding principles are this:

  • Tests should be runnable from a build directory (e.g. via ctest)
  • Component development should use the same testing workflow as CI systems
  • Scripts for CI like these in build_tools/github_actions/test_executable_scripts/ should be thin wrappers around test runners / executables that possibly add filtering and sharding options applicable to CI. They should NOT be structurally modifying the artifact directory in any way. We granted a narrow exception for that to hip-tests in Enable hip-tests on TheRock #2001 but if this is becoming a pattern we need to fix it and not dig the whole deeper

See more detailed analysis of the issue and recommendations to fixing it here: https://github.com/ScottTodd/claude-rocm-workspace/blob/main/reviews/pr_TheRock_3590.md#1-install-to-bin-to-eliminate-dll-copy-and-ld_library_path-workarounds

I ran local experiments with the artifacts produced by this PR here: https://github.com/ScottTodd/claude-rocm-workspace/blob/main/reviews/pr_TheRock_3590.md#local-experiment-install-to-bin-validation

  • Moving files to bin/, tests passed if I ran without setting OCL_ICD_FILENAMES (so using my system OpenCL ICD version 3652.0)
  • Moving files to bin/, tests failed with OCL_ICD_FILENAMES (using OpenCL ICD version 3628.0...?) I'm not sure why, given that tests passed on CI

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your comment. I agree we should not have different installation paths for Linux vs windows.
However, changing the installation paths in clr could impact compute build.
So changes will be required in rocm-systems, compute could take a couple of weeks to get this merged again.
this coverage is important piece to validate Opencl and move away from legacy build systems.
Also earlier the linux paths were a requirement from packaging team to install the artifacts in a certain folder path.
We need similar requirements now with Rock which covers both linux and windows
I want to come back and clean up the paths/artifacts through a ticket/issue instead of blocking this PR.
Delaying this further risks test coverage gap in 7.13 too.

Looking at your experiments-
exp 1 - invalid, uses system32 opencl and comgr dlls for loading. Registry key takes precedence. Which is incorrect, need the dlls from the rock build to be loaded
exp 2 - partially valid, setting OCL_ICD_FILENAMES=$(pwd)/amdocl64.dll picks one dll from local folder but does not find all the dlls. So it fails
exp 3 - invalid again. same as exp1. use dlls from system32 instead of local build

To pick the right dlls to be loaded at runtime . Use Listdlls app to see the dlls loaded at runtime.
opt1 - We need to update registry key to point the amdocl64
or
opt2 - copy the required dlls to local folder of exe.

opt1 did not work, because we do not have any facility in Rock to update the registry key. Updating registry key requires admin permissions so could not set it through the test_ocltst.py.
choosing opt2 - With the interest of time, chose 2. this way local folder always gets precedence over system32 dlls

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, changing the installation paths in clr could impact compute build.

Impacts on internal builds are not generally a blocking concern for open source code, builds, or releases. The costs for living downstream are paid by downstream projects, not upstream.

I see the points about timelines, but so far I'm seeing more tech debt being accumulated than paid down (I tried to run hip-tests the other day and was completely unable to do so, facing similar issues as on here... these "scripts that only work on CI" need to be fixed)

Also earlier the linux paths were a requirement from packaging team to install the artifacts in a certain folder path.
We need similar requirements now with Rock which covers both linux and window

As far as I can tell, only rocgdb (also added recently) writes to a top level tests/ folder. All other ROCm subprojects use other paths. So this is already in "needs a requirement from packaging" territory. Using the already established layout seems like it would be the path of least resistance here to me, especially given that the tests/ path does not work (we're not going to ask users who download our packages to copy DLLs to run tests... they should be able to run the tests as easily as developers working on the projects and CI/CD systems can).

exp 2 - partially valid, setting OCL_ICD_FILENAMES=$(pwd)/amdocl64.dll picks one dll from local folder but does not find all the dlls. So it fails

Which am I missing? I have everything that this PR tries to copy and I don't see any missing DLLs when I run the various exes and dlls through https://github.com/lucasg/dependencies

λ ls D:\scratch\claude\artifacts\3590-win\bin
AMD.ROCM.Comgr.MANIFEST  clinfo.exe*   hipcc.exe*      oclperf.dll*     oclruntime.exclude
amd_comgr0702.dll*       cltrace.dll*  hipconfig.bat   oclperf.exclude  ocltst.exe*
amdocl64.dll*

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually looking at your exp setup
exp 1 - since you copy the test items to bin. runtime dlls picked are valid but system32 Opencl.dll is loaded
exp 2 - assuming items are copied from bin to test . setting OCL_ICD_FILENAMES forces to pick local OpenCL.dll. not sure what is causing this failure. Cant repro it locally.
exp 3 - running from tests folder without copying dlls to local folder picks the system32 dlls. So this is invalid.

Copy link
Copy Markdown
Contributor Author

@agunashe agunashe Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Impacts on internal builds are not generally a blocking concern for open source code, builds, or releases. The costs for living downstream are paid by downstream projects, not upstream.

I agree with this if its a simple bug fix, upstream gets priority. But currently we are still holding the legacy/downstream systems because of the test coverage gap in Rock. Once we have enabled the tests on Rock, we can look at fixing the issues, atleast then we have a gate check that verifies our change. Right now the opencl changes are not tested in Rock, this causes issues to be caught later in the QA cycle.

I see the points about timelines, but so far I'm seeing more tech debt being accumulated than paid down (I tried to run hip-tests the other day and was completely unable to do so, facing similar issues as on here... these "scripts that only work on CI" need to be fixed)

Please file an issue assign to our team, we will work on it soon. It should work outside CI as well.

As far as I can tell, only rocgdb (also added recently) writes to a top level tests/ folder. All other ROCm subprojects use other paths. So this is already in "needs a requirement from packaging" territory. Using the already established layout seems like it would be the path of least resistance here to me, especially given that the tests/ path does not work (we're not going to ask users who download our packages to copy DLLs to run tests... they should be able to run the tests as easily as developers working on the projects and CI/CD systems can).

ocltst is a test binary. I am not sure if its right thing to copy this to bin folder. sorry need a proper requirement and design.

also exp2 should work, I am not able to repro it locally.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can provide another LIMITED, TEMPORARY exception for these sorts of DLL copying workarounds in tests as part of adding ocltst coverage to TheRock but this needs to be fixed urgently as it impacts core developer workflows and shipping package structures.

We're still recovering from issues where framework (JAX) automated tests, QA processes, and packages were completely misaligned to the point where we were silently falling back to testing on CPU on CI, QA had their own undocumented test procedures, and the end result were user-facing packages that were fundamentally broken.

This has all the same smells to it. We ship these test files as part of releases. Users/developers/etc. can and will try to run the test files in those release archives. They will not work as-is. Users/developers/etc. will not think to find a script like this or copy DLLs / set environment variables themselves.

Tracking follow-ups in #4192.



def execute_tests(env):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do these tests have a smoke test filter? something like this: https://github.com/ROCm/TheRock/blob/main/docs/development/test_filtering.md

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as of now, there is no smoke tests for this test.
this is already a shortened test. Maybe we can look at adding smoke tests in the future, needs more work identifying and trimming the tests

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes let's open tix for filtering, as we probably want to do expanded testing later

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#4192 - added to this issue

OCLTST_PATH = setup_env(env)
if not is_windows:
Comment thread
agunashe marked this conversation as resolved.
Outdated
cmd = [
"./ocltst",
"-m",
Comment thread
agunashe marked this conversation as resolved.
Outdated
"liboclruntime.so",
"-A",
Comment thread
agunashe marked this conversation as resolved.
Outdated
"oclruntime.exclude",
]
shell_var = False
logging.info(f"++ Setting LD_LIBRARY_PATH={env['LD_LIBRARY_PATH']}")
logging.info(f"++ Setting OCL_ICD_VENDORS={env['OCL_ICD_VENDORS']}")
else:
cmd = [
"ocltst.exe",
"-m",
Comment thread
agunashe marked this conversation as resolved.
Outdated
"oclruntime.dll",
"-A",
"oclruntime.exclude",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any options for parallelization? can we add a comment on what ./ocltst does? it seems to be a custom script so it's unclear to us what is under the hood. is it ctest/gtest/pytest?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its a custom test with c/c++ calls no test framework used. Added 16 threads , think they run parallely. Need to check

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems to have reduced the execution time to around 17 mins.

]
shell_var = True
logging.info(f"++ Setting OCL_ICD_FILENAMES={env['OCL_ICD_FILENAMES']}")
logging.info(f"++ Exec [{OCLTST_PATH}]$ {shlex.join(cmd)}")
subprocess.run(cmd, cwd=OCLTST_PATH, check=True, env=env, shell=shell_var)


if __name__ == "__main__":
execute_tests(env)
8 changes: 5 additions & 3 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ if(THEROCK_ENABLE_OCL_RUNTIME)

set(OCL_CLR_CMAKE_ARGS)
set(OCL_CLR_RUNTIME_DEPS)

if(WIN32)
# Windows CLR options
set(_compute_pal_dir "${THEROCK_ROCM_SYSTEMS_SOURCE_DIR}/shared/amdgpu-windows-interop")
Expand All @@ -394,10 +395,11 @@ if(THEROCK_ENABLE_OCL_RUNTIME)
rocprofiler-register
ROCR-Runtime
)
list(APPEND OCL_CLR_CMAKE_ARGS
"-DBUILD_TESTS=${THEROCK_BUILD_TESTING}"
)
endif()
# ocltst build flag
list(APPEND OCL_CLR_CMAKE_ARGS
"-DBUILD_TESTS=${THEROCK_BUILD_TESTING}"
)

therock_cmake_subproject_declare(ocl-clr
USE_DIST_AMDGPU_TARGETS
Expand Down
Loading