diff --git a/gpu/README.md b/gpu/README.md
index f7df29c6..f03896ff 100644
--- a/gpu/README.md
+++ b/gpu/README.md
@@ -10,13 +10,13 @@ versions.
 There are three types of hermetic toolkits configurations:
 
 1) Recommended: [Repository rules use redistributions loaded from NVIDIA repositories](#supported-hermetic-cuda-cudnn-nvshmem-versions).
-   
+
    For full CUDA toolkit hermeticity, use CUDA User Mode Driver libraries loaded from NVIDIA repositories
    by setting `--@cuda_driver//:include_cuda_umd_libs=true` (see [instructions](#configure-hermetic-cuda-user-mode-driver)).
-   
+
 
 2) [Repository rules use redistributions loaded from custom remote locations or
-local files](#2-custom-cudacudnnnvshmem-archives-and-nccl-wheels).
+   local files](#2-custom-cudacudnnnvshmem-archives-and-nccl-wheels).
 
    This option is recommended for testing custom/unreleases redistributions, or
    redistributions previously loaded locally.
@@ -141,12 +141,12 @@ is specified in [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https:
    build:cuda --repo_env TF_NEED_CUDA=1
    build:cuda --@rules_ml_toolchain//common:enable_cuda
    ```
-   
+
    To use Clang compiler for CUDA targets, set
    `--@local_config_cuda//:cuda_compiler=clang`, for NVCC compiler set
-  `--@local_config_cuda//:cuda_compiler=nvcc` and `TF_NVCC_CLANG` environment
+   `--@local_config_cuda//:cuda_compiler=nvcc` and `TF_NVCC_CLANG` environment
    variable.
-   
+
    ```
    build:build_cuda_with_clang --@local_config_cuda//:cuda_compiler=clang
 
@@ -222,12 +222,12 @@ UMD version should be compatible with KMD and CUDA Runtime versions.
 
 
 - Supported Kernel Mode Driver and User Mode Driver version combinations:
- 
+
   Driver versions combination | Is supported
-  -------- | --------
+    -------- | --------
   KMD > UMD | -
   KMD <= UMD | +
- 
+
 - UMD and CUDA Runtime versions compatibility is described in
   [NVIDIA documentation](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#id6).
 
@@ -258,11 +258,11 @@ UMD version should be compatible with KMD and CUDA Runtime versions.
    ```
 
 2. To select specific version of hermetic NCCL, set the
-   `HERMETIC_NCCL_VERFSION` environment variable. Use only supported versions.
+   `HERMETIC_NCCL_VERSION` environment variable. Use only supported versions.
    You may set the environment
    variables directly in your shell or in `.bazelrc` file as shown below:
    ```
-   build:cuda --repo_env=HERMETIC_NCCL_VERFSION="2.27.7"
+   build:cuda --repo_env=HERMETIC_NCCL_VERSION="2.27.7"
    ```   
 
 3. To select specific version of hermetic NVSHMEM, set the
@@ -329,19 +329,23 @@ The JSON files contain paths to individual redistributions for different OS
 architectures.
 
 1. Create `cuda_redist.json` and/or `cudnn_redist.json` and/or
-`nvshmem_redist.json` files.
+   `nvshmem_redist.json` files.
 
    `cuda_redist.json` show follow the format below:
 
    ```
    {
       "cuda_cccl": {
-         "linux-x86_64": {
-            "relative_path": "cuda_cccl-linux-x86_64-12.4.99-archive.tar.xz",
-         },
-         "linux-sbsa": {
-            "relative_path": "cuda_cccl-linux-sbsa-12.4.99-archive.tar.xz",
-         }
+          "linux-x86_64": {
+              "full_path": "https://github.com/NVIDIA/cccl/archive/0d328e06c9fc78a216ec70df4917f7230a9c77e3.tar.gz",
+              "sha256": "c45dddfcebfc2d719e0c4cc6a874a4b50a751b90daba139699d3fc11708cf0ef",
+              "strip_prefix": "cccl-0d328e06c9fc78a216ec70df4917f7230a9c77e3",
+        },
+        "linux-sbsa": {
+              "full_path": "https://github.com/NVIDIA/cccl/archive/0d328e06c9fc78a216ec70df4917f7230a9c77e3.tar.gz",
+              "sha256": "c45dddfcebfc2d719e0c4cc6a874a4b50a751b90daba139699d3fc11708cf0ef",
+              "strip_prefix": "cccl-0d328e06c9fc78a216ec70df4917f7230a9c77e3",
+        },
       },
    }
    ```
@@ -384,8 +388,10 @@ architectures.
    }
    ```
 
-   The `relative_path` field can be replaced with `full_path` for the full URLs
-   and absolute local paths starting with `file:///`.
+   Note that `sha_256` and `strip_prefix` are optional.
+
+   `full_path` should be used for the full URLs  and absolute local paths
+   starting with `file:///`.
 
 2. In the downstream project dependent on `rules_ml_toolchain`, update the
    hermetic cuda JSON repository call in `WORKSPACE` file. Both web links and
@@ -449,12 +455,16 @@ dependencies in Google ML projects.
    ```
    _CUSTOM_CUDA_REDISTRIBUTIONS = {
       "cuda_cccl": {
-         "linux-x86_64": {
-            "relative_path": "cuda_cccl-linux-x86_64-12.4.99-archive.tar.xz",
-         },
-         "linux-sbsa": {
-            "relative_path": "cuda_cccl-linux-sbsa-12.4.99-archive.tar.xz",
-         }
+          "linux-x86_64": {
+              "full_path": "https://github.com/NVIDIA/cccl/archive/0d328e06c9fc78a216ec70df4917f7230a9c77e3.tar.gz",
+              "sha256": "c45dddfcebfc2d719e0c4cc6a874a4b50a751b90daba139699d3fc11708cf0ef",
+              "strip_prefix": "cccl-0d328e06c9fc78a216ec70df4917f7230a9c77e3",
+          },
+          "linux-sbsa": {
+              "full_path": "https://github.com/NVIDIA/cccl/archive/0d328e06c9fc78a216ec70df4917f7230a9c77e3.tar.gz",
+              "sha256": "c45dddfcebfc2d719e0c4cc6a874a4b50a751b90daba139699d3fc11708cf0ef",
+              "strip_prefix": "cccl-0d328e06c9fc78a216ec70df4917f7230a9c77e3",
+          },
       },
    }
    ```
@@ -497,14 +507,27 @@ dependencies in Google ML projects.
    }
    ```
 
-   The `relative_path` field can be replaced with `full_path` for the full URLs
-   and absolute local paths starting with `file:///`.
+   Note that `sha_256` and `strip_prefix` are optional.
+
+   `full_path` should be used for the full URLs  and absolute local paths
+   starting with `file:///`.
 
 2. In the same `WORKSPACE` file, pass the created dictionaries to the repository
-   rule. If the dictionaries contain relative paths to distributions, the path
+   rule.
+
+   If the dictionaries contain relative paths to distributions, the path
    prefix should be updated in `cuda_redist_init_repositories()`,
    `cudnn_redist_init_repository()` and `nvshmem_redist_init_repository()`
    calls.
+
+   There is an option to customize BUILD templates when the custom
+   redistributions have different folder structure than default ones.
+   Note that `source_dirs` is mandatory, it's used for the scenarios described
+   [here](https://github.com/google-ml-infra/rules_ml_toolchain/blob/main/gpu/README.md#3-local-toolkit-installations-used-as-sources-for-hermetic-repositories).
+
+   If the templates for the scenarios above are different, you need to provide
+   them in `version_to_templates` under `local` key.
+
    ```
    register_toolchains("@rules_ml_toolchain//cc:linux_x86_64_linux_x86_64_cuda")
    register_toolchains("@rules_ml_toolchain//cc:linux_aarch64_linux_aarch64_cuda")
@@ -520,9 +543,30 @@ dependencies in Google ML projects.
       "cuda_redist_init_repositories",
       "cudnn_redist_init_repository",
    )
+   
+   _CCCL_BUILD_TEMPLATES = {
+        "cuda_cccl": {
+            "repo_name": "cuda_cccl",
+            "version_to_template": {
+                "13": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl_github.BUILD.tpl",
+                "12": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl_github.BUILD.tpl",
+                "11": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl_github.BUILD.tpl",
+            },
+            "local": {
+                "source_dirs": ["include", "lib"],
+                "version_to_template": {
+                    "13": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
+                    "12": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
+                    "11": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
+                },
+            },
+        },
+   }
+   
    cuda_redist_init_repositories(
       cuda_redistributions = _CUSTOM_CUDA_REDISTRIBUTIONS,
       cuda_redist_path_prefix = "file:///home/usr/Downloads/dists/",
+      redist_versions_to_build_templates = _CCCL_BUILD_TEMPLATES,
    )
    cudnn_redist_init_repository(
       cudnn_redistributions = _CUSTOM_CUDNN_REDISTRIBUTIONS,
@@ -590,13 +634,17 @@ _CUDNN_JSON_DICT = {
 
 _CUDA_DIST_DICT = {
    "cuda_cccl": {
-      "linux-x86_64": {
-            "relative_path": "cuda_cccl-linux-x86_64-12.4.99-archive.tar.xz",
-      },
-      "linux-sbsa": {
-            "relative_path": "cuda_cccl-linux-sbsa-12.4.99-archive.tar.xz",
-      },
-   },
+        "linux-x86_64": {
+            "full_path": "https://github.com/NVIDIA/cccl/archive/0d328e06c9fc78a216ec70df4917f7230a9c77e3.tar.gz",
+            "sha256": "c45dddfcebfc2d719e0c4cc6a874a4b50a751b90daba139699d3fc11708cf0ef",
+            "strip_prefix": "cccl-0d328e06c9fc78a216ec70df4917f7230a9c77e3",
+        },
+        "linux-sbsa": {
+            "full_path": "https://github.com/NVIDIA/cccl/archive/0d328e06c9fc78a216ec70df4917f7230a9c77e3.tar.gz",
+            "sha256": "c45dddfcebfc2d719e0c4cc6a874a4b50a751b90daba139699d3fc11708cf0ef",
+            "strip_prefix": "cccl-0d328e06c9fc78a216ec70df4917f7230a9c77e3",
+        },
+    },,
    "libcusolver": {
       "linux-x86_64": {
             "full_path": "file:///usr/Downloads/dists/libcusolver-linux-x86_64-11.6.0.99-archive.tar.xz",
@@ -607,6 +655,25 @@ _CUDA_DIST_DICT = {
    },
 }
 
+_CCCL_BUILD_TEMPLATES = {
+    "cuda_cccl": {
+        "repo_name": "cuda_cccl",
+        "version_to_template": {
+            "13": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl_github.BUILD.tpl",
+            "12": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl_github.BUILD.tpl",
+            "11": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl_github.BUILD.tpl",
+        },
+        "local": {
+            "source_dirs": ["include", "lib"],
+            "version_to_template": {
+                "13": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
+                "12": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
+                "11": "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
+            },
+        },
+    },
+}
+
 _CUDNN_DIST_DICT = {
    "cudnn": {
       "linux-x86_64": {
@@ -655,9 +722,14 @@ load(
    "cuda_redist_init_repositories",
    "cudnn_redist_init_repository",
 )
-cudnn_redist_init_repositories(
+load(
+    "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_redist_versions.bzl",
+    "REDIST_VERSIONS_TO_BUILD_TEMPLATES",
+)
+cuda_redist_init_repositories(
    cuda_redistributions = CUDA_REDISTRIBUTIONS | _CUDA_DIST_DICT,
    cuda_redist_path_prefix = "file:///usr/Downloads/dists/",
+   redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES | _CCCL_BUILD_TEMPLATES,
 )
 cudnn_redist_init_repository(
    cudnn_redistributions = CUDNN_REDISTRIBUTIONS | _CUDNN_DIST_DICT,
@@ -748,5 +820,4 @@ The structure of the folders inside NVSHMEM dir should be the following:
     include/
     lib/
     bin/
-```
-
+```
\ No newline at end of file
diff --git a/gpu/cuda/cuda_redist_init_repositories.bzl b/gpu/cuda/cuda_redist_init_repositories.bzl
index 94ae2681..63fe5d7c 100644
--- a/gpu/cuda/cuda_redist_init_repositories.bzl
+++ b/gpu/cuda/cuda_redist_init_repositories.bzl
@@ -20,7 +20,6 @@ load(
     "cuda_redist_init_repositories_wrapper",
     "cudnn_redist_init_repository_wrapper",
 )
-
 load(
     "//third_party/gpus/cuda/hermetic:cuda_redist_versions.bzl",
     "CUDA_REDIST_PATH_PREFIX",
@@ -39,14 +38,17 @@ def cudnn_redist_init_repository(
         cudnn_redistributions,
         cudnn_redist_path_prefix,
         mirrored_tar_cudnn_redist_path_prefix,
-        redist_versions_to_build_templates)
+        redist_versions_to_build_templates,
+    )
 
 def cuda_redist_init_repositories(
         cuda_redistributions,
         cuda_redist_path_prefix = CUDA_REDIST_PATH_PREFIX,
         mirrored_tar_cuda_redist_path_prefix = MIRRORED_TAR_CUDA_REDIST_PATH_PREFIX,
         redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES):
-    cuda_redist_init_repositories_wrapper(cuda_redistributions,
+    cuda_redist_init_repositories_wrapper(
+        cuda_redistributions,
         cuda_redist_path_prefix,
         mirrored_tar_cuda_redist_path_prefix,
-        redist_versions_to_build_templates)
\ No newline at end of file
+        redist_versions_to_build_templates,
+    )
diff --git a/gpu/nccl/nccl_redist_init_repository.bzl b/gpu/nccl/nccl_redist_init_repository.bzl
index 3c5a9bef..c04e7b0d 100644
--- a/gpu/nccl/nccl_redist_init_repository.bzl
+++ b/gpu/nccl/nccl_redist_init_repository.bzl
@@ -28,4 +28,7 @@ load(
 def nccl_redist_init_repository(
         cuda_nccl_wheels = CUDA_NCCL_WHEELS,
         redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES):
-    nccl_redist_init_repository_wrapper(cuda_nccl_wheels, redist_versions_to_build_templates)
\ No newline at end of file
+    nccl_redist_init_repository_wrapper(
+        cuda_nccl_wheels,
+        redist_versions_to_build_templates,
+    )
diff --git a/third_party/extensions/cuda_redist_init.bzl b/third_party/extensions/cuda_redist_init.bzl
index 2754fc31..c7d2e529 100644
--- a/third_party/extensions/cuda_redist_init.bzl
+++ b/third_party/extensions/cuda_redist_init.bzl
@@ -33,6 +33,7 @@ def _cuda_redist_init_ext_impl(mctx):
         cudnn_redistributions = CUDNN_REDISTRIBUTIONS,
     )
 
+# TODO(ybaturina): add missing features from workspace mode
 cuda_redist_init_ext = module_extension(
     implementation = _cuda_redist_init_ext_impl,
 )
diff --git a/third_party/gpus/cuda/hermetic/cuda_cccl_github.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cccl_github.BUILD.tpl
new file mode 100644
index 00000000..b311d6a5
--- /dev/null
+++ b/third_party/gpus/cuda/hermetic/cuda_cccl_github.BUILD.tpl
@@ -0,0 +1,82 @@
+licenses(["restricted"])  # NVIDIA proprietary license
+
+filegroup(
+    name = "header_list",
+    srcs = [":thrust_header_list",":nv_header_list", ":cuda_header_list", ":cub_header_list"],
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_library(
+    name = "headers",
+    deps = [":thrust_headers",":nv_headers", ":cuda_headers", ":cub_headers"],
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+filegroup(
+    name = "thrust_header_list",
+    srcs = glob([
+        %{comment}"thrust/thrust/**",
+    ]),
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_library(
+    name = "thrust_headers",
+    hdrs = [":thrust_header_list"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["thrust"],
+    strip_include_prefix = "thrust",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+filegroup(
+    name = "cuda_header_list",
+    srcs = glob([
+        %{comment}"libcudacxx/include/cuda/**",
+    ]),
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [":cuda_header_list"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["libcudacxx/include"],
+    strip_include_prefix = "libcudacxx/include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+filegroup(
+    name = "nv_header_list",
+    srcs = glob([
+        %{comment}"libcudacxx/include/nv/**",
+    ]),
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_library(
+    name = "nv_headers",
+    hdrs = ["nv_header_list"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["libcudacxx/include/nv"],
+    strip_include_prefix = "libcudacxx/include",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+filegroup(
+    name = "cub_header_list",
+    srcs = glob([
+        %{comment}"cub/cub/**",
+    ]),
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
+cc_library(
+    name = "cub_headers",
+    hdrs = [":cub_header_list"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cub"],
+    strip_include_prefix = "cub",
+    visibility = ["@local_config_cuda//cuda:__pkg__"],
+)
+
diff --git a/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl b/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl
index 13d8bdc6..d0e35c83 100644
--- a/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl
+++ b/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl
@@ -16,6 +16,7 @@
 
 load(
     "//third_party/gpus:nvidia_common_rules.bzl",
+    "get_local_templates",
     "get_redistribution_urls",
     "get_version_and_template_lists",
     "redist_init_repository",
@@ -49,10 +50,13 @@ def cudnn_redist_init_repository(
     versions, templates = get_version_and_template_lists(
         repo_data["version_to_template"],
     )
+    local_templates = get_local_templates(repo_data["local"], templates)
+    local_source_dirs = repo_data["local"]["source_dirs"]
     redist_init_repository(
         name = repo_data["repo_name"],
         versions = versions,
         build_templates = templates,
+        local_build_templates = local_templates,
         url_dict = url_dict,
         redist_path_prefix = cudnn_redist_path_prefix,
         mirrored_tar_redist_path_prefix = mirrored_tar_cudnn_redist_path_prefix,
@@ -60,7 +64,7 @@ def cudnn_redist_init_repository(
         local_path_env_var = "LOCAL_CUDNN_PATH",
         use_tar_file_env_var = "USE_CUDA_TAR_ARCHIVE_FILES",
         target_arch_env_var = "CUDA_REDIST_TARGET_PLATFORM",
-        local_source_dirs = ["include", "lib"],
+        local_source_dirs = local_source_dirs,
     )
 
 def cuda_redist_init_repositories(
@@ -86,10 +90,13 @@ def cuda_redist_init_repositories(
         versions, templates = get_version_and_template_lists(
             repo_data["version_to_template"],
         )
+        local_templates = get_local_templates(repo_data["local"], templates)
+        local_source_dirs = repo_data["local"]["source_dirs"]
         redist_init_repository(
             name = repo_data["repo_name"],
             versions = versions,
             build_templates = templates,
+            local_build_templates = local_templates,
             url_dict = url_dict,
             redist_path_prefix = cuda_redist_path_prefix,
             mirrored_tar_redist_path_prefix = mirrored_tar_cuda_redist_path_prefix,
@@ -97,7 +104,7 @@ def cuda_redist_init_repositories(
             local_path_env_var = "LOCAL_CUDA_PATH",
             use_tar_file_env_var = "USE_CUDA_TAR_ARCHIVE_FILES",
             target_arch_env_var = "CUDA_REDIST_TARGET_PLATFORM",
-            local_source_dirs = ["include", "lib", "bin", "nvvm"],
+            local_source_dirs = local_source_dirs,
             repository_symlinks = {
                 Label("@cuda_cudart//:include/cuda.h"): "include/cuda.h",
                 Label("@cuda_nvdisasm//:bin/nvdisasm"): "bin/nvdisasm",
@@ -115,7 +122,8 @@ def cudnn_redist_init_repository_wrapper(
         cudnn_redistributions,
         cudnn_redist_path_prefix,
         mirrored_tar_cudnn_redist_path_prefix,
-        redist_versions_to_build_templates)
+        redist_versions_to_build_templates,
+    )
 
 # TODO(yuriit): Remove after moving to //gpu/cuda package
 def cuda_redist_init_repositories_wrapper(
@@ -123,7 +131,9 @@ def cuda_redist_init_repositories_wrapper(
         cuda_redist_path_prefix = CUDA_REDIST_PATH_PREFIX,
         mirrored_tar_cuda_redist_path_prefix = MIRRORED_TAR_CUDA_REDIST_PATH_PREFIX,
         redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES):
-    cuda_redist_init_repositories(cuda_redistributions,
+    cuda_redist_init_repositories(
+        cuda_redistributions,
         cuda_redist_path_prefix,
         mirrored_tar_cuda_redist_path_prefix,
-        redist_versions_to_build_templates)
+        redist_versions_to_build_templates,
+    )
diff --git a/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl b/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl
index 2a1c1886..c015fff8 100644
--- a/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl
+++ b/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl
@@ -531,12 +531,18 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "530": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
             "520": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["bin", "lib"],
+        },
     },
     "cuda_nccl": {
         "repo_name": "cuda_nccl",
         "version_to_template": {
             "2": "//third_party/nccl/hermetic:cuda_nccl.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "cudnn": {
         "repo_name": "cuda_cudnn",
@@ -545,6 +551,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "9": "//third_party/gpus/cuda/hermetic:cuda_cudnn.BUILD.tpl",
             "8": "//third_party/gpus/cuda/hermetic:cuda_cudnn8.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "libcublas": {
         "repo_name": "cuda_cublas",
@@ -553,6 +562,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_cublas.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_cublas.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "cuda_cudart": {
         "repo_name": "cuda_cudart",
@@ -561,6 +573,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_cudart.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_cudart.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "libcufft": {
         "repo_name": "cuda_cufft",
@@ -570,6 +585,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "11": "//third_party/gpus/cuda/hermetic:cuda_cufft.BUILD.tpl",
             "10": "//third_party/gpus/cuda/hermetic:cuda_cufft.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "cuda_cupti": {
         "repo_name": "cuda_cupti",
@@ -578,12 +596,18 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_cupti.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_cupti.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "libcurand": {
         "repo_name": "cuda_curand",
         "version_to_template": {
             "10": "//third_party/gpus/cuda/hermetic:cuda_curand.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "libcusolver": {
         "repo_name": "cuda_cusolver",
@@ -591,6 +615,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_cusolver.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_cusolver.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "libcusparse": {
         "repo_name": "cuda_cusparse",
@@ -599,6 +626,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_cusparse.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_cusparse.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "libnvjitlink": {
         "repo_name": "cuda_nvjitlink",
@@ -606,6 +636,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "13": "//third_party/gpus/cuda/hermetic:cuda_nvjitlink.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvjitlink.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "cuda_nvrtc": {
         "repo_name": "cuda_nvrtc",
@@ -614,6 +647,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvrtc.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvrtc.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "cuda_cccl": {
         "repo_name": "cuda_cccl",
@@ -622,12 +658,18 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
     "cuda_crt": {
         "repo_name": "cuda_crt",
         "version_to_template": {
             "13": "//third_party/gpus/cuda/hermetic:cuda_crt.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include"],
+        },
     },
     "cuda_nvcc": {
         "repo_name": "cuda_nvcc",
@@ -636,12 +678,18 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvcc.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvcc.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "bin", "nvvm"],
+        },
     },
     "libnvvm": {
         "repo_name": "cuda_nvvm",
         "version_to_template": {
             "13": "//third_party/gpus/cuda/hermetic:cuda_nvvm.BUILD",
         },
+        "local": {
+            "source_dirs": ["nvvm"],
+        },
     },
     "cuda_nvdisasm": {
         "repo_name": "cuda_nvdisasm",
@@ -649,6 +697,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "13": "//third_party/gpus/cuda/hermetic:cuda_nvdisasm.BUILD",
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvdisasm.BUILD",
         },
+        "local": {
+            "source_dirs": ["bin"],
+        },
     },
     "cuda_nvml_dev": {
         "repo_name": "cuda_nvml",
@@ -657,6 +708,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvml.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvml.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib", "nvml"],
+        },
     },
     "cuda_nvprune": {
         "repo_name": "cuda_nvprune",
@@ -665,6 +719,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvprune.BUILD",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvprune.BUILD",
         },
+        "local": {
+            "source_dirs": ["bin"],
+        },
     },
     "cuda_profiler_api": {
         "repo_name": "cuda_profiler_api",
@@ -673,6 +730,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_profiler.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_profiler.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include"],
+        },
     },
     "cuda_nvtx": {
         "repo_name": "cuda_nvtx",
@@ -681,6 +741,9 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvtx.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvtx.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib"],
+        },
     },
 }
 
@@ -690,5 +753,8 @@ NVSHMEM_REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
         "version_to_template": {
             "3": "//third_party/nvshmem/hermetic:nvidia_nvshmem.BUILD.tpl",
         },
+        "local": {
+            "source_dirs": ["include", "lib", "bin"],
+        },
     },
 }
diff --git a/third_party/gpus/nvidia_common_rules.bzl b/third_party/gpus/nvidia_common_rules.bzl
index e6ddc4bd..0d25e366 100644
--- a/third_party/gpus/nvidia_common_rules.bzl
+++ b/third_party/gpus/nvidia_common_rules.bzl
@@ -64,6 +64,7 @@ def _get_orig_repo_name(repository_ctx):
     if hasattr(repository_ctx, "original_name") and repository_ctx.original_name:
         # For bazel 8 and above.
         return repository_ctx.original_name
+
     # With Bzlmod, the repo name will be something like `_main~cuda_redist_init_ext~cuda_nvml`,
     # we need to extract the original repo name.
     return repository_ctx.name.split("~")[-1]
@@ -144,35 +145,30 @@ def get_lib_name_to_version_dict(repository_ctx):
                 lib_name_to_version_dict[minor_version_key] = lib_version
     return lib_name_to_version_dict
 
-def create_dummy_build_file(repository_ctx, use_comment_symbols = True):
-    repository_ctx.template(
-        "BUILD",
-        repository_ctx.attr.build_templates[0],
-        {
-            "%{multiline_comment}": "'''" if use_comment_symbols else "",
-            "%{comment}": "#" if use_comment_symbols else "",
-        },
-    )
-
-def create_cuda_nvcc_build_file(repository_ctx, use_comment_symbols = True):
-    cuda_version = (get_env_var(repository_ctx, "HERMETIC_CUDA_VERSION") or
-                    get_env_var(repository_ctx, "TF_CUDA_VERSION"))
+def create_dummy_build_file(repository_ctx, cuda_version, is_local_redist, use_comment_symbols = True):
+    if is_local_redist:
+        build_template = repository_ctx.attr.local_build_templates[0]
+    else:
+        build_template = repository_ctx.attr.build_templates[0]
     repository_ctx.template(
         "BUILD",
-        repository_ctx.attr.build_templates[0],
+        build_template,
         {
             "%{multiline_comment}": "'''" if use_comment_symbols else "",
             "%{comment}": "#" if use_comment_symbols else "",
-            "%{version_of_cuda}": cuda_version,
+            "%{version_of_cuda}": cuda_version or "",
         },
     )
 
-def _get_build_template(repository_ctx, major_lib_version):
+def _get_build_template(repository_ctx, major_lib_version, is_local_redist):
     template = None
     for i in range(0, len(repository_ctx.attr.versions)):
         for dist_version in repository_ctx.attr.versions[i].split(","):
             if dist_version == major_lib_version:
-                template = repository_ctx.attr.build_templates[i]
+                if is_local_redist:
+                    template = repository_ctx.attr.local_build_templates[i]
+                else:
+                    template = repository_ctx.attr.build_templates[i]
                 break
     if not template:
         fail("No build template found for {} version {}".format(
@@ -193,30 +189,31 @@ def get_major_library_version(repository_ctx, lib_name_to_version_dict):
 
 def create_build_file(
         repository_ctx,
+        cuda_version,
         lib_name_to_version_dict,
-        major_lib_version):
+        major_lib_version,
+        is_local_redist):
     # buildifier: disable=function-docstring-args
     """Creates a BUILD file for the repository."""
     if len(major_lib_version) == 0:
-        build_template_content = repository_ctx.read(
-            repository_ctx.attr.build_templates[0],
-        )
-
-        if _get_orig_repo_name(repository_ctx) == "cuda_nvcc":
-            create_cuda_nvcc_build_file(
-                repository_ctx,
-                use_comment_symbols = True if "_version}" in build_template_content else False,
-            )
+        if is_local_redist:
+            build_template = repository_ctx.attr.local_build_templates[0]
         else:
-            create_dummy_build_file(
-                repository_ctx,
-                use_comment_symbols = True if "_version}" in build_template_content else False,
-            )
+            build_template = repository_ctx.attr.build_templates[0]
+        build_template_content = repository_ctx.read(build_template)
+
+        create_dummy_build_file(
+            repository_ctx,
+            cuda_version,
+            is_local_redist = is_local_redist,
+            use_comment_symbols = True if "_version}" in build_template_content else False,
+        )
 
         return
     build_template = _get_build_template(
         repository_ctx,
         major_lib_version.split(".")[0],
+        is_local_redist,
     )
     repository_ctx.template(
         "BUILD",
@@ -298,8 +295,10 @@ def use_local_redist_path(repository_ctx, local_redist_path, dirs):
     )
     create_build_file(
         repository_ctx,
+        get_cuda_version(repository_ctx),
         lib_name_to_version_dict,
         major_version,
+        is_local_redist = True,
     )
     _create_libcuda_symlinks(
         repository_ctx,
@@ -314,7 +313,7 @@ def _download_redistribution(
         mirrored_tar_path_prefix):
     # buildifier: disable=function-docstring-args
     """Downloads and extracts NVIDIA redistribution."""
-    (url, sha256) = repository_ctx.attr.url_dict[arch_key]
+    (url, sha256, custom_strip_prefix) = repository_ctx.attr.url_dict[arch_key]
 
     # If url is not relative, then appending prefix is not needed.
     if not (url.startswith("http") or url.startswith("file:///")):
@@ -332,8 +331,8 @@ def _download_redistribution(
         output = file_name,
         sha256 = sha256,
     )
-    if repository_ctx.attr.override_strip_prefix:
-        strip_prefix = repository_ctx.attr.override_strip_prefix
+    if custom_strip_prefix:
+        strip_prefix = custom_strip_prefix
     else:
         strip_prefix = archive_name
     if url.endswith(".tar.xz") or url.endswith(".tar"):
@@ -383,7 +382,7 @@ def _use_downloaded_redistribution(repository_ctx):
 
     if not redist_version:
         # If no toolkit version is found, comment out cc_import targets.
-        create_dummy_build_file(repository_ctx)
+        create_dummy_build_file(repository_ctx, cuda_version, is_local_redist = False)
         create_version_file(repository_ctx, major_version)
         return
 
@@ -391,7 +390,7 @@ def _use_downloaded_redistribution(repository_ctx):
         print("{} is not found in redistributions list.".format(
             _get_orig_repo_name(repository_ctx),
         ))  # buildifier: disable=print
-        create_dummy_build_file(repository_ctx)
+        create_dummy_build_file(repository_ctx, cuda_version, is_local_redist = False)
         create_version_file(repository_ctx, major_version)
         return
 
@@ -427,8 +426,10 @@ def _use_downloaded_redistribution(repository_ctx):
     )
     create_build_file(
         repository_ctx,
+        cuda_version,
         lib_name_to_version_dict,
         major_version,
+        is_local_redist = False,
     )
     _create_libcuda_symlinks(
         repository_ctx,
@@ -450,7 +451,7 @@ _redist_repo = repository_rule(
         "url_dict": attr.string_list_dict(mandatory = True),
         "versions": attr.string_list(mandatory = True),
         "build_templates": attr.label_list(mandatory = True),
-        "override_strip_prefix": attr.string(),
+        "local_build_templates": attr.label_list(mandatory = True),
         "redist_path_prefix": attr.string(),
         "mirrored_tar_redist_path_prefix": attr.string(mandatory = False),
         "redist_version_env_vars": attr.string_list(mandatory = True),
@@ -475,6 +476,7 @@ def redist_init_repository(
         name,
         versions,
         build_templates,
+        local_build_templates,
         url_dict,
         redist_path_prefix,
         mirrored_tar_redist_path_prefix,
@@ -483,7 +485,6 @@ def redist_init_repository(
         use_tar_file_env_var,
         target_arch_env_var,
         local_source_dirs,
-        override_strip_prefix = "",
         repository_symlinks = {}):
     # buildifier: disable=function-docstring-args
     """Initializes repository for individual NVIDIA redistribution."""
@@ -492,7 +493,7 @@ def redist_init_repository(
         url_dict = url_dict,
         versions = versions,
         build_templates = build_templates,
-        override_strip_prefix = override_strip_prefix,
+        local_build_templates = local_build_templates,
         redist_path_prefix = redist_path_prefix,
         mirrored_tar_redist_path_prefix = mirrored_tar_redist_path_prefix,
         redist_version_env_vars = redist_version_env_vars,
@@ -518,6 +519,7 @@ def get_redistribution_urls(dist_info):
             url_dict[_REDIST_ARCH_DICT[arch]] = [
                 dist_info[arch_key]["relative_path"],
                 dist_info[arch_key].get("sha256", ""),
+                dist_info[arch_key].get("strip_prefix", ""),
             ]
             continue
 
@@ -525,6 +527,7 @@ def get_redistribution_urls(dist_info):
             url_dict[_REDIST_ARCH_DICT[arch]] = [
                 dist_info[arch_key]["full_path"],
                 dist_info[arch_key].get("sha256", ""),
+                dist_info[arch_key].get("strip_prefix", ""),
             ]
             continue
 
@@ -536,7 +539,7 @@ def get_redistribution_urls(dist_info):
             url_dict["{cuda_version}_{arch}".format(
                 cuda_version = cuda_version,
                 arch = _REDIST_ARCH_DICT[arch],
-            )] = [data[path_key], data.get("sha256", "")]
+            )] = [data[path_key], data.get("sha256", ""), data.get("strip_prefix", "")]
     return url_dict
 
 def get_version_and_template_lists(version_to_template):
@@ -556,6 +559,15 @@ def get_version_and_template_lists(version_to_template):
         template_list.append(Label(template))
     return (version_list, template_list)
 
+def get_local_templates(local_repo_data, templates):
+    if "version_to_template" in local_repo_data:
+        _, local_templates = get_version_and_template_lists(
+            local_repo_data["version_to_template"],
+        )
+    else:
+        local_templates = templates
+    return local_templates
+
 def _get_json_file_content(
         repository_ctx,
         url_to_sha256,
diff --git a/third_party/nccl/hermetic/nccl_redist_init_repository.bzl b/third_party/nccl/hermetic/nccl_redist_init_repository.bzl
index 956e3978..226534ea 100644
--- a/third_party/nccl/hermetic/nccl_redist_init_repository.bzl
+++ b/third_party/nccl/hermetic/nccl_redist_init_repository.bzl
@@ -25,6 +25,7 @@ load(
     "get_cuda_version",
     "get_env_var",
     "get_lib_name_to_version_dict",
+    "get_local_templates",
     "get_major_library_version",
     "get_version_and_template_lists",
     "use_local_redist_path",
@@ -43,7 +44,7 @@ def _use_downloaded_nccl_wheel(repository_ctx):
     major_version = ""
     if not cuda_version:
         # If no CUDA version is found, comment out cc_import targets.
-        create_dummy_build_file(repository_ctx)
+        create_dummy_build_file(repository_ctx, cuda_version, is_local_redist = False)
         create_version_file(repository_ctx, major_version)
         return
 
@@ -108,8 +109,10 @@ def _use_downloaded_nccl_wheel(repository_ctx):
     )
     create_build_file(
         repository_ctx,
+        cuda_version,
         lib_name_to_version_dict,
         major_version,
+        is_local_redist = False,
     )
 
     create_version_file(repository_ctx, major_version)
@@ -117,7 +120,7 @@ def _use_downloaded_nccl_wheel(repository_ctx):
 def _cuda_nccl_repo_impl(repository_ctx):
     local_nccl_path = get_env_var(repository_ctx, "LOCAL_NCCL_PATH")
     if local_nccl_path:
-        use_local_redist_path(repository_ctx, local_nccl_path, ["include", "lib"])
+        use_local_redist_path(repository_ctx, local_nccl_path, repository_ctx.attr.local_source_dirs)
     else:
         _use_downloaded_nccl_wheel(repository_ctx)
 
@@ -128,7 +131,9 @@ cuda_nccl_repo = repository_rule(
         "url_dict": attr.string_dict(mandatory = True),
         "versions": attr.string_list(mandatory = True),
         "build_templates": attr.label_list(mandatory = True),
+        "local_build_templates": attr.label_list(mandatory = True),
         "strip_prefix": attr.string(),
+        "local_source_dirs": attr.string_list(mandatory = True),
     },
 )
 
@@ -149,17 +154,24 @@ def nccl_redist_init_repository(
     versions, templates = get_version_and_template_lists(
         repo_data["version_to_template"],
     )
+    local_templates = get_local_templates(repo_data["local"], templates)
+    local_source_dirs = repo_data["local"]["source_dirs"]
     cuda_nccl_repo(
         name = repo_data["repo_name"],
         sha256_dict = nccl_artifacts_dict["sha256_dict"],
         url_dict = nccl_artifacts_dict["url_dict"],
         versions = versions,
         build_templates = templates,
+        local_build_templates = local_templates,
         strip_prefix = "nvidia/nccl",
+        local_source_dirs = local_source_dirs,
     )
 
 # TODO(yuriit): Remove after moving to //gpu/nccl package
 def nccl_redist_init_repository_wrapper(
         cuda_nccl_wheels,
         redist_versions_to_build_templates):
-    nccl_redist_init_repository(cuda_nccl_wheels, redist_versions_to_build_templates)
+    nccl_redist_init_repository(
+        cuda_nccl_wheels,
+        redist_versions_to_build_templates,
+    )
diff --git a/third_party/nvshmem/hermetic/nvshmem_redist_init_repository.bzl b/third_party/nvshmem/hermetic/nvshmem_redist_init_repository.bzl
index f8cfc5ac..def4ce6b 100644
--- a/third_party/nvshmem/hermetic/nvshmem_redist_init_repository.bzl
+++ b/third_party/nvshmem/hermetic/nvshmem_redist_init_repository.bzl
@@ -16,6 +16,7 @@
 
 load(
     "//third_party/gpus:nvidia_common_rules.bzl",
+    "get_local_templates",
     "get_redistribution_urls",
     "get_version_and_template_lists",
     "redist_init_repository",
@@ -42,9 +43,12 @@ def nvshmem_redist_init_repository(
     versions, templates = get_version_and_template_lists(
         repo_data["version_to_template"],
     )
+    local_templates = get_local_templates(repo_data["local"], templates)
+    local_source_dirs = repo_data["local"]["source_dirs"]
     redist_init_repository(
         name = repo_data["repo_name"],
         versions = versions,
+        local_build_templates = local_templates,
         build_templates = templates,
         url_dict = url_dict,
         redist_path_prefix = nvshmem_redist_path_prefix,
@@ -53,17 +57,18 @@ def nvshmem_redist_init_repository(
         local_path_env_var = "LOCAL_NVSHMEM_PATH",
         use_tar_file_env_var = "USE_NVSHMEM_TAR_ARCHIVE_FILES",
         target_arch_env_var = "NVSHMEM_REDIST_TARGET_PLATFORM",
-        local_source_dirs = ["include", "lib", "bin"],
+        local_source_dirs = local_source_dirs,
     )
 
 # TODO(yuriit): Remove this function after moving to //gpu/nvshmem
 def nvshmem_redist_init_repository_wrapper(
-    nvshmem_redistributions,
-    nvshmem_redist_path_prefix,
-    mirrored_tar_nvshmem_redist_path_prefix,
-    redist_versions_to_build_templates):
+        nvshmem_redistributions,
+        nvshmem_redist_path_prefix,
+        mirrored_tar_nvshmem_redist_path_prefix,
+        redist_versions_to_build_templates):
     nvshmem_redist_init_repository(
         nvshmem_redistributions,
         nvshmem_redist_path_prefix,
         mirrored_tar_nvshmem_redist_path_prefix,
-        redist_versions_to_build_templates)
\ No newline at end of file
+        redist_versions_to_build_templates,
+    )