From feb3d90326d2c5e80a10709ad583b7524600503b Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Sun, 28 Dec 2025 00:26:37 -0600 Subject: [PATCH 01/31] Minimal Windows CI Build requirements --- .github/workflows/R-CMD-check.yaml | 24 +++++++ configure.win | 106 +++++++++++++++++++++++++++++ src/.gitignore | 2 + src/Makevars.win | 11 --- src/Makevars.win.in | 12 ++++ 5 files changed, 144 insertions(+), 11 deletions(-) create mode 100755 configure.win delete mode 100644 src/Makevars.win create mode 100644 src/Makevars.win.in diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 2ff3f6f..d447b68 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -20,6 +20,7 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: ubuntu-latest, r: 'release'} + - {os: windows-latest, r: 'release'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} @@ -44,6 +45,29 @@ jobs: if: runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y opencl-headers ocl-icd-opencl-dev libclblast-dev + - name: Install OpenCL headers and CLBlast (Windows) + if: runner.os == 'Windows' + run: | + # Download OpenCL headers from Khronos + Invoke-WebRequest -Uri "https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/heads/main.zip" -OutFile opencl-headers.zip + Expand-Archive -Path opencl-headers.zip -DestinationPath $env:USERPROFILE + + # Download CLBlast pre-built binaries + Invoke-WebRequest -Uri "https://github.com/CNugteren/CLBlast/releases/download/1.6.3/CLBlast-1.6.3-windows-x64.7z" -OutFile clblast.7z + 7z x clblast.7z -o"$env:USERPROFILE" + + # Set environment variables for R package build (use forward slashes for gcc) + $openclPath = "$env:USERPROFILE/OpenCL-Headers-main" -replace '\\', '/' + $clblastPath = "$env:USERPROFILE/CLBlast-1.6.3-windows-x64" -replace '\\', '/' + + echo "OPENCL_CPPFLAGS=-I$openclPath" | Out-File -FilePath $env:GITHUB_ENV -Append + echo "CLBLAST_CPPFLAGS=-I$clblastPath/include" | Out-File -FilePath $env:GITHUB_ENV -Append + echo "CLBLAST_LIBS=-L$clblastPath/lib -lclblast" | Out-File -FilePath $env:GITHUB_ENV -Append + + # Add CLBlast DLL to PATH for runtime + echo "$clblastPath/lib" | Out-File -FilePath $env:GITHUB_PATH -Append + shell: pwsh + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::rcmdcheck diff --git a/configure.win b/configure.win new file mode 100755 index 0000000..4a5b807 --- /dev/null +++ b/configure.win @@ -0,0 +1,106 @@ +#!/bin/sh +## +## RcppBandicoot configure.win +## +## Windows-specific configuration script +## Detects OpenCL and CLBlast from environment variables +## +## Copyright (C) 2023-2025 James Balamuta +## Licensed under GPL-2 or later +## + +echo "Configuring RcppBandicoot for Windows..." + +## Get R_HOME +: ${R_HOME=$(R RHOME)} +if test -z "${R_HOME}"; then + echo "ERROR: Could not determine R_HOME" + exit 1 +fi + +## Default values +BANDICOOT_CXXFLAGS="" +BANDICOOT_LIBS="" +OPENMP_CXXFLAGS='$(SHLIB_OPENMP_CXXFLAGS)' +OPENCL_TARGET_VERSION=300 + +## Kernel source directory (matches configure.ac logic) +BANDICOOT_KERNELS_DIR=$("${R_HOME}/bin/Rscript" -e 'cat(paste(head(.libPaths(),1), "RcppBandicoot", "include", "bandicoot_bits", "ks", "", sep="/"))') + +## Always disable CUDA on Windows (requires manual setup) +BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CUDA" + +## Check for OpenCL headers via environment variable +if [ -n "${OPENCL_CPPFLAGS}" ]; then + echo " OpenCL headers: found (via OPENCL_CPPFLAGS)" + BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_USE_OPENCL ${OPENCL_CPPFLAGS}" + HAVE_OPENCL=1 +else + echo " OpenCL headers: not found" + echo " Set OPENCL_CPPFLAGS to specify OpenCL header location" + HAVE_OPENCL=0 +fi + +## Check for OpenCL library +if [ -n "${OPENCL_LIBS}" ]; then + echo " OpenCL library: found (via OPENCL_LIBS)" + BANDICOOT_LIBS="${BANDICOOT_LIBS} ${OPENCL_LIBS}" +fi + +## Check for CLBlast via environment variable +if [ -n "${CLBLAST_CPPFLAGS}" ]; then + echo " CLBlast: found (via CLBLAST_CPPFLAGS)" + BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_USE_CLBLAST ${CLBLAST_CPPFLAGS}" + HAVE_CLBLAST=1 +else + echo " CLBlast: not found, disabling" + BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CLBLAST" + HAVE_CLBLAST=0 +fi + +## Check for CLBlast library +if [ -n "${CLBLAST_LIBS}" ]; then + BANDICOOT_LIBS="${CLBLAST_LIBS} ${BANDICOOT_LIBS}" +fi + +## Always disable clBLAS (CLBlast is preferred) +BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CLBLAS" + +## Add common flags +BANDICOOT_CXXFLAGS="${OPENMP_CXXFLAGS} -Wno-missing-braces ${BANDICOOT_CXXFLAGS}" + +## Add R's LAPACK/BLAS +BANDICOOT_LIBS="${BANDICOOT_LIBS} \$(LAPACK_LIBS) \$(BLAS_LIBS) \$(FLIBS)" + +## Check if we have at least OpenCL +if [ "${HAVE_OPENCL}" = "0" ]; then + echo "" + echo "WARNING: No GPU backend detected!" + echo "" + echo "RcppBandicoot requires OpenCL headers to compile on Windows." + echo "Set the following environment variables before building:" + echo "" + echo " OPENCL_CPPFLAGS - Path to OpenCL headers (e.g., -IC:/OpenCL/include)" + echo " OPENCL_LIBS - OpenCL library flags (e.g., -LC:/OpenCL/lib -lOpenCL)" + echo " CLBLAST_CPPFLAGS - Path to CLBlast headers (optional)" + echo " CLBLAST_LIBS - CLBlast library flags (optional)" + echo "" +fi + +echo "" +echo "Configuration Summary:" +echo " OpenCL: ${HAVE_OPENCL}" +echo " CLBlast: ${HAVE_CLBLAST}" +echo " CUDA: disabled (Windows)" +echo " Kernels: ${BANDICOOT_KERNELS_DIR}" +echo "" + +## Generate Makevars.win from template +sed -e "s|@BANDICOOT_CXXFLAGS@|${BANDICOOT_CXXFLAGS}|g" \ + -e "s|@BANDICOOT_LIBS@|${BANDICOOT_LIBS}|g" \ + -e "s|@OPENMP_CXXFLAGS@|${OPENMP_CXXFLAGS}|g" \ + -e "s|@OPENCL_TARGET_VERSION@|${OPENCL_TARGET_VERSION}|g" \ + -e "s|@BANDICOOT_KERNELS_DIR@|${BANDICOOT_KERNELS_DIR}|g" \ + src/Makevars.win.in > src/Makevars.win + +echo "Generated src/Makevars.win" diff --git a/src/.gitignore b/src/.gitignore index 22034c4..5948b32 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,3 +1,5 @@ *.o *.so *.dll +Makevars +Makevars.win diff --git a/src/Makevars.win b/src/Makevars.win deleted file mode 100644 index 667e3f0..0000000 --- a/src/Makevars.win +++ /dev/null @@ -1,11 +0,0 @@ -## RcppBandicoot Makevars.win -## -## Windows-specific Makevars -## GPU backend configuration must be done manually on Windows - -PKG_CPPFLAGS = -I../inst/include - -## Warning: GPU backends may not be properly configured -## Users may need to manually specify OpenCL or CUDA paths -PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -Wno-missing-braces -PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win.in b/src/Makevars.win.in new file mode 100644 index 0000000..45027a8 --- /dev/null +++ b/src/Makevars.win.in @@ -0,0 +1,12 @@ +## RcppBandicoot Makevars.win.in +## +## This file is processed by configure.win to generate Makevars.win +## It includes GPU backend configuration (OpenCL, CLBlast) for Windows + +PKG_CPPFLAGS = -I../inst/include -DCOOT_TARGET_OPENCL_VERSION=@OPENCL_TARGET_VERSION@ -DCOOT_KERNEL_SOURCE_DIR='"@BANDICOOT_KERNELS_DIR@"' + +## Compiler flags from configure.win +PKG_CXXFLAGS = @BANDICOOT_CXXFLAGS@ + +## Linker flags from configure.win +PKG_LIBS = @OPENMP_CXXFLAGS@ @BANDICOOT_LIBS@ From 6cb30605ef27f85ea59e900346c6bf74621be46e Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Sun, 28 Dec 2025 00:43:10 -0600 Subject: [PATCH 02/31] Are we shipping configure.win? --- configure.win | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/configure.win b/configure.win index 4a5b807..8144f01 100755 --- a/configure.win +++ b/configure.win @@ -96,11 +96,24 @@ echo " Kernels: ${BANDICOOT_KERNELS_DIR}" echo "" ## Generate Makevars.win from template -sed -e "s|@BANDICOOT_CXXFLAGS@|${BANDICOOT_CXXFLAGS}|g" \ - -e "s|@BANDICOOT_LIBS@|${BANDICOOT_LIBS}|g" \ - -e "s|@OPENMP_CXXFLAGS@|${OPENMP_CXXFLAGS}|g" \ - -e "s|@OPENCL_TARGET_VERSION@|${OPENCL_TARGET_VERSION}|g" \ - -e "s|@BANDICOOT_KERNELS_DIR@|${BANDICOOT_KERNELS_DIR}|g" \ - src/Makevars.win.in > src/Makevars.win - -echo "Generated src/Makevars.win" +if [ -f "src/Makevars.win.in" ]; then + sed -e "s|@BANDICOOT_CXXFLAGS@|${BANDICOOT_CXXFLAGS}|g" \ + -e "s|@BANDICOOT_LIBS@|${BANDICOOT_LIBS}|g" \ + -e "s|@OPENMP_CXXFLAGS@|${OPENMP_CXXFLAGS}|g" \ + -e "s|@OPENCL_TARGET_VERSION@|${OPENCL_TARGET_VERSION}|g" \ + -e "s|@BANDICOOT_KERNELS_DIR@|${BANDICOOT_KERNELS_DIR}|g" \ + src/Makevars.win.in > src/Makevars.win + echo "Generated src/Makevars.win" +else + echo "ERROR: src/Makevars.win.in not found!" + exit 1 +fi + +## Verify the generated file +if [ -f "src/Makevars.win" ]; then + echo "Contents of src/Makevars.win:" + cat src/Makevars.win +else + echo "ERROR: Failed to generate src/Makevars.win!" + exit 1 +fi From 178642866281fc1dde42329564b8f72e7663ecae Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Sun, 28 Dec 2025 00:43:29 -0600 Subject: [PATCH 03/31] Downloaded windows? --- .github/workflows/R-CMD-check.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index d447b68..d4ffa5a 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -56,6 +56,11 @@ jobs: Invoke-WebRequest -Uri "https://github.com/CNugteren/CLBlast/releases/download/1.6.3/CLBlast-1.6.3-windows-x64.7z" -OutFile clblast.7z 7z x clblast.7z -o"$env:USERPROFILE" + # Verify downloads + Write-Host "OpenCL headers directory:" + Get-ChildItem "$env:USERPROFILE/OpenCL-Headers-main" -ErrorAction SilentlyContinue | Select-Object -First 10 + Write-Host "CLBlast directory:" + Get-ChildItem "$env:USERPROFILE/CLBlast-1.6.3-windows-x64" -ErrorAction SilentlyContinue # Set environment variables for R package build (use forward slashes for gcc) $openclPath = "$env:USERPROFILE/OpenCL-Headers-main" -replace '\\', '/' $clblastPath = "$env:USERPROFILE/CLBlast-1.6.3-windows-x64" -replace '\\', '/' From 3f136a4582ad5b7e47f50c59c6a69614d756569b Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Sun, 28 Dec 2025 00:53:18 -0600 Subject: [PATCH 04/31] Mirror suggested deployment by using vcpkg and custom tooling on CLBlast (though, chocolatey looks better ever second...) --- .github/workflows/R-CMD-check.yaml | 51 ++++++++++++++++-------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index d4ffa5a..74ba440 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -45,33 +45,38 @@ jobs: if: runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y opencl-headers ocl-icd-opencl-dev libclblast-dev - - name: Install OpenCL headers and CLBlast (Windows) + - name: Install OpenCL and CLBlast (Windows) if: runner.os == 'Windows' + shell: bash run: | - # Download OpenCL headers from Khronos - Invoke-WebRequest -Uri "https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/heads/main.zip" -OutFile opencl-headers.zip - Expand-Archive -Path opencl-headers.zip -DestinationPath $env:USERPROFILE + # Install OpenCL ICD Loader via vcpkg + vcpkg install opencl:x64-windows # Download CLBlast pre-built binaries - Invoke-WebRequest -Uri "https://github.com/CNugteren/CLBlast/releases/download/1.6.3/CLBlast-1.6.3-windows-x64.7z" -OutFile clblast.7z - 7z x clblast.7z -o"$env:USERPROFILE" - - # Verify downloads - Write-Host "OpenCL headers directory:" - Get-ChildItem "$env:USERPROFILE/OpenCL-Headers-main" -ErrorAction SilentlyContinue | Select-Object -First 10 - Write-Host "CLBlast directory:" - Get-ChildItem "$env:USERPROFILE/CLBlast-1.6.3-windows-x64" -ErrorAction SilentlyContinue - # Set environment variables for R package build (use forward slashes for gcc) - $openclPath = "$env:USERPROFILE/OpenCL-Headers-main" -replace '\\', '/' - $clblastPath = "$env:USERPROFILE/CLBlast-1.6.3-windows-x64" -replace '\\', '/' - - echo "OPENCL_CPPFLAGS=-I$openclPath" | Out-File -FilePath $env:GITHUB_ENV -Append - echo "CLBLAST_CPPFLAGS=-I$clblastPath/include" | Out-File -FilePath $env:GITHUB_ENV -Append - echo "CLBLAST_LIBS=-L$clblastPath/lib -lclblast" | Out-File -FilePath $env:GITHUB_ENV -Append - - # Add CLBlast DLL to PATH for runtime - echo "$clblastPath/lib" | Out-File -FilePath $env:GITHUB_PATH -Append - shell: pwsh + curl -L -o clblast.7z "https://github.com/CNugteren/CLBlast/releases/download/1.6.3/CLBlast-1.6.3-windows-x64.7z" + 7z x clblast.7z -o"$USERPROFILE" + + # Get vcpkg paths (use forward slashes for gcc) + VCPKG_ROOT=$(cygpath -m "$VCPKG_INSTALLATION_ROOT") + CLBLAST_PATH=$(cygpath -m "$USERPROFILE/CLBlast-1.6.3-windows-x64") + + echo "OpenCL installed via vcpkg at: $VCPKG_ROOT" + echo "CLBlast at: $CLBLAST_PATH" + + # List installed files + ls -la "$VCPKG_INSTALLATION_ROOT/installed/x64-windows/include/" || true + ls -la "$VCPKG_INSTALLATION_ROOT/installed/x64-windows/lib/" || true + ls -la "$USERPROFILE/CLBlast-1.6.3-windows-x64/" || true + + # Set environment variables for R package build + echo "OPENCL_CPPFLAGS=-I$VCPKG_ROOT/installed/x64-windows/include" >> $GITHUB_ENV + echo "OPENCL_LIBS=-L$VCPKG_ROOT/installed/x64-windows/lib -lOpenCL" >> $GITHUB_ENV + echo "CLBLAST_CPPFLAGS=-I$CLBLAST_PATH/include" >> $GITHUB_ENV + echo "CLBLAST_LIBS=-L$CLBLAST_PATH/lib -lclblast" >> $GITHUB_ENV + + # Add DLLs to PATH for runtime + echo "$VCPKG_ROOT/installed/x64-windows/bin" >> $GITHUB_PATH + echo "$CLBLAST_PATH/lib" >> $GITHUB_PATH - uses: r-lib/actions/setup-r-dependencies@v2 with: From e843c34f562f334c3d54643f736026743d2439b3 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Sun, 28 Dec 2025 00:53:47 -0600 Subject: [PATCH 05/31] Clarify linking --- configure.win | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/configure.win b/configure.win index 8144f01..63c11bb 100755 --- a/configure.win +++ b/configure.win @@ -43,8 +43,11 @@ fi ## Check for OpenCL library if [ -n "${OPENCL_LIBS}" ]; then - echo " OpenCL library: found (via OPENCL_LIBS)" + echo " OpenCL library: found (via OPENCL_LIBS): ${OPENCL_LIBS}" BANDICOOT_LIBS="${BANDICOOT_LIBS} ${OPENCL_LIBS}" +else + echo " OpenCL library: not found (OPENCL_LIBS not set)" + echo " WARNING: Linking will fail without OpenCL library!" fi ## Check for CLBlast via environment variable @@ -60,6 +63,7 @@ fi ## Check for CLBlast library if [ -n "${CLBLAST_LIBS}" ]; then + echo " CLBlast library: ${CLBLAST_LIBS}" BANDICOOT_LIBS="${CLBLAST_LIBS} ${BANDICOOT_LIBS}" fi From 36d0799041433e04b8d2f18b5cb93f0e8fbdd203 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Sun, 28 Dec 2025 01:26:02 -0600 Subject: [PATCH 06/31] Use chocolately to grab the intel cpu emulation. --- .github/workflows/R-CMD-check.yaml | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 74ba440..7eb09d0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -60,13 +60,8 @@ jobs: VCPKG_ROOT=$(cygpath -m "$VCPKG_INSTALLATION_ROOT") CLBLAST_PATH=$(cygpath -m "$USERPROFILE/CLBlast-1.6.3-windows-x64") - echo "OpenCL installed via vcpkg at: $VCPKG_ROOT" - echo "CLBlast at: $CLBLAST_PATH" - - # List installed files - ls -la "$VCPKG_INSTALLATION_ROOT/installed/x64-windows/include/" || true - ls -la "$VCPKG_INSTALLATION_ROOT/installed/x64-windows/lib/" || true - ls -la "$USERPROFILE/CLBlast-1.6.3-windows-x64/" || true + echo "vcpkg root: $VCPKG_ROOT" + echo "CLBlast path: $CLBLAST_PATH" # Set environment variables for R package build echo "OPENCL_CPPFLAGS=-I$VCPKG_ROOT/installed/x64-windows/include" >> $GITHUB_ENV @@ -74,9 +69,13 @@ jobs: echo "CLBLAST_CPPFLAGS=-I$CLBLAST_PATH/include" >> $GITHUB_ENV echo "CLBLAST_LIBS=-L$CLBLAST_PATH/lib -lclblast" >> $GITHUB_ENV - # Add DLLs to PATH for runtime - echo "$VCPKG_ROOT/installed/x64-windows/bin" >> $GITHUB_PATH - echo "$CLBLAST_PATH/lib" >> $GITHUB_PATH + # Add CLBlast DLL to PATH for runtime + CLBLAST_PATH_WIN=$(cygpath -w "$USERPROFILE/CLBlast-1.6.3-windows-x64/lib") + echo "$CLBLAST_PATH_WIN" >> $GITHUB_PATH + + - name: Install Intel OpenCL CPU Runtime (Windows) + if: runner.os == 'Windows' + run: choco install opencl-intel-cpu-runtime -y - uses: r-lib/actions/setup-r-dependencies@v2 with: From 98c6b1ab82e5d45e8b878a988426b59a516e46b7 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Sun, 28 Dec 2025 01:39:18 -0600 Subject: [PATCH 07/31] Add a _lot_ more debug statements. Move OpenCL Emulation to front. Try to force add to Windows Path --- .github/workflows/R-CMD-check.yaml | 39 ++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 7eb09d0..1039536 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -45,6 +45,10 @@ jobs: if: runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y opencl-headers ocl-icd-opencl-dev libclblast-dev + - name: Install Intel OpenCL CPU Runtime (Windows) + if: runner.os == 'Windows' + run: choco install opencl-intel-cpu-runtime -y + - name: Install OpenCL and CLBlast (Windows) if: runner.os == 'Windows' shell: bash @@ -69,13 +73,38 @@ jobs: echo "CLBLAST_CPPFLAGS=-I$CLBLAST_PATH/include" >> $GITHUB_ENV echo "CLBLAST_LIBS=-L$CLBLAST_PATH/lib -lclblast" >> $GITHUB_ENV - # Add CLBlast DLL to PATH for runtime - CLBLAST_PATH_WIN=$(cygpath -w "$USERPROFILE/CLBlast-1.6.3-windows-x64/lib") - echo "$CLBLAST_PATH_WIN" >> $GITHUB_PATH + # Add DLLs to PATH for runtime loading + # vcpkg OpenCL ICD loader + VCPKG_BIN=$(cygpath -w "$VCPKG_INSTALLATION_ROOT/installed/x64-windows/bin") + echo "$VCPKG_BIN" >> $GITHUB_PATH + # CLBlast DLL + CLBLAST_BIN=$(cygpath -w "$USERPROFILE/CLBlast-1.6.3-windows-x64/lib") + echo "$CLBLAST_BIN" >> $GITHUB_PATH - - name: Install Intel OpenCL CPU Runtime (Windows) + # Debug: show DLL locations + echo "=== DLL locations ===" + ls -la "$VCPKG_INSTALLATION_ROOT/installed/x64-windows/bin/" || true + ls -la "$USERPROFILE/CLBlast-1.6.3-windows-x64/lib/" || true + + - name: Verify OpenCL installation (Windows) if: runner.os == 'Windows' - run: choco install opencl-intel-cpu-runtime -y + shell: pwsh + run: | + Write-Host "=== Checking OpenCL ICD registry ===" + Get-ChildItem "HKLM:\SOFTWARE\Khronos\OpenCL\Vendors" -ErrorAction SilentlyContinue | ForEach-Object { + $name = $_.PSChildName + Write-Host " ICD: $name" + } + Write-Host "" + Write-Host "=== Checking for OpenCL.dll in PATH ===" + $opencl = Get-Command OpenCL.dll -ErrorAction SilentlyContinue + if ($opencl) { Write-Host " Found: $($opencl.Source)" } + else { Write-Host " OpenCL.dll not found in PATH" } + Write-Host "" + Write-Host "=== Checking for clblast.dll in PATH ===" + $clblast = Get-Command clblast.dll -ErrorAction SilentlyContinue + if ($clblast) { Write-Host " Found: $($clblast.Source)" } + else { Write-Host " clblast.dll not found in PATH" } - uses: r-lib/actions/setup-r-dependencies@v2 with: From de1c81f15cdf007fdbdd92897fc8ac2d80b4f6c7 Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Mon, 5 Jan 2026 12:04:23 -0600 Subject: [PATCH 08/31] Update R-CMD-check.yaml --- .github/workflows/R-CMD-check.yaml | 96 ++++++++++++------------------ 1 file changed, 39 insertions(+), 57 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 1039536..0466867 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -39,72 +39,54 @@ jobs: - name: Install clblast via Homebrew (macOS) if: runner.os == 'macOS' - run: brew install clblast clblas + run: brew install clblast clblas clinfo - name: Install clblast via apt (Ubuntu) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y opencl-headers ocl-icd-opencl-dev libclblast-dev - - - name: Install Intel OpenCL CPU Runtime (Windows) + run: | + sudo apt-get update + sudo apt-get install -y \ + opencl-headers \ + ocl-icd-opencl-dev \ + libclblast-dev \ + clinfo + + - name: Cache Intel OpenCL Runtime if: runner.os == 'Windows' - run: choco install opencl-intel-cpu-runtime -y - - - name: Install OpenCL and CLBlast (Windows) + id: cache-opencl-win + uses: actions/cache@v5 + with: + # Default install location is x86 ... + path: C:\Program Files (x86)\Common Files\Intel\Shared Libraries + key: intel-opencl-runtime-2025.3.1 + + - name: Install OpenCL SDK (Windows) if: runner.os == 'Windows' - shell: bash + shell: pwsh run: | - # Install OpenCL ICD Loader via vcpkg vcpkg install opencl:x64-windows - - # Download CLBlast pre-built binaries - curl -L -o clblast.7z "https://github.com/CNugteren/CLBlast/releases/download/1.6.3/CLBlast-1.6.3-windows-x64.7z" - 7z x clblast.7z -o"$USERPROFILE" - - # Get vcpkg paths (use forward slashes for gcc) - VCPKG_ROOT=$(cygpath -m "$VCPKG_INSTALLATION_ROOT") - CLBLAST_PATH=$(cygpath -m "$USERPROFILE/CLBlast-1.6.3-windows-x64") - - echo "vcpkg root: $VCPKG_ROOT" - echo "CLBlast path: $CLBLAST_PATH" - - # Set environment variables for R package build - echo "OPENCL_CPPFLAGS=-I$VCPKG_ROOT/installed/x64-windows/include" >> $GITHUB_ENV - echo "OPENCL_LIBS=-L$VCPKG_ROOT/installed/x64-windows/lib -lOpenCL" >> $GITHUB_ENV - echo "CLBLAST_CPPFLAGS=-I$CLBLAST_PATH/include" >> $GITHUB_ENV - echo "CLBLAST_LIBS=-L$CLBLAST_PATH/lib -lclblast" >> $GITHUB_ENV - - # Add DLLs to PATH for runtime loading - # vcpkg OpenCL ICD loader - VCPKG_BIN=$(cygpath -w "$VCPKG_INSTALLATION_ROOT/installed/x64-windows/bin") - echo "$VCPKG_BIN" >> $GITHUB_PATH - # CLBlast DLL - CLBLAST_BIN=$(cygpath -w "$USERPROFILE/CLBlast-1.6.3-windows-x64/lib") - echo "$CLBLAST_BIN" >> $GITHUB_PATH - - # Debug: show DLL locations - echo "=== DLL locations ===" - ls -la "$VCPKG_INSTALLATION_ROOT/installed/x64-windows/bin/" || true - ls -la "$USERPROFILE/CLBlast-1.6.3-windows-x64/lib/" || true - - - name: Verify OpenCL installation (Windows) - if: runner.os == 'Windows' + $vcpkgPath = "$env:VCPKG_INSTALLATION_ROOT\installed\x64-windows" + echo "OCL=$vcpkgPath" >> $env:GITHUB_ENV + echo "OPENCL_INC=$vcpkgPath\include" >> $env:GITHUB_ENV + echo "OPENCL_LIB64=$vcpkgPath\lib" >> $env:GITHUB_ENV + + - name: Install Intel CPU Runtime (Windows) + if: runner.os == 'Windows' && steps.cache-opencl-win.outputs.cache-hit != 'true' shell: pwsh run: | - Write-Host "=== Checking OpenCL ICD registry ===" - Get-ChildItem "HKLM:\SOFTWARE\Khronos\OpenCL\Vendors" -ErrorAction SilentlyContinue | ForEach-Object { - $name = $_.PSChildName - Write-Host " ICD: $name" - } - Write-Host "" - Write-Host "=== Checking for OpenCL.dll in PATH ===" - $opencl = Get-Command OpenCL.dll -ErrorAction SilentlyContinue - if ($opencl) { Write-Host " Found: $($opencl.Source)" } - else { Write-Host " OpenCL.dll not found in PATH" } - Write-Host "" - Write-Host "=== Checking for clblast.dll in PATH ===" - $clblast = Get-Command clblast.dll -ErrorAction SilentlyContinue - if ($clblast) { Write-Host " Found: $($clblast.Source)" } - else { Write-Host " clblast.dll not found in PATH" } + $url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/ad824c04-01c8-4ae5-b5e8-164a04f67609/w_opencl_runtime_p_2025.3.1.762.exe" + Invoke-WebRequest -Uri $url -OutFile "$env:TEMP\ocl.exe" + Start-Process -FilePath "$env:TEMP\ocl.exe" -ArgumentList '-s','-a','--silent','--eula','accept' -Wait -NoNewWindow + + - name: Verify OpenCL installation + shell: bash + run: | + if command -v clinfo &> /dev/null; then + clinfo -l + else + echo "clinfo not available, skipping verification" + fi + - uses: r-lib/actions/setup-r-dependencies@v2 with: From 8967f8808a411f0ce0e9b2bae72a23a068f20acd Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Mon, 5 Jan 2026 12:14:26 -0600 Subject: [PATCH 09/31] Simplify OpenCL installation in R-CMD-check workflow Refactor OpenCL installation steps for Windows and remove verification step. --- .github/workflows/R-CMD-check.yaml | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0466867..8831111 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -48,8 +48,7 @@ jobs: sudo apt-get install -y \ opencl-headers \ ocl-icd-opencl-dev \ - libclblast-dev \ - clinfo + libclblast-dev - name: Cache Intel OpenCL Runtime if: runner.os == 'Windows' @@ -75,18 +74,9 @@ jobs: shell: pwsh run: | $url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/ad824c04-01c8-4ae5-b5e8-164a04f67609/w_opencl_runtime_p_2025.3.1.762.exe" - Invoke-WebRequest -Uri $url -OutFile "$env:TEMP\ocl.exe" - Start-Process -FilePath "$env:TEMP\ocl.exe" -ArgumentList '-s','-a','--silent','--eula','accept' -Wait -NoNewWindow - - - name: Verify OpenCL installation - shell: bash - run: | - if command -v clinfo &> /dev/null; then - clinfo -l - else - echo "clinfo not available, skipping verification" - fi - + $outFile = "$env:TEMP\ocl.exe" + (New-Object System.Net.WebClient).DownloadFile($url, $outFile) + Start-Process -FilePath $outFile -ArgumentList '-s','-a','--silent','--eula','accept' -Wait -NoNewWindow - uses: r-lib/actions/setup-r-dependencies@v2 with: From 89fae3a11023f696bc88cb1d6651c2385a8c040d Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Mon, 5 Jan 2026 12:25:05 -0600 Subject: [PATCH 10/31] Simplify installation commands in R-CMD-check.yaml --- .github/workflows/R-CMD-check.yaml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 8831111..f2b1d2e 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,12 +43,7 @@ jobs: - name: Install clblast via apt (Ubuntu) if: runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install -y \ - opencl-headers \ - ocl-icd-opencl-dev \ - libclblast-dev + run: sudo apt-get update && sudo apt-get install -y opencl-headers ocl-icd-opencl-dev libclblast-dev - name: Cache Intel OpenCL Runtime if: runner.os == 'Windows' @@ -73,7 +68,7 @@ jobs: if: runner.os == 'Windows' && steps.cache-opencl-win.outputs.cache-hit != 'true' shell: pwsh run: | - $url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/ad824c04-01c8-4ae5-b5e8-164a04f67609/w_opencl_runtime_p_2025.3.1.762.exe" + $url = "https://github.com/coatless-actions/opencl-cpu-intel/releases/download/0.0.1/w_opencl_runtime_p_2025.3.1.762.exe" $outFile = "$env:TEMP\ocl.exe" (New-Object System.Net.WebClient).DownloadFile($url, $outFile) Start-Process -FilePath $outFile -ArgumentList '-s','-a','--silent','--eula','accept' -Wait -NoNewWindow From fca36f24aa06e17ab8dfc062819098823413e7f3 Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Mon, 5 Jan 2026 12:37:18 -0600 Subject: [PATCH 11/31] Modify R-CMD-check.yaml for Windows OpenCL setup Updated the R-CMD-check workflow to include Intel OpenCL installation steps for Windows and removed macOS and Ubuntu configurations. --- .github/workflows/R-CMD-check.yaml | 40 ++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f2b1d2e..b2dfcec 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,13 +18,14 @@ jobs: fail-fast: false matrix: config: - - {os: macos-latest, r: 'release'} - - {os: ubuntu-latest, r: 'release'} + #- {os: macos-latest, r: 'release'} + #- {os: ubuntu-latest, r: 'release'} - {os: windows-latest, r: 'release'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes + INTEL_OPENCL_URL: "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b6dccdb7-b503-41ea-bd4b-a78e9c2d8dd6/w_opencl_runtime_p_2025.1.0.972.exe" steps: - uses: actions/checkout@v4 @@ -64,14 +65,37 @@ jobs: echo "OPENCL_INC=$vcpkgPath\include" >> $env:GITHUB_ENV echo "OPENCL_LIB64=$vcpkgPath\lib" >> $env:GITHUB_ENV - - name: Install Intel CPU Runtime (Windows) - if: runner.os == 'Windows' && steps.cache-opencl-win.outputs.cache-hit != 'true' + - name: Install Intel CPU Runtime for OpenCL (Windows) + if: runner.os == 'Windows' + shell: pwsh + run: | + # Download + curl -o opencl-installer.exe "${{ env.INTEL_OPENCL_URL }}" + + # Extract MSI from the self-extracting exe + $proc = Start-Process "./opencl-installer.exe" "-s -x -f extracted" -NoNewWindow -PassThru + $proc.WaitForExit() + + # Install via msiexec + $msi = Get-ChildItem ./extracted/*.msi | Select-Object -First 1 + $proc = Start-Process "msiexec" "/i `"$msi`" /qn /l*! install.log" -NoNewWindow -PassThru + $proc.WaitForExit() + + if ($proc.ExitCode -ne 0) { + Get-Content install.log + exit $proc.ExitCode + } + + Remove-Item -Recurse -Force extracted, opencl-installer.exe + + - name: Register Intel OpenCL ICD (Windows) + if: runner.os == 'Windows' shell: pwsh run: | - $url = "https://github.com/coatless-actions/opencl-cpu-intel/releases/download/0.0.1/w_opencl_runtime_p_2025.3.1.762.exe" - $outFile = "$env:TEMP\ocl.exe" - (New-Object System.Net.WebClient).DownloadFile($url, $outFile) - Start-Process -FilePath $outFile -ArgumentList '-s','-a','--silent','--eula','accept' -Wait -NoNewWindow + $dllPath = "C:\Program Files (x86)\Common Files\Intel\Shared Libraries\bin\OpenCL.dll" + REG ADD "HKLM\SOFTWARE\Khronos\OpenCL\Vendors" /v $dllPath /t REG_DWORD /d 0 /f + REG ADD "HKLM\SOFTWARE\Wow6432Node\Khronos\OpenCL\Vendors" /v $dllPath /t REG_DWORD /d 0 /f + Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Common Files\Intel\Shared Libraries\bin\" - uses: r-lib/actions/setup-r-dependencies@v2 with: From 5ea7992b7e19d4e597f4a8e036b35ca21fb85824 Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Mon, 5 Jan 2026 12:46:42 -0600 Subject: [PATCH 12/31] Modify OpenCL paths and add debug step Updated OpenCL environment variables and added debugging step. --- .github/workflows/R-CMD-check.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index b2dfcec..0b04da1 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -62,8 +62,8 @@ jobs: vcpkg install opencl:x64-windows $vcpkgPath = "$env:VCPKG_INSTALLATION_ROOT\installed\x64-windows" echo "OCL=$vcpkgPath" >> $env:GITHUB_ENV - echo "OPENCL_INC=$vcpkgPath\include" >> $env:GITHUB_ENV - echo "OPENCL_LIB64=$vcpkgPath\lib" >> $env:GITHUB_ENV + echo "OPENCL_CPPFLAGS=-I$vcpkgPath/include" >> $env:GITHUB_ENV + echo "OPENCL_LIBS=-L$vcpkgPath/lib -lOpenCL" >> $env:GITHUB_ENV - name: Install Intel CPU Runtime for OpenCL (Windows) if: runner.os == 'Windows' @@ -97,6 +97,14 @@ jobs: REG ADD "HKLM\SOFTWARE\Wow6432Node\Khronos\OpenCL\Vendors" /v $dllPath /t REG_DWORD /d 0 /f Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Common Files\Intel\Shared Libraries\bin\" + - name: Debug OpenCL paths + if: runner.os == 'Windows' + shell: pwsh + run: | + echo "OPENCL_CPPFLAGS: $env:OPENCL_CPPFLAGS" + echo "OPENCL_LIBS: $env:OPENCL_LIBS" + ls "$env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\include\CL" + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::rcmdcheck From 3226f6bc63d55cb0a4c37c2d478736c0ee0aaf3b Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Mon, 5 Jan 2026 12:54:18 -0600 Subject: [PATCH 13/31] Fix vcpkg path to use forward slashes Update path format for vcpkg installation on Windows. --- .github/workflows/R-CMD-check.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0b04da1..e3e81ae 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -60,8 +60,9 @@ jobs: shell: pwsh run: | vcpkg install opencl:x64-windows - $vcpkgPath = "$env:VCPKG_INSTALLATION_ROOT\installed\x64-windows" - echo "OCL=$vcpkgPath" >> $env:GITHUB_ENV + # Convert to forward slashes to avoid escaping issues in configure.win + $vcpkgPath = "$env:VCPKG_INSTALLATION_ROOT/installed/x64-windows" -replace '\\','/' + echo "OPENCL_CPPFLAGS=-I$vcpkgPath/include" >> $env:GITHUB_ENV echo "OPENCL_LIBS=-L$vcpkgPath/lib -lOpenCL" >> $env:GITHUB_ENV From 2b6db800582077f6a73f3846894479fedf13e21f Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Mon, 5 Jan 2026 13:02:38 -0600 Subject: [PATCH 14/31] Register CLBlast locations in workflow Added environment variable registrations for CLBlast. --- .github/workflows/R-CMD-check.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index e3e81ae..a40d9f1 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -62,9 +62,14 @@ jobs: vcpkg install opencl:x64-windows # Convert to forward slashes to avoid escaping issues in configure.win $vcpkgPath = "$env:VCPKG_INSTALLATION_ROOT/installed/x64-windows" -replace '\\','/' - + + # Register OpenCL location echo "OPENCL_CPPFLAGS=-I$vcpkgPath/include" >> $env:GITHUB_ENV echo "OPENCL_LIBS=-L$vcpkgPath/lib -lOpenCL" >> $env:GITHUB_ENV + + # Register CLBlast location + echo "CLBLAST_CPPFLAGS=-I$vcpkgPath/include" >> $env:GITHUB_ENV + echo "CLBLAST_LIBS=-L$vcpkgPath/lib -lclblast" >> $env:GITHUB_ENV - name: Install Intel CPU Runtime for OpenCL (Windows) if: runner.os == 'Windows' From b934a044806bf877bc61c3dd31651bd69e213ba9 Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Mon, 5 Jan 2026 13:08:46 -0600 Subject: [PATCH 15/31] Install clblast alongside OpenCL SDK on Windows --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index a40d9f1..cedea20 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -59,7 +59,7 @@ jobs: if: runner.os == 'Windows' shell: pwsh run: | - vcpkg install opencl:x64-windows + vcpkg install opencl:x64-windows clblast:x64-windows # Convert to forward slashes to avoid escaping issues in configure.win $vcpkgPath = "$env:VCPKG_INSTALLATION_ROOT/installed/x64-windows" -replace '\\','/' From bba0eceb341fb2d6392a5d75087b043bebef8c40 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Mon, 5 Jan 2026 13:25:44 -0600 Subject: [PATCH 16/31] missing clblast.dll and OpenCL.dll ? --- .github/workflows/R-CMD-check.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index cedea20..980245d 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -70,6 +70,9 @@ jobs: # Register CLBlast location echo "CLBLAST_CPPFLAGS=-I$vcpkgPath/include" >> $env:GITHUB_ENV echo "CLBLAST_LIBS=-L$vcpkgPath/lib -lclblast" >> $env:GITHUB_ENV + + # Add vcpkg bin to PATH so DLLs can be found at runtime + echo "$env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" >> $env:GITHUB_PATH - name: Install Intel CPU Runtime for OpenCL (Windows) if: runner.os == 'Windows' From cfc9252e40606fa9895b6025d420de1fbd31a386 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Mon, 5 Jan 2026 13:36:39 -0600 Subject: [PATCH 17/31] generate R/flags.R from R/flags.R.in with all required variable substitutions --- configure.win | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/configure.win b/configure.win index 63c11bb..baaeb5b 100755 --- a/configure.win +++ b/configure.win @@ -69,6 +69,7 @@ fi ## Always disable clBLAS (CLBlast is preferred) BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CLBLAS" +HAVE_CLBLAS=0 ## Add common flags BANDICOOT_CXXFLAGS="${OPENMP_CXXFLAGS} -Wno-missing-braces ${BANDICOOT_CXXFLAGS}" @@ -121,3 +122,63 @@ else echo "ERROR: Failed to generate src/Makevars.win!" exit 1 fi + +## Generate R/flags.R from template +## Set variables for flags.R.in substitution +HAVE_CUDA=0 +HAVE_OPENMP=0 +DEFAULT_BACKEND="CL_BACKEND" +OPENCL_CXXFLAGS_FOR_R="" +OPENCL_LIBS_FOR_R="" +CUDA_CXXFLAGS="" +CUDA_LIBS="" +CLBLAST_CXXFLAGS_FOR_R="" +CLBLAST_LIBS_FOR_R="" +CLBLAS_CXXFLAGS="" +CLBLAS_LIBS="" +LAPACK_BLAS_LIBS="\$(LAPACK_LIBS) \$(BLAS_LIBS) \$(FLIBS)" +CLBLAST_PREFIX="" +CLBLAS_PREFIX="" +CUDA_HOME="" +SDKPATH="" + +## Set OpenCL flags for R if detected +if [ "${HAVE_OPENCL}" = "1" ]; then + OPENCL_CXXFLAGS_FOR_R="-DCOOT_USE_OPENCL ${OPENCL_CPPFLAGS}" + OPENCL_LIBS_FOR_R="${OPENCL_LIBS}" +fi + +## Set CLBlast flags for R if detected +if [ "${HAVE_CLBLAST}" = "1" ]; then + CLBLAST_CXXFLAGS_FOR_R="-DCOOT_USE_CLBLAST ${CLBLAST_CPPFLAGS}" + CLBLAST_LIBS_FOR_R="${CLBLAST_LIBS}" +fi + +if [ -f "R/flags.R.in" ]; then + sed -e "s|@BANDICOOT_CXXFLAGS@|${BANDICOOT_CXXFLAGS}|g" \ + -e "s|@BANDICOOT_LIBS@|${BANDICOOT_LIBS}|g" \ + -e "s|@OPENCL_CXXFLAGS@|${OPENCL_CXXFLAGS_FOR_R}|g" \ + -e "s|@OPENCL_LIBS@|${OPENCL_LIBS_FOR_R}|g" \ + -e "s|@CUDA_CXXFLAGS@|${CUDA_CXXFLAGS}|g" \ + -e "s|@CUDA_LIBS@|${CUDA_LIBS}|g" \ + -e "s|@CLBLAST_CXXFLAGS@|${CLBLAST_CXXFLAGS_FOR_R}|g" \ + -e "s|@CLBLAST_LIBS@|${CLBLAST_LIBS_FOR_R}|g" \ + -e "s|@CLBLAS_CXXFLAGS@|${CLBLAS_CXXFLAGS}|g" \ + -e "s|@CLBLAS_LIBS@|${CLBLAS_LIBS}|g" \ + -e "s|@LAPACK_BLAS_LIBS@|${LAPACK_BLAS_LIBS}|g" \ + -e "s|@HAVE_OPENCL@|${HAVE_OPENCL}|g" \ + -e "s|@HAVE_CUDA@|${HAVE_CUDA}|g" \ + -e "s|@HAVE_CLBLAST@|${HAVE_CLBLAST}|g" \ + -e "s|@HAVE_CLBLAS@|${HAVE_CLBLAS}|g" \ + -e "s|@HAVE_OPENMP@|${HAVE_OPENMP}|g" \ + -e "s|@DEFAULT_BACKEND@|${DEFAULT_BACKEND}|g" \ + -e "s|@CLBLAST_PREFIX@|${CLBLAST_PREFIX}|g" \ + -e "s|@CLBLAS_PREFIX@|${CLBLAS_PREFIX}|g" \ + -e "s|@CUDA_HOME@|${CUDA_HOME}|g" \ + -e "s|@SDKPATH@|${SDKPATH}|g" \ + R/flags.R.in > R/flags.R + echo "Generated R/flags.R" +else + echo "ERROR: R/flags.R.in not found!" + exit 1 +fi From 823a2c117855fcf121fb451b4446bcfb0f4228b5 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Mon, 5 Jan 2026 13:49:39 -0600 Subject: [PATCH 18/31] Remove non-portable flag. --- configure.win | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.win b/configure.win index baaeb5b..f9a9600 100755 --- a/configure.win +++ b/configure.win @@ -72,7 +72,7 @@ BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CLBLAS" HAVE_CLBLAS=0 ## Add common flags -BANDICOOT_CXXFLAGS="${OPENMP_CXXFLAGS} -Wno-missing-braces ${BANDICOOT_CXXFLAGS}" +BANDICOOT_CXXFLAGS="${OPENMP_CXXFLAGS} ${BANDICOOT_CXXFLAGS}" ## Add R's LAPACK/BLAS BANDICOOT_LIBS="${BANDICOOT_LIBS} \$(LAPACK_LIBS) \$(BLAS_LIBS) \$(FLIBS)" From a5231d57964ee40f319dd6cf3776ba43ab67a608 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Mon, 5 Jan 2026 13:50:10 -0600 Subject: [PATCH 19/31] Standardize line endings to avoid complaints from the Windows CI runs... --- .Rbuildignore | 2 ++ .gitattributes | 6 ++++++ 2 files changed, 8 insertions(+) create mode 100644 .gitattributes diff --git a/.Rbuildignore b/.Rbuildignore index 1d20737..f4d2368 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,3 +6,5 @@ ^autom4te\.cache$ ^config\.log$ ^config\.status$ +^install\.log$ +^\.gitattributes$ diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c6bc410 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +# Ensure shell scripts and configure files always use LF line endings +configure text eol=lf +configure.ac text eol=lf +configure.win text eol=lf +cleanup text eol=lf +*.sh text eol=lf From 567be6d102d028c0440e5a080f67aabf4937fed0 Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Tue, 6 Jan 2026 01:08:05 -0600 Subject: [PATCH 20/31] Update configure.win Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- configure.win | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.win b/configure.win index f9a9600..8c8f4c0 100755 --- a/configure.win +++ b/configure.win @@ -64,7 +64,7 @@ fi ## Check for CLBlast library if [ -n "${CLBLAST_LIBS}" ]; then echo " CLBlast library: ${CLBLAST_LIBS}" - BANDICOOT_LIBS="${CLBLAST_LIBS} ${BANDICOOT_LIBS}" + BANDICOOT_LIBS="${BANDICOOT_LIBS} ${CLBLAST_LIBS}" fi ## Always disable clBLAS (CLBlast is preferred) From f6af274b9025f3394e20ca0f878ec505631df145 Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Tue, 6 Jan 2026 01:08:13 -0600 Subject: [PATCH 21/31] Update configure.win Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- configure.win | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.win b/configure.win index 8c8f4c0..63aa438 100755 --- a/configure.win +++ b/configure.win @@ -126,7 +126,7 @@ fi ## Generate R/flags.R from template ## Set variables for flags.R.in substitution HAVE_CUDA=0 -HAVE_OPENMP=0 +HAVE_OPENMP=1 DEFAULT_BACKEND="CL_BACKEND" OPENCL_CXXFLAGS_FOR_R="" OPENCL_LIBS_FOR_R="" From e001f022bc4a5c61f8eae0f8194fc74884b1884b Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Tue, 6 Jan 2026 01:08:48 -0600 Subject: [PATCH 22/31] Update .Rbuildignore Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .Rbuildignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index f4d2368..a981b71 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,5 +6,4 @@ ^autom4te\.cache$ ^config\.log$ ^config\.status$ -^install\.log$ ^\.gitattributes$ From 6bc4b9694adf0ecae5026c97cade97754fa0696a Mon Sep 17 00:00:00 2001 From: James J Balamuta Date: Tue, 6 Jan 2026 01:09:29 -0600 Subject: [PATCH 23/31] Update src/Makevars.win.in Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/Makevars.win.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makevars.win.in b/src/Makevars.win.in index 45027a8..ccdda30 100644 --- a/src/Makevars.win.in +++ b/src/Makevars.win.in @@ -3,7 +3,7 @@ ## This file is processed by configure.win to generate Makevars.win ## It includes GPU backend configuration (OpenCL, CLBlast) for Windows -PKG_CPPFLAGS = -I../inst/include -DCOOT_TARGET_OPENCL_VERSION=@OPENCL_TARGET_VERSION@ -DCOOT_KERNEL_SOURCE_DIR='"@BANDICOOT_KERNELS_DIR@"' +PKG_CPPFLAGS = -I../inst/include -DCOOT_TARGET_OPENCL_VERSION=@OPENCL_TARGET_VERSION@ -DCOOT_KERNEL_SOURCE_DIR=\"@BANDICOOT_KERNELS_DIR@\" ## Compiler flags from configure.win PKG_CXXFLAGS = @BANDICOOT_CXXFLAGS@ From d394482475153ae355d678e93383b6f86b40444a Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Thu, 23 Apr 2026 23:42:01 -0500 Subject: [PATCH 24/31] Tighten Windows CI caching and OpenCL gating Makes the Intel OpenCL runtime install skip on cache hit, pins the cache key to the installer URL version, drops the debug step, and requires both OPENCL_CPPFLAGS and OPENCL_LIBS before enabling COOT_USE_OPENCL. --- .github/workflows/R-CMD-check.yaml | 16 +++++----------- configure.win | 22 +++++++++------------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 980245d..fc3f68d 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -51,9 +51,11 @@ jobs: id: cache-opencl-win uses: actions/cache@v5 with: - # Default install location is x86 ... + # Default install location is x86 ... path: C:\Program Files (x86)\Common Files\Intel\Shared Libraries - key: intel-opencl-runtime-2025.3.1 + # Keep this version pinned to the INTEL_OPENCL_URL above so a URL + # bump invalidates the cache. + key: intel-opencl-runtime-2025.1.0.972 - name: Install OpenCL SDK (Windows) if: runner.os == 'Windows' @@ -75,7 +77,7 @@ jobs: echo "$env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" >> $env:GITHUB_PATH - name: Install Intel CPU Runtime for OpenCL (Windows) - if: runner.os == 'Windows' + if: runner.os == 'Windows' && steps.cache-opencl-win.outputs.cache-hit != 'true' shell: pwsh run: | # Download @@ -106,14 +108,6 @@ jobs: REG ADD "HKLM\SOFTWARE\Wow6432Node\Khronos\OpenCL\Vendors" /v $dllPath /t REG_DWORD /d 0 /f Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Common Files\Intel\Shared Libraries\bin\" - - name: Debug OpenCL paths - if: runner.os == 'Windows' - shell: pwsh - run: | - echo "OPENCL_CPPFLAGS: $env:OPENCL_CPPFLAGS" - echo "OPENCL_LIBS: $env:OPENCL_LIBS" - ls "$env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\include\CL" - - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::rcmdcheck diff --git a/configure.win b/configure.win index 63aa438..353e9b8 100755 --- a/configure.win +++ b/configure.win @@ -30,26 +30,22 @@ BANDICOOT_KERNELS_DIR=$("${R_HOME}/bin/Rscript" -e 'cat(paste(head(.libPaths(),1 ## Always disable CUDA on Windows (requires manual setup) BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CUDA" -## Check for OpenCL headers via environment variable -if [ -n "${OPENCL_CPPFLAGS}" ]; then +## OpenCL requires both the headers (OPENCL_CPPFLAGS) and the link flags +## (OPENCL_LIBS). Enabling the backend with only one set would compile but +## fail to link, so gate COOT_USE_OPENCL on having both. +if [ -n "${OPENCL_CPPFLAGS}" ] && [ -n "${OPENCL_LIBS}" ]; then echo " OpenCL headers: found (via OPENCL_CPPFLAGS)" + echo " OpenCL library: found (via OPENCL_LIBS): ${OPENCL_LIBS}" BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_USE_OPENCL ${OPENCL_CPPFLAGS}" + BANDICOOT_LIBS="${BANDICOOT_LIBS} ${OPENCL_LIBS}" HAVE_OPENCL=1 else - echo " OpenCL headers: not found" - echo " Set OPENCL_CPPFLAGS to specify OpenCL header location" + echo " OpenCL: not configured (need both OPENCL_CPPFLAGS and OPENCL_LIBS)" + [ -z "${OPENCL_CPPFLAGS}" ] && echo " missing OPENCL_CPPFLAGS" + [ -z "${OPENCL_LIBS}" ] && echo " missing OPENCL_LIBS" HAVE_OPENCL=0 fi -## Check for OpenCL library -if [ -n "${OPENCL_LIBS}" ]; then - echo " OpenCL library: found (via OPENCL_LIBS): ${OPENCL_LIBS}" - BANDICOOT_LIBS="${BANDICOOT_LIBS} ${OPENCL_LIBS}" -else - echo " OpenCL library: not found (OPENCL_LIBS not set)" - echo " WARNING: Linking will fail without OpenCL library!" -fi - ## Check for CLBlast via environment variable if [ -n "${CLBLAST_CPPFLAGS}" ]; then echo " CLBlast: found (via CLBLAST_CPPFLAGS)" From 3f641bce2cae67557071100a00a70f599df49b5d Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Thu, 23 Apr 2026 23:44:09 -0500 Subject: [PATCH 25/31] Derive Intel OpenCL cache key from the runtime URL A URL bump now auto-invalidates the cache instead of silently reusing a stale install because we forgot to edit the key. --- .github/workflows/R-CMD-check.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index fc3f68d..782fad7 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -53,9 +53,9 @@ jobs: with: # Default install location is x86 ... path: C:\Program Files (x86)\Common Files\Intel\Shared Libraries - # Keep this version pinned to the INTEL_OPENCL_URL above so a URL - # bump invalidates the cache. - key: intel-opencl-runtime-2025.1.0.972 + # Key is derived from INTEL_OPENCL_URL, so bumping the runtime + # URL above automatically invalidates the cache. + key: intel-opencl-runtime-${{ env.INTEL_OPENCL_URL }} - name: Install OpenCL SDK (Windows) if: runner.os == 'Windows' From f89f07dfa6c8a1dbc001ea70a0e15fc828c5d461 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Fri, 24 Apr 2026 00:02:57 -0500 Subject: [PATCH 26/31] Fix stale bandicoot kernels and package-dir artifacts The upstream-update workflow renamed kernels/ into a pre-existing ks/ during the 4.0.0 bump, so main has been shipping 3.1.0 kernel sources under ks/{cuda,opencl}/ alongside 4.0.0 headers, with the real 4.0.0 kernels nested at ks/kernels/{cuda,opencl}/ (tripping CRAN's 100-byte path limit). Cleans out the stale dirs, promotes the 4.0.0 kernels to their intended location, and wipes bandicoot_bits/ before each re-copy so this cannot recur. Also redirects the Windows MSI install log to RUNNER_TEMP and adds it to .Rbuildignore so it no longer appears as a top-level non-standard file. --- .Rbuildignore | 1 + .github/workflows/R-CMD-check.yaml | 9 +- .github/workflows/upstream-update.yml | 6 + .../ks/{kernels => }/cuda/defs/c_defs.cu | 0 .../ks/cuda/defs/cuda_prelims.cu | 5 + .../ks/{kernels => }/cuda/defs/z_defs.cu | 0 .../bandicoot_bits/ks/cuda/oneway/fill.cu | 31 -- .../ks/cuda/oneway/fill_sve1.cu | 28 -- .../ks/cuda/oneway/fill_sve2.cu | 35 -- .../ks/cuda/threeway/equ_array_atan2.cu | 40 --- .../ks/cuda/threeway/equ_array_div_array.cu | 40 --- .../cuda/threeway/equ_array_div_array_cube.cu | 43 --- .../ks/cuda/threeway/equ_array_hypot.cu | 40 --- .../ks/cuda/threeway/equ_array_max_array.cu | 40 --- .../ks/cuda/threeway/equ_array_min_array.cu | 40 --- .../ks/cuda/threeway/equ_array_minus_array.cu | 40 --- .../threeway/equ_array_minus_array_cube.cu | 43 --- .../ks/cuda/threeway/equ_array_mul_array.cu | 40 --- .../cuda/threeway/equ_array_mul_array_cube.cu | 43 --- .../ks/cuda/threeway/equ_array_plus_array.cu | 40 --- .../threeway/equ_array_plus_array_cube.cu | 43 --- .../ks/cuda/twoway/convert_type.cu | 35 -- .../ks/cuda/twoway/convert_type_cube.cu | 42 --- .../ks/cuda/twoway/equ_array_abs.cu | 43 --- .../ks/cuda/twoway/equ_array_acos_post.cu | 44 --- .../ks/cuda/twoway/equ_array_acos_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_acosh_post.cu | 44 --- .../ks/cuda/twoway/equ_array_acosh_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_asin_post.cu | 44 --- .../ks/cuda/twoway/equ_array_asin_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_asinh_post.cu | 44 --- .../ks/cuda/twoway/equ_array_asinh_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_atan_post.cu | 44 --- .../ks/cuda/twoway/equ_array_atan_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_atanh_post.cu | 44 --- .../ks/cuda/twoway/equ_array_atanh_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_ceil_post.cu | 51 --- .../ks/cuda/twoway/equ_array_ceil_pre.cu | 51 --- .../ks/cuda/twoway/equ_array_cos_post.cu | 44 --- .../ks/cuda/twoway/equ_array_cos_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_cosh_post.cu | 44 --- .../ks/cuda/twoway/equ_array_cosh_pre.cu | 44 --- .../cuda/twoway/equ_array_div_scalar_post.cu | 40 --- .../twoway/equ_array_div_scalar_post_sve1.cu | 31 -- .../twoway/equ_array_div_scalar_post_sve2.cu | 42 --- .../cuda/twoway/equ_array_div_scalar_pre.cu | 53 --- .../twoway/equ_array_div_scalar_pre_sve1.cu | 44 --- .../twoway/equ_array_div_scalar_pre_sve2.cu | 55 --- .../ks/cuda/twoway/equ_array_erf_post.cu | 44 --- .../ks/cuda/twoway/equ_array_erf_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_erfc_post.cu | 44 --- .../ks/cuda/twoway/equ_array_erfc_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_exp10_post.cu | 43 --- .../ks/cuda/twoway/equ_array_exp10_pre.cu | 43 --- .../ks/cuda/twoway/equ_array_exp2_post.cu | 43 --- .../ks/cuda/twoway/equ_array_exp2_pre.cu | 43 --- .../ks/cuda/twoway/equ_array_exp_post.cu | 43 --- .../ks/cuda/twoway/equ_array_exp_pre.cu | 43 --- .../ks/cuda/twoway/equ_array_floor_post.cu | 51 --- .../ks/cuda/twoway/equ_array_floor_pre.cu | 51 --- .../ks/cuda/twoway/equ_array_lgamma_post.cu | 44 --- .../ks/cuda/twoway/equ_array_lgamma_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_log10_post.cu | 43 --- .../ks/cuda/twoway/equ_array_log10_pre.cu | 43 --- .../ks/cuda/twoway/equ_array_log2_post.cu | 43 --- .../ks/cuda/twoway/equ_array_log2_pre.cu | 43 --- .../ks/cuda/twoway/equ_array_log_post.cu | 43 --- .../ks/cuda/twoway/equ_array_log_pre.cu | 43 --- .../cuda/twoway/equ_array_max_array_cube.cu | 43 --- .../cuda/twoway/equ_array_min_array_cube.cu | 43 --- .../twoway/equ_array_minus_scalar_post.cu | 40 --- .../equ_array_minus_scalar_post_sve1.cu | 31 -- .../equ_array_minus_scalar_post_sve2.cu | 42 --- .../twoway/equ_array_minus_scalar_pre_post.cu | 42 --- .../equ_array_minus_scalar_pre_post_sve1.cu | 33 -- .../equ_array_minus_scalar_pre_post_sve2.cu | 42 --- .../twoway/equ_array_minus_scalar_pre_pre.cu | 42 --- .../equ_array_minus_scalar_pre_pre_sve1.cu | 33 -- .../equ_array_minus_scalar_pre_pre_sve2.cu | 42 --- .../ks/cuda/twoway/equ_array_mod_scalar.cu | 42 --- .../ks/cuda/twoway/equ_array_mul_scalar.cu | 40 --- .../cuda/twoway/equ_array_mul_scalar_sve1.cu | 31 -- .../cuda/twoway/equ_array_mul_scalar_sve2.cu | 42 --- .../ks/cuda/twoway/equ_array_neg_post.cu | 43 --- .../ks/cuda/twoway/equ_array_neg_pre.cu | 43 --- .../ks/cuda/twoway/equ_array_plus_scalar.cu | 40 --- .../cuda/twoway/equ_array_plus_scalar_sve1.cu | 31 -- .../cuda/twoway/equ_array_plus_scalar_sve2.cu | 42 --- .../ks/cuda/twoway/equ_array_pow_post.cu | 44 --- .../ks/cuda/twoway/equ_array_pow_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_round_post.cu | 51 --- .../ks/cuda/twoway/equ_array_round_pre.cu | 51 --- .../ks/cuda/twoway/equ_array_sign_post.cu | 55 --- .../ks/cuda/twoway/equ_array_sign_pre.cu | 55 --- .../ks/cuda/twoway/equ_array_sin_post.cu | 44 --- .../ks/cuda/twoway/equ_array_sin_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_sinc_post.cu | 55 --- .../ks/cuda/twoway/equ_array_sinc_pre.cu | 55 --- .../ks/cuda/twoway/equ_array_sinh_post.cu | 44 --- .../ks/cuda/twoway/equ_array_sinh_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_sqrt_post.cu | 43 --- .../ks/cuda/twoway/equ_array_sqrt_pre.cu | 43 --- .../ks/cuda/twoway/equ_array_square_post.cu | 43 --- .../ks/cuda/twoway/equ_array_square_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_tan_post.cu | 44 --- .../ks/cuda/twoway/equ_array_tan_pre.cu | 44 --- .../ks/cuda/twoway/equ_array_tanh_post.cu | 44 --- .../ks/cuda/twoway/equ_array_tanh_pre.cu | 44 --- .../cuda/twoway/equ_array_trunc_exp_post.cu | 68 ---- .../ks/cuda/twoway/equ_array_trunc_exp_pre.cu | 68 ---- .../cuda/twoway/equ_array_trunc_log_post.cu | 76 ---- .../ks/cuda/twoway/equ_array_trunc_log_pre.cu | 76 ---- .../ks/cuda/twoway/equ_array_trunc_post.cu | 51 --- .../ks/cuda/twoway/equ_array_trunc_pre.cu | 51 --- .../ks/cuda/twoway/extract_sve1.cu | 31 -- .../ks/cuda/twoway/extract_sve2.cu | 39 -- .../ks/cuda/twoway/rel_and_array.cu | 29 -- .../ks/cuda/twoway/rel_eq_array.cu | 29 -- .../ks/cuda/twoway/rel_eq_scalar.cu | 28 -- .../ks/cuda/twoway/rel_gt_array.cu | 29 -- .../ks/cuda/twoway/rel_gt_scalar.cu | 28 -- .../ks/cuda/twoway/rel_gteq_array.cu | 29 -- .../ks/cuda/twoway/rel_gteq_scalar.cu | 28 -- .../ks/cuda/twoway/rel_lt_array.cu | 29 -- .../ks/cuda/twoway/rel_lt_scalar.cu | 28 -- .../ks/cuda/twoway/rel_lteq_array.cu | 29 -- .../ks/cuda/twoway/rel_lteq_scalar.cu | 28 -- .../ks/cuda/twoway/rel_neq_array.cu | 29 -- .../ks/cuda/twoway/rel_neq_scalar.cu | 28 -- .../ks/cuda/twoway/rel_or_array.cu | 29 -- .../bandicoot_bits/ks/cuda/twoway/replace.cu | 50 --- .../ks/kernels/cuda/defs/cuda_prelims.cu | 42 --- .../ks/kernels/cuda/defs/d_defs.cu | 41 --- .../ks/kernels/cuda/defs/f_defs.cu | 41 --- .../ks/kernels/cuda/defs/h_defs.cu | 66 ---- .../ks/kernels/cuda/defs/s16_defs.cu | 41 --- .../ks/kernels/cuda/defs/s32_defs.cu | 41 --- .../ks/kernels/cuda/defs/s64_defs.cu | 45 --- .../ks/kernels/cuda/defs/s8_defs.cu | 45 --- .../ks/kernels/cuda/defs/u16_defs.cu | 41 --- .../ks/kernels/cuda/defs/u32_defs.cu | 41 --- .../ks/kernels/cuda/defs/u64_defs.cu | 45 --- .../ks/kernels/cuda/defs/u8_defs.cu | 45 --- .../kernels/cuda/deps/accu_subgroup_reduce.cu | 25 -- .../cuda/deps/and_subgroup_reduce_u32.cu | 27 -- .../kernels/cuda/deps/max_subgroup_reduce.cu | 25 -- .../kernels/cuda/deps/min_subgroup_reduce.cu | 25 -- .../cuda/deps/or_subgroup_reduce_u32.cu | 27 -- .../kernels/cuda/deps/prod_subgroup_reduce.cu | 25 -- .../ks/kernels/cuda/deps/var_philox.cu | 54 --- .../ks/kernels/cuda/oneway/accu.cu | 60 ---- .../ks/kernels/cuda/oneway/accu_simple.cu | 32 -- .../ks/kernels/cuda/oneway/accu_small.cu | 52 --- .../ks/kernels/cuda/oneway/approx_equal.cu | 149 -------- .../kernels/cuda/oneway/approx_equal_cube.cu | 161 --------- .../cuda/oneway/approx_equal_cube_small.cu | 152 -------- .../kernels/cuda/oneway/approx_equal_small.cu | 140 -------- .../ks/kernels/cuda/oneway/count_nonzeros.cu | 103 ------ .../ks/kernels/cuda/oneway/find.cu | 60 ---- .../ks/kernels/cuda/oneway/find_first.cu | 66 ---- .../ks/kernels/cuda/oneway/find_last.cu | 87 ----- .../ks/kernels/cuda/oneway/index_max.cu | 147 -------- .../kernels/cuda/oneway/index_max_colwise.cu | 41 --- .../kernels/cuda/oneway/index_max_cube_col.cu | 41 --- .../kernels/cuda/oneway/index_max_rowwise.cu | 40 --- .../ks/kernels/cuda/oneway/index_max_small.cu | 94 ----- .../ks/kernels/cuda/oneway/index_min.cu | 147 -------- .../kernels/cuda/oneway/index_min_colwise.cu | 41 --- .../kernels/cuda/oneway/index_min_cube_col.cu | 41 --- .../kernels/cuda/oneway/index_min_rowwise.cu | 40 --- .../ks/kernels/cuda/oneway/index_min_small.cu | 94 ----- .../cuda/oneway/inplace_philox_randn.cu | 27 -- .../ks/kernels/cuda/oneway/inplace_set_eye.cu | 28 -- .../cuda/oneway/inplace_xorwow32_randi.cu | 28 -- .../cuda/oneway/inplace_xorwow32_randu.cu | 25 -- .../cuda/oneway/inplace_xorwow64_randi.cu | 28 -- .../cuda/oneway/inplace_xorwow64_randu.cu | 25 -- .../ks/kernels/cuda/oneway/linspace.cu | 28 -- .../ks/kernels/cuda/oneway/logspace.cu | 28 -- .../ks/kernels/cuda/oneway/ltri_set_zero.cu | 28 -- .../ks/kernels/cuda/oneway/max.cu | 72 ---- .../ks/kernels/cuda/oneway/max_abs.cu | 72 ---- .../ks/kernels/cuda/oneway/max_abs_small.cu | 64 ---- .../ks/kernels/cuda/oneway/max_small.cu | 64 ---- .../ks/kernels/cuda/oneway/min.cu | 72 ---- .../ks/kernels/cuda/oneway/min_small.cu | 64 ---- .../ks/kernels/cuda/oneway/mul_colwise.cu | 38 -- .../kernels/cuda/oneway/mul_colwise_trans.cu | 38 -- .../ks/kernels/cuda/oneway/mul_rowwise.cu | 38 -- .../kernels/cuda/oneway/mul_rowwise_trans.cu | 38 -- .../ks/kernels/cuda/oneway/prod.cu | 60 ---- .../ks/kernels/cuda/oneway/prod_small.cu | 52 --- .../ks/kernels/cuda/oneway/radix_sort_asc.cu | 281 --------------- .../cuda/oneway/radix_sort_colwise_asc.cu | 127 ------- .../cuda/oneway/radix_sort_colwise_desc.cu | 128 ------- .../ks/kernels/cuda/oneway/radix_sort_desc.cu | 281 --------------- .../cuda/oneway/radix_sort_index_asc.cu | 321 ----------------- .../cuda/oneway/radix_sort_index_desc.cu | 321 ----------------- .../radix_sort_index_multi_wg_shuffle.cu | 27 -- .../oneway/radix_sort_multi_wg_bit_count.cu | 24 -- .../oneway/radix_sort_multi_wg_shuffle.cu | 25 -- .../cuda/oneway/radix_sort_rowwise_asc.cu | 131 ------- .../cuda/oneway/radix_sort_rowwise_desc.cu | 133 ------- .../ks/kernels/cuda/oneway/regspace_desc.cu | 28 -- .../ks/kernels/cuda/oneway/reorder_cols.cu | 36 -- .../ks/kernels/cuda/oneway/rotate_180.cu | 37 -- .../oneway/shifted_prefix_sum_add_offset.cu | 53 --- .../cuda/oneway/shifted_prefix_sum_small.cu | 81 ----- .../oneway/shifted_prefix_sum_subgroups.cu | 102 ------ .../ks/kernels/cuda/oneway/shuffle.cu | 90 ----- .../ks/kernels/cuda/oneway/shuffle_large.cu | 93 ----- .../oneway/stable_radix_sort_index_asc.cu | 262 -------------- .../oneway/stable_radix_sort_index_desc.cu | 256 -------------- .../ks/kernels/cuda/oneway/submat_var.cu | 80 ----- .../kernels/cuda/oneway/submat_var_small.cu | 72 ---- .../ks/kernels/cuda/oneway/symmatl_inplace.cu | 32 -- .../ks/kernels/cuda/oneway/symmatu_inplace.cu | 32 -- .../ks/kernels/cuda/oneway/trace.cu | 33 -- .../ks/kernels/cuda/oneway/var.cu | 64 ---- .../ks/kernels/cuda/oneway/var_colwise.cu | 41 --- .../ks/kernels/cuda/oneway/var_rowwise.cu | 40 --- .../ks/kernels/cuda/oneway/var_small.cu | 56 --- .../cuda/oneway_integral/and_reduce.cu | 79 ----- .../cuda/oneway_integral/and_reduce_small.cu | 54 --- .../kernels/cuda/oneway_integral/ipiv_det.cu | 70 ---- .../cuda/oneway_integral/ipiv_det_small.cu | 58 --- .../kernels/cuda/oneway_integral/or_reduce.cu | 78 ---- .../cuda/oneway_integral/or_reduce_small.cu | 54 --- .../ks/kernels/cuda/oneway_real/diag_prod.cu | 71 ---- .../cuda/oneway_real/diag_prod_small.cu | 59 ---- .../ks/kernels/cuda/oneway_real/extract_cx.cu | 38 -- .../kernels/cuda/oneway_real/lu_extract_l.cu | 44 --- .../kernels/cuda/oneway_real/lu_extract_p.cu | 28 -- .../cuda/oneway_real/lu_extract_pivoted_l.cu | 50 --- .../kernels/cuda/oneway_real/rel_any_inf.cu | 69 ---- .../cuda/oneway_real/rel_any_inf_small.cu | 63 ---- .../kernels/cuda/oneway_real/rel_any_nan.cu | 69 ---- .../cuda/oneway_real/rel_any_nan_small.cu | 63 ---- .../cuda/oneway_real/rel_any_nonfinite.cu | 69 ---- .../oneway_real/rel_any_nonfinite_small.cu | 63 ---- .../kernels/cuda/oneway_real/rel_isfinite.cu | 27 -- .../ks/kernels/cuda/oneway_real/rel_isnan.cu | 27 -- .../cuda/oneway_real/rel_isnonfinite.cu | 27 -- .../ks/kernels/cuda/oneway_real/vec_norm_1.cu | 67 ---- .../cuda/oneway_real/vec_norm_1_small.cu | 55 --- .../ks/kernels/cuda/oneway_real/vec_norm_2.cu | 63 ---- .../cuda/oneway_real/vec_norm_2_robust.cu | 64 ---- .../oneway_real/vec_norm_2_robust_small.cu | 56 --- .../cuda/oneway_real/vec_norm_2_small.cu | 55 --- .../ks/kernels/cuda/oneway_real/vec_norm_k.cu | 65 ---- .../cuda/oneway_real/vec_norm_k_small.cu | 56 --- .../kernels/cuda/oneway_real/vec_norm_min.cu | 80 ----- .../cuda/oneway_real/vec_norm_min_small.cu | 68 ---- .../kernels/cuda/twoway/broadcast_div_post.cu | 42 --- .../kernels/cuda/twoway/broadcast_div_pre.cu | 42 --- .../cuda/twoway/broadcast_minus_post.cu | 42 --- .../cuda/twoway/broadcast_minus_pre.cu | 42 --- .../ks/kernels/cuda/twoway/broadcast_plus.cu | 42 --- .../ks/kernels/cuda/twoway/broadcast_schur.cu | 42 --- .../ks/kernels/cuda/twoway/broadcast_set.cu | 41 --- .../cuda/twoway/broadcast_subset_div_post.cu | 51 --- .../cuda/twoway/broadcast_subset_div_pre.cu | 51 --- .../twoway/broadcast_subset_minus_post.cu | 51 --- .../cuda/twoway/broadcast_subset_minus_pre.cu | 51 --- .../cuda/twoway/broadcast_subset_plus.cu | 51 --- .../cuda/twoway/broadcast_subset_schur.cu | 51 --- .../cuda/twoway/broadcast_subset_set.cu | 46 --- .../ks/kernels/cuda/twoway/clamp.cu | 36 -- .../ks/kernels/cuda/twoway/cross.cu | 36 -- .../ks/kernels/cuda/twoway/dot.cu | 85 ----- .../ks/kernels/cuda/twoway/dot_small.cu | 62 ---- .../ks/kernels/cuda/twoway/htrans.cu | 34 -- .../cuda/twoway/inplace_sve1_div_array.cu | 28 -- .../cuda/twoway/inplace_sve1_div_sve1.cu | 29 -- .../cuda/twoway/inplace_sve1_eq_array.cu | 28 -- .../cuda/twoway/inplace_sve1_eq_sve1.cu | 29 -- .../cuda/twoway/inplace_sve1_minus_array.cu | 28 -- .../cuda/twoway/inplace_sve1_minus_sve1.cu | 29 -- .../cuda/twoway/inplace_sve1_mul_array.cu | 28 -- .../cuda/twoway/inplace_sve1_mul_sve1.cu | 29 -- .../cuda/twoway/inplace_sve1_plus_array.cu | 28 -- .../cuda/twoway/inplace_sve1_plus_sve1.cu | 29 -- .../cuda/twoway/inplace_sve2_div_array.cu | 36 -- .../cuda/twoway/inplace_sve2_div_sve2.cu | 40 --- .../cuda/twoway/inplace_sve2_eq_array.cu | 36 -- .../cuda/twoway/inplace_sve2_eq_sve2.cu | 40 --- .../cuda/twoway/inplace_sve2_minus_array.cu | 36 -- .../cuda/twoway/inplace_sve2_minus_sve2.cu | 40 --- .../cuda/twoway/inplace_sve2_mul_array.cu | 36 -- .../cuda/twoway/inplace_sve2_mul_sve2.cu | 40 --- .../cuda/twoway/inplace_sve2_plus_array.cu | 36 -- .../cuda/twoway/inplace_sve2_plus_sve2.cu | 40 --- .../cuda/twoway/max_colwise_conv_post.cu | 36 -- .../cuda/twoway/max_colwise_conv_pre.cu | 36 -- .../cuda/twoway/max_cube_col_conv_post.cu | 36 -- .../cuda/twoway/max_cube_col_conv_pre.cu | 36 -- .../cuda/twoway/max_rowwise_conv_post.cu | 35 -- .../cuda/twoway/max_rowwise_conv_pre.cu | 35 -- .../cuda/twoway/mean_colwise_conv_post.cu | 36 -- .../cuda/twoway/mean_colwise_conv_pre.cu | 36 -- .../cuda/twoway/mean_rowwise_conv_post.cu | 35 -- .../cuda/twoway/mean_rowwise_conv_pre.cu | 35 -- .../cuda/twoway/min_colwise_conv_post.cu | 36 -- .../cuda/twoway/min_colwise_conv_pre.cu | 36 -- .../cuda/twoway/min_cube_col_conv_post.cu | 36 -- .../cuda/twoway/min_cube_col_conv_pre.cu | 36 -- .../cuda/twoway/min_rowwise_conv_post.cu | 35 -- .../cuda/twoway/min_rowwise_conv_pre.cu | 35 -- .../ks/kernels/cuda/twoway/rel_all_neq.cu | 68 ---- .../cuda/twoway/rel_all_neq_colwise.cu | 36 -- .../cuda/twoway/rel_all_neq_rowwise.cu | 35 -- .../kernels/cuda/twoway/rel_all_neq_small.cu | 59 ---- .../ks/kernels/cuda/twoway/rel_any_neq.cu | 71 ---- .../cuda/twoway/rel_any_neq_colwise.cu | 38 -- .../cuda/twoway/rel_any_neq_rowwise.cu | 37 -- .../kernels/cuda/twoway/rel_any_neq_small.cu | 63 ---- .../ks/kernels/cuda/twoway/strans.cu | 33 -- .../cuda/twoway/sum_colwise_conv_post.cu | 36 -- .../cuda/twoway/sum_colwise_conv_pre.cu | 36 -- .../cuda/twoway/sum_rowwise_conv_post.cu | 35 -- .../cuda/twoway/sum_rowwise_conv_pre.cu | 35 -- .../ks/kernels/cuda/twoway/symmatl.cu | 33 -- .../ks/kernels/cuda/twoway/symmatu.cu | 33 -- .../zeroway/shuffle_large_compute_locs.cu | 63 ---- .../ks/kernels/opencl/defs/d_defs.cl | 30 -- .../ks/kernels/opencl/defs/f_defs.cl | 30 -- .../ks/kernels/opencl/defs/h_defs.cl | 30 -- .../ks/kernels/opencl/defs/opencl_prelims.cl | 37 -- .../ks/kernels/opencl/defs/s16_defs.cl | 30 -- .../ks/kernels/opencl/defs/s32_defs.cl | 30 -- .../ks/kernels/opencl/defs/s64_defs.cl | 30 -- .../ks/kernels/opencl/defs/s8_defs.cl | 30 -- .../ks/kernels/opencl/defs/u16_defs.cl | 30 -- .../ks/kernels/opencl/defs/u32_defs.cl | 30 -- .../ks/kernels/opencl/defs/u64_defs.cl | 30 -- .../ks/kernels/opencl/defs/u8_defs.cl | 30 -- .../opencl/deps/accu_subgroup_reduce.cl | 115 ------ .../opencl/deps/and_subgroup_reduce_u32.cl | 114 ------ .../opencl/deps/max_subgroup_reduce.cl | 114 ------ .../opencl/deps/min_subgroup_reduce.cl | 113 ------ .../opencl/deps/or_subgroup_reduce_u32.cl | 114 ------ .../opencl/deps/prod_subgroup_reduce.cl | 115 ------ .../ks/kernels/opencl/deps/var_philox.cl | 56 --- .../opencl/magma_real/lansy_inf_lower.cl | 307 ---------------- .../opencl/magma_real/lansy_inf_upper.cl | 313 ---------------- .../opencl/magma_real/lansy_max_lower.cl | 85 ----- .../opencl/magma_real/lansy_max_upper.cl | 86 ----- .../kernels/opencl/magma_real/lascl_full.cl | 75 ---- .../kernels/opencl/magma_real/lascl_lower.cl | 77 ---- .../kernels/opencl/magma_real/lascl_upper.cl | 75 ---- .../opencl/magma_real/laset_band_lower.cl | 91 ----- .../opencl/magma_real/laset_band_upper.cl | 92 ----- .../kernels/opencl/magma_real/laset_full.cl | 100 ------ .../kernels/opencl/magma_real/laset_lower.cl | 99 ------ .../kernels/opencl/magma_real/laset_upper.cl | 99 ------ .../ks/kernels/opencl/magma_real/laswp.cl | 82 ----- .../transpose_inplace_even_magma.cl | 124 ------- .../magma_real/transpose_inplace_odd_magma.cl | 120 ------- .../opencl/magma_real/transpose_magma.cl | 138 -------- .../ks/kernels/opencl/oneway/accu.cl | 59 ---- .../ks/kernels/opencl/oneway/accu_simple.cl | 34 -- .../ks/kernels/opencl/oneway/accu_small.cl | 54 --- .../ks/kernels/opencl/oneway/approx_equal.cl | 147 -------- .../opencl/oneway/approx_equal_cube.cl | 159 --------- .../opencl/oneway/approx_equal_cube_small.cl | 154 -------- .../opencl/oneway/approx_equal_small.cl | 142 -------- .../kernels/opencl/oneway/count_nonzeros.cl | 103 ------ .../ks/kernels/opencl/oneway/find.cl | 62 ---- .../ks/kernels/opencl/oneway/find_first.cl | 68 ---- .../ks/kernels/opencl/oneway/find_last.cl | 89 ----- .../ks/kernels/opencl/oneway/index_max.cl | 333 ------------------ .../opencl/oneway/index_max_colwise.cl | 46 --- .../opencl/oneway/index_max_cube_col.cl | 45 --- .../opencl/oneway/index_max_rowwise.cl | 44 --- .../kernels/opencl/oneway/index_max_small.cl | 98 ------ .../ks/kernels/opencl/oneway/index_min.cl | 333 ------------------ .../opencl/oneway/index_min_colwise.cl | 46 --- .../opencl/oneway/index_min_cube_col.cl | 45 --- .../opencl/oneway/index_min_rowwise.cl | 44 --- .../kernels/opencl/oneway/index_min_small.cl | 98 ------ .../opencl/oneway/inplace_philox_randn.cl | 223 ------------ .../kernels/opencl/oneway/inplace_set_eye.cl | 29 -- .../opencl/oneway/inplace_xorwow32_randi.cl | 82 ----- .../opencl/oneway/inplace_xorwow32_randu.cl | 75 ---- .../opencl/oneway/inplace_xorwow64_randi.cl | 82 ----- .../opencl/oneway/inplace_xorwow64_randu.cl | 75 ---- .../ks/kernels/opencl/oneway/linspace.cl | 30 -- .../ks/kernels/opencl/oneway/logspace.cl | 30 -- .../ks/kernels/opencl/oneway/ltri_set_zero.cl | 29 -- .../ks/kernels/opencl/oneway/max.cl | 72 ---- .../ks/kernels/opencl/oneway/max_abs.cl | 72 ---- .../ks/kernels/opencl/oneway/max_abs_small.cl | 67 ---- .../ks/kernels/opencl/oneway/max_small.cl | 67 ---- .../ks/kernels/opencl/oneway/min.cl | 72 ---- .../ks/kernels/opencl/oneway/min_small.cl | 67 ---- .../ks/kernels/opencl/oneway/mul_colwise.cl | 45 --- .../opencl/oneway/mul_colwise_trans.cl | 45 --- .../ks/kernels/opencl/oneway/mul_rowwise.cl | 45 --- .../opencl/oneway/mul_rowwise_trans.cl | 45 --- .../ks/kernels/opencl/oneway/prod.cl | 59 ---- .../ks/kernels/opencl/oneway/prod_small.cl | 54 --- .../kernels/opencl/oneway/radix_sort_asc.cl | 280 --------------- .../opencl/oneway/radix_sort_colwise_asc.cl | 128 ------- .../opencl/oneway/radix_sort_colwise_desc.cl | 129 ------- .../kernels/opencl/oneway/radix_sort_desc.cl | 279 --------------- .../opencl/oneway/radix_sort_index_asc.cl | 321 ----------------- .../opencl/oneway/radix_sort_index_desc.cl | 320 ----------------- .../radix_sort_index_multi_wg_shuffle.cl | 160 --------- .../oneway/radix_sort_multi_wg_bit_count.cl | 108 ------ .../oneway/radix_sort_multi_wg_shuffle.cl | 131 ------- .../opencl/oneway/radix_sort_rowwise_asc.cl | 132 ------- .../opencl/oneway/radix_sort_rowwise_desc.cl | 133 ------- .../ks/kernels/opencl/oneway/regspace_desc.cl | 30 -- .../ks/kernels/opencl/oneway/reorder_cols.cl | 41 --- .../ks/kernels/opencl/oneway/rotate_180.cl | 39 -- .../oneway/shifted_prefix_sum_add_offset.cl | 55 --- .../opencl/oneway/shifted_prefix_sum_small.cl | 82 ----- .../oneway/shifted_prefix_sum_subgroups.cl | 102 ------ .../ks/kernels/opencl/oneway/shuffle.cl | 87 ----- .../ks/kernels/opencl/oneway/shuffle_large.cl | 90 ----- .../oneway/stable_radix_sort_index_asc.cl | 260 -------------- .../oneway/stable_radix_sort_index_desc.cl | 260 -------------- .../ks/kernels/opencl/oneway/submat_var.cl | 79 ----- .../kernels/opencl/oneway/submat_var_small.cl | 74 ---- .../kernels/opencl/oneway/symmatl_inplace.cl | 33 -- .../kernels/opencl/oneway/symmatu_inplace.cl | 33 -- .../ks/kernels/opencl/oneway/trace.cl | 37 -- .../ks/kernels/opencl/oneway/var.cl | 63 ---- .../ks/kernels/opencl/oneway/var_colwise.cl | 44 --- .../ks/kernels/opencl/oneway/var_rowwise.cl | 43 --- .../ks/kernels/opencl/oneway/var_small.cl | 58 --- .../opencl/oneway_integral/and_reduce.cl | 166 --------- .../oneway_integral/and_reduce_small.cl | 57 --- .../opencl/oneway_integral/ipiv_det.cl | 25 -- .../opencl/oneway_integral/ipiv_det_small.cl | 25 -- .../opencl/oneway_integral/or_reduce.cl | 166 --------- .../opencl/oneway_integral/or_reduce_small.cl | 57 --- .../kernels/opencl/oneway_real/diag_prod.cl | 78 ---- .../opencl/oneway_real/diag_prod_small.cl | 61 ---- .../kernels/opencl/oneway_real/extract_cx.cl | 40 --- .../opencl/oneway_real/lu_extract_l.cl | 48 --- .../opencl/oneway_real/lu_extract_p.cl | 29 -- .../oneway_real/lu_extract_pivoted_l.cl | 53 --- .../kernels/opencl/oneway_real/rel_any_inf.cl | 68 ---- .../opencl/oneway_real/rel_any_inf_small.cl | 63 ---- .../kernels/opencl/oneway_real/rel_any_nan.cl | 68 ---- .../opencl/oneway_real/rel_any_nan_small.cl | 63 ---- .../opencl/oneway_real/rel_any_nonfinite.cl | 68 ---- .../oneway_real/rel_any_nonfinite_small.cl | 63 ---- .../opencl/oneway_real/rel_isfinite.cl | 29 -- .../kernels/opencl/oneway_real/rel_isnan.cl | 29 -- .../opencl/oneway_real/rel_isnonfinite.cl | 29 -- .../kernels/opencl/oneway_real/vec_norm_1.cl | 71 ---- .../opencl/oneway_real/vec_norm_1_small.cl | 54 --- .../kernels/opencl/oneway_real/vec_norm_2.cl | 63 ---- .../opencl/oneway_real/vec_norm_2_robust.cl | 65 ---- .../oneway_real/vec_norm_2_robust_small.cl | 58 --- .../opencl/oneway_real/vec_norm_2_small.cl | 56 --- .../kernels/opencl/oneway_real/vec_norm_k.cl | 65 ---- .../opencl/oneway_real/vec_norm_k_small.cl | 58 --- .../opencl/oneway_real/vec_norm_min.cl | 83 ----- .../opencl/oneway_real/vec_norm_min_small.cl | 68 ---- .../opencl/twoway/broadcast_div_post.cl | 45 --- .../opencl/twoway/broadcast_div_pre.cl | 45 --- .../opencl/twoway/broadcast_minus_post.cl | 45 --- .../opencl/twoway/broadcast_minus_pre.cl | 45 --- .../kernels/opencl/twoway/broadcast_plus.cl | 45 --- .../kernels/opencl/twoway/broadcast_schur.cl | 45 --- .../ks/kernels/opencl/twoway/broadcast_set.cl | 44 --- .../twoway/broadcast_subset_div_post.cl | 55 --- .../opencl/twoway/broadcast_subset_div_pre.cl | 55 --- .../twoway/broadcast_subset_minus_post.cl | 55 --- .../twoway/broadcast_subset_minus_pre.cl | 55 --- .../opencl/twoway/broadcast_subset_plus.cl | 55 --- .../opencl/twoway/broadcast_subset_schur.cl | 55 --- .../opencl/twoway/broadcast_subset_set.cl | 50 --- .../ks/kernels/opencl/twoway/clamp.cl | 39 -- .../ks/kernels/opencl/twoway/cross.cl | 39 -- .../ks/kernels/opencl/twoway/dot.cl | 166 --------- .../ks/kernels/opencl/twoway/dot_small.cl | 56 --- .../ks/kernels/opencl/twoway/htrans.cl | 35 -- .../opencl/twoway/inplace_sve1_div_array.cl | 31 -- .../opencl/twoway/inplace_sve1_div_sve1.cl | 33 -- .../opencl/twoway/inplace_sve1_eq_array.cl | 31 -- .../opencl/twoway/inplace_sve1_eq_sve1.cl | 33 -- .../opencl/twoway/inplace_sve1_minus_array.cl | 31 -- .../opencl/twoway/inplace_sve1_minus_sve1.cl | 33 -- .../opencl/twoway/inplace_sve1_mul_array.cl | 31 -- .../opencl/twoway/inplace_sve1_mul_sve1.cl | 33 -- .../opencl/twoway/inplace_sve1_plus_array.cl | 31 -- .../opencl/twoway/inplace_sve1_plus_sve1.cl | 33 -- .../opencl/twoway/inplace_sve2_div_array.cl | 42 --- .../opencl/twoway/inplace_sve2_div_sve2.cl | 49 --- .../opencl/twoway/inplace_sve2_eq_array.cl | 42 --- .../opencl/twoway/inplace_sve2_eq_sve2.cl | 49 --- .../opencl/twoway/inplace_sve2_minus_array.cl | 42 --- .../opencl/twoway/inplace_sve2_minus_sve2.cl | 49 --- .../opencl/twoway/inplace_sve2_mul_array.cl | 42 --- .../opencl/twoway/inplace_sve2_mul_sve2.cl | 49 --- .../opencl/twoway/inplace_sve2_plus_array.cl | 42 --- .../opencl/twoway/inplace_sve2_plus_sve2.cl | 49 --- .../opencl/twoway/max_colwise_conv_post.cl | 40 --- .../opencl/twoway/max_colwise_conv_pre.cl | 40 --- .../opencl/twoway/max_cube_col_conv_post.cl | 37 -- .../opencl/twoway/max_cube_col_conv_pre.cl | 40 --- .../opencl/twoway/max_rowwise_conv_post.cl | 36 -- .../opencl/twoway/max_rowwise_conv_pre.cl | 39 -- .../opencl/twoway/mean_colwise_conv_post.cl | 40 --- .../opencl/twoway/mean_colwise_conv_pre.cl | 40 --- .../opencl/twoway/mean_rowwise_conv_post.cl | 36 -- .../opencl/twoway/mean_rowwise_conv_pre.cl | 39 -- .../opencl/twoway/min_colwise_conv_post.cl | 40 --- .../opencl/twoway/min_colwise_conv_pre.cl | 40 --- .../opencl/twoway/min_cube_col_conv_post.cl | 37 -- .../opencl/twoway/min_cube_col_conv_pre.cl | 40 --- .../opencl/twoway/min_rowwise_conv_post.cl | 36 -- .../opencl/twoway/min_rowwise_conv_pre.cl | 39 -- .../ks/kernels/opencl/twoway/rel_all_neq.cl | 65 ---- .../opencl/twoway/rel_all_neq_colwise.cl | 40 --- .../opencl/twoway/rel_all_neq_rowwise.cl | 36 -- .../opencl/twoway/rel_all_neq_small.cl | 60 ---- .../ks/kernels/opencl/twoway/rel_any_neq.cl | 68 ---- .../opencl/twoway/rel_any_neq_colwise.cl | 42 --- .../opencl/twoway/rel_any_neq_rowwise.cl | 38 -- .../opencl/twoway/rel_any_neq_small.cl | 63 ---- .../ks/kernels/opencl/twoway/repmat.cl | 43 --- .../ks/kernels/opencl/twoway/strans.cl | 35 -- .../opencl/twoway/sum_colwise_conv_post.cl | 40 --- .../opencl/twoway/sum_colwise_conv_pre.cl | 40 --- .../opencl/twoway/sum_rowwise_conv_post.cl | 36 -- .../opencl/twoway/sum_rowwise_conv_pre.cl | 39 -- .../ks/kernels/opencl/twoway/symmatl.cl | 35 -- .../ks/kernels/opencl/twoway/symmatu.cl | 35 -- .../zeroway/shuffle_large_compute_locs.cl | 61 ---- .../ks/{kernels => }/opencl/defs/c_defs.cl | 0 .../bandicoot_bits/ks/opencl/defs/d_defs.cl | 8 +- .../bandicoot_bits/ks/opencl/defs/f_defs.cl | 8 +- .../bandicoot_bits/ks/opencl/defs/h_defs.cl | 5 + .../bandicoot_bits/ks/opencl/defs/s16_defs.cl | 5 + .../bandicoot_bits/ks/opencl/defs/s32_defs.cl | 5 + .../bandicoot_bits/ks/opencl/defs/s64_defs.cl | 5 + .../bandicoot_bits/ks/opencl/defs/s8_defs.cl | 5 + .../bandicoot_bits/ks/opencl/defs/u16_defs.cl | 5 + .../bandicoot_bits/ks/opencl/defs/u32_defs.cl | 5 + .../bandicoot_bits/ks/opencl/defs/u64_defs.cl | 5 + .../bandicoot_bits/ks/opencl/defs/u8_defs.cl | 5 + .../ks/{kernels => }/opencl/defs/z_defs.cl | 0 .../bandicoot_bits/ks/opencl/oneway/fill.cl | 33 -- .../ks/opencl/oneway/fill_sve1.cl | 32 -- .../ks/opencl/oneway/fill_sve2.cl | 41 --- .../ks/opencl/threeway/equ_array_atan2.cl | 42 --- .../ks/opencl/threeway/equ_array_div_array.cl | 42 --- .../threeway/equ_array_div_array_cube.cl | 45 --- .../ks/opencl/threeway/equ_array_hypot.cl | 42 --- .../ks/opencl/threeway/equ_array_max_array.cl | 42 --- .../ks/opencl/threeway/equ_array_min_array.cl | 42 --- .../opencl/threeway/equ_array_minus_array.cl | 42 --- .../threeway/equ_array_minus_array_cube.cl | 45 --- .../ks/opencl/threeway/equ_array_mul_array.cl | 42 --- .../threeway/equ_array_mul_array_cube.cl | 45 --- .../opencl/threeway/equ_array_plus_array.cl | 42 --- .../threeway/equ_array_plus_array_cube.cl | 45 --- .../ks/opencl/twoway/convert_type.cl | 37 -- .../ks/opencl/twoway/convert_type_cube.cl | 50 --- .../ks/opencl/twoway/equ_array_abs.cl | 45 --- .../ks/opencl/twoway/equ_array_acos_post.cl | 46 --- .../ks/opencl/twoway/equ_array_acos_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_acosh_post.cl | 46 --- .../ks/opencl/twoway/equ_array_acosh_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_asin_post.cl | 46 --- .../ks/opencl/twoway/equ_array_asin_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_asinh_post.cl | 46 --- .../ks/opencl/twoway/equ_array_asinh_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_atan_post.cl | 46 --- .../ks/opencl/twoway/equ_array_atan_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_atanh_post.cl | 46 --- .../ks/opencl/twoway/equ_array_atanh_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_ceil_post.cl | 53 --- .../ks/opencl/twoway/equ_array_ceil_pre.cl | 53 --- .../ks/opencl/twoway/equ_array_cos_post.cl | 46 --- .../ks/opencl/twoway/equ_array_cos_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_cosh_post.cl | 46 --- .../ks/opencl/twoway/equ_array_cosh_pre.cl | 46 --- .../twoway/equ_array_div_scalar_post.cl | 43 --- .../twoway/equ_array_div_scalar_post_sve1.cl | 36 -- .../twoway/equ_array_div_scalar_post_sve2.cl | 52 --- .../opencl/twoway/equ_array_div_scalar_pre.cl | 56 --- .../twoway/equ_array_div_scalar_pre_sve1.cl | 49 --- .../twoway/equ_array_div_scalar_pre_sve2.cl | 65 ---- .../ks/opencl/twoway/equ_array_erf_post.cl | 46 --- .../ks/opencl/twoway/equ_array_erf_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_erfc_post.cl | 46 --- .../ks/opencl/twoway/equ_array_erfc_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_exp10_post.cl | 45 --- .../ks/opencl/twoway/equ_array_exp10_pre.cl | 45 --- .../ks/opencl/twoway/equ_array_exp2_post.cl | 45 --- .../ks/opencl/twoway/equ_array_exp2_pre.cl | 45 --- .../ks/opencl/twoway/equ_array_exp_post.cl | 46 --- .../ks/opencl/twoway/equ_array_exp_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_floor_post.cl | 53 --- .../ks/opencl/twoway/equ_array_floor_pre.cl | 53 --- .../ks/opencl/twoway/equ_array_lgamma_post.cl | 46 --- .../ks/opencl/twoway/equ_array_lgamma_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_log10_post.cl | 45 --- .../ks/opencl/twoway/equ_array_log10_pre.cl | 45 --- .../ks/opencl/twoway/equ_array_log2_post.cl | 45 --- .../ks/opencl/twoway/equ_array_log2_pre.cl | 45 --- .../ks/opencl/twoway/equ_array_log_post.cl | 46 --- .../ks/opencl/twoway/equ_array_log_pre.cl | 46 --- .../opencl/twoway/equ_array_max_array_cube.cl | 45 --- .../opencl/twoway/equ_array_min_array_cube.cl | 45 --- .../twoway/equ_array_minus_scalar_post.cl | 43 --- .../equ_array_minus_scalar_post_sve1.cl | 36 -- .../equ_array_minus_scalar_post_sve2.cl | 52 --- .../twoway/equ_array_minus_scalar_pre_post.cl | 45 --- .../equ_array_minus_scalar_pre_post_sve1.cl | 38 -- .../equ_array_minus_scalar_pre_post_sve2.cl | 52 --- .../twoway/equ_array_minus_scalar_pre_pre.cl | 45 --- .../equ_array_minus_scalar_pre_pre_sve1.cl | 38 -- .../equ_array_minus_scalar_pre_pre_sve2.cl | 52 --- .../ks/opencl/twoway/equ_array_mod_scalar.cl | 39 -- .../ks/opencl/twoway/equ_array_mul_scalar.cl | 43 --- .../twoway/equ_array_mul_scalar_sve1.cl | 36 -- .../twoway/equ_array_mul_scalar_sve2.cl | 52 --- .../ks/opencl/twoway/equ_array_neg_post.cl | 46 --- .../ks/opencl/twoway/equ_array_neg_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_plus_scalar.cl | 43 --- .../twoway/equ_array_plus_scalar_sve1.cl | 36 -- .../twoway/equ_array_plus_scalar_sve2.cl | 52 --- .../ks/opencl/twoway/equ_array_pow_post.cl | 45 --- .../ks/opencl/twoway/equ_array_pow_pre.cl | 45 --- .../ks/opencl/twoway/equ_array_round_post.cl | 53 --- .../ks/opencl/twoway/equ_array_round_pre.cl | 53 --- .../ks/opencl/twoway/equ_array_sign_post.cl | 57 --- .../ks/opencl/twoway/equ_array_sign_pre.cl | 57 --- .../ks/opencl/twoway/equ_array_sin_post.cl | 46 --- .../ks/opencl/twoway/equ_array_sin_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_sinc_post.cl | 57 --- .../ks/opencl/twoway/equ_array_sinc_pre.cl | 57 --- .../ks/opencl/twoway/equ_array_sinh_post.cl | 46 --- .../ks/opencl/twoway/equ_array_sinh_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_sqrt_post.cl | 46 --- .../ks/opencl/twoway/equ_array_sqrt_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_square_post.cl | 46 --- .../ks/opencl/twoway/equ_array_square_pre.cl | 47 --- .../ks/opencl/twoway/equ_array_tan_post.cl | 46 --- .../ks/opencl/twoway/equ_array_tan_pre.cl | 46 --- .../ks/opencl/twoway/equ_array_tanh_post.cl | 46 --- .../ks/opencl/twoway/equ_array_tanh_pre.cl | 46 --- .../opencl/twoway/equ_array_trunc_exp_post.cl | 70 ---- .../opencl/twoway/equ_array_trunc_exp_pre.cl | 70 ---- .../opencl/twoway/equ_array_trunc_log_post.cl | 77 ---- .../opencl/twoway/equ_array_trunc_log_pre.cl | 78 ---- .../ks/opencl/twoway/equ_array_trunc_post.cl | 53 --- .../ks/opencl/twoway/equ_array_trunc_pre.cl | 53 --- .../ks/opencl/twoway/extract_sve1.cl | 34 -- .../ks/opencl/twoway/extract_sve2.cl | 46 --- .../ks/opencl/twoway/rel_and_array.cl | 33 -- .../ks/opencl/twoway/rel_eq_array.cl | 33 -- .../ks/opencl/twoway/rel_eq_scalar.cl | 31 -- .../ks/opencl/twoway/rel_gt_array.cl | 33 -- .../ks/opencl/twoway/rel_gt_scalar.cl | 31 -- .../ks/opencl/twoway/rel_gteq_array.cl | 33 -- .../ks/opencl/twoway/rel_gteq_scalar.cl | 31 -- .../ks/opencl/twoway/rel_lt_array.cl | 33 -- .../ks/opencl/twoway/rel_lt_scalar.cl | 31 -- .../ks/opencl/twoway/rel_lteq_array.cl | 33 -- .../ks/opencl/twoway/rel_lteq_scalar.cl | 31 -- .../ks/opencl/twoway/rel_neq_array.cl | 33 -- .../ks/opencl/twoway/rel_neq_scalar.cl | 31 -- .../ks/opencl/twoway/rel_or_array.cl | 33 -- .../ks/opencl/twoway/replace.cl | 52 --- 672 files changed, 75 insertions(+), 37727 deletions(-) rename inst/include/bandicoot_bits/ks/{kernels => }/cuda/defs/c_defs.cu (100%) rename inst/include/bandicoot_bits/ks/{kernels => }/cuda/defs/z_defs.cu (100%) delete mode 100644 inst/include/bandicoot_bits/ks/cuda/oneway/fill.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_atan2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array_cube.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_hypot.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_max_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_min_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array_cube.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array_cube.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array_cube.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/convert_type.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/convert_type_cube.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_abs.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_max_array_cube.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_min_array_cube.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mod_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_and_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_scalar.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/rel_or_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/cuda/twoway/replace.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/cuda_prelims.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/d_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/f_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/h_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/s16_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/s32_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/s64_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/s8_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/u16_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/u32_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/u64_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/defs/u8_defs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/deps/accu_subgroup_reduce.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/deps/and_subgroup_reduce_u32.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/deps/max_subgroup_reduce.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/deps/min_subgroup_reduce.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/deps/or_subgroup_reduce_u32.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/deps/prod_subgroup_reduce.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/deps/var_philox.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_simple.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/count_nonzeros.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_first.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_last.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_colwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_cube_col.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_rowwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_colwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_cube_col.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_rowwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_philox_randn.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_set_eye.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randi.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randu.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randi.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randu.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/linspace.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/logspace.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/ltri_set_zero.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise_trans.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise_trans.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_asc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_asc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_desc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_desc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_asc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_desc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_multi_wg_shuffle.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_bit_count.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_shuffle.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_asc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_desc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/regspace_desc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/reorder_cols.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/rotate_180.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_add_offset.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_subgroups.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle_large.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_asc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_desc.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatl_inplace.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatu_inplace.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/trace.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_colwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_rowwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/extract_cx.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_l.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_p.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_pivoted_l.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isfinite.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnan.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnonfinite.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_plus.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_schur.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_set.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_plus.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_schur.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_set.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/clamp.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/cross.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/htrans.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_sve1.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_array.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_sve2.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_colwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_rowwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_colwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_rowwise.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_small.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/strans.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_post.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_pre.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatl.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatu.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/cuda/zeroway/shuffle_large_compute_locs.cu delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/d_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/f_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/h_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/opencl_prelims.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/s16_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/s32_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/s64_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/s8_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/u16_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/u32_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/u64_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/defs/u8_defs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/deps/accu_subgroup_reduce.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/deps/and_subgroup_reduce_u32.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/deps/max_subgroup_reduce.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/deps/min_subgroup_reduce.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/deps/or_subgroup_reduce_u32.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/deps/prod_subgroup_reduce.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/deps/var_philox.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_lower.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_upper.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_lower.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_upper.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_full.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_lower.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_upper.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_lower.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_upper.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_full.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_lower.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_upper.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laswp.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_even_magma.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_odd_magma.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_magma.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_simple.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/count_nonzeros.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_first.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_last.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_colwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_cube_col.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_rowwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_colwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_cube_col.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_rowwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_philox_randn.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_set_eye.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randi.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randu.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randi.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randu.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/linspace.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/logspace.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/ltri_set_zero.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise_trans.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise_trans.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_asc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_asc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_desc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_desc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_asc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_desc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_multi_wg_shuffle.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_bit_count.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_shuffle.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_asc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_desc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/regspace_desc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/reorder_cols.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/rotate_180.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_add_offset.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_subgroups.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle_large.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_asc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_desc.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatl_inplace.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatu_inplace.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/trace.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_colwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_rowwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/extract_cx.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_l.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_p.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_pivoted_l.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isfinite.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnan.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnonfinite.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_plus.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_schur.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_set.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_plus.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_schur.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_set.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/clamp.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/cross.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/htrans.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_colwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_rowwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_colwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_rowwise.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_small.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/repmat.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/strans.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatl.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatu.cl delete mode 100644 inst/include/bandicoot_bits/ks/kernels/opencl/zeroway/shuffle_large_compute_locs.cl rename inst/include/bandicoot_bits/ks/{kernels => }/opencl/defs/c_defs.cl (100%) rename inst/include/bandicoot_bits/ks/{kernels => }/opencl/defs/z_defs.cl (100%) delete mode 100644 inst/include/bandicoot_bits/ks/opencl/oneway/fill.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_atan2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array_cube.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_hypot.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_max_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_min_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array_cube.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array_cube.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array_cube.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/convert_type.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/convert_type_cube.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_abs.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_max_array_cube.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_min_array_cube.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mod_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_post.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_pre.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve1.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve2.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_and_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_scalar.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/rel_or_array.cl delete mode 100644 inst/include/bandicoot_bits/ks/opencl/twoway/replace.cl diff --git a/.Rbuildignore b/.Rbuildignore index a981b71..b18f2ed 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^config\.log$ ^config\.status$ ^\.gitattributes$ +^install\.log$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 782fad7..15538b6 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -89,11 +89,14 @@ jobs: # Install via msiexec $msi = Get-ChildItem ./extracted/*.msi | Select-Object -First 1 - $proc = Start-Process "msiexec" "/i `"$msi`" /qn /l*! install.log" -NoNewWindow -PassThru + # Write the MSI log outside the repo so it doesn't end up in the + # package tarball (R CMD check flags it as a non-standard top-level file). + $logPath = "$env:RUNNER_TEMP\intel-opencl-install.log" + $proc = Start-Process "msiexec" "/i `"$msi`" /qn /l*! `"$logPath`"" -NoNewWindow -PassThru $proc.WaitForExit() - + if ($proc.ExitCode -ne 0) { - Get-Content install.log + Get-Content $logPath exit $proc.ExitCode } diff --git a/.github/workflows/upstream-update.yml b/.github/workflows/upstream-update.yml index 236b0e7..1e8e047 100644 --- a/.github/workflows/upstream-update.yml +++ b/.github/workflows/upstream-update.yml @@ -96,6 +96,12 @@ jobs: if [ -d "$BANDICOOT_DIR/include" ]; then echo "Found include directory at $BANDICOOT_DIR/include" + # Wipe existing bandicoot_bits/ so we don't mix old and new content. + # Previously the mv below nested kernels/ inside a pre-existing ks/, + # producing ks/kernels/ paths that exceeded CRAN's 100-byte limit + # and left stale kernel sources behind. + rm -rf inst/include/bandicoot_bits inst/include/bandicoot + # Create destination directory if it doesn't exist mkdir -p inst/include diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/c_defs.cu b/inst/include/bandicoot_bits/ks/cuda/defs/c_defs.cu similarity index 100% rename from inst/include/bandicoot_bits/ks/kernels/cuda/defs/c_defs.cu rename to inst/include/bandicoot_bits/ks/cuda/defs/c_defs.cu diff --git a/inst/include/bandicoot_bits/ks/cuda/defs/cuda_prelims.cu b/inst/include/bandicoot_bits/ks/cuda/defs/cuda_prelims.cu index bcab5e7..269dbc2 100644 --- a/inst/include/bandicoot_bits/ks/cuda/defs/cuda_prelims.cu +++ b/inst/include/bandicoot_bits/ks/cuda/defs/cuda_prelims.cu @@ -12,11 +12,16 @@ // limitations under the License. // ------------------------------------------------------------------------ +#include + // These statically-compiled definitions are available in any Bandicoot kernel. #define uchar unsigned char #define ushort unsigned short #define uint unsigned int +#define cx_float cuFloatComplex +#define cx_double cuDoubleComplex + #define COOT_FN2(ARG1, ARG2) ARG1 ## ARG2 #define COOT_FN(ARG1, ARG2) COOT_FN2(ARG1, ARG2) diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/z_defs.cu b/inst/include/bandicoot_bits/ks/cuda/defs/z_defs.cu similarity index 100% rename from inst/include/bandicoot_bits/ks/kernels/cuda/defs/z_defs.cu rename to inst/include/bandicoot_bits/ks/cuda/defs/z_defs.cu diff --git a/inst/include/bandicoot_bits/ks/cuda/oneway/fill.cu b/inst/include/bandicoot_bits/ks/cuda/oneway/fill.cu deleted file mode 100644 index 8dc5535..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/oneway/fill.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,fill)(eT1* out, - const eT1 val, - const UWORD n_rows, - const UWORD n_cols, - const UWORD M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD index = col * M_n_rows + row; - - if(row < n_rows && col < n_cols) - { - out[index] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve1.cu deleted file mode 100644 index ff9b2c5..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve1.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,fill_sve1)(eT1* out, - const UWORD* out_locs, - const eT1 val, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - out[out_locs[i]] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve2.cu deleted file mode 100644 index c01ebd6..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve2.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,fill_sve2)(eT1* out, - const UWORD* in_row_locs, - const UWORD* in_col_locs, - const eT1 val, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD loc = ((in_row_locs == NULL) ? row : in_row_locs[row]) + - n_rows * ((in_col_locs == NULL) ? col : in_col_locs[col]); - - out[loc] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_atan2.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_atan2.cu deleted file mode 100644 index da0a001..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_atan2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atan2)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const fp_eT3 a_val = TO_FP_ET3(src_A[A_index]); - const fp_eT3 b_val = TO_FP_ET3(src_B[B_index]); - dest[dest_index] = TO_ET3(atan2(a_val, b_val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array.cu deleted file mode 100644 index ca20bb9..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(a_val / b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array_cube.cu deleted file mode 100644 index 09cf58b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_array_cube)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = TO_ET3(src_A[A_index] / src_B[B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_hypot.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_hypot.cu deleted file mode 100644 index 7bf819f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_hypot.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_hypot)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const fp_eT3 a_val = TO_FP_ET3(src_A[A_index]); - const fp_eT3 b_val = TO_FP_ET3(src_B[B_index]); - dest[dest_index] = TO_ET3(hypot(a_val, b_val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_max_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_max_array.cu deleted file mode 100644 index d47ea77..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_max_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_max_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(max(a_val, b_val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_min_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_min_array.cu deleted file mode 100644 index dc08dd8..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_min_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_min_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(min(a_val, b_val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array.cu deleted file mode 100644 index 5a16392..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(a_val - b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array_cube.cu deleted file mode 100644 index 6da345d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_array_cube)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = TO_ET3(src_A[A_index] - src_B[B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array.cu deleted file mode 100644 index f34c181..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(a_val * b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array_cube.cu deleted file mode 100644 index 2caf2a4..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_array_cube)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = TO_ET3(src_A[A_index] * src_B[B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array.cu deleted file mode 100644 index ed1fc02..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(a_val + b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array_cube.cu deleted file mode 100644 index 3709c1f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_array_cube)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = TO_ET3(src_A[A_index] + src_B[B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type.cu deleted file mode 100644 index 38fa2c7..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,convert_type)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD src_index = row + col * src_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const eT1 in_val = src[src_index]; - dest[dest_index] = TO_ET2(in_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type_cube.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type_cube.cu deleted file mode 100644 index 823cec6..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type_cube.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,convert_type_cube)(eT2* dest, - const eT2* /* src_A */, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD /* src_A_M_n_rows */, - const UWORD /* src_A_M_n_cols */, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - const eT1 in_val = src[src_index]; - dest[dest_index] = TO_ET2(in_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_abs.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_abs.cu deleted file mode 100644 index 0e5c5ed..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_abs.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2021-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_abs)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(ET1_ABS(src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_post.cu deleted file mode 100644 index 5e243f0..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_acos_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(acos(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_pre.cu deleted file mode 100644 index e87b09f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_acos_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(acos(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_post.cu deleted file mode 100644 index 8dd3e67..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_acosh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(acosh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_pre.cu deleted file mode 100644 index fc3fe8b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_acosh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(acosh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_post.cu deleted file mode 100644 index 552b6b5..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_asin_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(asin(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_pre.cu deleted file mode 100644 index e9b3d00..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_asin_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(asin(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_post.cu deleted file mode 100644 index 42f88bd..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_asinh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(asinh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_pre.cu deleted file mode 100644 index a50b220..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_asinh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(asinh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_post.cu deleted file mode 100644 index b6fddb9..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atan_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(atan(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_pre.cu deleted file mode 100644 index 12157c2..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atan_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(atan(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_post.cu deleted file mode 100644 index 2e9247a..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atanh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(atanh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_pre.cu deleted file mode 100644 index de8247f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atanh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(atanh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_post.cu deleted file mode 100644 index 7c54467..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_ceil_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - dest[dest_index] = TO_ET2(ceil(TO_FP_ET1(val))); - } - else - { - dest[dest_index] = TO_ET2(val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_pre.cu deleted file mode 100644 index 3c7948d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_ceil_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - dest[dest_index] = ceil(TO_FP_ET2(val)); - } - else - { - dest[dest_index] = val; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_post.cu deleted file mode 100644 index fe72a17..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_cos_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(cos(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_pre.cu deleted file mode 100644 index 8de59ee..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_cos_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(cos(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_post.cu deleted file mode 100644 index debd529..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_cosh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(cosh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_pre.cu deleted file mode 100644 index 38cc5f3..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_cosh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(cosh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post.cu deleted file mode 100644 index e155adf..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] / val_pre) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve1.cu deleted file mode 100644 index 1dec0d9..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_post_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]] / val_pre) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve2.cu deleted file mode 100644 index 82931ae..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_post_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc] / val_pre) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre.cu deleted file mode 100644 index d17e94b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre.cu +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // if both are 0, we take it as val_pre == 0 and val_post unused - if (val_post == TO_ET2(0)) - { - dest[dest_index] = TO_ET2(val_pre / src[src_index]); - } - else if (val_pre == TO_ET1(0) && val_post != TO_ET2(0)) - { - dest[dest_index] = val_post / (TO_ET2(src[src_index])); - } - else - { - // if both are nonzero, we apply sequentially----be careful! - dest[dest_index] = val_post / TO_ET2(val_pre / src[src_index]); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve1.cu deleted file mode 100644 index 1d345e8..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve1.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_pre_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - // if both are 0, we take it as val_pre == 0 and val_post unused - if (val_post == TO_ET2(0)) - { - dest[dest_locs[i]] = TO_ET2(val_pre / src[src_locs[i]]); - } - else if (val_pre == TO_ET1(0) && val_post != TO_ET2(0)) - { - dest[dest_locs[i]] = val_post / (TO_ET2(src[src_locs[i]])); - } - else - { - // if both are nonzero, we apply sequentially----be careful! - dest[dest_locs[i]] = val_post / TO_ET2(val_pre / src[src_locs[i]]); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve2.cu deleted file mode 100644 index ee53258..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve2.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_pre_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - // if both are 0, we take it as val_pre == 0 and val_post unused - if (val_post == TO_ET2(0)) - { - dest[dest_loc] = TO_ET2(val_pre / src[src_loc]); - } - else if (val_pre == TO_ET1(0) && val_post != TO_ET2(0)) - { - dest[dest_loc] = val_post / (TO_ET2(src[src_loc])); - } - else - { - // if both are nonzero, we apply sequentially----be careful! - dest[dest_loc] = val_post / TO_ET2(val_pre / src[src_loc]); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_post.cu deleted file mode 100644 index 2439868..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_erf_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - dest[dest_index] = TO_ET2(erf(TO_FP_ET1(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_pre.cu deleted file mode 100644 index 4257781..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_erf_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - dest[dest_index] = TO_ET2(erf(TO_FP_ET2(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_post.cu deleted file mode 100644 index 302c70f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_erfc_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - dest[dest_index] = TO_ET2(erfc(TO_FP_ET1(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_pre.cu deleted file mode 100644 index 44d4ce9..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_erfc_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - dest[dest_index] = TO_ET2(erfc(TO_FP_ET2(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_post.cu deleted file mode 100644 index 8e74e87..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp10_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(exp10(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_pre.cu deleted file mode 100644 index ca1f614..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp10_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(exp10(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_post.cu deleted file mode 100644 index b2c5926..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp2_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(exp2(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_pre.cu deleted file mode 100644 index eb98bf8..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp2_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(exp2(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_post.cu deleted file mode 100644 index c34faa2..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(exp(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_pre.cu deleted file mode 100644 index 9d4a38b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(exp(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_post.cu deleted file mode 100644 index 197779d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_floor_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - dest[dest_index] = TO_ET2(floor(TO_FP_ET1(val))); - } - else - { - dest[dest_index] = TO_ET2(val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_pre.cu deleted file mode 100644 index 2cc616d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_floor_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - dest[dest_index] = floor(TO_FP_ET2(val)); - } - else - { - dest[dest_index] = val; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_post.cu deleted file mode 100644 index 78b0896..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_lgamma_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - dest[dest_index] = TO_ET2(lgamma(TO_FP_ET1(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_pre.cu deleted file mode 100644 index d7f9466..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_lgamma_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - dest[dest_index] = TO_ET2(lgamma(TO_FP_ET2(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_post.cu deleted file mode 100644 index 96cc43b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log10_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(log10(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_pre.cu deleted file mode 100644 index 93e0f9b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log10_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(log10(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_post.cu deleted file mode 100644 index bddb666..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log2_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(log2(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_pre.cu deleted file mode 100644 index d374f14..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log2_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(log2(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_post.cu deleted file mode 100644 index cfcbbdb..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(log(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_pre.cu deleted file mode 100644 index e4b5ac5..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(log(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_max_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_max_array_cube.cu deleted file mode 100644 index aa03d2a..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_max_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_max_array_cube)(eT2* dest, - const eT2* src_A, - const eT1* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = max(src_A[A_index], (TO_ET2(src_B[B_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_min_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_min_array_cube.cu deleted file mode 100644 index 20d5383..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_min_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_min_array_cube)(eT2* dest, - const eT2* src_A, - const eT1* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = min(src_A[A_index], (TO_ET2(src_B[B_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post.cu deleted file mode 100644 index a254b3d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] - val_pre) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve1.cu deleted file mode 100644 index e99ed79..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_post_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]] - val_pre) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve2.cu deleted file mode 100644 index 0a04d7f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_post_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc] - val_pre) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post.cu deleted file mode 100644 index 5c03699..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) val_post; - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(val_pre - src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve1.cu deleted file mode 100644 index 66b3897..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve1.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - (void) val_post; - - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(val_pre - src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve2.cu deleted file mode 100644 index 7f2f5df..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(val_pre - src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre.cu deleted file mode 100644 index b5ae1eb..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) val_pre; - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = val_post - (TO_ET2(src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve1.cu deleted file mode 100644 index 85a6cea..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve1.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - (void) val_pre; - - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = val_post - (TO_ET2(src[src_locs[i]])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve2.cu deleted file mode 100644 index 8cd4772..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = val_post - (TO_ET2(src[src_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mod_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mod_scalar.cu deleted file mode 100644 index a90104d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mod_scalar.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mod_scalar)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // For an integer type, the casts end up doing nothing. - uint_eT1 val = TO_UINT_ET1(src[src_index]) % TO_UINT_ET1(val_pre); - dest[dest_index] = TO_ET2((TO_UINT_ET2(val)) % TO_UINT_ET2(val_post)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar.cu deleted file mode 100644 index b868880..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_scalar)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] * val_pre) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve1.cu deleted file mode 100644 index 15f1f43..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_scalar_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]] * val_pre) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve2.cu deleted file mode 100644 index 60060ff..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_scalar_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc] * val_pre) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_post.cu deleted file mode 100644 index 95c2ad5..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_neg_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(-src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_pre.cu deleted file mode 100644 index edb8447..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_neg_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = -(TO_ET2(src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar.cu deleted file mode 100644 index bcac46d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_scalar)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] + val_pre) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve1.cu deleted file mode 100644 index 8c83f5d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_scalar_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]] + val_pre) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve2.cu deleted file mode 100644 index 8c0a37d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_scalar_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc] + val_pre) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_post.cu deleted file mode 100644 index 05e841b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_pow_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(pow(val, TO_FP_ET1(val_pre))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_pre.cu deleted file mode 100644 index f33f4a3..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_pow_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(pow(val, TO_FP_ET2(val_post))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_post.cu deleted file mode 100644 index b88361c..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_round_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - dest[dest_index] = TO_ET2(round(TO_FP_ET1(val))); - } - else - { - dest[dest_index] = TO_ET2(val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_pre.cu deleted file mode 100644 index fb54509..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_round_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - dest[dest_index] = round(TO_FP_ET2(val)); - } - else - { - dest[dest_index] = val; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_post.cu deleted file mode 100644 index 438e23f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_post.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sign_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (val > TO_ET1(0)) - { - dest[dest_index] = TO_ET2(1); - } - else if (val == TO_ET1(0)) - { - dest[dest_index] = TO_ET2(0); - } - else - { - dest[dest_index] = TO_ET2(-1); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_pre.cu deleted file mode 100644 index 4d43198..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_pre.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sign_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (val > TO_ET2(0)) - { - dest[dest_index] = TO_ET2(1); - } - else if (val == TO_ET2(0)) - { - dest[dest_index] = TO_ET2(0); - } - else - { - dest[dest_index] = TO_ET2(-1); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_post.cu deleted file mode 100644 index 8a1668f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sin_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(sin(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_pre.cu deleted file mode 100644 index ae9bf58..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sin_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(sin(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_post.cu deleted file mode 100644 index 1f74f05..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_post.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sinc_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - // To imitate Armadillo correctly, we use double if the type is not floating point. - if (coot_is_fp(val)) - { - const fp_eT1 tmp = val * COOT_PI; - dest[dest_index] = (tmp == TO_ET1(0)) ? TO_ET2(1) : TO_ET2(sin(tmp) / tmp); - } - else - { - const double fp_val = (double) val; - const double tmp = fp_val * COOT_PI; - dest[dest_index] = (tmp == 0) ? TO_ET2(1) : TO_ET2(sin(tmp) / tmp); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_pre.cu deleted file mode 100644 index 340482c..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_pre.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sinc_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - // To imitate Armadillo correctly, we use double if the type is not floating point. - if (coot_is_fp(val)) - { - const fp_eT2 tmp = val * COOT_PI; - dest[dest_index] = (tmp == TO_ET2(0)) ? TO_ET2(1) : TO_ET2(sin(tmp) / tmp); - } - else - { - const double fp_val = (double) val; - const double tmp = fp_val * COOT_PI; - dest[dest_index] = (tmp == 0) ? TO_ET2(1) : TO_ET2(sin(tmp) / tmp); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_post.cu deleted file mode 100644 index 00ecefc..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sinh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(sinh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_pre.cu deleted file mode 100644 index dc343a3..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sinh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(sinh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_post.cu deleted file mode 100644 index f684434..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sqrt_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(sqrt(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_pre.cu deleted file mode 100644 index a198b01..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sqrt_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(sqrt(TO_FP_ET2(TO_ET2(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_post.cu deleted file mode 100644 index a555c9e..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_square_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] * src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_pre.cu deleted file mode 100644 index d67c11f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_square_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - dest[dest_index] = val * val; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_post.cu deleted file mode 100644 index 6b82e6f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_tan_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(tan(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_pre.cu deleted file mode 100644 index 8156316..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_tan_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(tan(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_post.cu deleted file mode 100644 index 281b782..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_tanh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(tanh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_pre.cu deleted file mode 100644 index ed162b3..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_tanh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(tanh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_post.cu deleted file mode 100644 index 142bf2d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_post.cu +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_exp_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To imitate Armadillo's behavior exactly, if the type is not floating-point, we convert to double. - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - const fp_eT1 fp_val = TO_FP_ET1(val); - if (fp_val >= log(coot_type_max(TO_FP_ET1(0)))) - { - dest[dest_index] = TO_ET2(TO_ET1(coot_type_max(TO_FP_ET1(0)))); - } - else - { - dest[dest_index] = TO_ET2(TO_ET1(exp(fp_val))); - } - } - else - { - const double fp_val = (double) val; - if (fp_val >= log(DBL_MAX)) - { - dest[dest_index] = TO_ET2(TO_ET1(DBL_MAX)); - } - else - { - dest[dest_index] = TO_ET2(TO_ET1(exp(fp_val))); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_pre.cu deleted file mode 100644 index 7754fed..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_pre.cu +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_exp_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To imitate Armadillo's behavior exactly, if the type is not floating-point, we convert to double. - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - const fp_eT2 fp_val = TO_FP_ET2(val); - if (fp_val >= log(coot_type_max(TO_FP_ET2(0)))) - { - dest[dest_index] = TO_ET2(coot_type_max(TO_FP_ET2(0))); - } - else - { - dest[dest_index] = TO_ET2(exp(fp_val)); - } - } - else - { - const double fp_val = (double) val; - if (fp_val >= log(DBL_MAX)) - { - dest[dest_index] = TO_ET2(DBL_MAX); - } - else - { - dest[dest_index] = TO_ET2(exp(fp_val)); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_post.cu deleted file mode 100644 index 92e2d76..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_post.cu +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_log_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To match Armadillo, we always use `double` as the intermediate type for any non-floating point type. - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - const fp_eT1 fp_val = TO_FP_ET1(val); - if (fp_val <= TO_FP_ET1(0)) - { - dest[dest_index] = TO_ET2(log(coot_type_minpos(TO_FP_ET1(0)))); - } - else if (coot_isinf(fp_val)) - { - dest[dest_index] = TO_ET2(log(coot_type_max(TO_FP_ET1(0)))); - } - else - { - dest[dest_index] = TO_ET2(TO_ET1(log(fp_val))); - } - } - else - { - const double fp_val = (double) val; - if (fp_val <= TO_FP_ET1(0)) - { - dest[dest_index] = TO_ET2(log(DBL_MIN)); - } - else if (isinf(fp_val)) - { - dest[dest_index] = TO_ET2(log(DBL_MAX)); - } - else - { - dest[dest_index] = TO_ET2(TO_ET1(log(fp_val))); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_pre.cu deleted file mode 100644 index 305ba01..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_pre.cu +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_log_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To match Armadillo, we always use `double` as the intermediate type for any non-floating point type. - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - const fp_eT2 fp_val = TO_FP_ET2(val); - if (fp_val <= TO_FP_ET2(0)) - { - dest[dest_index] = TO_ET2(log(coot_type_minpos(TO_FP_ET2(0)))); - } - else if (coot_isinf(fp_val)) - { - dest[dest_index] = TO_ET2(log(coot_type_max(TO_FP_ET2(0)))); - } - else - { - dest[dest_index] = TO_ET2(log(fp_val)); - } - } - else - { - const double fp_val = (double) val; - if (fp_val <= (double) 0) - { - dest[dest_index] = TO_ET2(log(DBL_MIN)); - } - else if (isinf(fp_val)) - { - dest[dest_index] = TO_ET2(log(DBL_MAX)); - } - else - { - dest[dest_index] = TO_ET2(log(fp_val)); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_post.cu deleted file mode 100644 index 645333f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - dest[dest_index] = TO_ET2(trunc(TO_FP_ET1(val))); - } - else - { - dest[dest_index] = TO_ET2(val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_pre.cu deleted file mode 100644 index 85da488..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - dest[dest_index] = trunc(TO_FP_ET2(val)); - } - else - { - dest[dest_index] = val; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve1.cu deleted file mode 100644 index 4522d16..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,extract_sve1)(eT2* out_mem, - const eT1* in_mem, - const UWORD* in_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - out_mem[i] = TO_ET2(in_mem[in_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve2.cu deleted file mode 100644 index 7e40c22..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve2.cu +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,extract_sve2)(eT2* out_mem, - const eT1* in_mem, - const UWORD* in_row_locs, - const UWORD* in_col_locs, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD out_n_rows, - const UWORD in_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD in_loc = ((in_row_locs == NULL) ? row : in_row_locs[row]) + - in_n_rows * ((in_col_locs == NULL) ? col : in_col_locs[col]); - - out_mem[row + out_n_rows * col] = TO_ET2(in_mem[in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_and_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_and_array.cu deleted file mode 100644 index 5661582..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_and_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_and_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 && val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_array.cu deleted file mode 100644 index 0f2008a..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_eq_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 == val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_scalar.cu deleted file mode 100644 index eb20434..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_eq_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 == val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_array.cu deleted file mode 100644 index 079aefa..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_gt_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 > val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_scalar.cu deleted file mode 100644 index 9f2b065..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_gt_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 > val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_array.cu deleted file mode 100644 index 1b06a32..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_gteq_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 >= val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_scalar.cu deleted file mode 100644 index f1f1b5c..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_gteq_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 >= val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_array.cu deleted file mode 100644 index fd184a2..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_lt_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 < val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_scalar.cu deleted file mode 100644 index e4d143f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_lt_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 < val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_array.cu deleted file mode 100644 index a1858ef..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_lteq_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 <= val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_scalar.cu deleted file mode 100644 index f713d26..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_lteq_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 <= val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_array.cu deleted file mode 100644 index 6c80404..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_neq_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 != val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_scalar.cu deleted file mode 100644 index 49193cb..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_neq_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 != val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_or_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_or_array.cu deleted file mode 100644 index 9be1efd..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_or_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_or_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 || val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/replace.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/replace.cu deleted file mode 100644 index deb438e..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/replace.cu +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,replace)(eT2* dest, - eT1* src, - const eT1 val_find, - const eT1 val_replace, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_isnan(val_find)) - { - // We are searching for a NaN so the check is a little different. - dest[dest_index] = TO_ET2((coot_isnan(val)) ? val_replace : val); - } - else - { - // No special handling needed. - dest[dest_index] = TO_ET2((val == val_find) ? val_replace : val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/cuda_prelims.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/cuda_prelims.cu deleted file mode 100644 index 269dbc2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/cuda_prelims.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -#include - -// These statically-compiled definitions are available in any Bandicoot kernel. -#define uchar unsigned char -#define ushort unsigned short -#define uint unsigned int - -#define cx_float cuFloatComplex -#define cx_double cuDoubleComplex - -#define COOT_FN2(ARG1, ARG2) ARG1 ## ARG2 -#define COOT_FN(ARG1, ARG2) COOT_FN2(ARG1, ARG2) - -#define UWORD size_t - -// For older CUDA toolkit versions, we must manually make FP16 limit macros -// available. -#if CUDA_VERSION < 12020 - #define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U) - #define CUDART_NAN_FP16 __ushort_as_half((unsigned short)0x7FFFU) - #define CUDART_MIN_DENORM_FP16 __ushort_as_half((unsigned short)0x0001U) - #define CUDART_MAX_NORMAL_FP16 __ushort_as_half((unsigned short)0x7BFFU) - #define CUDART_NEG_ZERO_FP16 __ushort_as_half((unsigned short)0x8000U) - #define CUDART_ZERO_FP16 __ushort_as_half((unsigned short)0x0000U) - #define CUDART_ONE_FP16 __ushort_as_half((unsigned short)0x3C00U) -#endif - -extern __shared__ char aux_shared_mem[]; // this may be used in some kernels diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/d_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/d_defs.cu deleted file mode 100644 index 5b4776e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/d_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for double elements. -__device__ inline bool coot_is_fp(const double) { return true; } -__device__ inline bool coot_is_signed(const double) { return true; } -__device__ inline double coot_type_min(const double) { return -DBL_MAX; } -__device__ inline double coot_type_minpos(const double) { return DBL_MIN; } -__device__ inline double coot_type_max(const double) { return DBL_MAX; } -__device__ inline bool coot_isnan(const double x) { return isnan(x); } -__device__ inline bool coot_isinf(const double x) { return isinf(x); } -__device__ inline bool coot_isfinite(const double x) { return isfinite(x); } - -// Conversion functions for double elements. -__device__ inline double coot_to_double(const uchar& x) { return (double) x; } -__device__ inline double coot_to_double(const char& x) { return (double) x; } -__device__ inline double coot_to_double(const ushort& x) { return (double) x; } -__device__ inline double coot_to_double(const short& x) { return (double) x; } -__device__ inline double coot_to_double(const uint& x) { return (double) x; } -__device__ inline double coot_to_double(const int& x) { return (double) x; } -__device__ inline double coot_to_double(const size_t& x) { return (double) x; } -__device__ inline double coot_to_double(const long& x) { return (double) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline double coot_to_double(const __half& x) { return (double) __half2float(x); } -#endif -__device__ inline double coot_to_double(const float& x) { return (double) x; } -__device__ inline double coot_to_double(const double& x) { return (double) x; } - -// Utility mathematical functions. -__device__ inline double coot_absdiff(const double x, const double y) { return fabs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/f_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/f_defs.cu deleted file mode 100644 index 73526e7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/f_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for float elements. -__device__ inline bool coot_is_fp(const float) { return true; } -__device__ inline bool coot_is_signed(const float) { return true; } -__device__ inline float coot_type_min(const float) { return -FLT_MAX; } -__device__ inline float coot_type_minpos(const float) { return FLT_MIN; } -__device__ inline float coot_type_max(const float) { return FLT_MAX; } -__device__ inline bool coot_isnan(const float x) { return isnan(x); } -__device__ inline bool coot_isinf(const float x) { return isinf(x); } -__device__ inline bool coot_isfinite(const float x) { return isfinite(x); } - -// Conversion functions for float elements. -__device__ inline float coot_to_float(const uchar& x) { return (float) x; } -__device__ inline float coot_to_float(const char& x) { return (float) x; } -__device__ inline float coot_to_float(const ushort& x) { return (float) x; } -__device__ inline float coot_to_float(const short& x) { return (float) x; } -__device__ inline float coot_to_float(const uint& x) { return (float) x; } -__device__ inline float coot_to_float(const int& x) { return (float) x; } -__device__ inline float coot_to_float(const size_t& x) { return (float) x; } -__device__ inline float coot_to_float(const long& x) { return (float) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline float coot_to_float(const __half& x) { return __half2float(x); } -#endif -__device__ inline float coot_to_float(const float& x) { return (float) x; } -__device__ inline float coot_to_float(const double& x) { return (float) x; } - -// Utility mathematical functions. -__device__ inline float coot_absdiff(const float x, const float y) { return fabs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/h_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/h_defs.cu deleted file mode 100644 index e3b1ae9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/h_defs.cu +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for fp16 elements. -__device__ inline bool coot_is_fp(const __half) { return true; } -__device__ inline bool coot_is_signed(const __half) { return true; } -__device__ inline __half coot_type_min(const __half) { return -HALF_MAX; } -__device__ inline __half coot_type_minpos(const __half) { return HALF_MIN; } -__device__ inline __half coot_type_max(const __half) { return HALF_MAX; } -__device__ inline bool coot_isnan(const __half x) { return __hisnan(x); } -__device__ inline bool coot_isinf(const __half x) { return __hisinf(x); } -__device__ inline bool coot_isfinite(const __half x) { return !__hisnan(x) && !__hisinf(x); } - -// Conversion functions for fp16 elements. -#if CUDA_VERSION < 12020 -__device__ inline __half coot_to___half(const uchar& x) { return (__half) ((ushort) x); } -__device__ inline __half coot_to___half(const char& x) { return (__half) ((short) x); } -#else -__device__ inline __half coot_to___half(const uchar& x) { return (__half) x; } -__device__ inline __half coot_to___half(const char& x) { return (__half) x; } -#endif -__device__ inline __half coot_to___half(const ushort& x) { return (__half) x; } -__device__ inline __half coot_to___half(const short& x) { return (__half) x; } -__device__ inline __half coot_to___half(const uint& x) { return (__half) x; } -__device__ inline __half coot_to___half(const int& x) { return (__half) x; } -#if CUDA_VERSION < 12020 -__device__ inline __half coot_to___half(const size_t& x) { return (__half) ((unsigned long long) x); } -__device__ inline __half coot_to___half(const long& x) { return (__half) ((long long) x); } -#else -__device__ inline __half coot_to___half(const size_t& x) { return (__half) x; } -__device__ inline __half coot_to___half(const long& x) { return (__half) x; } -#endif -__device__ inline __half coot_to___half(const __half& x) { return (__half) x; } -__device__ inline __half coot_to___half(const float& x) { return __float2half(x); } -__device__ inline __half coot_to___half(const double& x) { return __float2half((double) x); } - -// CUDA FP16 support does not include some arithmetic operators that we need for volatile elements so we add them ourselves... -#if CUDA_VERSION < 12040 -__device__ inline volatile __half& operator+=(volatile __half& a, const volatile __half& b) { a = __hadd((__half) a, (__half) b); return a; } -__device__ inline volatile __half& operator-=(volatile __half& a, const volatile __half& b) { a = __hsub((__half) a, (__half) b); return a; } -__device__ inline volatile __half& operator*=(volatile __half& a, const volatile __half& b) { a = __hmul((__half) a, (__half) b); return a; } -__device__ inline volatile __half& operator/=(volatile __half& a, const volatile __half& b) { a = __hdiv((__half) a, (__half) b); return a; } -#else -__device__ inline volatile __half& operator+=(volatile __half& a, const volatile __half& b) { a = __hadd(a, b); return a; } -__device__ inline volatile __half& operator-=(volatile __half& a, const volatile __half& b) { a = __hsub(a, b); return a; } -__device__ inline volatile __half& operator*=(volatile __half& a, const volatile __half& b) { a = __hmul(a, b); return a; } -__device__ inline volatile __half& operator/=(volatile __half& a, const volatile __half& b) { a = __hdiv(a, b); return a; } -#endif -__device__ inline __half abs(const __half a) { return __habs(a); } -__device__ inline __half pow(const __half a, const __half b) { return hexp2(b * hlog2(a)); } -__device__ inline __half min(const __half a, const __half b) { return __hmin_nan(a, b); } -__device__ inline __half max(const __half a, const __half b) { return __hmax_nan(a, b); } - -// Utility mathematical functions. -__device__ inline __half coot_absdiff(const __half x, const __half y) { return fabs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s16_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s16_defs.cu deleted file mode 100644 index 3d541f5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s16_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u16 elements. -__device__ inline bool coot_is_fp(const short) { return false; } -__device__ inline bool coot_is_signed(const short) { return true; } -__device__ inline short coot_type_min(const short) { return COOT_S16_MIN; } -__device__ inline short coot_type_minpos(const short) { return 1; } -__device__ inline short coot_type_max(const short) { return COOT_S16_MAX; } -__device__ inline bool coot_isnan(const short) { return false; } -__device__ inline bool coot_isinf(const short) { return false; } -__device__ inline bool coot_isfinite(const short) { return true; } - -// Conversion functions for u16 elements. -__device__ inline short coot_to_short(const uchar& x) { return (short) x; } -__device__ inline short coot_to_short(const char& x) { return (short) x; } -__device__ inline short coot_to_short(const ushort& x) { return (short) x; } -__device__ inline short coot_to_short(const short& x) { return (short) x; } -__device__ inline short coot_to_short(const uint& x) { return (short) x; } -__device__ inline short coot_to_short(const int& x) { return (short) x; } -__device__ inline short coot_to_short(const size_t& x) { return (short) x; } -__device__ inline short coot_to_short(const long& x) { return (short) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline short coot_to_short(const __half& x) { return (short) x; } -#endif -__device__ inline short coot_to_short(const float& x) { return (short) x; } -__device__ inline short coot_to_short(const double& x) { return (short) x; } - -// Utility mathematical functions. -__device__ inline short coot_absdiff(const short x, const short y) { return abs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s32_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s32_defs.cu deleted file mode 100644 index d5a2f1e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s32_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for s32 elements. -__device__ inline bool coot_is_fp(const int) { return false; } -__device__ inline bool coot_is_signed(const int) { return true; } -__device__ inline int coot_type_min(const int) { return COOT_S32_MIN; } -__device__ inline int coot_type_minpos(const int) { return 1; } -__device__ inline int coot_type_max(const int) { return COOT_S32_MAX; } -__device__ inline bool coot_isnan(const int) { return false; } -__device__ inline bool coot_isinf(const int) { return false; } -__device__ inline bool coot_isfinite(const int) { return true; } - -// Conversion functions for s32 elements. -__device__ inline int coot_to_int(const uchar& x) { return (int) x; } -__device__ inline int coot_to_int(const char& x) { return (int) x; } -__device__ inline int coot_to_int(const ushort& x) { return (int) x; } -__device__ inline int coot_to_int(const short& x) { return (int) x; } -__device__ inline int coot_to_int(const uint& x) { return (int) x; } -__device__ inline int coot_to_int(const int& x) { return (int) x; } -__device__ inline int coot_to_int(const size_t& x) { return (int) x; } -__device__ inline int coot_to_int(const long& x) { return (int) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline int coot_to_int(const __half& x) { return (int) x; } -#endif -__device__ inline int coot_to_int(const float& x) { return (int) x; } -__device__ inline int coot_to_int(const double& x) { return (int) x; } - -// Utility mathematical functions. -__device__ inline int coot_absdiff(const int x, const int y) { return abs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s64_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s64_defs.cu deleted file mode 100644 index 1ceebc7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s64_defs.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for s64 elements. -__device__ inline bool coot_is_fp(const long) { return false; } -__device__ inline bool coot_is_signed(const long) { return true; } -__device__ inline long coot_type_min(const long) { return COOT_S64_MIN; } -__device__ inline long coot_type_minpos(const long) { return 1; } -__device__ inline long coot_type_max(const long) { return COOT_S64_MAX; } -__device__ inline bool coot_isnan(const long) { return false; } -__device__ inline bool coot_isinf(const long) { return false; } -__device__ inline bool coot_isfinite(const long) { return true; } - -// Conversion functions for s64 elements. -__device__ inline long coot_to_long(const uchar& x) { return (long) x; } -__device__ inline long coot_to_long(const char& x) { return (long) x; } -__device__ inline long coot_to_long(const ushort& x) { return (long) x; } -__device__ inline long coot_to_long(const short& x) { return (long) x; } -__device__ inline long coot_to_long(const uint& x) { return (long) x; } -__device__ inline long coot_to_long(const int& x) { return (long) x; } -__device__ inline long coot_to_long(const size_t& x) { return (long) x; } -__device__ inline long coot_to_long(const long& x) { return (long) x; } -#if defined(COOT_HAVE_FP16) -#if CUDA_VERSION < 12020 -__device__ inline long coot_to_long(const __half& x) { return (long) ((long long) x); } -#else -__device__ inline long coot_to_long(const __half& x) { return (long) x; } -#endif -#endif -__device__ inline long coot_to_long(const float& x) { return (long) x; } -__device__ inline long coot_to_long(const double& x) { return (long) x; } - -// Utility mathematical functions. -__device__ inline long coot_absdiff(const long x, const long y) { return abs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s8_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s8_defs.cu deleted file mode 100644 index 3130839..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s8_defs.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for s8 elements. -__device__ inline bool coot_is_fp(const char) { return false; } -__device__ inline bool coot_is_signed(const char) { return true; } -__device__ inline bool coot_type_min(const char) { return COOT_S8_MIN; } -__device__ inline bool coot_type_minpos(const char) { return 1; } -__device__ inline bool coot_type_max(const char) { return COOT_S8_MAX; } -__device__ inline bool coot_isnan(const char) { return false; } -__device__ inline bool coot_isinf(const char) { return false; } -__device__ inline bool coot_isfinite(const char) { return true; } - -// Conversion functions for s8 elements. -__device__ inline char coot_to_char(const uchar& x) { return (char) x; } -__device__ inline char coot_to_char(const char& x) { return (char) x; } -__device__ inline char coot_to_char(const ushort& x) { return (char) x; } -__device__ inline char coot_to_char(const short& x) { return (char) x; } -__device__ inline char coot_to_char(const uint& x) { return (char) x; } -__device__ inline char coot_to_char(const int& x) { return (char) x; } -__device__ inline char coot_to_char(const size_t& x) { return (char) x; } -__device__ inline char coot_to_char(const long& x) { return (char) x; } -#if defined(COOT_HAVE_FP16) -#if CUDA_VERSION < 12020 -__device__ inline char coot_to_char(const __half& x) { return (char) ((short) x); } -#else -__device__ inline char coot_to_char(const __half& x) { return (char) x; } -#endif -#endif -__device__ inline char coot_to_char(const float& x) { return (char) x; } -__device__ inline char coot_to_char(const double& x) { return (char) x; } - -// Utility mathematical functions. -__device__ inline char coot_absdiff(const char x, const char y) { return abs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u16_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u16_defs.cu deleted file mode 100644 index 66d909f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u16_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u16 elements. -__device__ inline bool coot_is_fp(const ushort) { return false; } -__device__ inline bool coot_is_signed(const ushort) { return false; } -__device__ inline ushort coot_type_min(const ushort) { return 0; } -__device__ inline ushort coot_type_minpos(const ushort) { return 1; } -__device__ inline ushort coot_type_max(const ushort) { return COOT_U16_MAX; } -__device__ inline bool coot_isnan(const ushort) { return false; } -__device__ inline bool coot_isinf(const ushort) { return false; } -__device__ inline bool coot_isfinite(const ushort) { return true; } - -// Conversion functions for u16 elements. -__device__ inline ushort coot_to_ushort(const uchar& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const char& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const ushort& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const short& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const uint& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const int& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const size_t& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const long& x) { return (ushort) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline ushort coot_to_ushort(const __half& x) { return (ushort) x; } -#endif -__device__ inline ushort coot_to_ushort(const float& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const double& x) { return (ushort) x; } - -// Utility mathematical functions. -__device__ inline ushort coot_absdiff(const ushort x, const ushort y) { return (x > y) ? (x - y) : (y - x); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u32_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u32_defs.cu deleted file mode 100644 index c67d75c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u32_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u32 elements. -__device__ inline bool coot_is_fp(const uint) { return false; } -__device__ inline bool coot_is_signed(const uint) { return false; } -__device__ inline uint coot_type_min(const uint) { return 0; } -__device__ inline uint coot_type_minpos(const uint) { return 1; } -__device__ inline uint coot_type_max(const uint) { return COOT_U32_MAX; } -__device__ inline bool coot_isnan(const uint) { return false; } -__device__ inline bool coot_isinf(const uint) { return false; } -__device__ inline bool coot_isfinite(const uint) { return true; } - -// Conversion functions for u32 elements. -__device__ inline uint coot_to_uint(const uchar& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const char& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const ushort& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const short& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const uint& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const int& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const size_t& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const long& x) { return (uint) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline uint coot_to_uint(const __half& x) { return (uint) x; } -#endif -__device__ inline uint coot_to_uint(const float& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const double& x) { return (uint) x; } - -// Utility mathematical functions. -__device__ inline uint coot_absdiff(const uint x, const uint y) { return (x > y) ? (x - y) : (y - x); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u64_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u64_defs.cu deleted file mode 100644 index 942e795..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u64_defs.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u64 elements. -__device__ inline bool coot_is_fp(const size_t) { return false; } -__device__ inline bool coot_is_signed(const size_t) { return false; } -__device__ inline size_t coot_type_min(const size_t) { return 0; } -__device__ inline size_t coot_type_minpos(const size_t) { return 1; } -__device__ inline size_t coot_type_max(const size_t) { return COOT_U64_MAX; } -__device__ inline bool coot_isnan(const size_t) { return false; } -__device__ inline bool coot_isinf(const size_t) { return false; } -__device__ inline bool coot_isfinite(const size_t) { return true; } - -// Conversion functions for u64 elements. -__device__ inline size_t coot_to_size_t(const uchar& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const char& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const ushort& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const short& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const uint& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const int& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const size_t& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const long& x) { return (size_t) x; } -#if defined(COOT_HAVE_FP16) -#if CUDA_VERSION < 12020 -__device__ inline size_t coot_to_size_t(const __half& x) { return (size_t) ((unsigned long long) x); } -#else -__device__ inline size_t coot_to_size_t(const __half& x) { return (size_t) x; } -#endif -#endif -__device__ inline size_t coot_to_size_t(const float& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const double& x) { return (size_t) x; } - -// Utility mathematical functions. -__device__ inline size_t coot_absdiff(const size_t x, const size_t y) { return (x > y) ? (x - y) : (y - x); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u8_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u8_defs.cu deleted file mode 100644 index e2a2e0d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u8_defs.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u8 elements. -__device__ inline bool coot_is_fp(const uchar) { return false; } -__device__ inline bool coot_is_signed(const uchar) { return false; } -__device__ inline uchar coot_type_min(const uchar) { return 0; } -__device__ inline uchar coot_type_minpos(const uchar) { return 1; } -__device__ inline uchar coot_type_max(const uchar) { return COOT_U8_MAX; } -__device__ inline bool coot_isnan(const uchar) { return false; } -__device__ inline bool coot_isinf(const uchar) { return false; } -__device__ inline bool coot_isfinite(const uchar) { return true; } - -// Conversion functions for u8 elements. -__device__ inline uchar coot_to_uchar(const uchar& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const char& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const ushort& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const short& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const uint& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const int& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const size_t& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const long& x) { return (uchar) x; } -#if defined(COOT_HAVE_FP16) -#if CUDA_VERSION < 12020 -__device__ inline uchar coot_to_uchar(const __half& x) { return (uchar) ((ushort) x); } -#else -__device__ inline uchar coot_to_uchar(const __half& x) { return (uchar) x; } -#endif -#endif -__device__ inline uchar coot_to_uchar(const float& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const double& x) { return (uchar) x; } - -// Utility mathematical functions. -__device__ inline uchar coot_absdiff(const uchar x, const uchar y) { return (x > y) ? (x - y) : (y - x); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/accu_subgroup_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/accu_subgroup_reduce.cu deleted file mode 100644 index 9169a5e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/accu_subgroup_reduce.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__device__ -void -COOT_FN(PREFIX,accu_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] += data[tid + 32]; - data[tid] += data[tid + 16]; - data[tid] += data[tid + 8]; - data[tid] += data[tid + 4]; - data[tid] += data[tid + 2]; - data[tid] += data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/and_subgroup_reduce_u32.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/and_subgroup_reduce_u32.cu deleted file mode 100644 index c1b6c84..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/and_subgroup_reduce_u32.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -and_subgroup_reduce_u32(volatile uint* data, int tid) - { - data[tid] &= data[tid + 32]; - data[tid] &= data[tid + 16]; - data[tid] &= data[tid + 8]; - data[tid] &= data[tid + 4]; - data[tid] &= data[tid + 2]; - data[tid] &= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/max_subgroup_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/max_subgroup_reduce.cu deleted file mode 100644 index 96241b2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/max_subgroup_reduce.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__device__ -void -COOT_FN(PREFIX,max_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] = max(data[tid], data[tid + 32]); - data[tid] = max(data[tid], data[tid + 16]); - data[tid] = max(data[tid], data[tid + 8]); - data[tid] = max(data[tid], data[tid + 4]); - data[tid] = max(data[tid], data[tid + 2]); - data[tid] = max(data[tid], data[tid + 1]); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/min_subgroup_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/min_subgroup_reduce.cu deleted file mode 100644 index 93caecb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/min_subgroup_reduce.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2021 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__device__ -void -COOT_FN(PREFIX,min_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] = min(data[tid], data[tid + 32]); - data[tid] = min(data[tid], data[tid + 16]); - data[tid] = min(data[tid], data[tid + 8]); - data[tid] = min(data[tid], data[tid + 4]); - data[tid] = min(data[tid], data[tid + 2]); - data[tid] = min(data[tid], data[tid + 1]); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/or_subgroup_reduce_u32.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/or_subgroup_reduce_u32.cu deleted file mode 100644 index de00860..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/or_subgroup_reduce_u32.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -or_subgroup_reduce_u32(volatile uint* data, int tid) - { - data[tid] |= data[tid + 32]; - data[tid] |= data[tid + 16]; - data[tid] |= data[tid + 8]; - data[tid] |= data[tid + 4]; - data[tid] |= data[tid + 2]; - data[tid] |= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/prod_subgroup_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/prod_subgroup_reduce.cu deleted file mode 100644 index 65a1d6b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/prod_subgroup_reduce.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__device__ -void -COOT_FN(PREFIX,prod_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] *= data[tid + 32]; - data[tid] *= data[tid + 16]; - data[tid] *= data[tid + 8]; - data[tid] *= data[tid + 4]; - data[tid] *= data[tid + 2]; - data[tid] *= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/var_philox.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/var_philox.cu deleted file mode 100644 index 1135b5b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/var_philox.cu +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// Implementations of the variable philox algorithm to generate random numbers. -// Adapted from Mitchell, Stokes, Frank, and Holmes (2022), Listing 1. - - - -inline __device__ UWORD var_philox(const UWORD val, const UWORD* keys, const unsigned char bits) - { - // via Salmon, Moraes, Dror, and Shaw (2011): "Parallel random numbers: as easy as 1, 2, 3". - static const UWORD M0 = 0xD2B74407B1CE6E93; - - // The right side is allowed to have the extra bits. - const unsigned char right_side_bits = (bits + 1) / 2; - const unsigned char left_side_bits = bits / 2; - const uint left_mask = (((uint) 1) << left_side_bits) - 1; - const uint right_mask = (((uint) 1) << right_side_bits) - 1; - - uint state0 = (uint) (val >> right_side_bits); - uint state1 = (uint) (val & right_mask); - - // 24 rounds is what is needed to pass all the RNG tests (see section 5 of the paper). - uint hi, lo; - for (unsigned char i = 0; i < 24; ++i) - { - - // 64-bit integer multiplication, split the results into two uints - UWORD hilo = M0 * state0; - hi = (hilo >> 32); - lo = (uint) hilo; - - lo = (lo << (right_side_bits - left_side_bits)) | (state1 >> left_side_bits); - - state0 = ((hi ^ keys[i]) ^ state1) & left_mask; - state1 = lo & right_mask; - } - - // Combine the sides for the result. - return UWORD((state0 << right_side_bits) | UWORD(state1)); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu.cu deleted file mode 100644 index ca8413c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu.cu +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,accu)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - aux_mem[tid] += in_mem[i] + in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += in_mem[i]; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_simple.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_simple.cu deleted file mode 100644 index a666628..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_simple.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,accu_simple)(eT1* out, - const eT1* A, - const UWORD A_len) - { - const UWORD id = blockIdx.x * blockDim.x + threadIdx.x; - if(id == 0) - { - eT1 acc = TO_ET1(0); // runtime unrolling is not supported by CUDA - for(UWORD i = 0; i < A_len; ++i) - { - acc += A[i]; - } - - out[0] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_small.cu deleted file mode 100644 index f196060..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_small.cu +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,accu_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - aux_mem[tid] += in_mem[i] + in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += in_mem[i]; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal.cu deleted file mode 100644 index 01b1cf6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal.cu +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,approx_equal)(uint* out_mem, - const eT1* A_mem, - const UWORD A_M_n_rows, - const eT1* B_mem, - const UWORD B_M_n_rows, - const UWORD n_rows, - const UWORD n_elem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // A bit painful... - const UWORD row1 = i % n_rows; - const UWORD col1 = i / n_rows; - const UWORD row2 = (i + blockDim.x) % n_rows; - const UWORD col2 = (i + blockDim.x) / n_rows; - - const UWORD A_loc1 = row1 + col1 * A_M_n_rows; - const UWORD A_loc2 = row2 + col2 * A_M_n_rows; - const UWORD B_loc1 = row1 + col1 * B_M_n_rows; - const UWORD B_loc2 = row2 + col2 * B_M_n_rows; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (coot_isnan(A_val1) || coot_isnan(B_val1) || coot_isnan(A_val2) || coot_isnan(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = coot_absdiff(A_val1, B_val1); - const eT1 absdiff2 = coot_absdiff(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD row = i % n_rows; - const UWORD col = i / n_rows; - - const UWORD A_loc = row + col * A_M_n_rows; - const UWORD B_loc = row + col * B_M_n_rows; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (coot_isnan(A_val) || coot_isnan(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = coot_absdiff(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - and_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube.cu deleted file mode 100644 index e01db1e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube.cu +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,approx_equal_cube)(uint* out_mem, - const eT1* A_mem, - const UWORD A_M_n_rows, - const UWORD A_M_n_cols, - const eT1* B_mem, - const UWORD B_M_n_rows, - const UWORD B_M_n_cols, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_elem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - const UWORD n_elem_slice = n_rows * n_cols; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // A bit painful... TODO: implement a more efficient non-modulo approach - const UWORD elem1 = i % n_elem_slice; - const UWORD slice1 = i / n_elem_slice; - const UWORD row1 = elem1 % n_rows; - const UWORD col1 = elem1 / n_rows; - - const UWORD elem2 = (i + blockDim.x) % n_elem_slice; - const UWORD slice2 = (i + blockDim.x) / n_elem_slice; - const UWORD row2 = elem2 % n_rows; - const UWORD col2 = elem2 / n_rows; - - const UWORD A_loc1 = row1 + col1 * A_M_n_rows + slice1 * A_M_n_rows * A_M_n_cols; - const UWORD A_loc2 = row2 + col2 * A_M_n_rows + slice2 * A_M_n_rows * A_M_n_cols; - const UWORD B_loc1 = row1 + col1 * B_M_n_rows + slice1 * B_M_n_rows * B_M_n_cols; - const UWORD B_loc2 = row2 + col2 * B_M_n_rows + slice2 * B_M_n_rows * B_M_n_cols; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (coot_isnan(A_val1) || coot_isnan(B_val1) || coot_isnan(A_val2) || coot_isnan(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = coot_absdiff(A_val1, B_val1); - const eT1 absdiff2 = coot_absdiff(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD elem = i % n_elem_slice; - const UWORD slice = i / n_elem_slice; - const UWORD row = elem % n_rows; - const UWORD col = elem / n_rows; - - const UWORD A_loc = row + col * A_M_n_rows + slice * A_M_n_rows * A_M_n_cols; - const UWORD B_loc = row + col * B_M_n_rows + slice * B_M_n_rows * B_M_n_cols; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (coot_isnan(A_val) || coot_isnan(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = coot_absdiff(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - and_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube_small.cu deleted file mode 100644 index 72d3dde..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube_small.cu +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,approx_equal_cube_small)(uint* out_mem, - const eT1* A_mem, - const UWORD A_M_n_rows, - const UWORD A_M_n_cols, - const eT1* B_mem, - const UWORD B_M_n_rows, - const UWORD B_M_n_cols, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_elem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - const UWORD n_elem_slice = n_rows * n_cols; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // A bit painful... TODO: implement a more efficient non-modulo approach - const UWORD elem1 = i % n_elem_slice; - const UWORD slice1 = i / n_elem_slice; - const UWORD row1 = elem1 % n_rows; - const UWORD col1 = elem1 / n_rows; - - const UWORD elem2 = (i + blockDim.x) % n_elem_slice; - const UWORD slice2 = (i + blockDim.x) / n_elem_slice; - const UWORD row2 = elem2 % n_rows; - const UWORD col2 = elem2 / n_rows; - - const UWORD A_loc1 = row1 + col1 * A_M_n_rows + slice1 * A_M_n_rows * A_M_n_cols; - const UWORD A_loc2 = row2 + col2 * A_M_n_rows + slice2 * A_M_n_rows * A_M_n_cols; - const UWORD B_loc1 = row1 + col1 * B_M_n_rows + slice1 * B_M_n_rows * B_M_n_cols; - const UWORD B_loc2 = row2 + col2 * B_M_n_rows + slice2 * B_M_n_rows * B_M_n_cols; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (coot_isnan(A_val1) || coot_isnan(B_val1) || coot_isnan(A_val2) || coot_isnan(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = coot_absdiff(A_val1, B_val1); - const eT1 absdiff2 = coot_absdiff(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD elem = i % n_elem_slice; - const UWORD slice = i / n_elem_slice; - const UWORD row = elem % n_rows; - const UWORD col = elem / n_rows; - - const UWORD A_loc = row + col * A_M_n_rows + slice * A_M_n_rows * A_M_n_cols; - const UWORD B_loc = row + col * B_M_n_rows + slice * B_M_n_rows * B_M_n_cols; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (coot_isnan(A_val) || coot_isnan(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = coot_absdiff(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_small.cu deleted file mode 100644 index e16e98b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_small.cu +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,approx_equal_small)(uint* out_mem, - const eT1* A_mem, - const UWORD A_M_n_rows, - const eT1* B_mem, - const UWORD B_M_n_rows, - const UWORD n_rows, - const UWORD n_elem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // A bit painful... - const UWORD row1 = i % n_rows; - const UWORD col1 = i / n_rows; - const UWORD row2 = (i + blockDim.x) % n_rows; - const UWORD col2 = (i + blockDim.x) / n_rows; - - const UWORD A_loc1 = row1 + col1 * A_M_n_rows; - const UWORD A_loc2 = row2 + col2 * A_M_n_rows; - const UWORD B_loc1 = row1 + col1 * B_M_n_rows; - const UWORD B_loc2 = row2 + col2 * B_M_n_rows; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (coot_isnan(A_val1) || coot_isnan(B_val1) || coot_isnan(A_val2) || coot_isnan(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = coot_absdiff(A_val1, B_val1); - const eT1 absdiff2 = coot_absdiff(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD row = i % n_rows; - const UWORD col = i / n_rows; - - const UWORD A_loc = row + col * A_M_n_rows; - const UWORD B_loc = row + col * B_M_n_rows; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (coot_isnan(A_val) || coot_isnan(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = coot_absdiff(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/count_nonzeros.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/count_nonzeros.cu deleted file mode 100644 index 76a44bc..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/count_nonzeros.cu +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,count_nonzeros)(const eT1* A, - UWORD* thread_counts, - const UWORD n_elem) - { - // We want to pass over the memory in A and count the number of nonzero elements. - // This will give us a count for each individual thread; we then want to prefix-sum this. - // This kernel is meant to be used as the first part of find(). - - UWORD* aux_mem = (UWORD*) aux_shared_mem; // should have size equal to num_threads + 1 - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_count = 0; - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - if (A[i] != TO_ET1(0)) - { - ++local_count; - } - if (A[i + 1] != TO_ET1(0)) - { - ++local_count; - } - - i += 2; - } - if (i < end_elem) - { - if (A[i] != TO_ET1(0)) - { - ++local_count; - } - } - - // Aggregate the counts for all threads. - aux_mem[tid] = local_count; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - - for (UWORD s = num_threads / 2; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - // Set the last element correctly. - thread_counts[num_threads] = aux_mem[num_threads - 1]; - aux_mem[num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads / 2; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - thread_counts[tid] = aux_mem[tid]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find.cu deleted file mode 100644 index e9fc1fa..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find.cu +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,find)(const eT1* A, - const UWORD* thread_counts, - UWORD* out, - const UWORD n_elem) - { - // Our goal is to fill `out` with the indices of nonzero values. - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - - UWORD i = start_elem; - - while (i + 1 < end_elem) - { - if (A[i] != TO_ET1(0)) - { - out[out_index++] = i; - } - if (A[i + 1] != TO_ET1(0)) - { - out[out_index++] = (i + 1); - } - - i += 2; - } - if (i < end_elem) - { - if (A[i] != TO_ET1(0)) - { - out[out_index++] = i; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_first.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_first.cu deleted file mode 100644 index 750610c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_first.cu +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,find_first)(const eT1* A, - const UWORD* thread_counts, - UWORD* out, - const UWORD k, - const UWORD n_elem) - { - // Our goal is to fill `out` with the first `k` indices of nonzero values. - // It is assumed that `k != 0`; if `k` is `0`, use the `find` kernel instead. - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - - UWORD i = start_elem; - - // We only want to find the first k points. - if (out_index < k) - { - while (i + 1 < end_elem) - { - if (A[i] != TO_ET1(0) && out_index < k) - { - out[out_index++] = i; - } - if (A[i + 1] != TO_ET1(0) && out_index < k) - { - out[out_index++] = (i + 1); - } - - i += 2; - } - if (i < end_elem) - { - if (A[i] != TO_ET1(0) && out_index < k) - { - out[out_index++] = i; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_last.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_last.cu deleted file mode 100644 index e19e0fc..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_last.cu +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,find_last)(const eT1* A, - const UWORD* thread_counts, - UWORD* out, - const UWORD m, - const UWORD n_elem) - { - // Our goal is to fill `out` with the last `k` indices of nonzero values. - // (Note that to match Armadillo's behavior, we want the last `k` indices in ascending order.) - // Instead of accepting `k` as a parameter, we instead accept `m = nnz - k`. - // This gives us the first index we should be putting an output value in. - // It is also assumed that `k != 0`; if `k` is `0`, use the `find` kernel instead. - - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - UWORD last_out_index = thread_counts[tid + 1]; - - UWORD i = start_elem; - - // We only want to find points with index `m` or higher. - if (last_out_index >= m) - { - while (i + 1 < end_elem) - { - if (A[i] != TO_ET1(0)) - { - if (out_index >= m) - { - out[out_index - m] = i; - } - - ++out_index; - } - if (A[i + 1] != TO_ET1(0)) - { - if (out_index >= m) - { - out[out_index - m] = (i + 1); - } - - ++out_index; - } - - i += 2; - } - - if (i < end_elem) - { - if (A[i] != TO_ET1(0)) - { - if (out_index >= m) - { - out[out_index - m] = i; - } - - ++out_index; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max.cu deleted file mode 100644 index 7d7cf60..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max.cu +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,index_max_subgroup_reduce)(volatile eT1* data, volatile UWORD* uword_data, int tid) - { - if ((const eT1) data[tid + 32] > (const eT1) data[tid]) - { - data[tid] = data[tid + 32]; - uword_data[tid] = uword_data[tid + 32]; - } - - if ((const eT1) data[tid + 16] > (const eT1) data[tid]) - { - data[tid] = data[tid + 16]; - uword_data[tid] = uword_data[tid + 16]; - } - - if ((const eT1) data[tid + 8] > (const eT1) data[tid]) - { - data[tid] = data[tid + 8]; - uword_data[tid] = uword_data[tid + 8]; - } - - if ((const eT1) data[tid + 4] > (const eT1) data[tid]) - { - data[tid] = data[tid + 4]; - uword_data[tid] = uword_data[tid + 4]; - } - - if ((const eT1) data[tid + 2] > (const eT1) data[tid]) - { - data[tid] = data[tid + 2]; - uword_data[tid] = uword_data[tid + 2]; - } - - if ((const eT1) data[tid + 1] > (const eT1) data[tid]) - { - data[tid] = data[tid + 1]; - uword_data[tid] = uword_data[tid + 1]; - } - } - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,index_max)(const eT1* in_mem, - const UWORD* in_uword_mem, - const UWORD use_uword_mem, - const UWORD n_elem, - eT1* out_mem, - UWORD* out_uword_mem, - const UWORD uword_aux_mem_start) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - UWORD* aux_uword_mem = (UWORD*) (aux_shared_mem + uword_aux_mem_start); - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - aux_uword_mem[tid] = SIZE_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - if (i + blockDim.x < n_elem) - { - if (in_mem[i + blockDim.x] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - if (in_mem[i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - - if (in_mem[i + blockDim.x] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] > aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,index_max_subgroup_reduce)(aux_mem, aux_uword_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - out_uword_mem[blockIdx.x] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_colwise.cu deleted file mode 100644 index 5c71089..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_colwise.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_max_colwise)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 best_val = colptr[0]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_rows; ++i) - { - if (colptr[i] > best_val) - { - best_val = colptr[i]; - best_index = i; - } - } - - dest[col * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_cube_col.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_cube_col.cu deleted file mode 100644 index 68f4188..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_cube_col.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_max_cube_col)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT1 best_val = src[row + slice * n_rows * n_cols]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_cols; ++i) - { - if (src[(i * n_rows) + row + slice * n_rows * n_cols] > best_val) - { - best_val = src[(i * n_rows) + row + slice * n_rows * n_cols]; - best_index = i; - } - } - - dest[row + slice * n_rows] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_rowwise.cu deleted file mode 100644 index 3ae5cc2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_rowwise.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_max_rowwise)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 best_val = src[row]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_cols; ++i) - { - if (src[(i * src_M_n_rows) + row] > best_val) - { - best_val = src[(i * src_M_n_rows) + row]; - best_index = i; - } - } - - dest[row * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_small.cu deleted file mode 100644 index 2c1ca36..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_small.cu +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_max_small)(const eT1* in_mem, - const UWORD* in_uword_mem, - const UWORD use_uword_mem, - const UWORD n_elem, - eT1* out_mem, - UWORD* out_uword_mem, - const UWORD uword_aux_mem_start) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - UWORD* aux_uword_mem = (UWORD*) (aux_shared_mem + uword_aux_mem_start); - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - aux_uword_mem[tid] = SIZE_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - if (i + blockDim.x < n_elem) - { - if (in_mem[i + blockDim.x] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - if (in_mem[i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - - if (in_mem[i + blockDim.x] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] > aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - out_uword_mem[blockIdx.x] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min.cu deleted file mode 100644 index ae377c2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min.cu +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,index_min_subgroup_reduce)(volatile eT1* data, volatile UWORD* uword_data, int tid) - { - if ((const eT1) data[tid + 32] < (const eT1) data[tid]) - { - data[tid] = data[tid + 32]; - uword_data[tid] = uword_data[tid + 32]; - } - - if ((const eT1) data[tid + 16] < (const eT1) data[tid]) - { - data[tid] = data[tid + 16]; - uword_data[tid] = uword_data[tid + 16]; - } - - if ((const eT1) data[tid + 8] < (const eT1) data[tid]) - { - data[tid] = data[tid + 8]; - uword_data[tid] = uword_data[tid + 8]; - } - - if ((const eT1) data[tid + 4] < (const eT1) data[tid]) - { - data[tid] = data[tid + 4]; - uword_data[tid] = uword_data[tid + 4]; - } - - if ((const eT1) data[tid + 2] < (const eT1) data[tid]) - { - data[tid] = data[tid + 2]; - uword_data[tid] = uword_data[tid + 2]; - } - - if ((const eT1) data[tid + 1] < (const eT1) data[tid]) - { - data[tid] = data[tid + 1]; - uword_data[tid] = uword_data[tid + 1]; - } - } - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,index_min)(const eT1* in_mem, - const UWORD* in_uword_mem, - const UWORD use_uword_mem, - const UWORD n_elem, - eT1* out_mem, - UWORD* out_uword_mem, - const UWORD uword_aux_mem_start) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - UWORD* aux_uword_mem = (UWORD*) (aux_shared_mem + uword_aux_mem_start); - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - aux_uword_mem[tid] = SIZE_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - if (i + blockDim.x < n_elem) - { - if (in_mem[i + blockDim.x] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - if (in_mem[i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - - if (in_mem[i + blockDim.x] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] < aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,index_min_subgroup_reduce)(aux_mem, aux_uword_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - out_uword_mem[blockIdx.x] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_colwise.cu deleted file mode 100644 index 262e1b1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_colwise.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_min_colwise)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 best_val = colptr[0]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_rows; ++i) - { - if (colptr[i] < best_val) - { - best_val = colptr[i]; - best_index = i; - } - } - - dest[col * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_cube_col.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_cube_col.cu deleted file mode 100644 index 84eb56a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_cube_col.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_min_cube_col)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT1 best_val = src[row + slice * n_rows * n_cols]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_cols; ++i) - { - if (src[(i * n_rows) + row + slice * n_rows * n_cols] < best_val) - { - best_val = src[(i * n_rows) + row + slice * n_rows * n_cols]; - best_index = i; - } - } - - dest[row + slice * n_rows] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_rowwise.cu deleted file mode 100644 index b25c568..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_rowwise.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_min_rowwise)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 best_val = src[row]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_cols; ++i) - { - if (src[(i * src_M_n_rows) + row] < best_val) - { - best_val = src[(i * src_M_n_rows) + row]; - best_index = i; - } - } - - dest[row * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_small.cu deleted file mode 100644 index d1d0561..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_small.cu +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_min_small)(const eT1* in_mem, - const UWORD* in_uword_mem, - const UWORD use_uword_mem, - const UWORD n_elem, - eT1* out_mem, - UWORD* out_uword_mem, - const UWORD uword_aux_mem_start) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - UWORD* aux_uword_mem = (UWORD*) (aux_shared_mem + uword_aux_mem_start); - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - aux_uword_mem[tid] = SIZE_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - if (i + blockDim.x < n_elem) - { - if (in_mem[i + blockDim.x] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - if (in_mem[i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - - if (in_mem[i + blockDim.x] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] < aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - out_uword_mem[blockIdx.x] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_philox_randn.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_philox_randn.cu deleted file mode 100644 index f27bba0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_philox_randn.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_philox_randn)(eT1* mem, - unsigned int* philox_state, - const UWORD n_elem, - const fp_eT1 mu, - const fp_eT1 sd) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_set_eye.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_set_eye.cu deleted file mode 100644 index 92d3c80..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_set_eye.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_set_eye)(eT1* out, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - if( (row < n_rows) && (col < n_cols) ) - { - const UWORD offset = row + col * n_rows; - out[offset] = (row == col) ? TO_ET1(1) : TO_ET1(0); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randi.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randi.cu deleted file mode 100644 index 5cee0a8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randi.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_xorwow32_randi)(eT1* mem, - uint* xorwow_state, - const UWORD n_elem, - const eT1 lo, - const uint_eT1 range, - const bool needs_modulo) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randu.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randu.cu deleted file mode 100644 index 45c251f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randu.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_xorwow32_randu)(eT1* mem, - uint* xorwow_state, - const UWORD n_elem) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randi.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randi.cu deleted file mode 100644 index 817ebff..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randi.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_xorwow64_randi)(eT1* mem, - ulong* xorwow_state, - const UWORD n_elem, - const eT1 lo, - const uint_eT1 range, - const bool needs_modulo) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randu.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randu.cu deleted file mode 100644 index d3b7318..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randu.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_xorwow64_randu)(eT1* mem, - ulong* xorwow_state, - const UWORD n_elem) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/linspace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/linspace.cu deleted file mode 100644 index 8d14282..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/linspace.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ -__global__ -void -COOT_FN(PREFIX,linspace)(eT1* out_mem, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - UWORD idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num) - { - out_mem[idx * mem_incr] = TO_ET1(start + step * (TO_ET1(idx))); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/logspace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/logspace.cu deleted file mode 100644 index c0314a6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/logspace.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ -__global__ -void -COOT_FN(PREFIX,logspace)(eT1* out_mem, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - UWORD idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num) - { - out_mem[idx * mem_incr] = TO_ET1(pow(TO_FP_ET1(10), TO_FP_ET1(start + step * (TO_ET1(idx))))); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/ltri_set_zero.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/ltri_set_zero.cu deleted file mode 100644 index a0e56d7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/ltri_set_zero.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,ltri_set_zero)(eT1* out, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD index = col * n_rows + row; - if ( (row < n_rows) && (col < n_cols) && (row > col) ) - { - out[index] = TO_ET1(0); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max.cu deleted file mode 100644 index c3190bd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max.cu +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,max)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i + blockDim.x]); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i]); - aux_mem[tid] = max(aux_mem[tid], in_mem[i + blockDim.x]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i]); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,max_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs.cu deleted file mode 100644 index ce87fbd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs.cu +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,max_abs)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[i]); - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i + blockDim.x])); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i])); - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i + blockDim.x])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i])); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,max_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs_small.cu deleted file mode 100644 index 9ce7022..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs_small.cu +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_abs_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[i]); - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i + blockDim.x])); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i])); - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i + blockDim.x])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i])); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_small.cu deleted file mode 100644 index 63b8f75..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_small.cu +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i + blockDim.x]); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i]); - aux_mem[tid] = max(aux_mem[tid], in_mem[i + blockDim.x]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i]); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min.cu deleted file mode 100644 index 63877c8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min.cu +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2021 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,min)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i + blockDim.x]); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i]); - aux_mem[tid] = min(aux_mem[tid], in_mem[i + blockDim.x]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i]); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,min_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min_small.cu deleted file mode 100644 index 4d44a67..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min_small.cu +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2021 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i + blockDim.x]); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i]); - aux_mem[tid] = min(aux_mem[tid], in_mem[i + blockDim.x]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i]); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise.cu deleted file mode 100644 index a6e3259..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each column in `in` by the corresponding value in `A` -__global__ -void -COOT_FN(PREFIX,mul_colwise)(eT1* out, - const eT1* A, // expected to have length n_cols - const UWORD A_incr, - const eT1* in, - const eT1 alpha, // scalar to multiply - const UWORD n_rows, // size of `out` and `in` - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const UWORD in_offset = col * in_M_n_rows; - const UWORD out_offset = col * n_rows; - const eT1 val = alpha * A[col * A_incr]; - for (UWORD i = 0; i < n_rows; ++i) - { - out[i + out_offset] = val * in[i + in_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise_trans.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise_trans.cu deleted file mode 100644 index 3448d85..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise_trans.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each column in `trans(in)` by the corresponding value in `A` -__global__ -void -COOT_FN(PREFIX,mul_colwise_trans)(eT1* out, - const eT1* A, // expected to have length n_cols - const UWORD A_incr, - const eT1* in, - const eT1 alpha, // scalar to multiply - const UWORD n_rows, // size of `out` - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1 val = alpha * A[col]; - for (UWORD i = 0; i < n_rows; ++i) - { - const UWORD in_offset = col + i * in_M_n_rows; - const UWORD out_offset = col * n_rows + i; - out[out_offset] = val * in[in_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise.cu deleted file mode 100644 index 7a85015..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each row in `in` by the corresponding value in `A` -__global__ -void -COOT_FN(PREFIX,mul_rowwise)(eT1* out, - const eT1* A, // expected to have length n_rows - const UWORD A_incr, - const eT1* in, - const eT1 alpha, // scalar to multiply - const UWORD n_rows, // size of `out` and `in` - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - const eT1 val = alpha * A[row * A_incr]; - for (UWORD i = 0; i < n_cols; ++i) - { - const UWORD out_offset = i * n_rows + row; - const UWORD in_offset = i * in_M_n_rows + row; - out[out_offset] = val * in[in_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise_trans.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise_trans.cu deleted file mode 100644 index cc910ec..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise_trans.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each row in `trans(in)` by the corresponding value in `A` -__global__ -void -COOT_FN(PREFIX,mul_rowwise_trans)(eT1* out, - const eT1* A, // expected to have length n_rows - const UWORD A_incr, - const eT1* in, - const eT1 alpha, // scalar to multiply - const UWORD n_rows, // size of `out` - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - const eT1 val = alpha * A[row * A_incr]; - for (UWORD i = 0; i < n_cols; ++i) - { - const UWORD in_offset = i + row * in_M_n_rows; - const UWORD out_offset = i * n_rows + row; - out[out_offset] = val * in[in_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod.cu deleted file mode 100644 index 1e5ef8d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod.cu +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,prod)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - aux_mem[tid] *= in_mem[i] * in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] *= in_mem[i]; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,prod_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod_small.cu deleted file mode 100644 index 8c52cdf..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod_small.cu +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,prod_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - aux_mem[tid] *= in_mem[i] * in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] *= in_mem[i]; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_asc.cu deleted file mode 100644 index e170f7e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_asc.cu +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_asc)(eT1* A, - eT1* tmp_mem, - const UWORD n_elem) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_counts[2]; - - eT1* unsorted_memptr = A; - eT1* sorted_memptr = tmp_mem; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - __syncthreads(); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid ]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid + num_threads]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - - __syncthreads(); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // swap these and perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = aux_mem[num_threads] - aux_mem[tid]; // contains the first place we should put a 1 point (we will move downwards) - local_counts[1] = (local_counts[1] == 0) ? 0 : local_counts[1] - 1; // avoid underflow - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_asc.cu deleted file mode 100644 index b42eb8d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_asc.cu +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_colwise_asc)(eT1* A, - eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < A_n_cols) - { - eT1* unsorted_colptr = &A[col * A_M_n_rows]; - eT1* sorted_colptr = &tmp_mem[col * A_n_rows]; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - uint_eT1* colptr = reinterpret_cast(unsorted_colptr); - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> b)]; - } - - counts[1] = counts[0]; // now holds the offset to put the next value at - counts[0] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD out_index = counts[((colptr[i] & mask) >> b)]++; - sorted_colptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - eT1* tmp = unsorted_colptr; - unsorted_colptr = sorted_colptr; - sorted_colptr = tmp; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* colptr = reinterpret_cast(unsorted_colptr); - counts[0] = 0; - counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> last_bit)]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = counts[0] - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_colptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]++; - sorted_colptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_desc.cu deleted file mode 100644 index 6bd3959..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_desc.cu +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_colwise_desc)(eT1* A, - eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < A_n_cols) - { - eT1* unsorted_colptr = &A[col * A_M_n_rows]; - eT1* sorted_colptr = &tmp_mem[col * A_n_rows]; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - uint_eT1* colptr = reinterpret_cast(unsorted_colptr); - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> b)]; - } - - // Since we are sorting in descending order, 1-valued points come first. - counts[0] = counts[1]; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD out_index = counts[((colptr[i] & mask) >> b)]++; - sorted_colptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - eT1* tmp = unsorted_colptr; - unsorted_colptr = sorted_colptr; - sorted_colptr = tmp; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* colptr = reinterpret_cast(unsorted_colptr); - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - counts[0] = 0; - counts[1] = A_n_rows - 1; // points to the last element - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_colptr[out_index] = val; - } - } - else - { - counts[0] = 0; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> last_bit)]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the positive values ahead of the negative values. - counts[1] = counts[0]; // now holds the offset to put the next negative value at - counts[0] = 0; // now holds the offset to put the next positive value at - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]++; - sorted_colptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_desc.cu deleted file mode 100644 index bb1ec97..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_desc.cu +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_desc)(eT1* A, - eT1* tmp_mem, - const UWORD n_elem) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_counts[2]; - - eT1* unsorted_memptr = A; - eT1* sorted_memptr = tmp_mem; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - - // Step 2: aggregate the counts for all threads. - // Since we want the largest values to come first, the first entry will be the count of 1s. - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - __syncthreads(); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 1 point - // aux_mem[tid + num_threads] should hold the first place to put a 0 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid ]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - - __syncthreads(); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 0-bit values before the 1-bit values (since we are sorting in descending order). - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // perform a prefix sum, as with the rest of the bits; positive points will come first - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we already have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = n_elem - 1 - (aux_mem[num_threads + tid] - aux_mem[num_threads]); // contains the first place we should put a 1 point (we will move downwards) - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the positive values ahead of the negative values. - local_counts[0] = aux_mem[tid]; - local_counts[1] = aux_mem[tid + num_threads]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_asc.cu deleted file mode 100644 index e93347f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_asc.cu +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_index_asc)(eT1* A, - UWORD* A_index, - eT1* tmp_mem, - UWORD* tmp_mem_index, - const UWORD n_elem) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill A_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - A_index[i] = i; - A_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - A_index[i] = i; - } - - __syncthreads(); - - UWORD local_counts[2]; - - eT1* unsorted_memptr = A; - UWORD* unsorted_index_memptr = A_index; - eT1* sorted_memptr = tmp_mem; - UWORD* sorted_index_memptr = tmp_mem_index; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - __syncthreads(); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid ]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid + num_threads]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - UWORD* tmp_index = unsorted_index_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_index_memptr = tmp_index; - - __syncthreads(); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // swap these and perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = aux_mem[num_threads] - aux_mem[tid]; // contains the first place we should put a 1 point (we will move downwards) - local_counts[1] = (local_counts[1] == 0) ? 0 : local_counts[1] - 1; // avoid underflow - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_desc.cu deleted file mode 100644 index 90d475f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_desc.cu +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_index_desc)(eT1* A, - UWORD* A_index, - eT1* tmp_mem, - UWORD* tmp_mem_index, - const UWORD n_elem) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill A_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - A_index[i] = i; - A_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - A_index[i] = i; - } - - __syncthreads(); - - UWORD local_counts[2]; - - eT1* unsorted_memptr = A; - UWORD* unsorted_index_memptr = A_index; - eT1* sorted_memptr = tmp_mem; - UWORD* sorted_index_memptr = tmp_mem_index; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - - // Step 2: aggregate the counts for all threads. - // Since we want the largest values to come first, the first entry will be the count of 1s. - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - __syncthreads(); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 1 point - // aux_mem[tid + num_threads] should hold the first place to put a 0 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid ]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - __syncthreads(); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 0-bit values before the 1-bit values (since we are sorting in descending order). - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // perform a prefix sum, as with the rest of the bits; positive points will come first - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we already have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = n_elem - 1 - (aux_mem[num_threads + tid] - aux_mem[num_threads]); // contains the first place we should put a 1 point (we will move downwards) - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const eT1 index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const eT1 index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const eT1 index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the positive values ahead of the negative values. - local_counts[0] = aux_mem[tid]; - local_counts[1] = aux_mem[tid + num_threads]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const eT1 index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const eT1 index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_multi_wg_shuffle.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_multi_wg_shuffle.cu deleted file mode 100644 index 147012f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_multi_wg_shuffle.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,radix_sort_index_multi_wg_shuffle)(eT1* A, - UWORD* A_index, - eT1* out, - UWORD* out_index, - UWORD* counts, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - // This kernel is a placeholder and is not used by the CUDA backend. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_bit_count.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_bit_count.cu deleted file mode 100644 index 8602e1d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_bit_count.cu +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,radix_sort_multi_wg_bit_count)(eT1* A, - UWORD* counts, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - // This kernel is a placeholder and is not used by the CUDA backend. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_shuffle.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_shuffle.cu deleted file mode 100644 index 16de88b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_shuffle.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,radix_sort_multi_wg_shuffle)(eT1* A, - eT1* out, - UWORD* counts, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - // This kernel is a placeholder and is not used by the CUDA backend. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_asc.cu deleted file mode 100644 index 99995a3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_asc.cu +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_rowwise_asc)(eT1* A, - eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < A_n_rows) - { - eT1* unsorted_rowptr = &A[row]; - eT1* sorted_rowptr = &tmp_mem[row]; - - UWORD unsorted_n_rows = A_M_n_rows; - UWORD sorted_n_rows = A_n_rows; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - uint_eT1* rowptr = reinterpret_cast(unsorted_rowptr); - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> b]; - } - - counts[1] = counts[0]; // now holds the offset to put the next value at - counts[0] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD out_index = (counts[((rowptr[in_index] & mask) >> b)]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - eT1* tmp = unsorted_rowptr; - unsorted_rowptr = sorted_rowptr; - sorted_rowptr = tmp; - - UWORD tmp2 = unsorted_n_rows; - unsorted_n_rows = sorted_n_rows; - sorted_n_rows = tmp2; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* rowptr = reinterpret_cast(unsorted_rowptr); - counts[0] = 0; - counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> last_bit]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - if (coot_is_fp(TO_ET1(0))) - { - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = counts[0] - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = counts[bit_val] * sorted_n_rows; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_rowptr[out_index] = val; - } - } - else - { - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = (counts[bit_val]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_desc.cu deleted file mode 100644 index 077738a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_desc.cu +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_rowwise_desc)(eT1* A, - eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < A_n_rows) - { - eT1* unsorted_rowptr = &A[row]; - eT1* sorted_rowptr = &tmp_mem[row]; - - UWORD unsorted_n_rows = A_M_n_rows; - UWORD sorted_n_rows = A_n_rows; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - uint_eT1* rowptr = reinterpret_cast(unsorted_rowptr); - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> b]; - } - - // Since we are sorting in descending order, 1-valued points come first. - counts[0] = counts[1]; // now holds the offset to put the next value at - counts[1] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD out_index = (counts[((rowptr[in_index] & mask) >> b)]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - eT1* tmp = unsorted_rowptr; - unsorted_rowptr = sorted_rowptr; - sorted_rowptr = tmp; - - UWORD tmp2 = unsorted_n_rows; - unsorted_n_rows = sorted_n_rows; - sorted_n_rows = tmp2; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* rowptr = reinterpret_cast(unsorted_rowptr); - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - if (coot_is_fp(TO_ET1(0))) - { - counts[0] = 0; // now holds the offset to put the next positive value at - counts[1] = A_n_cols - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = counts[bit_val] * sorted_n_rows; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_rowptr[out_index] = val; - } - } - else - { - counts[0] = 0; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> last_bit]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - counts[1] = counts[0]; // now holds the offset to put the next negative value at - counts[0] = 0; // now holds the offset to put the next positive value at - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = (counts[bit_val]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/regspace_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/regspace_desc.cu deleted file mode 100644 index 39956f9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/regspace_desc.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ -__global__ -void -COOT_FN(PREFIX,regspace_desc)(eT1* out_mem, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 delta, - const UWORD num) - { - UWORD idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num) - { - out_mem[idx * mem_incr] = start - delta * TO_ET1(idx); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/reorder_cols.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/reorder_cols.cu deleted file mode 100644 index 83ac2a8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/reorder_cols.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,reorder_cols)(eT1* out_mem, - const eT1* in_mem, - const UWORD n_rows, - const UWORD* ordering, - const UWORD out_n_cols) - { - const UWORD out_col = blockIdx.x * blockDim.x + threadIdx.x; - if (out_col < out_n_cols) - { - const UWORD in_col = ordering[out_col]; - - eT1* out_colptr = out_mem + (out_col * n_rows); - const eT1* in_colptr = in_mem + (in_col * n_rows); - - for (UWORD i = 0; i < n_rows; ++i) - { - out_colptr[i] = in_colptr[i]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/rotate_180.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/rotate_180.cu deleted file mode 100644 index e1eca97..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/rotate_180.cu +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rotate_180)(eT1* out, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && col < n_cols) - { - const UWORD in_index = col * n_rows + row; - // out(i, j) = in(n_rows - i - 1, n_cols - j - 1) - // or - // out(n_rows - i - 1, n_cols - j - 1) = in(i, j) - const UWORD out_row = n_rows - row - 1; - const UWORD out_col = n_cols - col - 1; - const UWORD out_index = out_col * n_rows + out_row; - - out[out_index] = in[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_add_offset.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_add_offset.cu deleted file mode 100644 index b3883bb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_add_offset.cu +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel adds block-specific offsets to blocks of local memory. -// Specifically, block i, which has t threads, adds offsets[i] to the range -// mem[i * (2 * t)] to mem[(i + 1) * (2 * t) - 1] (inclusive). -__global__ -void -COOT_FN(PREFIX,shifted_prefix_sum_add_offset)(eT1* mem, - const eT1* offsets, - const UWORD n_elem) - { - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = blockIdx.x; - - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD local_offset = 2 * local_tid; - const UWORD mem_offset = group_offset + local_offset; - - const eT1 offset = offsets[group_id]; - - const eT1 in_val1 = (mem_offset < n_elem) ? mem[mem_offset ] : TO_ET1(0); - const eT1 in_val2 = (mem_offset + 1 < n_elem) ? mem[mem_offset + 1] : TO_ET1(0); - - const eT1 out_val1 = in_val1 + offset; - const eT1 out_val2 = in_val2 + offset; - - // Copy results back to memory. - if (mem_offset + 1 < n_elem) - { - mem[mem_offset ] = out_val1; - mem[mem_offset + 1] = out_val2; - } - else if (mem_offset < n_elem) - { - mem[mem_offset ] = out_val1; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_small.cu deleted file mode 100644 index f3a090f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_small.cu +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel performs shifted prefix-sum on `mem` assuming that (2 * blockDim.x) <= n_elem. -// It's okay if n_elem is not a power of 2. -__global__ -void -COOT_FN(PREFIX,shifted_prefix_sum_small)(eT1* mem, - const UWORD n_elem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = blockIdx.x; - - // Copy relevant memory to auxiliary memory. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD mem_offset = group_offset + 2 * local_tid; - - aux_mem[mem_offset ] = (mem_offset < n_elem) ? mem[mem_offset ] : TO_ET1(0); - aux_mem[mem_offset + 1] = (mem_offset + 1 < n_elem) ? mem[mem_offset + 1] : TO_ET1(0); - - UWORD offset = 1; - for (UWORD s = local_size; s > 0; s >>= 1) - { - __syncthreads(); - if (local_tid < s) - { - const UWORD ai = group_offset + offset * (2 * local_tid + 1) - 1; - const UWORD bi = group_offset + offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - } - - // Prepare for down-sweep by setting the last element to 0. - if (local_tid == 0) - { - aux_mem[2 * local_size - 1] = 0; - } - __syncthreads(); - - for (UWORD s = 1; s <= local_size; s *= 2) - { - offset >>= 1; - if (local_tid < s) - { - const UWORD ai = group_offset + offset * (2 * local_tid + 1) - 1; - const UWORD bi = group_offset + offset * (2 * local_tid + 2) - 1; - eT1 tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Copy results back to memory. - if (mem_offset + 1 < n_elem) - { - mem[mem_offset ] = aux_mem[mem_offset ]; - mem[mem_offset + 1] = aux_mem[mem_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[mem_offset ] = aux_mem[mem_offset ]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_subgroups.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_subgroups.cu deleted file mode 100644 index 1a7fed4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_subgroups.cu +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel performs the shifted prefix-sum on each individual block. -// This is the same as just running a regular prefix-sum kernel, except that -// `out_mem[i]` will store the total sum of elements in block `i`. -// After running this, to finish prefix-sum on the entire memory, offsets for -// each workgroup need to be added. -__global__ -void -COOT_FN(PREFIX,shifted_prefix_sum_subgroups)(eT1* mem, - eT1* out_mem, - const UWORD n_elem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = blockIdx.x; - - // Copy relevant memory to auxiliary memory. - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD local_offset = 2 * local_tid; - const UWORD mem_offset = group_offset + local_offset; - - aux_mem[local_offset ] = (mem_offset < n_elem) ? mem[mem_offset ] : TO_ET1(0); - aux_mem[local_offset + 1] = (mem_offset + 1 < n_elem) ? mem[mem_offset + 1] : TO_ET1(0); - - UWORD offset = 1; - for (UWORD s = local_size; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (local_offset + 1) - 1; - const UWORD bi = offset * (local_offset + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (mem_offset + 1 < n_elem) - { - mem[mem_offset ] = aux_mem[local_offset ]; - mem[mem_offset + 1] = aux_mem[local_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[mem_offset ] = aux_mem[local_offset ]; - } - - if (local_tid == 0) - { - // Write the sum of the subarray to the output memory. - out_mem[group_id] = aux_mem[2 * local_size - 1]; - // Prepare for the downsweep. - aux_mem[2 * local_size - 1] = 0; - } - __syncthreads(); - - offset = local_size; - for (UWORD s = 1; s <= local_size; s *= 2) - { - if (local_tid < s) - { - const UWORD ai = offset * (local_offset + 1) - 1; - const UWORD bi = offset * (local_offset + 2) - 1; - eT1 tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - offset >>= 1; - __syncthreads(); - } - - // Copy results back to memory. - // The results here are the prefix-summed results for each individual - // workgroup. - if (mem_offset + 1 < n_elem) - { - mem[mem_offset ] = aux_mem[local_offset ]; - mem[mem_offset + 1] = aux_mem[local_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[mem_offset ] = aux_mem[local_offset ]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle.cu deleted file mode 100644 index f116a93..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle.cu +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,shuffle)(eT1* out, - const UWORD out_incr, /* how many eT1s to advance to get to the start of the next element to shuffle */ - const UWORD out_elem_stride, /* how many eT1s between each eT1 in each element */ - const eT1* in, - const UWORD in_incr, - const UWORD in_elem_stride, - const UWORD n_elem, - const UWORD elems_per_elem, /* how many eT1s in each element to shuffle */ - const UWORD n_elem_pow2, - const UWORD* philox_key, - const UWORD num_bits) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x + blockIdx.x * blockDim.x; - - // Get our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // Fill aux_mem with the indicator of whether we are out of bounds. - // Then, we'll prefix-sum it. This will tell us where to put our result. - aux_mem[tid] = (in_loc < n_elem); - __syncthreads(); - - // Now, prefix-sum the auxiliary memory. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = n_elem_pow2 / 2; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[n_elem_pow2 - 1] = 0; - } - __syncthreads(); - - for (UWORD s = 1; s <= n_elem_pow2 / 2; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // With the prefix sum complete, we shuffle our result into position aux_mem[tid], but only if we are a thread with a "valid" output. - if (in_loc < n_elem) - { - const UWORD in_addr_offset = in_loc * in_incr; - const UWORD out_addr_offset = aux_mem[tid] * out_incr; - - for (UWORD i = 0; i < elems_per_elem; ++i) - { - out[out_addr_offset + (i * out_elem_stride)] = in[in_addr_offset + (i * in_elem_stride)]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle_large.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle_large.cu deleted file mode 100644 index 3a9cd30..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle_large.cu +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,shuffle_large)(eT1* out, - const UWORD out_incr, /* how many eT1s to advance to get to the start of the next element to shuffle */ - const UWORD out_elem_stride, /* how many eT1s between each eT1 in each element */ - const eT1* in, - const UWORD in_incr, - const UWORD in_elem_stride, - const UWORD* block_offsets, - const UWORD n_elem, - const UWORD elems_per_elem, /* how many eT1s in each element to shuffle */ - const UWORD n_elem_pow2, - const UWORD* philox_key, - const UWORD num_bits) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x + blockIdx.x * blockDim.x; - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; - - // Recompute our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // We actually have to perform the up-sweep a second time, since we did not save the memory the first time. - aux_mem[local_tid] = (in_loc < n_elem); - __syncthreads(); - - // Now, prefix-sum the auxiliary memory for this block. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = local_size / 2; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (local_tid == 0) - { - aux_mem[local_size - 1] = 0; - } - __syncthreads(); - - for (UWORD s = 1; s <= local_size / 2; s *= 2) - { - offset >>= 1; - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // With the prefix sum complete, we shuffle our result into position aux_mem[tid], but only if we are a thread with a "valid" output. - if (in_loc < n_elem) - { - const UWORD in_addr_offset = in_loc * in_incr; - const UWORD out_addr_offset = (aux_mem[local_tid] + block_offsets[blockIdx.x]) * out_incr; - - for (UWORD i = 0; i < elems_per_elem; ++i) - { - out[out_addr_offset + (i * out_elem_stride)] = in[in_addr_offset + (i * in_elem_stride)]; - } - } - } - diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_asc.cu deleted file mode 100644 index d5a8e16..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_asc.cu +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,stable_radix_sort_index_asc)(eT1* A, - UWORD* A_index, - eT1* tmp_mem, - UWORD* tmp_mem_index, - const UWORD n_elem) - { - // The stable sort differs from the rest of our radix sorts in that we must avoid ever "reversing" point orders. - // We do this by adapting the regular radix sort to also consider the highest bit (the sign bit for signed types). - // This alleviates the need to ever unpack points in a reverse order, and so the sort is stable. - - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill tmp_mem_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - tmp_mem_index[i] = i; - tmp_mem_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - tmp_mem_index[i] = i; - } - - __syncthreads(); - - // This is 4 instead of two because we need to account for the sign bit. - UWORD local_counts[4]; - - // We are doing an odd number of iterations, so set things up such that A_index will be holding the final results. - eT1* unsorted_memptr = A; - UWORD* unsorted_index_memptr = tmp_mem_index; - eT1* sorted_memptr = tmp_mem; - UWORD* sorted_index_memptr = A_index; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 sign_mask = (((uint_eT1) 1) << last_bit); - - for (UWORD b = 0; b < 8 * sizeof(eT1) - 1; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with sign 0 and bit value 0 - local_counts[1] = 0; // holds the count of elements with sign 0 and bit value 1 - local_counts[2] = 0; // holds the count of elements with sign 1 and bit value 0 - local_counts[3] = 0; // holds the count of elements with sign 1 and bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[((memptr[i ] & mask) >> b) + ((memptr[i ] & sign_mask) >> (last_bit - 1))]; - ++local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]; - } - - // Step 2: aggregate the counts for all threads. - // There are a couple cases here to get things in an ascending order: - // * Floating point number: [11, 10, 00, 01] - // * Unsigned integer: [00, 01, 10, 11] - // * Signed integer: [10, 11, 00, 01] - // Note that the notation "11" indicates, e.g., a point whose sign is 1 and bit value in bit b is 1. - // For unsigned integers, we treat the top bit as a "sign" bit even though it's not---but we choose an ordering that's still correct. - - if (!coot_is_signed(TO_ET1(0))) - { - // Unsigned integer (00, 01, 10, 11) - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - aux_mem[tid + 2 * num_threads] = local_counts[2]; - aux_mem[tid + 3 * num_threads] = local_counts[3]; - } - else if (coot_is_fp(TO_ET1(0))) - { - // Floating point (11, 10, 00, 01) - aux_mem[tid ] = local_counts[3]; - aux_mem[tid + num_threads] = local_counts[2]; - aux_mem[tid + 2 * num_threads] = local_counts[0]; - aux_mem[tid + 3 * num_threads] = local_counts[1]; - } - else - { - // Signed integer (10, 11, 00, 01) - aux_mem[tid ] = local_counts[2]; - aux_mem[tid + num_threads] = local_counts[3]; - aux_mem[tid + 2 * num_threads] = local_counts[0]; - aux_mem[tid + 3 * num_threads] = local_counts[1]; - } - __syncthreads(); - - // Now, we must assign four sections of memory for `tid` to put its points in. - // We do this by a prefix-sum operation across all threads. - // At the end of this operation (at the beginning of Step 3): - // - // local_counts[0] indicates the first place to put a sign-0 bit-value-0 point - // local_counts[1] indicates the first place to put a sign-0 bit-value-1 point - // local_counts[2] indicates the first place to put a sign-1 bit-value-0 point - // local_counts[3] indicates the first place to put a sign-1 bit-value-1 point - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - const UWORD ai1 = offset * (2 * tid + 1) - 1; - const UWORD bi1 = offset * (2 * tid + 2) - 1; - aux_mem[bi1] += aux_mem[ai1]; - const UWORD ai2 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi2 = offset * (2 * (tid + num_threads) + 2) - 1; - aux_mem[bi2] += aux_mem[ai2]; - offset *= 2; - __syncthreads(); - - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[4 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - offset >>= 1; - const UWORD ai3 = offset * (2 * tid + 1) - 1; - const UWORD bi3 = offset * (2 * tid + 2) - 1; - UWORD tmp3 = aux_mem[ai3]; - aux_mem[ai3] = aux_mem[bi3]; - aux_mem[bi3] += tmp3; - - const UWORD ai4 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi4 = offset * (2 * (tid + num_threads) + 2) - 1; - UWORD tmp4 = aux_mem[ai4]; - aux_mem[ai4] = aux_mem[bi4]; - aux_mem[bi4] += tmp4; - __syncthreads(); - - // Step 3: move points into the correct place. - // There are a couple cases here to get things in an ascending order: - // * Floating point number: [11, 10, 00, 01] - // * Unsigned integer: [00, 01, 10, 11] - // * Signed integer: [10, 11, 00, 01] - - if (!coot_is_signed(TO_ET1(0))) - { - // Unsigned integer (00, 01, 10, 11) - local_counts[0] = aux_mem[tid ]; - local_counts[1] = aux_mem[tid + num_threads]; - local_counts[2] = aux_mem[tid + 2 * num_threads]; - local_counts[3] = aux_mem[tid + 3 * num_threads]; - } - else if (coot_is_fp(TO_ET1(0))) - { - // Floating point (11, 10, 00, 01) - local_counts[0] = aux_mem[tid + 2 * num_threads]; - local_counts[1] = aux_mem[tid + 3 * num_threads]; - local_counts[2] = aux_mem[tid + num_threads]; - local_counts[3] = aux_mem[tid ]; - } - else - { - // Signed integer (10, 11, 00, 01) - local_counts[0] = aux_mem[tid + 2 * num_threads]; - local_counts[1] = aux_mem[tid + 3 * num_threads]; - local_counts[2] = aux_mem[tid ]; - local_counts[3] = aux_mem[tid + num_threads]; - } - - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - UWORD* tmp_index = unsorted_index_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_index_memptr = tmp_index; - - __syncthreads(); - } - - // Since we did an odd number of iterations, the result is stored in A_index. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_desc.cu deleted file mode 100644 index fa7a134..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_desc.cu +++ /dev/null @@ -1,256 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,stable_radix_sort_index_desc)(eT1* A, - UWORD* A_index, - eT1* tmp_mem, - UWORD* tmp_mem_index, - const UWORD n_elem) - { - // The stable sort differs from the rest of our radix sorts in that we must avoid ever "reversing" point orders. - // We do this by adapting the regular radix sort to also consider the highest bit (the sign bit for signed types). - // This alleviates the need to ever unpack points in a reverse order, and so the sort is stable. - - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill tmp_mem_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - tmp_mem_index[i] = i; - tmp_mem_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - tmp_mem_index[i] = i; - } - - __syncthreads(); - - UWORD local_counts[4]; - - // We are doing an odd number of iterations, so set things up such that A_index will be holding the final results. - eT1* unsorted_memptr = A; - UWORD* unsorted_index_memptr = tmp_mem_index; - eT1* sorted_memptr = tmp_mem; - UWORD* sorted_index_memptr = A_index; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 sign_mask = (((uint_eT1) 1) << last_bit); - - for (UWORD b = 0; b < 8 * sizeof(eT1) - 1; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 and sign value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 and sign value 0 - local_counts[2] = 0; // holds the count of elements with bit value 0 and sign value 1 - local_counts[3] = 0; // holds the count of elements with bit value 1 and sign value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[((memptr[i ] & mask) >> b) + ((memptr[i ] & sign_mask) >> (last_bit - 1))]; - ++local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]; - } - - // Step 2: aggregate the counts for all threads. - // There are a couple cases here to get things in a descending order: - // * Floating point number: [01, 00, 10, 11] - // * Unsigned integer: [11, 10, 01, 00] - // * Signed integer: [01, 00, 11, 10] - // Note that the notation "11" indicates, e.g., a point whose sign is 1 and bit value in bit b is 1. - // For unsigned integers, we treat the top bit as a "sign" bit even though it's not---but we choose an ordering that's still correct. - - if (!coot_is_signed(TO_ET1(0))) - { - // Unsigned integer (11, 10, 01, 00) - aux_mem[tid ] = local_counts[3]; - aux_mem[tid + num_threads] = local_counts[2]; - aux_mem[tid + 2 * num_threads] = local_counts[1]; - aux_mem[tid + 3 * num_threads] = local_counts[0]; - } - else if (coot_is_fp(TO_ET1(0))) - { - // Floating point (01, 00, 10, 11) - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - aux_mem[tid + 2 * num_threads] = local_counts[2]; - aux_mem[tid + 3 * num_threads] = local_counts[3]; - } - else - { - // Signed integer (01, 00, 11, 10) - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - aux_mem[tid + 2 * num_threads] = local_counts[3]; - aux_mem[tid + 3 * num_threads] = local_counts[2]; - } - __syncthreads(); - - // Now, we must assign four sections of memory for `tid` to put its points in. - // We do this by a prefix-sum operation across all threads. - // At the end of this operation (at the beginning of Step 3): - // - // local_counts[0] indicates the first place to put a sign-0 bit-value-0 point - // local_counts[1] indicates the first place to put a sign-0 bit-value-1 point - // local_counts[2] indicates the first place to put a sign-1 bit-value-0 point - // local_counts[3] indicates the first place to put a sign-1 bit-value-1 point - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - const UWORD ai1 = offset * (2 * tid + 1) - 1; - const UWORD bi1 = offset * (2 * tid + 2) - 1; - aux_mem[bi1] += aux_mem[ai1]; - const UWORD ai2 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi2 = offset * (2 * (tid + num_threads) + 2) - 1; - aux_mem[bi2] += aux_mem[ai2]; - offset *= 2; - __syncthreads(); - - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[4 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - offset >>= 1; - const UWORD ai3 = offset * (2 * tid + 1) - 1; - const UWORD bi3 = offset * (2 * tid + 2) - 1; - UWORD tmp3 = aux_mem[ai3]; - aux_mem[ai3] = aux_mem[bi3]; - aux_mem[bi3] += tmp3; - - const UWORD ai4 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi4 = offset * (2 * (tid + num_threads) + 2) - 1; - UWORD tmp4 = aux_mem[ai4]; - aux_mem[ai4] = aux_mem[bi4]; - aux_mem[bi4] += tmp4; - __syncthreads(); - - // Step 3: move points into the correct place. - if (!coot_is_signed(TO_ET1(0))) - { - // Unsigned integer (11, 10, 01, 00) - local_counts[0] = aux_mem[tid + 3 * num_threads]; - local_counts[1] = aux_mem[tid + 2 * num_threads]; - local_counts[2] = aux_mem[tid + num_threads]; - local_counts[3] = aux_mem[tid ]; - } - else if (coot_is_fp(TO_ET1(0))) - { - // Floating point (01, 00, 10, 11) - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid ]; - local_counts[2] = aux_mem[tid + 2 * num_threads]; - local_counts[3] = aux_mem[tid + 3 * num_threads]; - } - else - { - // Signed integer (01, 00, 11, 10) - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid ]; - local_counts[2] = aux_mem[tid + 3 * num_threads]; - local_counts[3] = aux_mem[tid + 2 * num_threads]; - } - - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - __syncthreads(); - } - - // Since we did an odd number of iterations, the result is stored in A_index. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var.cu deleted file mode 100644 index 7414178..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var.cu +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,submat_var)(const eT1* in_mem, - const UWORD n_elem, // number of elements in subview - eT1* out_mem, - const eT1 mean_val, - const UWORD in_n_rows, - const UWORD start_row, - const UWORD start_col, - const UWORD sub_n_rows, - const UWORD sub_n_cols) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const UWORD col1 = (i ) / sub_n_rows; - const UWORD col2 = (i + blockDim.x) / sub_n_rows; - const UWORD row1 = (i ) % sub_n_rows; - const UWORD row2 = (i + blockDim.x) % sub_n_rows; - const UWORD index1 = (col1 + start_col) * in_n_rows + (row1 + start_row); - const UWORD index2 = (col2 + start_col) * in_n_rows + (row2 + start_row); - - const eT1 val1 = (in_mem[index1] - mean_val); - const eT1 val2 = (in_mem[index2] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const UWORD col = i / sub_n_rows; - const UWORD row = i % sub_n_rows; - const UWORD index = (col + start_col) * in_n_rows + (row + start_row); - - const eT1 val = (in_mem[index] - mean_val); - aux_mem[tid] += (val * val); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var_small.cu deleted file mode 100644 index 0503006..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var_small.cu +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,submat_var_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const eT1 mean_val, - const UWORD in_n_rows, - const UWORD start_row, - const UWORD start_col, - const UWORD sub_n_rows, - const UWORD sub_n_cols) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const UWORD col1 = (i ) / sub_n_rows; - const UWORD col2 = (i + blockDim.x) / sub_n_rows; - const UWORD row1 = (i ) % sub_n_rows; - const UWORD row2 = (i + blockDim.x) % sub_n_rows; - const UWORD index1 = (col1 + start_col) * in_n_rows + (row1 + start_row); - const UWORD index2 = (col2 + start_col) * in_n_rows + (row2 + start_row); - - const eT1 val1 = (in_mem[index1] - mean_val); - const eT1 val2 = (in_mem[index2] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const UWORD col = i / sub_n_rows; - const UWORD row = i % sub_n_rows; - const UWORD index = (col + start_col) * in_n_rows + (row + start_row); - - const eT1 val = (in_mem[index] - mean_val); - aux_mem[tid] += (val * val); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatl_inplace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatl_inplace.cu deleted file mode 100644 index f4907eb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatl_inplace.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,symmatl_inplace)(eT1* out, - const UWORD size) // matrix is expected to be square - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < size && col < size && row > col) - { - const eT1 val = out[row + size * col]; - - // only need to copy to the upper triangle for the in-place version - out[col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatu_inplace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatu_inplace.cu deleted file mode 100644 index e16d8ca..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatu_inplace.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,symmatu_inplace)(eT1* out, - const UWORD size) // matrix is expected to be square - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < size && col < size && col > row) - { - const eT1 val = out[row + size * col]; - - // only need to copy to the lower triangle for the in-place version - out[col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/trace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/trace.cu deleted file mode 100644 index 8dc31a7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/trace.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,trace)(eT1* out, - const eT1* A, - const UWORD n_rows, - const UWORD N) - { - const UWORD id = blockIdx.x * blockDim.x + threadIdx.x; - if(id == 0) - { - eT1 acc = TO_ET1(0); - // runtime unrolling is not supported by CUDA - for(UWORD i=0; i 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_colwise.cu deleted file mode 100644 index aad8c98..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_colwise.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,var_colwise)(eT1* dest, - const eT1* src, - const eT1* src_means, - const UWORD n_rows, - const UWORD n_cols, - const UWORD norm_correction, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows, - const UWORD src_means_mem_incr) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - const eT1 mean_val = src_means[col * src_means_mem_incr]; - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_rows; ++i) - { - eT1 val = (colptr[i] - mean_val); - acc += (val * val); - } - - dest[col * dest_mem_incr] = (acc / TO_ET1(n_rows - norm_correction)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_rowwise.cu deleted file mode 100644 index 5c0f08a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_rowwise.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,var_rowwise)(eT1* dest, - const eT1* src, - const eT1* src_means, - const UWORD n_rows, - const UWORD n_cols, - const UWORD norm_correction, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows, - const UWORD src_means_mem_incr) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = TO_ET1(0); - const eT1 mean_val = src_means[row]; - for (UWORD i = 0; i < n_cols; ++i) - { - const eT1 val = (src[i * src_M_n_rows + row] - mean_val); - acc += (val * val); - } - - dest[row * dest_mem_incr] = (acc / TO_ET1(n_cols - norm_correction)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_small.cu deleted file mode 100644 index 790a779..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_small.cu +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,var_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const eT1 mean_val) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 val1 = (in_mem[i] - mean_val); - const eT1 val2 = (in_mem[i + blockDim.x] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 val = (in_mem[i] - mean_val); - aux_mem[tid] += (val * val); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce.cu deleted file mode 100644 index 3e9a140..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce.cu +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,and_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] &= data[tid + 32]; - data[tid] &= data[tid + 16]; - data[tid] &= data[tid + 8]; - data[tid] &= data[tid + 4]; - data[tid] &= data[tid + 2]; - data[tid] &= data[tid + 1]; - } - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,and_reduce)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - - aux_mem[tid] = ~((eT1) 0); // all bits to 1 - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] &= in_mem[i]; - aux_mem[tid] &= in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] &= in_mem[i]; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,and_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce_small.cu deleted file mode 100644 index 7c5b1da..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce_small.cu +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2021 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,and_reduce_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = ~((eT1) 0); // all bits to 1 - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] &= in_mem[i]; - aux_mem[tid] &= in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] &= in_mem[i]; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det.cu deleted file mode 100644 index fb5f7da..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det.cu +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declaration of one-way kernel that we need. -__device__ void COOT_FN(PREFIX,prod_subgroup_reduce)(volatile eT1* data, int tid); - -// this kernel is technically incorrect if the size is not a factor of 2! -// Compute the determinant of a permutation matrix as given by getrf(). -__global__ -void -COOT_FN(PREFIX,ipiv_det)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - - aux_mem[tid] = (eT1) 1; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = ((in_mem[i] - 1) == i) ? 1 : -1; - aux_mem[tid] *= val1; - const eT1 val2 = ((in_mem[i + blockDim.x] - 1) == (i + blockDim.x)) ? 1 : -1; - aux_mem[tid] *= val2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 val = ((in_mem[i] - 1) == i) ? 1 : -1; - aux_mem[tid] *= val; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,prod_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det_small.cu deleted file mode 100644 index d9d3c1f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det_small.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Compute the determinant of a permutation matrix as given by getrf(). -__global__ -void -COOT_FN(PREFIX,ipiv_det_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = (eT1) 1; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = ((in_mem[i] - 1) == i) ? 1 : -1; - aux_mem[tid] *= val1; - const eT1 val2 = ((in_mem[i + blockDim.x] - 1) == (i + blockDim.x)) ? 1 : -1; - aux_mem[tid] *= val2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 val = ((in_mem[i] - 1) == i) ? 1 : -1; - aux_mem[tid] *= val; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce.cu deleted file mode 100644 index ff63de1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce.cu +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,or_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] |= data[tid + 32]; - data[tid] |= data[tid + 16]; - data[tid] |= data[tid + 8]; - data[tid] |= data[tid + 4]; - data[tid] |= data[tid + 2]; - data[tid] |= data[tid + 1]; - } - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,or_reduce)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - - aux_mem[tid] = (eT1) 0; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] |= in_mem[i]; - aux_mem[tid] |= in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] |= in_mem[i]; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,or_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce_small.cu deleted file mode 100644 index 9a8290c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce_small.cu +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,or_reduce_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = (eT1) 0; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] |= in_mem[i]; - aux_mem[tid] |= in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] |= in_mem[i]; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod.cu deleted file mode 100644 index dd0cac4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod.cu +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declaration of one-way kernel that we need. -__device__ void COOT_FN(PREFIX,prod_subgroup_reduce)(volatile eT1* data, int tid); - -// this kernel is technically incorrect if the size is not a factor of 2! -// Compute the product of the elements on the diagonal of a matrix. -__global__ -void -COOT_FN(PREFIX,diag_prod)(const eT1* in_mem, - const UWORD n_rows, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_rows) - { - // copy to local shared memory - const UWORD index1 = i * n_rows + i; - const eT1 v1 = in_mem[index1]; - const UWORD index2 = (i + blockDim.x) * n_rows + (i + blockDim.x); - const eT1 v2 = in_mem[index2]; - aux_mem[tid] *= v1 * v2; - i += grid_size; - } - if (i < n_rows) - { - const UWORD index = i * n_rows + i; - const eT1 v = in_mem[index]; - aux_mem[tid] *= v; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,prod_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod_small.cu deleted file mode 100644 index 448de93..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod_small.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// compute the product of the diagonal of a matrix -__global__ -void -COOT_FN(PREFIX,diag_prod_small)(const eT1* in_mem, - const UWORD n_rows, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_rows) - { - // copy to local shared memory - const UWORD index1 = i * n_rows + i; - const eT1 v1 = in_mem[index1]; - const UWORD index2 = (i + blockDim.x) * n_rows + (i + blockDim.x); - const eT1 v2 = in_mem[index2]; - aux_mem[tid] *= v1 * v2; - i += grid_size; - } - if (i < n_rows) - { - const UWORD index = i * n_rows + i; - const eT1 v = in_mem[index]; - aux_mem[tid] *= v; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/extract_cx.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/extract_cx.cu deleted file mode 100644 index 3957225..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/extract_cx.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Extract real or imaginary elements from a complex matrix into a real matrix. -// This kernel is a bit of a hack until we have actual complex matrix support! -__global__ -void -COOT_FN(PREFIX,extract_cx)(const eT1* in_mem, - eT1* out_mem, - const UWORD real_or_imag, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows, - const UWORD out_M_n_rows) - { - // If real_or_imag is 0, we extract the real part. If 1, we extract the - // imaginary part. - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD in_index = 2 * (col * in_M_n_rows + row) + real_or_imag; - const UWORD out_index = col * out_M_n_rows + row; - - if (col < n_cols && row < n_rows) - { - out_mem[out_index] = in_mem[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_l.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_l.cu deleted file mode 100644 index cebe438..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_l.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This extracts L from U, and sets the lower diagonal of U to 0. -__global__ -void -COOT_FN(PREFIX,lu_extract_l)(eT1* L, - eT1* U, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - // Note that neither U nor L must be square. - // L has size n_rows x min(n_rows, n_cols). - // U has size min(n_rows, n_cols) x n_cols. - const UWORD min_rows_cols = min(n_rows, n_cols); - - const UWORD in_index = row + n_rows * col; // this is also L_out_index - const UWORD U_out_index = row + min_rows_cols * col; - - if ((row < n_rows) && (col < min_rows_cols)) - { - L[in_index] = (row > col) ? in[in_index] : ((row == col) ? 1 : 0); - } - - if ((row < min_rows_cols) && (col < n_cols)) - { - U[U_out_index] = (row > col) ? 0 : in[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_p.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_p.cu deleted file mode 100644 index 090462a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_p.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,lu_extract_p)(eT1* P, - const UWORD* ipiv2, - const UWORD n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - - if (row < n_rows) - { - const UWORD index = row + ipiv2[row] * n_rows; - P[index] = (UWORD) 1; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_pivoted_l.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_pivoted_l.cu deleted file mode 100644 index 5e50481..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_pivoted_l.cu +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This extracts L from U, and sets the lower diagonal of U to 0. -__global__ -void -COOT_FN(PREFIX,lu_extract_pivoted_l)(eT1* L, - eT1* U, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD* ipiv) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - // Note that neither U nor L must be square. - // L has size n_rows x min(n_rows, n_cols). - // U has size min(n_rows, n_cols) x n_cols. - const UWORD min_rows_cols = min(n_rows, n_cols); - - const UWORD in_index = row + n_rows * col; // this is also L_out_index - const UWORD U_out_index = row + min_rows_cols * col; - - // We are extracted a permuted version of L. - // Instead of extracting row i of U as row i of L, - // we extract row i of U as row ipiv[i] of L. - const UWORD L_out_index = ipiv[row] + n_rows * col; - - if ((row < n_rows) && (col < min_rows_cols)) - { - L[L_out_index] = (row > col) ? in[in_index] : ((row == col) ? 1 : 0); - } - - if ((row < min_rows_cols) && (col < n_cols)) - { - U[U_out_index] = (row > col) ? 0 : in[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf.cu deleted file mode 100644 index c1af8f5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf.cu +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_any_inf)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= coot_isinf(val1); - aux_mem[tid] |= coot_isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= coot_isinf(val1); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - or_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf_small.cu deleted file mode 100644 index 0c91941..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf_small.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_inf_small)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= coot_isinf(val1); - aux_mem[tid] |= coot_isinf(val2); - if (aux_mem[tid] == 1) - break; - - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= coot_isinf(val1); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan.cu deleted file mode 100644 index a5e82fc..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan.cu +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_any_nan)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= coot_isnan(val1); - aux_mem[tid] |= coot_isnan(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= coot_isnan(val1); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - or_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan_small.cu deleted file mode 100644 index 5e39330..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan_small.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_nan_small)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= coot_isnan(val1); - aux_mem[tid] |= coot_isnan(val2); - if (aux_mem[tid] == 1) - break; - - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= coot_isnan(val1); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite.cu deleted file mode 100644 index 93847ba..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite.cu +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_any_nonfinite)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= !coot_isfinite(val1); - aux_mem[tid] |= !coot_isfinite(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= !coot_isfinite(val1); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - or_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite_small.cu deleted file mode 100644 index b6e7af8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite_small.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_nonfinite_small)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= !coot_isfinite(val1); - aux_mem[tid] |= !coot_isfinite(val2); - if (aux_mem[tid] == 1) - break; - - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= !coot_isfinite(val1); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isfinite.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isfinite.cu deleted file mode 100644 index 19d2241..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isfinite.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_isfinite)(UWORD* out, - const eT1* X, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val = (eT1) X[i]; - out[i] = coot_isfinite(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnan.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnan.cu deleted file mode 100644 index d7b5dde..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnan.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_isnan)(UWORD* out, - const eT1* X, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = (eT1) X[i]; - out[i] = coot_isnan(val1); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnonfinite.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnonfinite.cu deleted file mode 100644 index f1fa746..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnonfinite.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_isnonfinite)(UWORD* out, - const eT1* X, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val = (eT1) X[i]; - out[i] = !coot_isfinite(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1.cu deleted file mode 100644 index 59b5818..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1.cu +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declaration of one-way kernel that we need. -__device__ void COOT_FN(PREFIX,accu_subgroup_reduce)(volatile eT1* data, int tid); - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,vec_norm_1)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = abs(in_mem[i]); - const eT1 v2 = abs(in_mem[i + blockDim.x]); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = abs(in_mem[i]); - aux_mem[tid] += v; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1_small.cu deleted file mode 100644 index 0f4c579..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1_small.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_1_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = abs(in_mem[i]); - const eT1 v2 = abs(in_mem[i + blockDim.x]); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = abs(in_mem[i]); - aux_mem[tid] += v; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2.cu deleted file mode 100644 index 54ef65e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_2)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = in_mem[i] * in_mem[i]; - const eT1 v2 = in_mem[i + blockDim.x] * in_mem[i + blockDim.x]; - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = in_mem[i] * in_mem[i]; - aux_mem[tid] += v; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust.cu deleted file mode 100644 index 7684dc8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust.cu +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_2_robust)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const eT1 max_val) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = (in_mem[i] / max_val); - const eT1 v2 = (in_mem[i + blockDim.x] / max_val); - aux_mem[tid] += (v1 * v1) + (v2 * v2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = in_mem[i] / max_val; - aux_mem[tid] += (v * v); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust_small.cu deleted file mode 100644 index 672296d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust_small.cu +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_2_robust_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const eT1 max_val) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = (in_mem[i] / max_val); - const eT1 v2 = (in_mem[i + blockDim.x] / max_val); - aux_mem[tid] += (v1 * v1) + (v2 * v2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = (in_mem[i] / max_val); - aux_mem[tid] += (v * v); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_small.cu deleted file mode 100644 index 4584680..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_small.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_2_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = in_mem[i] * in_mem[i]; - const eT1 v2 = in_mem[i + blockDim.x] * in_mem[i + blockDim.x]; - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = in_mem[i] * in_mem[i]; - aux_mem[tid] += v; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k.cu deleted file mode 100644 index f594ce3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k.cu +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,vec_norm_k)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const UWORD k) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = pow(in_mem[i], (eT1) k); - const eT1 v2 = pow(in_mem[i + blockDim.x], (eT1) k); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = pow(in_mem[i], (eT1) k); - aux_mem[tid] += v; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k_small.cu deleted file mode 100644 index fb49e4c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k_small.cu +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_k_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const UWORD k) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = pow(in_mem[i], (eT1) k); - const eT1 v2 = pow(in_mem[i + blockDim.x], (eT1) k); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = pow(in_mem[i], (eT1) k); - aux_mem[tid] += v; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min.cu deleted file mode 100644 index 49684a9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min.cu +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declaration of one-way kernel that we need. -__device__ void COOT_FN(PREFIX,min_subgroup_reduce)(volatile eT1* data, int tid); - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,vec_norm_min)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max((eT1) 0); - - if (i < n_elem) - { - aux_mem[tid] = abs(in_mem[i]); - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], abs(in_mem[i + blockDim.x])); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = abs(in_mem[i]); - const eT1 v2 = abs(in_mem[i + blockDim.x]); - const eT1 v3 = min(v1, v2); - aux_mem[tid] = min(aux_mem[tid], v3); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = abs(in_mem[i]); - aux_mem[tid] = min(aux_mem[tid], v); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the min_subgroup_reduce utility function. - COOT_FN(PREFIX,min_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min_small.cu deleted file mode 100644 index aca3f19..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min_small.cu +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_min_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max((eT1) 0); - - if (i < n_elem) - { - aux_mem[tid] = abs(in_mem[i]); - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], abs(in_mem[i + blockDim.x])); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = abs(in_mem[i]); - const eT1 v2 = abs(in_mem[i + blockDim.x]); - const eT1 v3 = min(v1, v2); - aux_mem[tid] = min(aux_mem[tid], v3); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = abs(in_mem[i]); - aux_mem[tid] = min(aux_mem[tid], v); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_post.cu deleted file mode 100644 index 976cd25..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_post.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_div_post)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = out_src[out_src_loc] / (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_pre.cu deleted file mode 100644 index b215f9f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_pre.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_div_pre)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = (TO_ET2(in[in_loc])) / out_src[out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_post.cu deleted file mode 100644 index 779de93..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_post.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_minus_post)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = out_src[out_src_loc] - (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_pre.cu deleted file mode 100644 index c5038bf..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_pre.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_minus_pre)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = (TO_ET2(in[in_loc])) - out_src[out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_plus.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_plus.cu deleted file mode 100644 index bebefac..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_plus.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_plus)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = out_src[out_src_loc] + (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_schur.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_schur.cu deleted file mode 100644 index 8b5070b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_schur.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_schur)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = out_src[out_src_loc] * (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_set.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_set.cu deleted file mode 100644 index 624c16d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_set.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_set)(eT2* out, - const eT2* /* out_src */, // old values are unused - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD /* out_src_M_n_rows */, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = TO_ET2(in[in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_post.cu deleted file mode 100644 index a4f9206..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_div_post)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = out_src[out_src_loc] / (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_pre.cu deleted file mode 100644 index d41d0ea..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_div_pre)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = (TO_ET2(in[in_loc])) / out_src[out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_post.cu deleted file mode 100644 index a1951c2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_minus_post)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = out_src[out_src_loc] - (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_pre.cu deleted file mode 100644 index 6478147..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_minus_pre)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = (TO_ET2(in[in_loc])) - out_src[out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_plus.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_plus.cu deleted file mode 100644 index fc8d542..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_plus.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_plus)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = out_src[out_src_loc] + (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_schur.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_schur.cu deleted file mode 100644 index c2e4095..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_schur.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_schur)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = out_src[out_src_loc] * (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_set.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_set.cu deleted file mode 100644 index b1d6dd0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_set.cu +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_set)(eT2* out, - const eT2* /* out_src */, // unused - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD /* out_src_M_n_rows */, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/clamp.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/clamp.cu deleted file mode 100644 index 428aa03..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/clamp.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ -__global__ -void -COOT_FN(PREFIX,clamp)(eT2* dest, - const eT1* src, - const eT1 min_val, - const eT1 max_val, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD src_index = row + col * src_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const eT1 clamped_val = max(min_val, min(max_val, src[src_index])); - dest[dest_index] = TO_ET2(clamped_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/cross.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/cross.cu deleted file mode 100644 index 6cc7a96..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/cross.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,cross)(eT2* out, - const eT1* A, - const eT1* B) // A and B should have 3 elements - { - const UWORD idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx < 3) - { - const UWORD a1_index = ((idx + 1) % 3); - const UWORD a2_index = ((idx + 2) % 3); - - const UWORD b1_index = ((idx + 2) % 3); - const UWORD b2_index = ((idx + 1) % 3); - - const eT1 val = (A[a1_index] * B[b1_index]) - (A[a2_index] * B[b2_index]); - out[idx] = TO_ET2(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot.cu deleted file mode 100644 index c0f1af3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot.cu +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,dot_subgroup_reduce)(volatile twoway_promoted_eT* data, int tid) - { - data[tid] += data[tid + 32]; - data[tid] += data[tid + 16]; - data[tid] += data[tid + 8]; - data[tid] += data[tid + 4]; - data[tid] += data[tid + 2]; - data[tid] += data[tid + 1]; - } - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,dot)(twoway_promoted_eT* out_mem, - const eT1* A, - const eT2* B, - const UWORD n_elem) - { - twoway_promoted_eT* aux_mem = (twoway_promoted_eT*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const twoway_promoted_eT A_i1 = TO_TWOWAY_PROMOTED_ET(A[i]); - const twoway_promoted_eT B_i1 = TO_TWOWAY_PROMOTED_ET(B[i]); - - const twoway_promoted_eT A_i2 = TO_TWOWAY_PROMOTED_ET(A[i + blockDim.x]); - const twoway_promoted_eT B_i2 = TO_TWOWAY_PROMOTED_ET(B[i + blockDim.x]); - - aux_mem[tid] += (A_i1 * B_i1) + (A_i2 * B_i2); // copy to local shared memory - i += grid_size; - } - if (i < n_elem) - { - const twoway_promoted_eT A_i1 = TO_TWOWAY_PROMOTED_ET(A[i]); - const twoway_promoted_eT B_i1 = TO_TWOWAY_PROMOTED_ET(B[i]); - - aux_mem[tid] += (A_i1 * B_i1); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,dot_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot_small.cu deleted file mode 100644 index 27735f6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot_small.cu +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,dot_small)(twoway_promoted_eT* out_mem, - const eT1* A, - const eT2* B, - const UWORD n_elem) - { - twoway_promoted_eT* aux_mem = (twoway_promoted_eT*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const twoway_promoted_eT A_i1 = TO_TWOWAY_PROMOTED_ET(A[i]); - const twoway_promoted_eT B_i1 = TO_TWOWAY_PROMOTED_ET(B[i]); - - const twoway_promoted_eT A_i2 = TO_TWOWAY_PROMOTED_ET(A[i + blockDim.x]); - const twoway_promoted_eT B_i2 = TO_TWOWAY_PROMOTED_ET(B[i + blockDim.x]); - - // copy to local shared memory - aux_mem[tid] += (A_i1 * B_i1) + (A_i2 * B_i2); - i += grid_size; - } - if (i < n_elem) - { - const twoway_promoted_eT A_i1 = TO_TWOWAY_PROMOTED_ET(A[i]); - const twoway_promoted_eT B_i1 = TO_TWOWAY_PROMOTED_ET(B[i]); - - aux_mem[tid] += (A_i1 * B_i1); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/htrans.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/htrans.cu deleted file mode 100644 index 4dc7d45..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/htrans.cu +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// TODO: once we have complex support, this will need to be amended -__global__ -void -COOT_FN(PREFIX,htrans)(eT2* out, - const eT1* in, - const UWORD in_n_rows, - const UWORD in_n_cols) - { - // For a non-inplace transpose, we can use a pretty naive approach. - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD in_offset = row + col * in_n_rows; - const UWORD out_offset = col + row * in_n_cols; - - if( (row < in_n_rows) && (col < in_n_cols) ) - { - const eT2 element = TO_ET2(in[in_offset]); - out[out_offset] = element; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_array.cu deleted file mode 100644 index 6cab7e1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_div_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] /= TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_sve1.cu deleted file mode 100644 index 4b5524b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_div_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] /= TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_array.cu deleted file mode 100644 index db7051c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_eq_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_sve1.cu deleted file mode 100644 index 8e45725..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_eq_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_array.cu deleted file mode 100644 index 4447bf4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_minus_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] -= TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_sve1.cu deleted file mode 100644 index aa3141b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_minus_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] -= TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_array.cu deleted file mode 100644 index 97efc6e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_mul_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] *= TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_sve1.cu deleted file mode 100644 index 6bb3583..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_mul_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] *= TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_array.cu deleted file mode 100644 index f2aa01c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_plus_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] += TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_sve1.cu deleted file mode 100644 index 6f7dd53..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_plus_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] += TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_array.cu deleted file mode 100644 index 427ec81..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_div_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] /= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_sve2.cu deleted file mode 100644 index ea73f4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_div_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] /= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_array.cu deleted file mode 100644 index c19ee2d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_eq_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] = TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_sve2.cu deleted file mode 100644 index 293bb28..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_eq_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_array.cu deleted file mode 100644 index a8c1655..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_minus_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] -= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_sve2.cu deleted file mode 100644 index 94988f1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_minus_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] -= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_array.cu deleted file mode 100644 index fa47196..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_mul_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] *= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_sve2.cu deleted file mode 100644 index 52eca01..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_mul_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] *= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_array.cu deleted file mode 100644 index c7830d2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_plus_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] += TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_sve2.cu deleted file mode 100644 index 22c23f8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_plus_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] += TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_post.cu deleted file mode 100644 index 0348de6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_colwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 acc = colptr[0]; - for (UWORD i = 1; i < n_rows; ++i) - { - acc = max(acc, colptr[i]); - } - - dest[col * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_pre.cu deleted file mode 100644 index fa94bc3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_colwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT2 acc = TO_ET2(colptr[0]); - for (UWORD i = 1; i < n_rows; ++i) - { - acc = max(acc, TO_ET2(colptr[i])); - } - - dest[col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_post.cu deleted file mode 100644 index f6dd6f3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_cube_col_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT1 acc = src[row + slice * n_rows * n_cols]; - for (UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, src[(i * n_rows) + row + slice * n_rows * n_cols]); - } - - dest[row + slice * n_rows] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_pre.cu deleted file mode 100644 index 7b2f2ce..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_cube_col_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT2 acc = TO_ET2(src[row + slice * n_rows * n_cols]); - for (UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, TO_ET2(src[(i * n_rows) + row + slice * n_rows * n_cols])); - } - - dest[row + slice * n_rows] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_post.cu deleted file mode 100644 index b456f62..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_post.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_rowwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = src[row]; - for (UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, src[(i * src_M_n_rows) + row]); - } - - dest[row * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_pre.cu deleted file mode 100644 index 28b6c20..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_pre.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_rowwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT2 acc = TO_ET2(src[row]); - for (UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, TO_ET2(src[(i * src_M_n_rows) + row])); - } - - dest[row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_post.cu deleted file mode 100644 index e12c7a5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,mean_colwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[ col * src_M_n_rows ]); - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_rows; ++i) - { - acc += colptr[i]; - } - - dest[col * dest_mem_incr] = TO_ET2(acc / TO_ET1(n_rows)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_pre.cu deleted file mode 100644 index f0c17ee..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,mean_colwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[ col * src_M_n_rows ]); - eT2 acc = TO_ET2(0); - for (UWORD i = 0; i < n_rows; ++i) - { - acc += TO_ET2(colptr[i]); - } - - dest[col * dest_mem_incr] = (acc / TO_ET2(n_rows)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_post.cu deleted file mode 100644 index 21b41d8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_post.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,mean_rowwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_cols; ++i) - { - acc += src[i * src_M_n_rows + row]; - } - - dest[row * dest_mem_incr] = TO_ET2(acc / TO_ET1(n_cols)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_pre.cu deleted file mode 100644 index c3edf00..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_pre.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,mean_rowwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT2 acc = TO_ET2(0); - for (UWORD i = 0; i < n_cols; ++i) - { - acc += TO_ET2(src[i * src_M_n_rows + row]); - } - - dest[row * dest_mem_incr] = (acc / TO_ET2(n_cols)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_post.cu deleted file mode 100644 index 026597c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_colwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 acc = colptr[0]; - for (UWORD i = 1; i < n_rows; ++i) - { - acc = min(acc, colptr[i]); - } - - dest[col * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_pre.cu deleted file mode 100644 index a5f7086..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_colwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT2 acc = TO_ET2(colptr[0]); - for (UWORD i = 1; i < n_rows; ++i) - { - acc = min(acc, TO_ET2(colptr[i])); - } - - dest[col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_post.cu deleted file mode 100644 index 161a7c3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_cube_col_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT1 acc = src[row + slice * n_rows * n_cols]; - for (UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, src[(i * n_rows) + row + slice * n_rows * n_cols]); - } - - dest[row + slice * n_rows] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_pre.cu deleted file mode 100644 index d62aea0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_cube_col_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT2 acc = TO_ET2(src[row + slice * n_rows * n_cols]); - for (UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, TO_ET2(src[(i * n_rows) + row + slice * n_rows * n_cols])); - } - - dest[row + slice * n_rows] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_post.cu deleted file mode 100644 index 8e125af..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_post.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_rowwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = src[row]; - for (UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, src[(i * src_M_n_rows) + row]); - } - - dest[row * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_pre.cu deleted file mode 100644 index aadf89a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_pre.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_rowwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT2 acc = TO_ET2(src[row]); - for (UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, TO_ET2(src[(i * src_M_n_rows) + row])); - } - - dest[row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq.cu deleted file mode 100644 index b6390cd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq.cu +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_all_neq)(const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - uint* out, - const eT2 val) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - const eT2 val2 = TO_ET2(X[i + blockDim.x]); - - aux_mem[tid] &= (val1 != val); - aux_mem[tid] &= (val2 != val); - i += grid_size; - } - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - - aux_mem[tid] &= (val1 != val); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - and_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_colwise.cu deleted file mode 100644 index 510e92f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_colwise.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_all_neq_colwise)(UWORD* out, - const eT1* A, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < A_n_cols) - { - const eT1* colptr = &(A[ col*A_n_rows ]); - UWORD result = 1; - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT2 val1 = TO_ET2(colptr[i]); - result &= (val1 != val); - } - - out[col] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_rowwise.cu deleted file mode 100644 index cd04aaf..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_rowwise.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_all_neq_rowwise)(UWORD* out, - const eT1* A, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < A_n_rows) - { - UWORD result = 1; - for (UWORD i = 0; i < A_n_cols; ++i) - { - const eT2 val1 = TO_ET2(A[i * A_n_rows + row]); - result &= (val1 != val); - } - - out[row] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_small.cu deleted file mode 100644 index 396e49e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_small.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_all_neq_small)(const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - uint* out, - const eT2 val) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT2 val1 = TO_ET2(X[i]); - const eT2 val2 = TO_ET2(X[i + blockDim.x]); - - aux_mem[tid] &= (val1 != val); - aux_mem[tid] &= (val2 != val); - i += grid_size; - } - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - - aux_mem[tid] &= (val1 != val); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq.cu deleted file mode 100644 index ebb2e8f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq.cu +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_any_neq)(const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - uint* out, - const eT2 val) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - const eT2 val2 = TO_ET2(X[i + blockDim.x]); - - aux_mem[tid] |= (val1 != val); - aux_mem[tid] |= (val2 != val); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT2 val1 = TO_ET2(X[i]); - - aux_mem[tid] |= (val1 != val); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - or_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_colwise.cu deleted file mode 100644 index 49694df..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_colwise.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_neq_colwise)(UWORD* out, - const eT1* A, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < A_n_cols) - { - const eT1* colptr = &(A[ col*A_n_rows ]); - UWORD result = 0; - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT2 val1 = TO_ET2(colptr[i]); - result |= (val1 != val); - if (result == 1) - break; - } - - out[col] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_rowwise.cu deleted file mode 100644 index 8656a3b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_rowwise.cu +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_neq_rowwise)(UWORD* out, - const eT1* A, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < A_n_rows) - { - UWORD result = 0; - for (UWORD i = 0; i < A_n_cols; ++i) - { - const eT2 val1 = TO_ET2(A[i * A_n_rows + row]); - result |= (val1 != val); - if (result == 1) - break; - } - - out[row] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_small.cu deleted file mode 100644 index e79e394..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_small.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_neq_small)(const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - uint* out, - const eT2 val) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT2 val1 = TO_ET2(X[i]); - const eT2 val2 = TO_ET2(X[i + blockDim.x]); - - aux_mem[tid] |= (val1 != val); - aux_mem[tid] |= (val2 != val); - if (aux_mem[tid] == 1) - break; - - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT2 val1 = TO_ET2(X[i]); - - aux_mem[tid] |= (val1 != val); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/strans.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/strans.cu deleted file mode 100644 index 0e9dc7b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/strans.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,strans)(eT2* out, - const eT1* in, - const UWORD in_n_rows, - const UWORD in_n_cols) - { - // For a non-inplace transpose, we can use a pretty naive approach. - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD in_offset = row + col * in_n_rows; - const UWORD out_offset = col + row * in_n_cols; - - if( (row < in_n_rows) && (col < in_n_cols) ) - { - const eT2 element = TO_ET2(in[in_offset]); - out[out_offset] = element; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_post.cu deleted file mode 100644 index d82bf19..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,sum_colwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_rows; ++i) - { - acc += colptr[i]; - } - - dest[col * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_pre.cu deleted file mode 100644 index bc9816b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,sum_colwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT2 acc = TO_ET2(0); - for (UWORD i = 0; i < n_rows; ++i) - { - acc += TO_ET2(colptr[i]); - } - - dest[col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_post.cu deleted file mode 100644 index 26b257e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_post.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,sum_rowwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_cols; ++i) - { - acc += src[(i * src_M_n_rows) + row]; - } - - dest[row * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_pre.cu deleted file mode 100644 index 49425dd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_pre.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,sum_rowwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT2 acc = TO_ET2(0); - for (UWORD i = 0; i < n_cols; ++i) - { - acc += TO_ET2(src[(i * src_M_n_rows) + row]); - } - - dest[row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatl.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatl.cu deleted file mode 100644 index b0ec3ce..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatl.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,symmatl)(eT2* out, - const eT1* A, - const UWORD size) // matrix is expected to be square - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < size && col < size && row >= col) - { - const eT2 val = TO_ET2(A[row + size * col]); - - out[col + size * row] = val; - out[row + size * col] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatu.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatu.cu deleted file mode 100644 index 34dfbd6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatu.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,symmatu)(eT2* out, - const eT1* A, - const UWORD size) // matrix is expected to be square - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < size && col < size && col >= row) - { - const eT2 val = TO_ET2(A[row + size * col]); - - out[row + size * col] = val; - out[col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/zeroway/shuffle_large_compute_locs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/zeroway/shuffle_large_compute_locs.cu deleted file mode 100644 index 9a78caa..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/zeroway/shuffle_large_compute_locs.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This performs the first part of the shuffle_vec kernel: it computes random -// locations for the output using the variable philox bijective shuffle, -// and then does the first step of the output compression (the upsweep of the -// shifted prefix sum). -__global__ -void -shuffle_large_compute_locs(UWORD* out_block_mem, - const UWORD n_elem, - const UWORD n_elem_pow2, - const UWORD* philox_key, - const UWORD num_bits) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x + blockIdx.x * blockDim.x; - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; - - // Get our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // Fill aux_mem with the indicator of whether we are out of bounds. - // Then, we'll prefix-sum it. This will tell us where to put our result. - aux_mem[local_tid] = (in_loc < n_elem); - __syncthreads(); - - // Now, prefix-sum the auxiliary memory. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = local_size / 2; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (local_tid == 0) - { - out_block_mem[blockIdx.x] = aux_mem[local_size - 1]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/d_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/d_defs.cl deleted file mode 100644 index a4801ac..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/d_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline double coot_type_min_double() { return -DBL_MAX; } -inline double coot_type_minpos_double() { return DBL_MIN; } -inline double coot_type_max_double() { return DBL_MAX; } - -inline bool coot_is_fp_double() { return true; } -inline bool coot_is_signed_double() { return true; } -inline bool coot_isnan_double(const double x) { return isnan(x); } - -inline double coot_absdiff_double(const double x, const double y) { return fabs(x - y); } - -inline double coot_conj_double(const double x) { return x; } - -inline double coot_plus_double(const double a, const double b) { return a + b; } -inline double coot_minus_double(const double a, const double b) { return a - b; } -inline double coot_mul_double(const double a, const double b) { return a * b; } -inline double coot_div_double(const double a, const double b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/f_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/f_defs.cl deleted file mode 100644 index be682a1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/f_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline float coot_type_min_float() { return -FLT_MAX; } -inline float coot_type_minpos_float() { return FLT_MIN; } -inline float coot_type_max_float() { return FLT_MAX; } - -inline bool coot_is_fp_float() { return true; } -inline bool coot_is_signed_float() { return true; } -inline bool coot_isnan_float(const float x) { return isnan(x); } - -inline float coot_absdiff_float(const float x, const float y) { return fabs(x - y); } - -inline float coot_conj_float(const float x) { return x; } - -inline float coot_plus_float(const float a, const float b) { return a + b; } -inline float coot_minus_float(const float a, const float b) { return a - b; } -inline float coot_mul_float(const float a, const float b) { return a * b; } -inline float coot_div_float(const float a, const float b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/h_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/h_defs.cl deleted file mode 100644 index f8215c5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/h_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline half coot_type_min_half() { return -HALF_MAX; } -inline half coot_type_minpos_half() { return HALF_MIN; } -inline half coot_type_max_half() { return HALF_MAX; } - -inline bool coot_is_fp_half() { return true; } -inline bool coot_is_signed_half() { return true; } -inline bool coot_isnan_half(const half x) { return isnan(x); } - -inline half coot_absdiff_half(const half x, const half y) { return fabs(x - y); } - -inline half coot_conj_half(const half x) { return x; } - -inline half coot_plus_half(const half a, const half b) { return a + b; } -inline half coot_minus_half(const half a, const half b) { return a - b; } -inline half coot_mul_half(const half a, const half b) { return a * b; } -inline half coot_div_half(const half a, const half b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/opencl_prelims.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/opencl_prelims.cl deleted file mode 100644 index 93be9b6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/opencl_prelims.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// These statically-compiled definitions are available in any Bandicoot kernel. -typedef float2 cx_float; -#ifdef COOT_HAVE_FP64 -typedef double2 cx_double; -#endif - -#define COOT_FN2(ARG1,ARG2) ARG1 ## ARG2 -#define COOT_FN(ARG1,ARG2) COOT_FN2(ARG1,ARG2) - -#define COOT_FN_3_2(ARG1,ARG2,ARG3) ARG1 ## ARG2 ## ARG3 -#define COOT_FN_3(ARG1,ARG2,ARG3) COOT_FN_3_2(ARG1,ARG2,ARG3) - -// Sometimes we need to approximate Armadillo functionality that uses -// double---but double may not be available. So we do our best... -#ifdef COOT_HAVE_FP64 - #define ARMA_FP_TYPE double - #define ARMA_FP_MAX DBL_MAX - #define ARMA_FP_MIN DBL_MIN -#else - #define ARMA_FP_TYPE float - #define ARMA_FP_MAX FLT_MAX - #define ARMA_FP_MIN FLT_MIN -#endif diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s16_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s16_defs.cl deleted file mode 100644 index 98898a3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s16_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline short coot_type_min_short() { return COOT_S16_MIN; } -inline short coot_type_minpos_short() { return 1; } -inline short coot_type_max_short() { return COOT_S16_MAX; } - -inline bool coot_is_fp_short() { return false; } -inline bool coot_is_signed_short() { return true; } -inline bool coot_isnan_short(const short x) { return false; } - -inline short coot_absdiff_short(const short x, const short y) { return abs(x - y); } - -inline short coot_conj_short(const short x) { return x; } - -inline short coot_plus_short(const short a, const short b) { return a + b; } -inline short coot_minus_short(const short a, const short b) { return a - b; } -inline short coot_mul_short(const short a, const short b) { return a * b; } -inline short coot_div_short(const short a, const short b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s32_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s32_defs.cl deleted file mode 100644 index 9403ec6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s32_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline int coot_type_min_int() { return COOT_S32_MIN; } -inline int coot_type_minpos_int() { return 1; } -inline int coot_type_max_int() { return COOT_S32_MAX; } - -inline bool coot_is_fp_int() { return false; } -inline bool coot_is_signed_int() { return true; } -inline bool coot_isnan_int(const int x) { return false; } - -inline int coot_absdiff_int(const int x, const int y) { return abs(x - y); } - -inline int coot_conj_int(const int x) { return x; } - -inline int coot_plus_int(const int a, const int b) { return a + b; } -inline int coot_minus_int(const int a, const int b) { return a - b; } -inline int coot_mul_int(const int a, const int b) { return a * b; } -inline int coot_div_int(const int a, const int b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s64_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s64_defs.cl deleted file mode 100644 index 93c9c96..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s64_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline long coot_type_min_long() { return COOT_S64_MIN; } -inline long coot_type_minpos_long() { return 1; } -inline long coot_type_max_long() { return COOT_S64_MAX; } - -inline bool coot_is_fp_long() { return false; } -inline bool coot_is_signed_long() { return true; } -inline bool coot_isnan_long(const long x) { return false; } - -inline long coot_absdiff_long(const long x, const long y) { return abs(x - y); } - -inline long coot_conj_long(const long x) { return x; } - -inline long coot_plus_long(const long a, const long b) { return a + b; } -inline long coot_minus_long(const long a, const long b) { return a - b; } -inline long coot_mul_long(const long a, const long b) { return a * b; } -inline long coot_div_long(const long a, const long b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s8_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s8_defs.cl deleted file mode 100644 index 20fb6fb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s8_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline char coot_type_min_char() { return COOT_S8_MIN; } -inline char coot_type_minpos_char() { return 1; } -inline char coot_type_max_char() { return COOT_S8_MAX; } - -inline bool coot_is_fp_char() { return false; } -inline bool coot_is_signed_char() { return true; } -inline bool coot_isnan_char(const char x) { return false; } - -inline char coot_absdiff_char(const char x, const char y) { return abs(x - y); } - -inline char coot_conj_char(const char x) { return x; } - -inline char coot_plus_char(const char a, const char b) { return a + b; } -inline char coot_minus_char(const char a, const char b) { return a - b; } -inline char coot_mul_char(const char a, const char b) { return a * b; } -inline char coot_div_char(const char a, const char b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u16_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u16_defs.cl deleted file mode 100644 index 5848c03..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u16_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline ushort coot_type_min_ushort() { return 0; } -inline ushort coot_type_minpos_ushort() { return 1; } -inline ushort coot_type_max_ushort() { return COOT_U16_MAX; } - -inline bool coot_is_fp_ushort() { return false; } -inline bool coot_is_signed_ushort() { return false; } -inline bool coot_isnan_ushort(const ushort x) { return false; } - -inline ushort coot_absdiff_ushort(const ushort x, const ushort y) { return (x > y) ? (x - y) : (y - x); } - -inline ushort coot_conj_ushort(const ushort x) { return x; } - -inline ushort coot_plus_ushort(const ushort a, const ushort b) { return a + b; } -inline ushort coot_minus_ushort(const ushort a, const ushort b) { return a - b; } -inline ushort coot_mul_ushort(const ushort a, const ushort b) { return a * b; } -inline ushort coot_div_ushort(const ushort a, const ushort b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u32_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u32_defs.cl deleted file mode 100644 index e427814..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u32_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline uint coot_type_min_uint() { return 0; } -inline uint coot_type_minpos_uint() { return 1; } -inline uint coot_type_max_uint() { return COOT_U32_MAX; } - -inline bool coot_is_fp_uint() { return false; } -inline bool coot_is_signed_uint() { return false; } -inline bool coot_isnan_uint(const uint x) { return false; } - -inline uint coot_absdiff_uint(const uint x, const uint y) { return (x > y) ? (x - y) : (y - x); } - -inline uint coot_conj_uint(const uint x) { return x; } - -inline uint coot_plus_uint(const uint a, const uint b) { return a + b; } -inline uint coot_minus_uint(const uint a, const uint b) { return a - b; } -inline uint coot_mul_uint(const uint a, const uint b) { return a * b; } -inline uint coot_div_uint(const uint a, const uint b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u64_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u64_defs.cl deleted file mode 100644 index 3e308b3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u64_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline ulong coot_type_min_ulong() { return 0; } -inline ulong coot_type_minpos_ulong() { return 1; } -inline ulong coot_type_max_ulong() { return COOT_U64_MAX; } - -inline bool coot_is_fp_ulong() { return false; } -inline bool coot_is_signed_ulong() { return false; } -inline bool coot_isnan_ulong(const ulong x) { return false; } - -inline ulong coot_absdiff_ulong(const ulong x, const ulong y) { return (x > y) ? (x - y) : (y - x); } - -inline ulong coot_conj_ulong(const ulong x) { return x; } - -inline ulong coot_plus_ulong(const ulong a, const ulong b) { return a + b; } -inline ulong coot_minus_ulong(const ulong a, const ulong b) { return a - b; } -inline ulong coot_mul_ulong(const ulong a, const ulong b) { return a * b; } -inline ulong coot_div_ulong(const ulong a, const ulong b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u8_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u8_defs.cl deleted file mode 100644 index fda8d45..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u8_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline uchar coot_type_min_uchar() { return 0; } -inline uchar coot_type_minpos_uchar() { return 1; } -inline uchar coot_type_max_uchar() { return COOT_U8_MAX; } - -inline bool coot_is_fp_uchar() { return false; } -inline bool coot_is_signed_uchar() { return false; } -inline bool coot_isnan_uchar(const uchar x) { return false; } - -inline uchar coot_absdiff_uchar(const uchar x, const uchar y) { return (x > y) ? (x - y) : (y - x); } - -inline uchar coot_conj_uchar(const uchar x) { return x; } - -inline uchar coot_plus_uchar(const uchar a, const uchar b) { return a + b; } -inline uchar coot_minus_uchar(const uchar a, const uchar b) { return a - b; } -inline uchar coot_mul_uchar(const uchar a, const uchar b) { return a * b; } -inline uchar coot_div_uchar(const uchar a, const uchar b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/accu_subgroup_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/accu_subgroup_reduce.cl deleted file mode 100644 index 2063b30..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/accu_subgroup_reduce.cl +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -COOT_FN(PREFIX,accu_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] += data[tid + i]; - - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/and_subgroup_reduce_u32.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/and_subgroup_reduce_u32.cl deleted file mode 100644 index 2b532e2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/and_subgroup_reduce_u32.cl +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -and_subgroup_reduce_other_u32(__local volatile uint* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] &= data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -and_subgroup_reduce_8_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -and_subgroup_reduce_16_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -and_subgroup_reduce_32_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -and_subgroup_reduce_64_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -and_subgroup_reduce_128_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/max_subgroup_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/max_subgroup_reduce.cl deleted file mode 100644 index c4d855a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/max_subgroup_reduce.cl +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -COOT_FN(PREFIX,max_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] = max(data[tid], data[tid + i]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 64]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 128]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 64]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/min_subgroup_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/min_subgroup_reduce.cl deleted file mode 100644 index 3cab836..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/min_subgroup_reduce.cl +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -COOT_FN(PREFIX,min_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] = min(data[tid], data[tid + i]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,min_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,min_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,min_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,min_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 64]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } - - - -void COOT_FN(PREFIX,min_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 128]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 64]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/or_subgroup_reduce_u32.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/or_subgroup_reduce_u32.cl deleted file mode 100644 index 98a8d5a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/or_subgroup_reduce_u32.cl +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -or_subgroup_reduce_other_u32(__local volatile uint* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] |= data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -or_subgroup_reduce_8_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -or_subgroup_reduce_16_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -or_subgroup_reduce_32_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -or_subgroup_reduce_64_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -or_subgroup_reduce_128_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/prod_subgroup_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/prod_subgroup_reduce.cl deleted file mode 100644 index cd8a056..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/prod_subgroup_reduce.cl +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -COOT_FN(PREFIX,prod_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] *= data[tid + i]; - - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/var_philox.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/var_philox.cl deleted file mode 100644 index 5135aa2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/var_philox.cl +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// Implementations of the variable philox algorithm to generate random numbers. -// Adapted from Mitchell, Stokes, Frank, and Holmes (2022), Listing 1. - - - -inline -UWORD -var_philox(const UWORD val, const __global UWORD* keys, const unsigned char bits) - { - // via Salmon, Moraes, Dror, and Shaw (2011): "Parallel random numbers: as easy as 1, 2, 3". - const UWORD M0 = 0xD2B74407B1CE6E93; - - // The right side is allowed to have the extra bits. - const unsigned char right_side_bits = (bits + 1) / 2; - const unsigned char left_side_bits = bits / 2; - const uint left_mask = (((uint) 1) << left_side_bits) - 1; - const uint right_mask = (((uint) 1) << right_side_bits) - 1; - - uint state0 = (uint) (val >> right_side_bits); - uint state1 = (uint) (val & right_mask); - - // 24 rounds is what is needed to pass all the RNG tests (see section 5 of the paper). - uint hi, lo; - for (unsigned char i = 0; i < 24; ++i) - { - - // 64-bit integer multiplication, split the results into two uints - UWORD hilo = M0 * state0; - hi = (hilo >> 32); - lo = (uint) hilo; - - lo = (lo << (right_side_bits - left_side_bits)) | (state1 >> left_side_bits); - - state0 = ((hi ^ keys[i]) ^ state1) & left_mask; - state1 = lo & right_mask; - } - - // Combine the sides for the result. - return ((UWORD) (state0 << right_side_bits)) | ((UWORD) state1); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_lower.cl deleted file mode 100644 index 8bb4e4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_lower.cl +++ /dev/null @@ -1,307 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Computes row sums dwork[i] = sum( abs( A(i,:) )), i=0:n-1, for || A ||_inf, -// where n is any size and A is stored lower. -// Has ceil( n / inf_bs ) blocks of (inf_bs x 4) threads each (inf_bs=32). -// z precision uses > 16 KB shared memory, so requires Fermi (arch >= 200). - - - -__kernel -void -COOT_FN(PREFIX,lansy_inf_lower) - ( - const UWORD n, - const __global eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* dwork, - const UWORD dwork_offset, - const UWORD n_full_block, - const UWORD n_mod_bs - ) - { - A += A_offset; - dwork += dwork_offset; - - UWORD tx = get_local_id(0); - UWORD ty = get_local_id(1); - - UWORD diag = get_group_id(0) * MAGMABLAS_LANSY_INF_BS; - UWORD ind = get_group_id(0) * MAGMABLAS_LANSY_INF_BS + tx; - - eT1 res = 0.; - - __local eT1 la[MAGMABLAS_LANSY_INF_BS][MAGMABLAS_LANSY_INF_BS + 1]; - - if ( get_group_id(0) < n_full_block ) - { - // ------------------------------ - // All full block rows - A += ind; - A += ty * lda; - - // ---------- - // loop over all blocks left of the diagonal block - for(UWORD i=0; i < diag; i += MAGMABLAS_LANSY_INF_BS ) - { - // 32x4 threads cooperatively load 32x32 block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; - } - A += lda * MAGMABLAS_LANSY_INF_BS; - barrier( CLK_LOCAL_MEM_FENCE ); - - // compute 4 partial sums of each row, i.e., - // for ty=0: res = sum( la[tx, 0: 7] ) - // for ty=1: res = sum( la[tx, 8:15] ) - // for ty=2: res = sum( la[tx,16:23] ) - // for ty=3: res = sum( la[tx,24:31] ) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // load diagonal block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // copy lower triangle to upper triangle, and - // make diagonal real (zero imaginary part) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD i=ty*8; i < ty*8 + 8; i++) - { - if ( i < tx ) - { - la[i][tx] = la[tx][i]; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // ---------- - // loop over all 32x32 blocks below diagonal block - A += MAGMABLAS_LANSY_INF_BS; - for(UWORD i=diag + MAGMABLAS_LANSY_INF_BS; i < n - n_mod_bs; i += MAGMABLAS_LANSY_INF_BS ) - { - // load block (transposed) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[ty+j][tx] = A[j*lda]; - } - A += MAGMABLAS_LANSY_INF_BS; - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // last partial block, which is (n_mod_bs by inf_bs) - if ( n_mod_bs > 0 ) - { - // load block (transposed), with zeros for rows outside matrix - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - if ( tx < n_mod_bs ) - { - la[ty+j][tx] = A[j*lda]; - } - else - { - la[ty+j][tx] = (eT1) 0; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // 32x4 threads store partial sums into shared memory - la[tx][ty] = res; - barrier( CLK_LOCAL_MEM_FENCE ); - - // first column of 32x1 threads computes final sum of each row - if ( ty == 0 ) - { - res = res + la[tx][1] + la[tx][2] + la[tx][3]; - dwork[ind] = res; - } - } - else - { - // ------------------------------ - // Last, partial block row - // Threads past end of matrix (i.e., ind >= n) are redundantly assigned - // the last row (n-1). At the end, those results are ignored -- only - // results for ind < n are saved into dwork. - if ( tx < n_mod_bs ) - { - A += ind; - } - else - { - A += (get_group_id(0) * MAGMABLAS_LANSY_INF_BS + n_mod_bs - 1); // redundantly do last row - } - A += ty * lda; - - // ---------- - // loop over all blocks left of the diagonal block - // each is (n_mod_bs by inf_bs) - for(UWORD i=0; i < diag; i += MAGMABLAS_LANSY_INF_BS ) - { - // load block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; - } - A += lda * MAGMABLAS_LANSY_INF_BS; - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < 8; j++) - { - res += ET1_ABS( la[tx][j+ty*8] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // partial diagonal block - if ( ty == 0 && tx < n_mod_bs ) - { - // sum rows left of diagonal - for(UWORD j=0; j < tx; j++) - { - res += ET1_ABS( *A ); - A += lda; - } - // sum diagonal (ignoring imaginary part) - res += ET1_ABS( *A ); - A += 1; - // sum column below diagonal - for(UWORD j=tx+1; j < n_mod_bs; j++) - { - res += ET1_ABS( *A ); - A += 1; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // ---------- - // 32x4 threads store partial sums into shared memory - la[tx][ty]= res; - barrier( CLK_LOCAL_MEM_FENCE ); - - // first column of 32x1 threads computes final sum of each row - // rows outside matrix are ignored - if ( ty == 0 && tx < n_mod_bs ) - { - res = res + la[tx][1] + la[tx][2] + la[tx][3]; - dwork[ind] = res; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_upper.cl deleted file mode 100644 index 3ffc03c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_upper.cl +++ /dev/null @@ -1,313 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Computes row sums dwork[i] = sum( abs( A(i,:) )), i=0:n-1, for || A ||_inf, -// where n is any size and A is stored upper. -// Has ceil( n / inf_bs ) blocks of (inf_bs x 4) threads each (inf_bs=32). -// z precision uses > 16 KB shared memory, so requires Fermi (arch >= 200). -// The upper implementation is similar to lower, but processes blocks -// in the transposed order: -// lower goes from left over to diagonal, then down to bottom; -// upper goes from top down to diagonal, then over to right. -// Differences are noted with # in comments. - - - -__kernel -void -COOT_FN(PREFIX,lansy_inf_upper) - ( - const UWORD n, - const __global eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* dwork, - const UWORD dwork_offset, - const UWORD n_full_block, - const UWORD n_mod_bs - ) - { - A += A_offset; - dwork += dwork_offset; - - UWORD tx = get_local_id(0); - UWORD ty = get_local_id(1); - - UWORD diag = get_group_id(0) * MAGMABLAS_LANSY_INF_BS; - UWORD ind = get_group_id(0) * MAGMABLAS_LANSY_INF_BS + tx; - - eT1 res = 0.; - - __local eT1 la[MAGMABLAS_LANSY_INF_BS][MAGMABLAS_LANSY_INF_BS + 1]; - - if ( get_group_id(0) < n_full_block ) - { - // ------------------------------ - // All full block #columns - A += get_group_id(0) * MAGMABLAS_LANSY_INF_BS * lda + tx; //# - A += ty * lda; - - // ---------- - // loop over all blocks #above the diagonal block - for(UWORD i=0; i < diag; i += MAGMABLAS_LANSY_INF_BS ) - { - // 32x4 threads cooperatively load 32x32 block (#transposed) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[ty+j][tx] = A[j*lda]; //# - } - A += MAGMABLAS_LANSY_INF_BS; //# - barrier( CLK_LOCAL_MEM_FENCE ); - - // compute 4 partial sums of each row, i.e., - // for ty=0: res = sum( la[tx, 0: 7] ) - // for ty=1: res = sum( la[tx, 8:15] ) - // for ty=2: res = sum( la[tx,16:23] ) - // for ty=3: res = sum( la[tx,24:31] ) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // load diagonal block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // copy #upper triangle to #lower triangle, and - // make diagonal real (zero imaginary part) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD i=ty*8; i < ty*8 + 8; i++) - { - if ( i > tx ) - { //# - la[i][tx] = la[tx][i]; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // ---------- - // loop over all 32x32 blocks #right of diagonal block - A += MAGMABLAS_LANSY_INF_BS * lda; //# - for(UWORD i=diag + MAGMABLAS_LANSY_INF_BS; i < n - n_mod_bs; i += MAGMABLAS_LANSY_INF_BS ) - { - // load block (#non-transposed) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; //# - } - A += MAGMABLAS_LANSY_INF_BS * lda; //# - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // last partial block, which is #(inf_bs by n_mod_bs) - if ( n_mod_bs > 0 ) - { - // load block (#non-transposed), with zeros for #cols outside matrix - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - if ( ty+j < n_mod_bs ) - { //# - la[tx][ty+j] = A[j*lda]; //# - } - else - { - la[tx][ty+j] = (eT1) 0; //# - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // 32x4 threads store partial sums into shared memory - la[tx][ty] = res; - barrier( CLK_LOCAL_MEM_FENCE ); - - // first column of 32x1 threads computes final sum of each row - if ( ty == 0 ) - { - res = res + la[tx][1] + la[tx][2] + la[tx][3]; - dwork[ind] = res; - } - } - else - { - // ------------------------------ - // Last, partial block #column - // Instead of assigning threads ind >= n to the last row (n-1), as in Lower, - // Upper simply adjusts loop bounds to avoid loading columns outside the matrix. - // Again, at the end, those results are ignored -- only - // results for ind < n are saved into dwork. - A += get_group_id(0) * MAGMABLAS_LANSY_INF_BS * lda + tx; //# - A += ty * lda; - - // ---------- - // loop over all blocks #above the diagonal block - // each is #(inf_bs by n_mod_bs) - for(UWORD i=0; i < diag; i += MAGMABLAS_LANSY_INF_BS ) - { - // load block (#transposed), #ignoring columns outside matrix - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - if ( ty+j < n_mod_bs ) - { - la[ty+j][tx] = A[j*lda]; - } - } - A += MAGMABLAS_LANSY_INF_BS; //# - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < 8; j++) - { - res += ET1_ABS( la[tx][j+ty*8] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // partial diagonal block - if ( ty == 0 && tx < n_mod_bs ) - { - // #transpose pointer within diagonal block - // #i.e., from A = A(tx,ty), transpose to A = A(ty,tx). - A = A - tx - ty*lda + tx*lda + ty; - - // sum #column above diagonal - for(UWORD j=0; j < tx; j++) - { - res += ET1_ABS( *A ); - A += 1; //# - } - // sum diagonal (ignoring imaginary part) - res += ET1_ABS( *A ); - A += lda; //# - // sum #row right of diagonal - for(UWORD j=tx+1; j < n_mod_bs; j++) - { - res += ET1_ABS( *A ); - A += lda; //# - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // ---------- - // 32x4 threads store partial sums into shared memory - la[tx][ty]= res; - barrier( CLK_LOCAL_MEM_FENCE ); - - // first column of 32x1 threads computes final sum of each row - // rows outside matrix are ignored - if ( ty == 0 && tx < n_mod_bs ) - { - res = res + la[tx][1] + la[tx][2] + la[tx][3]; - dwork[ind] = res; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_lower.cl deleted file mode 100644 index 7df958f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_lower.cl +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Computes dwork[i] = max( abs( A(i,0:i) )), i=0:n-1, for ||A||_max, where A is stored lower - - - -__kernel -void -COOT_FN(PREFIX,lansy_max_lower) - ( - const UWORD n, - const __global eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* dwork, - const UWORD dwork_offset - ) - { - A += A_offset; - dwork += dwork_offset; - - int ind = get_group_id(0) * MAGMABLAS_LANSY_MAX_BS + get_local_id(0); - eT1 res = 0; - - if (ind < n) - { - A += ind; - for(int j=0; j < ind; ++j) - { - res = fmax( res, ET1_ABS( *A )); - A += lda; - } - // diagonal element (ignoring imaginary part) - res = fmax( res, ET1_ABS( *A )); - dwork[ind] = res; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_upper.cl deleted file mode 100644 index 7b34f54..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_upper.cl +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Computes dwork[i] = max( abs( A(i,0:i) )), i=0:n-1, for ||A||_max, where A is stored upper. - - - -__kernel -void -COOT_FN(PREFIX,lansy_max_upper) - ( - const UWORD n, - const __global eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* dwork, - const UWORD dwork_offset - ) - { - A += A_offset; - dwork += dwork_offset; - - int ind = get_group_id(0) * MAGMABLAS_LANSY_MAX_BS + get_local_id(0); - eT1 res = 0; - - if (ind < n) - { - A += ind; - A += (n-1)*lda; - for(int j=n-1; j > ind; j--) - { - res = fmax( res, ET1_ABS( *A )); - A -= lda; - } - // diagonal element (ignoring imaginary part) - res = fmax( res, ET1_ABS( *A )); - dwork[ind] = res; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_full.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_full.cl deleted file mode 100644 index 85b3ff0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_full.cl +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Multiply A by `mul`. - -__kernel -void -COOT_FN(PREFIX,lascl_full) - ( - const UWORD m, - const UWORD n, - const eT1 mul, - __global eT1* A, - const UWORD A_offset, - const UWORD lda - ) - { - UWORD ind = get_group_id(0) * MAGMABLAS_LASCL_NB + get_local_id(0); - - A += A_offset + ind; - if (ind < m) - { - for (UWORD j=0; j < n; j++ ) - { - A[j*lda] *= mul; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_lower.cl deleted file mode 100644 index 148a649..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_lower.cl +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Multiply A by `mul`. A is lower triangular. - -__kernel -void -COOT_FN(PREFIX,lascl_lower) - ( - const UWORD m, - const UWORD n, - const eT1 mul, - __global eT1* A, - const UWORD A_offset, - const UWORD lda - ) - { - UWORD ind = get_group_id(0) * MAGMABLAS_LASCL_NB + get_local_id(0); - - UWORD break_d = (ind < n) ? ind : n-1; - - A += A_offset + ind; - if (ind < m) - { - for (UWORD j=0; j <= break_d; j++ ) - { - A[j*lda] *= mul; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_upper.cl deleted file mode 100644 index 3c5d514..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_upper.cl +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Multiply A by `mul`. A is upper triangular. - -__kernel -void -COOT_FN(PREFIX,lascl_upper) - ( - const UWORD m, - const UWORD n, - const eT1 mul, - __global eT1* A, - const UWORD A_offset, - const UWORD lda - ) - { - UWORD ind = get_group_id(0) * MAGMABLAS_LASCL_NB + get_local_id(0); - - A += A_offset + ind; - if (ind < m) - { - for (UWORD j=n-1; j >= ind; j--) - { - A[j*lda] *= mul; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_lower.cl deleted file mode 100644 index 4663b4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_lower.cl +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// GPU kernel for setting the k-1 sub-diagonals to OFFDIAG -// and the main diagonal to DIAG. -// Divides matrix into min( ceil(m/nb), ceil(n/nb) ) block-columns, -// with k threads in each block. -// Each thread iterates across one diagonal. -// Thread 0 does the main diagonal, thread 1 the first sub-diagonal, etc. - - - -__kernel -void -COOT_FN(PREFIX,laset_band_lower)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - UWORD ibx = get_group_id(0) * MAGMABLAS_LASET_BAND_NB; - UWORD ind = ibx + get_local_id(0); - - A += A_offset + ind + ibx * lda; - - eT1 value = offdiag; - if (get_local_id(0) == 0) - { - value = diag; - } - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j=0; j < MAGMABLAS_LASET_BAND_NB; j++) - { - if (ibx + j < n && ind + j < m) - { - A[j * (lda + 1)] = value; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_upper.cl deleted file mode 100644 index 96af81c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_upper.cl +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// GPU kernel for setting the k-1 sub-diagonals to OFFDIAG -// and the main diagonal to DIAG. -// Divides matrix into min( ceil(m/nb), ceil(n/nb) ) block-columns, -// with k threads in each block. -// Each thread iterates across one diagonal. -// Thread 0 does the main diagonal, thread 1 the first sub-diagonal, etc. - - - -__kernel -void -COOT_FN(PREFIX,laset_band_upper)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - int k = get_local_size(0); - int ibx = get_group_id(0) * MAGMABLAS_LASET_BAND_NB; - int ind = ibx + get_local_id(0) - k + 1; - - A += A_offset + ind + ibx * lda; - - eT1 value = offdiag; - if (get_local_id(0) == k - 1) - { - value = diag; - } - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j = 0; j < MAGMABLAS_LASET_BAND_NB; j++) - { - if (ibx + j < n && ind + j >= 0 && ind + j < m) - { - A[j * (lda + 1)] = value; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_full.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_full.cl deleted file mode 100644 index 5701d59..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_full.cl +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. -// Each block has BLK_X threads. -// Each thread loops across one row, updating BLK_Y entries. -// -// Code similar to lacpy, lag2s, lag2z, geadd. - -__kernel -void -COOT_FN(PREFIX,laset_full)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - A += A_offset; - - UWORD ind = get_group_id(0) * MAGMABLAS_BLK_X + get_local_id(0); - UWORD iby = get_group_id(1) * MAGMABLAS_BLK_Y; - /* check if full block-column && (below diag || above diag || offdiag == diag) */ - bool full = (iby + MAGMABLAS_BLK_Y <= n && (ind >= iby + MAGMABLAS_BLK_Y || ind + MAGMABLAS_BLK_X <= iby || ( offdiag == diag ))); - /* do only rows inside matrix */ - if (ind < m) - { - A += ind + iby * lda; - if (full) - { - // full block-column, off-diagonal block or offdiag == diag - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(int j=0; j < MAGMABLAS_BLK_Y; ++j) - { - A[j * lda] = offdiag; - } - } - else - { - // either partial block-column or diagonal block - for (int j=0; j < MAGMABLAS_BLK_Y && iby+j < n; ++j) - { - if (iby + j == ind) - A[j * lda] = diag; - else - A[j * lda] = offdiag; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_lower.cl deleted file mode 100644 index bb5b3b2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_lower.cl +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Similar to laset_full, but updates only the diagonal and below. -// Blocks that are fully above the diagonal exit immediately. -// -// Code similar to lacpy, zlat2c, clat2z. - -__kernel -void -COOT_FN(PREFIX,laset_lower)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - A += A_offset; - - UWORD ind = get_group_id(0) * MAGMABLAS_BLK_X + get_local_id(0); - UWORD iby = get_group_id(1) * MAGMABLAS_BLK_Y; - /* check if full block-column && (below diag) */ - bool full = (iby + MAGMABLAS_BLK_Y <= n && (ind >= iby + MAGMABLAS_BLK_Y)); - /* do only rows inside matrix, and blocks not above diag */ - if (ind < m && ind + MAGMABLAS_BLK_X > iby) - { - A += ind + iby*lda; - if (full) - { - // full block-column, off-diagonal block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j=0; j < MAGMABLAS_BLK_Y; ++j) - { - A[j * lda] = offdiag; - } - } - else - { - // either partial block-column or diagonal block - for (int j=0; j < MAGMABLAS_BLK_Y && iby+j < n; ++j) - { - if (iby + j == ind) - A[j * lda] = diag; - else if (ind > iby + j) - A[j*lda] = offdiag; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_upper.cl deleted file mode 100644 index 02ed04d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_upper.cl +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Similar to laset_full, but updates only the diagonal and above. -// Blocks that are fully below the diagonal exit immediately. -// -// Code similar to lacpy, zlat2c, clat2z. - -__kernel -void -COOT_FN(PREFIX,laset_upper)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - A += A_offset; - - UWORD ind = get_group_id(0) * MAGMABLAS_BLK_X + get_local_id(0); - UWORD iby = get_group_id(1) * MAGMABLAS_BLK_Y; - /* check if full block-column && (above diag) */ - bool full = (iby + MAGMABLAS_BLK_Y <= n && (ind + MAGMABLAS_BLK_X <= iby)); - /* do only rows inside matrix, and blocks not below diag */ - if (ind < m && ind < iby + MAGMABLAS_BLK_Y) - { - A += ind + iby*lda; - if (full) - { - // full block-column, off-diagonal block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(int j=0; j < MAGMABLAS_BLK_Y; ++j) - { - A[j * lda] = offdiag; - } - } - else - { - // either partial block-column or diagonal block - for (int j=0; j < MAGMABLAS_BLK_Y && iby+j < n; ++j) - { - if (iby + j == ind) - A[j*lda] = diag; - else if (ind < iby + j) - A[j*lda] = offdiag; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laswp.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laswp.cl deleted file mode 100644 index a7793b0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laswp.cl +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Matrix A is stored row-wise in dAT. -// Divide matrix A into block-columns of NTHREADS columns each. -// Each GPU block processes one block-column of A. -// Each thread goes down a column of A, -// swapping rows according to pivots stored in params. -__kernel -void -COOT_FN(PREFIX,laswp)(int n, - __global eT1* dAT, - unsigned long dAT_offset, - int ldda, - magmablas_laswp_params_t params) - { - dAT += dAT_offset; - - int tid = get_local_id(0) + get_local_size(0)*get_group_id(0); - if ( tid < n ) - { - dAT += tid; - __global eT1* A1 = dAT; - - for( int i1 = 0; i1 < params.npivots; ++i1 ) - { - int i2 = params.ipiv[i1]; - __global eT1* A2 = dAT + i2*ldda; - eT1 temp = *A1; - *A1 = *A2; - *A2 = temp; - A1 += ldda; // A1 = dA + i1*ldx - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_even_magma.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_even_magma.cl deleted file mode 100644 index 3917cec..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_even_magma.cl +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// grid is ((n/nb) + 1) x (n/nb)/2, where n/nb is even. -// lower indicates blocks in strictly lower triangle of grid, excluding diagonal. -// lower blocks shift up by one to cover left side of matrix including diagonal. -// upper blocks swap block indices (x,y) and shift by grid width -// to cover right side of matrix. -// [ A00 A01 ] [ A10 . | . . ] -// [ A10 A11 ] [ A20 A21 | . . ] -// grid [ A20 A21 ] covers matrix as [ A30 A31 | A00 . ] -// [ A30 A31 ] [ A40 A41 | A01 A11 ] -// [ A40 A41 ] -// -// Each block is NB x NB threads. -// For non-diagonal block A, block B is symmetric block. -// Thread (i,j) loads A(i,j) into sA(j,i) and B(i,j) into sB(j,i), i.e., transposed, -// syncs, then saves sA(i,j) to B(i,j) and sB(i,j) to A(i,j). -// Threads outside the matrix do not touch memory. - -__kernel -void -COOT_FN(PREFIX,transpose_inplace_even_magma)(const UWORD n, - __global eT1* matrix, - const UWORD matrix_offset, - const UWORD lda) - { - matrix += matrix_offset; - - __local eT1 sA[MAGMABLAS_TRANS_INPLACE_NB][MAGMABLAS_TRANS_INPLACE_NB + 1]; - __local eT1 sB[MAGMABLAS_TRANS_INPLACE_NB][MAGMABLAS_TRANS_INPLACE_NB + 1]; - - UWORD i = get_local_id(0); - UWORD j = get_local_id(1); - - bool lower = (get_group_id(0) > get_group_id(1)); - UWORD ii = (lower ? (get_group_id(0) - 1) : (get_group_id(1) + get_num_groups(1))); - UWORD jj = (lower ? (get_group_id(1) ) : (get_group_id(0) + get_num_groups(1))); - - ii *= MAGMABLAS_TRANS_INPLACE_NB; - jj *= MAGMABLAS_TRANS_INPLACE_NB; - - __global eT1* A = matrix + (ii + i) + (jj + j) * lda; - if (ii == jj) - { - if (ii + i < n && jj + j < n) - { - sA[j][i] = *A; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (ii + i < n && jj + j < n) - { - *A = sA[i][j]; - } - } - else - { - __global eT1* B = matrix + (jj + i) + (ii + j) * lda; - if (ii + i < n && jj + j < n) - { - sA[j][i] = *A; - } - if (jj + i < n && ii + j < n) - { - sB[j][i] = *B; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (ii + i < n && jj + j < n) - { - *A = sB[i][j]; - } - if (jj + i < n && ii + j < n) - { - *B = sA[i][j]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_odd_magma.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_odd_magma.cl deleted file mode 100644 index 923a887..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_odd_magma.cl +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// grid is (n/nb) x ((n/nb)/2 + 1), where n/nb is odd. -// lower indicates blocks in lower triangle of grid, including diagonal. -// lower blocks cover left side of matrix, including diagonal. -// upper blocks swap block indices (x,y) and shift by grid width (or width-1) -// to cover right side of matrix. -// [ A00 A01 A02 ] [ A00 . . | . . ] -// [ A10 A11 A12 ] [ A10 A11 . | . . ] -// grid [ A20 A21 A22 ] covers matrix as [ A20 A21 A22 | . . ] -// [ A30 A31 A32 ] [ A30 A31 A32 | A01 . ] -// [ A40 A41 A42 ] [ A40 A41 A42 | A02 A12 ] -// -// See transpose_inplace_even_magma for description of threads. - -__kernel -void -COOT_FN(PREFIX,transpose_inplace_odd_magma)(const UWORD n, - __global eT1* matrix, - const UWORD matrix_offset, - const UWORD lda) - { - matrix += matrix_offset; - - __local eT1 sA[MAGMABLAS_TRANS_INPLACE_NB][MAGMABLAS_TRANS_INPLACE_NB + 1]; - __local eT1 sB[MAGMABLAS_TRANS_INPLACE_NB][MAGMABLAS_TRANS_INPLACE_NB + 1]; - - UWORD i = get_local_id(0); - UWORD j = get_local_id(1); - - bool lower = (get_group_id(0) >= get_group_id(1)); - UWORD ii = (lower ? get_group_id(0) : (get_group_id(1) + get_num_groups(1) - 1)); - UWORD jj = (lower ? get_group_id(1) : (get_group_id(0) + get_num_groups(1) )); - - ii *= MAGMABLAS_TRANS_INPLACE_NB; - jj *= MAGMABLAS_TRANS_INPLACE_NB; - - __global eT1* A = matrix + (ii + i) + (jj + j) * lda; - if (ii == jj) - { - if (ii + i < n && jj + j < n) - { - sA[j][i] = *A; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (ii + i < n && jj + j < n) - { - *A = sA[i][j]; - } - } - else - { - __global eT1* B = matrix + (jj + i) + (ii + j) * lda; - if (ii + i < n && jj + j < n) - { - sA[j][i] = *A; - } - if (jj + i < n && ii + j < n) - { - sB[j][i] = *B; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (ii + i < n && jj + j < n) - { - *A = sB[i][j]; - } - if (jj + i < n && ii + j < n) - { - *B = sA[i][j]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_magma.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_magma.cl deleted file mode 100644 index 1288197..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_magma.cl +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB. -// uses NX-by-NY threads, where NB/NX, NB/NY, NX/NY evenly. -// subtile each NB-by-NB tile with (NB/NX) subtiles sized NX-by-NB -// for each subtile -// load NX-by-NB subtile transposed from A into sA, as (NB/NY) blocks sized NX-by-NY -// save NB-by-NX subtile from sA into AT, as (NB/NX)*(NX/NY) blocks sized NX-by-NY -// A += NX -// AT += NX*ldat - -__kernel -void -COOT_FN(PREFIX,transpose_magma)(const UWORD m, - const UWORD n, - __global const eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* AT, - const UWORD AT_offset, - const UWORD ldat) - { - A += A_offset; - AT += AT_offset; - - __local eT1 sA[MAGMABLAS_TRANS_NB][MAGMABLAS_TRANS_NX+1]; - - UWORD tx = get_local_id(0); - UWORD ty = get_local_id(1); - UWORD ibx = get_group_id(0) * MAGMABLAS_TRANS_NB; - UWORD iby = get_group_id(1) * MAGMABLAS_TRANS_NB; - UWORD i, j; - - A += ibx + tx + (iby + ty) * lda; - AT += iby + tx + (ibx + ty) * ldat; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int tile = 0; tile < MAGMABLAS_TRANS_NB / MAGMABLAS_TRANS_NX; ++tile) - { - // load NX-by-NB subtile transposed from A into sA - i = ibx + tx + tile * MAGMABLAS_TRANS_NX; - j = iby + ty; - if (i < m) - { - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j2=0; j2 < MAGMABLAS_TRANS_NB; j2 += MAGMABLAS_TRANS_NY) - { - if (j + j2 < n) - { - sA[ty + j2][tx] = A[j2*lda]; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // save NB-by-NX subtile from sA into AT - i = iby + tx; - j = ibx + ty + tile * MAGMABLAS_TRANS_NX; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int i2 = 0; i2 < MAGMABLAS_TRANS_NB; i2 += MAGMABLAS_TRANS_NX) - { - if (i + i2 < n) - { - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j2 = 0; j2 < MAGMABLAS_TRANS_NX; j2 += MAGMABLAS_TRANS_NY) - { - if (j + j2 < m) - { - AT[i2 + j2 * ldat] = sA[tx + i2][ty + j2]; - } - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // move to next subtile - A += MAGMABLAS_TRANS_NX; - AT += MAGMABLAS_TRANS_NX * ldat; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu.cl deleted file mode 100644 index e1ecc4e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu.cl +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,accu)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += in_mem[in_mem_offset + i] + in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += in_mem[in_mem_offset + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_simple.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_simple.cl deleted file mode 100644 index 57b00b0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_simple.cl +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,accu_simple)(__global eT1* out, - __global const eT1* A, - const UWORD A_len) - { - const UWORD id = get_global_id(0); - if(id == 0) - { - eT1 acc = (eT1)(0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < A_len; ++i) - { - acc += A[i]; - } - out[0] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_small.cl deleted file mode 100644 index 52a1e77..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_small.cl +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,accu_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += in_mem[in_mem_offset + i] + in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += in_mem[in_mem_offset + i]; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal.cl deleted file mode 100644 index fe4649a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal.cl +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,approx_equal)(__global uint* out_mem, - __global const eT1* A_mem, - const UWORD A_offset, - const UWORD A_M_n_rows, - __global const eT1* B_mem, - const UWORD B_offset, - const UWORD B_M_n_rows, - const UWORD n_rows, - const UWORD n_elem, - __local volatile uint* aux_mem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - // A bit painful... - const UWORD row1 = i % n_rows; - const UWORD col1 = i / n_rows; - const UWORD row2 = (i + get_local_size(0)) % n_rows; - const UWORD col2 = (i + get_local_size(0)) / n_rows; - - const UWORD A_loc1 = A_offset + row1 + col1 * A_M_n_rows; - const UWORD A_loc2 = A_offset + row2 + col2 * A_M_n_rows; - const UWORD B_loc1 = B_offset + row1 + col1 * B_M_n_rows; - const UWORD B_loc2 = B_offset + row2 + col2 * B_M_n_rows; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (COOT_FN(coot_isnan_,eT1)(A_val1) || COOT_FN(coot_isnan_,eT1)(B_val1) || COOT_FN(coot_isnan_,eT1)(A_val2) || COOT_FN(coot_isnan_,eT1)(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = COOT_FN(coot_absdiff_,eT1)(A_val1, B_val1); - const eT1 absdiff2 = COOT_FN(coot_absdiff_,eT1)(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= (eT1) 1) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD row = i % n_rows; - const UWORD col = i / n_rows; - - const UWORD A_loc = A_offset + row + col * A_M_n_rows; - const UWORD B_loc = B_offset + row + col * B_M_n_rows; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (COOT_FN(coot_isnan_,eT1)(A_val) || COOT_FN(coot_isnan_,eT1)(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = COOT_FN(coot_absdiff_,eT1)(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= (eT1) 1) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(and_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube.cl deleted file mode 100644 index 5252aac..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube.cl +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,approx_equal_cube)(__global uint* out_mem, - __global const eT1* A_mem, - const UWORD A_offset, - const UWORD A_M_n_rows, - const UWORD A_M_n_cols, - __global const eT1* B_mem, - const UWORD B_offset, - const UWORD B_M_n_rows, - const UWORD B_M_n_cols, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_elem, - __local volatile uint* aux_mem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - const UWORD n_elem_slice = n_rows * n_cols; - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - // A bit painful... TODO: implement a more efficient non-modulo approach - const UWORD elem1 = i % n_elem_slice; - const UWORD slice1 = i / n_elem_slice; - const UWORD row1 = elem1 % n_rows; - const UWORD col1 = elem1 / n_rows; - - const UWORD elem2 = (i + get_local_size(0)) % n_elem_slice; - const UWORD slice2 = (i + get_local_size(0)) / n_elem_slice; - const UWORD row2 = elem2 % n_rows; - const UWORD col2 = elem2 / n_rows; - - const UWORD A_loc1 = A_offset + row1 + col1 * A_M_n_rows + slice1 * A_M_n_rows * A_M_n_cols; - const UWORD A_loc2 = A_offset + row2 + col2 * A_M_n_rows + slice2 * A_M_n_rows * A_M_n_cols; - const UWORD B_loc1 = B_offset + row1 + col1 * B_M_n_rows + slice1 * B_M_n_rows * B_M_n_cols; - const UWORD B_loc2 = B_offset + row2 + col2 * B_M_n_rows + slice2 * B_M_n_rows * B_M_n_cols; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (COOT_FN(coot_isnan_,eT1)(A_val1) || COOT_FN(coot_isnan_,eT1)(B_val1) || COOT_FN(coot_isnan_,eT1)(A_val2) || COOT_FN(coot_isnan_,eT1)(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = COOT_FN(coot_absdiff_,eT1)(A_val1, B_val1); - const eT1 absdiff2 = COOT_FN(coot_absdiff_,eT1)(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= (eT1) 1) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD elem = i % n_elem_slice; - const UWORD slice = i / n_elem_slice; - const UWORD row = elem % n_rows; - const UWORD col = elem / n_rows; - - const UWORD A_loc = A_offset + row + col * A_M_n_rows + slice * A_M_n_rows * A_M_n_cols; - const UWORD B_loc = B_offset + row + col * B_M_n_rows + slice * B_M_n_rows * B_M_n_cols; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (COOT_FN(coot_isnan_,eT1)(A_val) || COOT_FN(coot_isnan_,eT1)(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = COOT_FN(coot_absdiff_,eT1)(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= (eT1) 1) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(and_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube_small.cl deleted file mode 100644 index 5e2492a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube_small.cl +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,approx_equal_cube_small)(__global uint* out_mem, - __global const eT1* A_mem, - const UWORD A_offset, - const UWORD A_M_n_rows, - const UWORD A_M_n_cols, - __global const eT1* B_mem, - const UWORD B_offset, - const UWORD B_M_n_rows, - const UWORD B_M_n_cols, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_elem, - __local volatile uint* aux_mem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - const UWORD n_elem_slice = n_rows * n_cols; - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - // A bit painful... TODO: implement a more efficient non-modulo approach - const UWORD elem1 = i % n_elem_slice; - const UWORD slice1 = i / n_elem_slice; - const UWORD row1 = elem1 % n_rows; - const UWORD col1 = elem1 / n_rows; - - const UWORD elem2 = (i + get_local_size(0)) % n_elem_slice; - const UWORD slice2 = (i + get_local_size(0)) / n_elem_slice; - const UWORD row2 = elem2 % n_rows; - const UWORD col2 = elem2 / n_rows; - - const UWORD A_loc1 = A_offset + row1 + col1 * A_M_n_rows + slice1 * A_M_n_rows * A_M_n_cols; - const UWORD A_loc2 = A_offset + row2 + col2 * A_M_n_rows + slice2 * A_M_n_rows * A_M_n_cols; - const UWORD B_loc1 = B_offset + row1 + col1 * B_M_n_rows + slice1 * B_M_n_rows * B_M_n_cols; - const UWORD B_loc2 = B_offset + row2 + col2 * B_M_n_rows + slice2 * B_M_n_rows * B_M_n_cols; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (COOT_FN(coot_isnan_,eT1)(A_val1) || COOT_FN(coot_isnan_,eT1)(B_val1) || COOT_FN(coot_isnan_,eT1)(A_val2) || COOT_FN(coot_isnan_,eT1)(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = COOT_FN(coot_absdiff_,eT1)(A_val1, B_val1); - const eT1 absdiff2 = COOT_FN(coot_absdiff_,eT1)(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= (eT1) 1) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD elem = i % n_elem_slice; - const UWORD slice = i / n_elem_slice; - const UWORD row = elem % n_rows; - const UWORD col = elem / n_rows; - - const UWORD A_loc = A_offset + row + col * A_M_n_rows + slice * A_M_n_rows * A_M_n_cols; - const UWORD B_loc = B_offset + row + col * B_M_n_rows + slice * B_M_n_rows * B_M_n_cols; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (COOT_FN(coot_isnan_,eT1)(A_val) || COOT_FN(coot_isnan_,eT1)(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = COOT_FN(coot_absdiff_,eT1)(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= (eT1) 1) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_small.cl deleted file mode 100644 index 5deec2f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_small.cl +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,approx_equal_small)(__global uint* out_mem, - __global const eT1* A_mem, - const UWORD A_offset, - const UWORD A_M_n_rows, - __global const eT1* B_mem, - const UWORD B_offset, - const UWORD B_M_n_rows, - const UWORD n_rows, - const UWORD n_elem, - __local volatile uint* aux_mem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - // A bit painful... - const UWORD row1 = i % n_rows; - const UWORD col1 = i / n_rows; - const UWORD row2 = (i + get_local_size(0)) % n_rows; - const UWORD col2 = (i + get_local_size(0)) / n_rows; - - const UWORD A_loc1 = A_offset + row1 + col1 * A_M_n_rows; - const UWORD A_loc2 = A_offset + row2 + col2 * A_M_n_rows; - const UWORD B_loc1 = B_offset + row1 + col1 * B_M_n_rows; - const UWORD B_loc2 = B_offset + row2 + col2 * B_M_n_rows; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (COOT_FN(coot_isnan_,eT1)(A_val1) || COOT_FN(coot_isnan_,eT1)(B_val1) || COOT_FN(coot_isnan_,eT1)(A_val2) || COOT_FN(coot_isnan_,eT1)(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = COOT_FN(coot_absdiff_,eT1)(A_val1, B_val1); - const eT1 absdiff2 = COOT_FN(coot_absdiff_,eT1)(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= (eT1) 1) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD row = i % n_rows; - const UWORD col = i / n_rows; - - const UWORD A_loc = A_offset + row + col * A_M_n_rows; - const UWORD B_loc = B_offset + row + col * B_M_n_rows; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (COOT_FN(coot_isnan_,eT1)(A_val) || COOT_FN(coot_isnan_,eT1)(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = COOT_FN(coot_absdiff_,eT1)(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= (eT1) 1) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/count_nonzeros.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/count_nonzeros.cl deleted file mode 100644 index e981dec..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/count_nonzeros.cl +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,count_nonzeros)(__global const eT1* A, - const UWORD A_offset, - __global UWORD* thread_counts, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - // We want to pass over the memory in A and count the number of nonzero elements. - // This will give us a count for each individual thread; we then want to prefix-sum this. - // This kernel is meant to be used as the first part of find(). - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_count = 0; - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - ++local_count; - } - if (A[A_offset + i + 1] != (eT1) 0) - { - ++local_count; - } - - i += 2; - } - if (i < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - ++local_count; - } - } - - // Aggregate the counts for all threads. - aux_mem[tid] = local_count; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - - for (UWORD s = num_threads / 2; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - // Set the last element correctly. - thread_counts[num_threads] = aux_mem[num_threads - 1]; - aux_mem[num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads / 2; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - thread_counts[tid] = aux_mem[tid]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find.cl deleted file mode 100644 index ab2f0b4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find.cl +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,find)(__global const eT1* A, - const UWORD A_offset, - __global const UWORD* thread_counts, - __global UWORD* out, - const UWORD out_offset, - const UWORD n_elem) - { - // Our goal is to fill `out` with the indices of nonzero values. - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - - UWORD i = start_elem; - - while (i + 1 < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - out[out_offset + out_index++] = i; - } - if (A[A_offset + i + 1] != (eT1) 0) - { - out[out_offset + out_index++] = (i + 1); - } - - i += 2; - } - if (i < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - out[out_offset + out_index++] = i; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_first.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_first.cl deleted file mode 100644 index c0638d1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_first.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,find_first)(__global const eT1* A, - const UWORD A_offset, - __global const UWORD* thread_counts, - __global UWORD* out, - const UWORD out_offset, - const UWORD k, - const UWORD n_elem) - { - // Our goal is to fill `out` with the first `k` indices of nonzero values. - // It is assumed that `k != 0`; if `k` is `0`, use the `find` kernel instead. - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - - UWORD i = start_elem; - - // We only want to find the first k points. - if (out_index < k) - { - while (i + 1 < end_elem) - { - if (A[A_offset + i] != (eT1) 0 && out_index < k) - { - out[out_offset + out_index++] = i; - } - if (A[A_offset + i + 1] != (eT1) 0 && out_index < k) - { - out[out_offset + out_index++] = (i + 1); - } - - i += 2; - } - if (i < end_elem) - { - if (A[A_offset + i] != (eT1) 0 && out_index < k) - { - out[out_offset + out_index++] = i; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_last.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_last.cl deleted file mode 100644 index 3dee54b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_last.cl +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,find_last)(__global const eT1* A, - const UWORD A_offset, - __global const UWORD* thread_counts, - __global UWORD* out, - const UWORD out_offset, - const UWORD m, - const UWORD n_elem) - { - // Our goal is to fill `out` with the last `k` indices of nonzero values. - // (Note that to match Armadillo's behavior, we want the last `k` indices in ascending order.) - // Instead of accepting `k` as a parameter, we instead accept `m = nnz - k`. - // This gives us the first index we should be putting an output value in. - // It is also assumed that `k != 0`; if `k` is `0`, use the `find` kernel instead. - - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - UWORD last_out_index = thread_counts[tid + 1]; - - UWORD i = start_elem; - - // We only want to find points with index `m` or higher. - if (last_out_index >= m) - { - while (i + 1 < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - if (out_index >= m) - { - out[out_offset + out_index - m] = i; - } - - ++out_index; - } - if (A[A_offset + i + 1] != (eT1) 0) - { - if (out_index >= m) - { - out[out_offset + out_index - m] = (i + 1); - } - - ++out_index; - } - - i += 2; - } - - if (i < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - if (out_index >= m) - { - out[out_offset + out_index - m] = i; - } - - ++out_index; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max.cl deleted file mode 100644 index dd60fc3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max.cl +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright 2024 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_other)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - { - if (data[tid + i] > data[tid]) - { - data[tid] = data[tid + i]; - data_uword[tid] = data_uword[tid + i]; - } - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_8)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_16)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 16] > data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_32)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 32] > data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] > data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_64)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 64] > data[tid]) - { - data[tid] = data[tid + 64]; - data_uword[tid] = data_uword[tid + 64]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 32] > data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] > data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void COOT_FN(PREFIX,index_max_subgroup_reduce_128)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 128] > data[tid]) - { - data[tid] = data[tid + 128]; - data_uword[tid] = data_uword[tid + 128]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 64] > data[tid]) - { - data[tid] = data[tid + 64]; - data_uword[tid] = data_uword[tid + 64]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 32] > data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] > data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -__kernel -void -COOT_FN(PREFIX,index_max)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_uword_mem, - const UWORD in_uword_mem_offset, - const UWORD use_uword_mem, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __global UWORD* out_uword_mem, - const UWORD out_uword_mem_offset, - __local volatile eT1* aux_mem, - __local volatile UWORD* aux_uword_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - aux_uword_mem[tid] = COOT_UWORD_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - if (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i + get_local_size(0)] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - - if (in_mem[in_mem_offset + i + get_local_size(0)] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[in_mem_offset + i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] > aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,index_max_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, aux_uword_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - out_uword_mem[out_uword_mem_offset + get_group_id(0)] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_colwise.cl deleted file mode 100644 index b72ec16..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_colwise.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_max_colwise)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - - eT1 best_val = colptr[0]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - if (colptr[i] > best_val) - { - best_val = colptr[i]; - best_index = i; - } - } - dest[dest_offset + col * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_cube_col.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_cube_col.cl deleted file mode 100644 index 57f7ac7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_cube_col.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_max_cube_col)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT1 best_val = src[src_offset + row + slice * n_rows * n_cols]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - if (src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols] > best_val) - { - best_val = src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols]; - best_index = i; - } - } - dest[dest_offset + row + slice * n_rows] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_rowwise.cl deleted file mode 100644 index 87a8206..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_rowwise.cl +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_max_rowwise)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 best_val = src[src_offset + row]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - if (src[src_offset + (i * src_M_n_rows) + row] > best_val) - { - best_val = src[src_offset + (i * src_M_n_rows) + row]; - best_index = i; - } - } - dest[dest_offset + row * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_small.cl deleted file mode 100644 index a36ba38..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_small.cl +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2024 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_max_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_uword_mem, - const UWORD in_uword_mem_offset, - const UWORD use_uword_mem, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __global UWORD* out_uword_mem, - const UWORD out_uword_mem_offset, - __local volatile eT1* aux_mem, - __local volatile UWORD* aux_uword_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - aux_uword_mem[tid] = COOT_UWORD_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - if (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i + get_local_size(0)] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - - if (in_mem[in_mem_offset + i + get_local_size(0)] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[in_mem_offset + i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - if (aux_mem[tid + s] > aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - out_uword_mem[out_uword_mem_offset + get_group_id(0)] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min.cl deleted file mode 100644 index baba221..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min.cl +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright 2024 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_other)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - { - if (data[tid + i] < data[tid]) - { - data[tid] = data[tid + i]; - data_uword[tid] = data_uword[tid + i]; - } - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_8)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_16)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 16] < data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_32)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 32] < data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] < data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_64)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 64] < data[tid]) - { - data[tid] = data[tid + 64]; - data_uword[tid] = data_uword[tid + 64]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 32] < data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] < data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void COOT_FN(PREFIX,index_min_subgroup_reduce_128)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 128] < data[tid]) - { - data[tid] = data[tid + 128]; - data_uword[tid] = data_uword[tid + 128]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 64] < data[tid]) - { - data[tid] = data[tid + 64]; - data_uword[tid] = data_uword[tid + 64]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 32] < data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] < data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -__kernel -void -COOT_FN(PREFIX,index_min)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_uword_mem, - const UWORD in_uword_mem_offset, - const UWORD use_uword_mem, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __global UWORD* out_uword_mem, - const UWORD out_uword_mem_offset, - __local volatile eT1* aux_mem, - __local volatile UWORD* aux_uword_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - aux_uword_mem[tid] = (UWORD) 0; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - if (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i + get_local_size(0)] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - - if (in_mem[in_mem_offset + i + get_local_size(0)] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[in_mem_offset + i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] < aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,index_min_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, aux_uword_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - out_uword_mem[out_uword_mem_offset + get_group_id(0)] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_colwise.cl deleted file mode 100644 index d793178..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_colwise.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_min_colwise)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - - eT1 best_val = colptr[0]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - if (colptr[i] < best_val) - { - best_val = colptr[i]; - best_index = i; - } - } - dest[dest_offset + col * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_cube_col.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_cube_col.cl deleted file mode 100644 index 1f93e2d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_cube_col.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_min_cube_col)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT1 best_val = src[src_offset + row + slice * n_rows * n_cols]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - if (src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols] < best_val) - { - best_val = src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols]; - best_index = i; - } - } - dest[dest_offset + row + slice * n_rows] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_rowwise.cl deleted file mode 100644 index 7f0bbe0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_rowwise.cl +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_min_rowwise)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 best_val = src[src_offset + row]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - if (src[src_offset + (i * src_M_n_rows) + row] < best_val) - { - best_val = src[src_offset + (i * src_M_n_rows) + row]; - best_index = i; - } - } - dest[dest_offset + row * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_small.cl deleted file mode 100644 index f746c58..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_small.cl +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2024 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_min_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_uword_mem, - const UWORD in_uword_mem_offset, - const UWORD use_uword_mem, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __global UWORD* out_uword_mem, - const UWORD out_uword_mem_offset, - __local volatile eT1* aux_mem, - __local volatile UWORD* aux_uword_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - aux_uword_mem[tid] = (UWORD) 0; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - if (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i + get_local_size(0)] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - - if (in_mem[in_mem_offset + i + get_local_size(0)] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[in_mem_offset + i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - if (aux_mem[tid + s] < aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - out_uword_mem[out_uword_mem_offset + get_group_id(0)] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_philox_randn.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_philox_randn.cl deleted file mode 100644 index dbf60a3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_philox_randn.cl +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// philox_4x32_10, specific to generating u32s - - - -inline -void -philox_4x32_10_single_round(uint* counter, uint* key) - { - uint hi0 = mul_hi((uint) 0xD2511F53, counter[0]); - uint hi1 = mul_hi((uint) 0xCD9E8D57, counter[2]); - uint lo0 = 0xD2511F53 * counter[0]; - uint lo1 = 0xCD9E8D57 * counter[2]; - - counter[0] = hi1 ^ counter[1] ^ key[0]; - counter[1] = lo1; - counter[2] = hi0 ^ counter[3] ^ key[1]; - counter[3] = lo0; - } - - - -inline -void -philox_4x32_10_p_step(uint* philox_state) - { - if (++philox_state[0]) - return; - if (++philox_state[1]) - return; - if (++philox_state[2]) - return; - ++philox_state[3]; - } - - - -inline -void -philox_4x32_10_rng(uint* philox_state) - { - // 4 uint counter: philox_state[0:3] - // 2 uint key: philox_state[4:5] - - // apply P (increment state) - philox_4x32_10_p_step(philox_state); - - // apply S-box 10 times - for (UWORD i = 0; i < 9; ++i) - { - philox_4x32_10_single_round(philox_state, philox_state + 4); - philox_state[4] += 0x9E3779B9; - philox_state[5] += 0xBB67AE85; - } - philox_4x32_10_single_round(philox_state, philox_state + 4); - } - - -// -// Convenience functions to get random numbers of 32-bit or 64-bit width out of the Philox 4x32-10 generator. -// The aux memory is only used for ulongs (64-bit width). -// - -inline -void -philox_4x32_10_rng_uchar(uint* philox_state, uint* aux) - { - // This generates more than we need, but that's okay. - philox_4x32_10_rng(philox_state); - } - - - -inline -void -philox_4x32_10_rng_ushort(uint* philox_state, uint* aux) - { - // This generates more than we need, but that's okay. - philox_4x32_10_rng(philox_state); - } - - - -inline -void -philox_4x32_10_rng_uint(uint* philox_state, uint* aux) - { - philox_4x32_10_rng(philox_state); - } - - - -inline -void -philox_4x32_10_rng_ulong(uint* philox_state, uint* aux) - { - philox_4x32_10_rng(philox_state); - // Save 4x32 bits of random data. - aux[0] = philox_state[0]; - aux[1] = philox_state[1]; - aux[2] = philox_state[2]; - aux[3] = philox_state[3]; - // Generate the next 4x32 bits of random data. - philox_4x32_10_rng(philox_state); - } - - - -inline -uchar -philox_get_elem_uchar(uint* philox_state, uint* aux, const UWORD i) - { - return ((uchar*) philox_state)[i]; - } - - - -inline -ushort -philox_get_elem_ushort(uint* philox_state, uint* aux, const UWORD i) - { - return ((ushort*) philox_state)[i]; - } - - - -inline -uint -philox_get_elem_uint(uint* philox_state, uint* aux, const UWORD i) - { - return philox_state[i]; - } - - - -inline -ulong -philox_get_elem_ulong(uint* philox_state, uint* aux, const UWORD i) - { - if (i <= 1) - return ((ulong*) philox_state)[i]; - else - return ((ulong*) aux)[i - 2]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_philox_randn)(__global eT1* mem, - const UWORD mem_offset, - __global uint* philox_state, - const UWORD n, - const fp_eT1 mu, - const fp_eT1 sd) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - uint local_philox_state[6]; - local_philox_state[0] = philox_state[6 * tid ]; - local_philox_state[1] = philox_state[6 * tid + 1]; - local_philox_state[2] = philox_state[6 * tid + 2]; - local_philox_state[3] = philox_state[6 * tid + 3]; - local_philox_state[4] = philox_state[6 * tid + 4]; - local_philox_state[5] = philox_state[6 * tid + 5]; - - // Only used if we are generating 64-bit types. - uint aux_mem[4]; - - while (i < n) - { - COOT_FN(philox_4x32_10_rng_,uint_eT1)(local_philox_state, aux_mem); - - // Perform the Box-Muller transformation to transform [0, 1] samples to N(0, 1). - fp_eT1 sqrt_inner = -2 * log(COOT_FN(philox_get_elem_,uint_eT1)(local_philox_state, aux_mem, 0) / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - if (isnan(sqrt_inner) || isinf(sqrt_inner)) - sqrt_inner = (fp_eT1) 0; - fp_eT1 trig_inner = 2 * M_PI * (COOT_FN(philox_get_elem_,uint_eT1)(local_philox_state, aux_mem, 1) / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - - mem[mem_offset + i] = (eT1) ((sqrt(sqrt_inner) * cos(trig_inner)) * sd + mu); - i += num_threads; - if (i < n) - mem[mem_offset + i] = (eT1) ((sqrt(sqrt_inner) * sin(trig_inner)) * sd + mu); - i += num_threads; - - sqrt_inner = -2 * log(COOT_FN(philox_get_elem_,uint_eT1)(local_philox_state, aux_mem, 2) / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - if (isnan(sqrt_inner) || isinf(sqrt_inner)) - sqrt_inner = (fp_eT1) 0; - trig_inner = 2 * M_PI * (COOT_FN(philox_get_elem_,uint_eT1)(local_philox_state, aux_mem, 3) / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - - if (i < n) - mem[mem_offset + i] = (eT1) ((sqrt(sqrt_inner) * cos(trig_inner)) * sd + mu); - i += num_threads; - if (i < n) - mem[mem_offset + i] = (eT1) ((sqrt(sqrt_inner) * sin(trig_inner)) * sd + mu); - } - - // Restore RNG state. - philox_state[6 * tid ] = local_philox_state[0]; - philox_state[6 * tid + 1] = local_philox_state[1]; - philox_state[6 * tid + 2] = local_philox_state[2]; - philox_state[6 * tid + 3] = local_philox_state[3]; - philox_state[6 * tid + 4] = local_philox_state[4]; - philox_state[6 * tid + 5] = local_philox_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_set_eye.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_set_eye.cl deleted file mode 100644 index 7a17dec..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_set_eye.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_set_eye)(__global eT1* out, - const UWORD out_offset, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - if( (row < n_rows) && (col < n_cols) ) - { - const UWORD offset = row + col*n_rows + out_offset; - out[offset] = (row == col) ? (eT1)(1) : (eT1)(0); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randi.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randi.cl deleted file mode 100644 index 7516946..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randi.cl +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// See algorithm "xorwow" from page 5 of "Xorshift RNGs" by George Marsaglia. -inline -uint -xorwow32_rng_uint(uint* xorwow_state) - { - // xorwow_state[0] through xorwow_state[4] represent the 5 state integers, - // and xorwow_state[5] holds the counter. - uint t = xorwow_state[4] ^ (xorwow_state[4] >> 2); - - xorwow_state[4] = xorwow_state[3]; - xorwow_state[3] = xorwow_state[2]; - xorwow_state[2] = xorwow_state[1]; - xorwow_state[1] = xorwow_state[0]; - xorwow_state[0] ^= (xorwow_state[0] << 4) ^ (t ^ (t << 1)); - - // Following Saito and Matsumoto (2012), we use a larger constant for d so that the higher bits flip more often. - // We ignore their conclusion that XORWOW has problems (it's fast!). - xorwow_state[5] += 268183997; - return xorwow_state[0] + xorwow_state[5]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_xorwow32_randi)(__global eT1* mem, - const UWORD mem_offset, - __global uint* xorwow_state, - const UWORD n, - const eT1 lo, - const uint_eT1 range, - const char needs_modulo) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - uint local_xorwow_state[6]; - local_xorwow_state[0] = xorwow_state[6 * tid ]; - local_xorwow_state[1] = xorwow_state[6 * tid + 1]; - local_xorwow_state[2] = xorwow_state[6 * tid + 2]; - local_xorwow_state[3] = xorwow_state[6 * tid + 3]; - local_xorwow_state[4] = xorwow_state[6 * tid + 4]; - local_xorwow_state[5] = xorwow_state[6 * tid + 5]; - - while (i < n) - { - // This generates a number in [0, uint_eT1_max]. - uint_eT1 t = (uint_eT1) xorwow32_rng_uint(local_xorwow_state); - // Modulo down to the range [0, (hi - lo)], if needed. - if (needs_modulo == 1) - t %= (range + 1); - // Cast back to the correct type, and add lo to get the correct range. - mem[mem_offset + i] = ((eT1) t) + lo; - i += num_threads; - } - - // Return updated RNG state to global memory. - xorwow_state[6 * tid ] = local_xorwow_state[0]; - xorwow_state[6 * tid + 1] = local_xorwow_state[1]; - xorwow_state[6 * tid + 2] = local_xorwow_state[2]; - xorwow_state[6 * tid + 3] = local_xorwow_state[3]; - xorwow_state[6 * tid + 4] = local_xorwow_state[4]; - xorwow_state[6 * tid + 5] = local_xorwow_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randu.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randu.cl deleted file mode 100644 index 24192b2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randu.cl +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -//~ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -//~ -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// See algorithm "xorwow" from page 5 of "Xorshift RNGs" by George Marsaglia. -inline -uint -xorwow32_rng_uint(uint* xorwow_state) - { - // xorwow_state[0] through xorwow_state[4] represent the 5 state integers, - // and xorwow_state[5] holds the counter. - uint t = xorwow_state[4] ^ (xorwow_state[4] >> 2); - - xorwow_state[4] = xorwow_state[3]; - xorwow_state[3] = xorwow_state[2]; - xorwow_state[2] = xorwow_state[1]; - xorwow_state[1] = xorwow_state[0]; - xorwow_state[0] ^= (xorwow_state[0] << 4) ^ (t ^ (t << 1)); - - // Following Saito and Matsumoto (2012), we use a larger constant for d so that the higher bits flip more often. - // We ignore their conclusion that XORWOW has problems (it's fast!). - xorwow_state[5] += 268183997; - return xorwow_state[0] + xorwow_state[5]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_xorwow32_randu)(__global eT1* mem, - const UWORD mem_offset, - __global uint* xorwow_state, - const UWORD n) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - uint local_xorwow_state[6]; - local_xorwow_state[0] = xorwow_state[6 * tid ]; - local_xorwow_state[1] = xorwow_state[6 * tid + 1]; - local_xorwow_state[2] = xorwow_state[6 * tid + 2]; - local_xorwow_state[3] = xorwow_state[6 * tid + 3]; - local_xorwow_state[4] = xorwow_state[6 * tid + 4]; - local_xorwow_state[5] = xorwow_state[6 * tid + 5]; - - while (i < n) - { - uint_eT1 t = (uint_eT1) xorwow32_rng_uint(local_xorwow_state); - // Now normalize to [0, 1] and compute the output. - mem[mem_offset + i] = (eT1) (t / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - i += num_threads; - } - - // Return updated RNG state to global memory. - xorwow_state[6 * tid ] = local_xorwow_state[0]; - xorwow_state[6 * tid + 1] = local_xorwow_state[1]; - xorwow_state[6 * tid + 2] = local_xorwow_state[2]; - xorwow_state[6 * tid + 3] = local_xorwow_state[3]; - xorwow_state[6 * tid + 4] = local_xorwow_state[4]; - xorwow_state[6 * tid + 5] = local_xorwow_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randi.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randi.cl deleted file mode 100644 index 03d3817..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randi.cl +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// See algorithm "xorwow" from page 5 of "Xorshift RNGs" by George Marsaglia. -inline -ulong -xorwow64_rng_ulong(ulong* xorwow_state) - { - // xorwow_state[0] through xorwow_state[4] represent the 5 state integers, - // and xorwow_state[5] holds the counter. - ulong t = xorwow_state[4] ^ (xorwow_state[4] >> 2); - - xorwow_state[4] = xorwow_state[3]; - xorwow_state[3] = xorwow_state[2]; - xorwow_state[2] = xorwow_state[1]; - xorwow_state[1] = xorwow_state[0]; - xorwow_state[0] ^= (xorwow_state[0] << 4) ^ (t ^ (t << 1)); - - // Following Saito and Matsumoto (2012), we use a larger constant for d so that the higher bits flip more often. - // We ignore their conclusion that XORWOW has problems (it's fast!). - xorwow_state[5] += 2274084621458550325; - return xorwow_state[0] + xorwow_state[5]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_xorwow64_randi)(__global eT1* mem, - const UWORD mem_offset, - __global ulong* xorwow_state, - const UWORD n, - const eT1 lo, - const uint_eT1 range, - const char needs_modulo) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - ulong local_xorwow_state[6]; - local_xorwow_state[0] = xorwow_state[6 * tid ]; - local_xorwow_state[1] = xorwow_state[6 * tid + 1]; - local_xorwow_state[2] = xorwow_state[6 * tid + 2]; - local_xorwow_state[3] = xorwow_state[6 * tid + 3]; - local_xorwow_state[4] = xorwow_state[6 * tid + 4]; - local_xorwow_state[5] = xorwow_state[6 * tid + 5]; - - while (i < n) - { - // This generates a number in [0, uint_eT1_max]. - uint_eT1 t = (uint_eT1) xorwow64_rng_ulong(local_xorwow_state); - // Modulo down to the range [0, (hi - lo)], if needed. - if (needs_modulo == 1) - t %= (range + 1); - // Cast back to the correct type, and add lo to get the correct range. - mem[mem_offset + i] = ((eT1) t) + lo; - i += num_threads; - } - - // Return updated RNG state to global memory. - xorwow_state[6 * tid ] = local_xorwow_state[0]; - xorwow_state[6 * tid + 1] = local_xorwow_state[1]; - xorwow_state[6 * tid + 2] = local_xorwow_state[2]; - xorwow_state[6 * tid + 3] = local_xorwow_state[3]; - xorwow_state[6 * tid + 4] = local_xorwow_state[4]; - xorwow_state[6 * tid + 5] = local_xorwow_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randu.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randu.cl deleted file mode 100644 index 40450ae..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randu.cl +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -//~ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -//~ -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// See algorithm "xorwow" from page 5 of "Xorshift RNGs" by George Marsaglia. -inline -ulong -xorwow64_rng_ulong(ulong* xorwow_state) - { - // xorwow_state[0] through xorwow_state[4] represent the 5 state integers, - // and xorwow_state[5] holds the counter. - ulong t = xorwow_state[4] ^ (xorwow_state[4] >> 2); - - xorwow_state[4] = xorwow_state[3]; - xorwow_state[3] = xorwow_state[2]; - xorwow_state[2] = xorwow_state[1]; - xorwow_state[1] = xorwow_state[0]; - xorwow_state[0] ^= (xorwow_state[0] << 4) ^ (t ^ (t << 1)); - - // Following Saito and Matsumoto (2012), we use a larger constant for d so that the higher bits flip more often. - // We ignore their conclusion that XORWOW has problems (it's fast!). - xorwow_state[5] += 2274084621458550325; - return xorwow_state[0] + xorwow_state[5]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_xorwow64_randu)(__global eT1* mem, - const UWORD mem_offset, - __global ulong* xorwow_state, - const UWORD n) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - ulong local_xorwow_state[6]; - local_xorwow_state[0] = xorwow_state[6 * tid ]; - local_xorwow_state[1] = xorwow_state[6 * tid + 1]; - local_xorwow_state[2] = xorwow_state[6 * tid + 2]; - local_xorwow_state[3] = xorwow_state[6 * tid + 3]; - local_xorwow_state[4] = xorwow_state[6 * tid + 4]; - local_xorwow_state[5] = xorwow_state[6 * tid + 5]; - - while (i < n) - { - uint_eT1 t = (uint_eT1) xorwow64_rng_ulong(local_xorwow_state); - // Now normalize to [0, 1] and compute the output. - mem[mem_offset + i] = (eT1) (t / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - i += num_threads; - } - - // Return updated RNG state to global memory. - xorwow_state[6 * tid ] = local_xorwow_state[0]; - xorwow_state[6 * tid + 1] = local_xorwow_state[1]; - xorwow_state[6 * tid + 2] = local_xorwow_state[2]; - xorwow_state[6 * tid + 3] = local_xorwow_state[3]; - xorwow_state[6 * tid + 4] = local_xorwow_state[4]; - xorwow_state[6 * tid + 5] = local_xorwow_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/linspace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/linspace.cl deleted file mode 100644 index 884218e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/linspace.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,linspace)(__global eT1* out_mem, - const UWORD out_mem_offset, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - const UWORD idx = get_global_id(0); - if (idx < num) - { - out_mem[out_mem_offset + idx * mem_incr] = (eT1) (start + step * idx); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/logspace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/logspace.cl deleted file mode 100644 index ec65f5f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/logspace.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,logspace)(__global eT1* out_mem, - const UWORD out_mem_offset, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - const UWORD idx = get_global_id(0); - if (idx < num) - { - out_mem[out_mem_offset + idx * mem_incr] = (eT1) pow((fp_eT1) 10, (fp_eT1) (start + step * idx)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/ltri_set_zero.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/ltri_set_zero.cl deleted file mode 100644 index 1890e0b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/ltri_set_zero.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,ltri_set_zero)(__global eT1* out, - const UWORD out_offset, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD index = row + n_rows * col; - if( (row < n_rows) && (col < n_cols) && (row > col) ) - { - out[index + out_offset] = (eT1)(0); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max.cl deleted file mode 100644 index e500ad7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max.cl +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i]); - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,max_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs.cl deleted file mode 100644 index 28a667e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs.cl +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_abs)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[in_mem_offset + i]); - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i])); - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i])); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,max_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs_small.cl deleted file mode 100644 index c7a531c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs_small.cl +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_abs_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[in_mem_offset + i]); - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i])); - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i])); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_small.cl deleted file mode 100644 index b4872e1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_small.cl +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i]); - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i]); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min.cl deleted file mode 100644 index 156e598..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min.cl +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i]); - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,min_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min_small.cl deleted file mode 100644 index 91a6208..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min_small.cl +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i]); - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i]); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise.cl deleted file mode 100644 index 391f7f7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each column in `in` by the corresponding value in `A` -__kernel -void -COOT_FN(PREFIX,mul_colwise)(__global eT1* out, - const UWORD out_offset, - __global const eT1* A, // expected to have length n_cols - const UWORD A_offset, - const UWORD A_incr, - __global const eT1* in, - const UWORD in_offset, - const eT1 alpha, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - const UWORD in_elem_offset = col * in_M_n_rows; - const UWORD out_elem_offset = col * n_rows; - const eT1 val = alpha * A[A_offset + col * A_incr]; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - out[out_offset + i + out_elem_offset] = val * in[in_offset + i + in_elem_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise_trans.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise_trans.cl deleted file mode 100644 index 26a4713..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise_trans.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each column in `trans(in)` by the corresponding value in `A` -__kernel -void -COOT_FN(PREFIX,mul_colwise_trans)(__global eT1* out, - const UWORD out_offset, - __global const eT1* A, // expected to have length n_cols - const UWORD A_offset, - const UWORD A_incr, - __global const eT1* in, - const UWORD in_offset, - const eT1 alpha, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - const eT1 val = alpha * A[A_offset + col * A_incr]; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - const UWORD in_elem_offset = col + i * in_M_n_rows; - const UWORD out_elem_offset = col * n_rows + i; - out[out_offset + out_elem_offset] = val * in[in_offset + in_elem_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise.cl deleted file mode 100644 index c3d6ba7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each row in `in` by the corresponding value in `A` -__kernel -void -COOT_FN(PREFIX,mul_rowwise)(__global eT1* out, - const UWORD out_offset, - __global const eT1* A, // expected to have length n_rows - const UWORD A_offset, - const UWORD A_incr, - __global const eT1* in, - const UWORD in_offset, - const eT1 alpha, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - const eT1 val = alpha * A[A_offset + row * A_incr]; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_cols; ++i) - { - const UWORD in_elem_offset = i * in_M_n_rows + row; - const UWORD out_elem_offset = i * in_M_n_rows + row; - out[out_offset + out_elem_offset] = val * in[in_offset + in_elem_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise_trans.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise_trans.cl deleted file mode 100644 index 83371d9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise_trans.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each row in `trans(in)` by the corresponding value in `A` -__kernel -void -COOT_FN(PREFIX,mul_rowwise_trans)(__global eT1* out, - const UWORD out_offset, - __global const eT1* A, // expected to have length n_rows - const UWORD A_offset, - const UWORD A_incr, - __global const eT1* in, - const UWORD in_offset, - const eT1 alpha, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - const eT1 val = alpha * A[A_offset + row * A_incr]; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_cols; ++i) - { - const UWORD in_elem_offset = i + row * in_M_n_rows; - const UWORD out_elem_offset = i * n_rows + row; - out[out_offset + out_elem_offset] = val * in[in_offset + in_elem_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod.cl deleted file mode 100644 index e39bbf4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod.cl +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,prod)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] *= in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] *= in_mem[in_mem_offset + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,prod_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod_small.cl deleted file mode 100644 index 5a75ee8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod_small.cl +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,prod_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] *= in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] *= in_mem[in_mem_offset + i]; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_asc.cl deleted file mode 100644 index bd51347..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_asc.cl +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_asc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_counts[2]; - - __global eT1* unsorted_memptr = A + A_offset; - __global eT1* sorted_memptr = tmp_mem; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - barrier(CLK_LOCAL_MEM_FENCE); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid ]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid + num_threads]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // swap these and perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = aux_mem[num_threads] - aux_mem[tid]; // contains the first place we should put a 1 point (we will move downwards) - local_counts[1] = (local_counts[1] == 0) ? 0 : local_counts[1] - 1; // avoid underflow - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_asc.cl deleted file mode 100644 index 4dd8b55..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_asc.cl +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_colwise_asc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < A_n_cols) - { - __global eT1* unsorted_colptr = &A[A_offset + col * A_M_n_rows]; - __global eT1* sorted_colptr = &tmp_mem[ col * A_n_rows ]; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - __global uint_eT1* colptr = (__global uint_eT1*) unsorted_colptr; - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> b)]; - } - - counts[1] = counts[0]; // now holds the offset to put the next value at - counts[0] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD out_index = counts[((colptr[i] & mask) >> b)]++; - sorted_colptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - __global eT1* tmp = unsorted_colptr; - unsorted_colptr = sorted_colptr; - sorted_colptr = tmp; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* colptr = (__global uint_eT1*) unsorted_colptr; - counts[0] = 0; - counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> last_bit)]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = counts[0] - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_colptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]++; - sorted_colptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_desc.cl deleted file mode 100644 index e84408d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_desc.cl +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_colwise_desc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < A_n_cols) - { - __global eT1* unsorted_colptr = &A[A_offset + col * A_M_n_rows]; - __global eT1* sorted_colptr = &tmp_mem[ col * A_n_rows ]; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - __global uint_eT1* colptr = (__global uint_eT1*) unsorted_colptr; - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> b)]; - } - - counts[0] = counts[1]; // now holds the offset to put the next value at - counts[1] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD out_index = counts[((colptr[i] & mask) >> b)]++; - sorted_colptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - __global eT1* tmp = unsorted_colptr; - unsorted_colptr = sorted_colptr; - sorted_colptr = tmp; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* colptr = (__global uint_eT1*) unsorted_colptr; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - counts[0] = 0; // now holds the offset to put the next positive value at - counts[1] = A_n_rows - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_colptr[out_index] = val; - } - } - else - { - counts[0] = 0; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> last_bit)]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - counts[1] = counts[0]; // now holds the offset to put the next negative value at - counts[0] = 0; // now holds the offset to put the next positive value at - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]++; - sorted_colptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_desc.cl deleted file mode 100644 index c76e7bc..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_desc.cl +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_desc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_counts[2]; - - __global eT1* unsorted_memptr = A + A_offset; - __global eT1* sorted_memptr = tmp_mem; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - barrier(CLK_LOCAL_MEM_FENCE); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid ]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = n_elem - 1 - (aux_mem[num_threads + tid] - aux_mem[num_threads]); // contains the first place we should put a 1 point (we will move downwards) - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid]; - local_counts[1] = aux_mem[tid + num_threads]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_asc.cl deleted file mode 100644 index db77609..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_asc.cl +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_index_asc)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* tmp_mem, - __global UWORD* tmp_mem_index, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill A_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - A_index[A_index_offset + i] = i; - A_index[A_index_offset + i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - A_index[A_index_offset + i] = i; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - UWORD local_counts[2]; - - __global eT1* unsorted_memptr = A + A_offset; - __global UWORD* unsorted_index_memptr = A_index + A_index_offset; - __global eT1* sorted_memptr = tmp_mem; - __global UWORD* sorted_index_memptr = tmp_mem_index; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - barrier(CLK_LOCAL_MEM_FENCE); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid ]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid + num_threads]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - __global UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // swap these and perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = aux_mem[num_threads] - aux_mem[tid]; // contains the first place we should put a 1 point (we will move downwards) - local_counts[1] = (local_counts[1] == 0) ? 0 : local_counts[1] - 1; // avoid underflow - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_desc.cl deleted file mode 100644 index a67aa7e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_desc.cl +++ /dev/null @@ -1,320 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_index_desc)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* tmp_mem, - __global UWORD* tmp_mem_index, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill A_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - A_index[A_index_offset + i] = i; - A_index[A_index_offset + i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - A_index[A_index_offset + i] = i; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - UWORD local_counts[2]; - - __global eT1* unsorted_memptr = A + A_offset; - __global UWORD* unsorted_index_memptr = A_index + A_index_offset; - __global eT1* sorted_memptr = tmp_mem; - __global UWORD* sorted_index_memptr = tmp_mem_index; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - barrier(CLK_LOCAL_MEM_FENCE); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid ]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - __global UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = n_elem - 1 - (aux_mem[num_threads + tid] - aux_mem[num_threads]); // contains the first place we should put a 1 point (we will move downwards) - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid]; - local_counts[1] = aux_mem[tid + num_threads]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_multi_wg_shuffle.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_multi_wg_shuffle.cl deleted file mode 100644 index c841442..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_multi_wg_shuffle.cl +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_index_multi_wg_shuffle)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* out, - const UWORD out_offset, - __global UWORD* out_index, - const UWORD out_index_offset, - __global UWORD* counts, - const UWORD counts_offset, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - const UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - __global uint_eT1* uA = (__global uint_eT1*) A; - const uint_eT1 mask = (((uint_eT1) 3) << start_bit); - - int upper_bit_shift = 1; - UWORD local_offsets[4]; - if (sort_type == 0) - { - // for an ascending sort, the offsets are ordered for bit values 00/01/10/11 - local_offsets[0] = counts[counts_offset + tid ]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid + num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 11 point - } - else if (sort_type == 1) - { - // for a descending sort, the offsets are ordered for bit values 11/10/01/00 - local_offsets[0] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + tid ]; // first place we should put a 11 point - } - else if (sort_type == 2) - { - // for the last bits of a signed integer in an ascending sort, the offsets are ordered for bit values (10/11/00/01) - local_offsets[0] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + tid ]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + tid + num_threads]; // first place we should put a 11 point - } - else if (sort_type == 3) - { - // for the last bits of a signed integer in a descending sort, the offsets are ordered for bit values (01/00/11/10) - local_offsets[0] = counts[counts_offset + tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid ]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 11 point - } - else if (sort_type == 4) - { - // for the last bits of a floating-point number in an ascending sort, the offsets are ordered for bit values (11/10/00/01) - // and, the negative values are ordered in a descending order, so we have to reverse them - local_offsets[0] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + 2 * num_threads] - counts[counts_offset + tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + num_threads] - counts[counts_offset + tid ]; // first place we should put a 11 point - - // avoid underflow - local_offsets[2] = (local_offsets[2] == 0) ? 0 : local_offsets[2] - 1; - local_offsets[3] = (local_offsets[3] == 0) ? 0 : local_offsets[3] - 1; - - upper_bit_shift = -1; // sort negative values backwards - } - else if (sort_type == 5) - { - // for the last bits of a floating-point number in a descending sort, the offsets are ordered for bit values (01/00/10/11) - // and, the negative values are ordered in a descending order, so we have to reverse them - local_offsets[0] = counts[counts_offset + tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid ]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + 3 * num_threads] - (counts[counts_offset + tid + 2 * num_threads] - counts[counts_offset + 2 * num_threads]); // first place we should put a 10 point - local_offsets[3] = n_elem - (counts[counts_offset + tid + 3 * num_threads] - counts[counts_offset + 3 * num_threads]); // first place we should put a 11 point - - // avoid underflow - local_offsets[2] = (local_offsets[2] == 0) ? 0 : local_offsets[2] - 1; - local_offsets[3] = (local_offsets[3] == 0) ? 0 : local_offsets[3] - 1; - - upper_bit_shift = -1; // sort negative values backwards - } - else if (sort_type == 6) - { - // for the last bits of a floating-point number in a stable ascending sort, - // the offsets are ordered for bit values (11/10/00/01) - // but, we do not need to reverse any values - local_offsets[0] = counts[tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid ]; // first place we should put a 11 point - } - else if (sort_type == 7) - { - // for the last bits of a floating-point number in a stable descending sort, - // the offsets are ordered for bit values (01/00/10/11) - // but, we do not need to reverse any values - local_offsets[0] = counts[tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid ]; // first place we should put a 01 point - local_offsets[2] = counts[tid + 2 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid + 3 * num_threads]; // first place we should put a 11 point - } - - // Move all points that this thread is responsible for into the correct place. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - const uint_eT1 val1 = uA[A_offset + i ]; - const uint_eT1 val2 = uA[A_offset + i + 1]; - - const UWORD loc1 = ((val1 & mask) >> start_bit); - const UWORD loc2 = ((val2 & mask) >> start_bit); - - const UWORD out_index1 = local_offsets[loc1]; - local_offsets[loc1] += ((loc1 >= 2) ? upper_bit_shift : 1); - const UWORD out_index2 = local_offsets[loc2]; - local_offsets[loc2] += ((loc2 >= 2) ? upper_bit_shift : 1); - - out[out_offset + out_index1] = A[A_offset + i]; - out_index[out_index_offset + out_index1] = A_index[A_index_offset + i]; - - out[out_offset + out_index2] = A[A_offset + i + 1]; - out_index[out_offset + out_index2] = A_index[A_index_offset + i + 1]; - - i += 2; - } - if (i < end_elem) - { - const uint_eT1 val = uA[A_offset + i]; - const UWORD loc = ((val & mask) >> start_bit); - const UWORD out_index1 = local_offsets[loc]; - local_offsets[loc] += ((loc >= 2) ? upper_bit_shift : 1); - out[out_offset + out_index1] = A[A_offset + i]; - out_index[out_index_offset + out_index1] = A_index[A_index_offset + i]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_bit_count.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_bit_count.cl deleted file mode 100644 index a9cf02c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_bit_count.cl +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_multi_wg_bit_count)(__global eT1* A, - const UWORD A_offset, - __global UWORD* counts, - const UWORD counts_offset, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - const UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - const uint_eT1 mask = (((uint_eT1) 3) << start_bit); - - __global uint_eT1* uA = (__global uint_eT1*) A; // so that we can mask elements of A bitwise - - UWORD local_counts[4] = { 0, 0, 0, 0 }; - - // Count the number of elements with each bit value (00/01/10/11) that belong - // to this thread. - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(uA[A_offset + i ] & mask) >> start_bit]; - ++local_counts[(uA[A_offset + i + 1] & mask) >> start_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(uA[A_offset + i] & mask) >> start_bit]; - } - - // Save results to the right place for later processing. - if (sort_type == 0) - { - counts[counts_offset + tid ] = local_counts[0]; - counts[counts_offset + tid + num_threads] = local_counts[1]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[2]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[3]; - } - else if (sort_type == 1) - { - // If sort_type == 1 (descending), we want to store the results in the bit - // order 11/10/01/00, instead of the order of local_counts (00/01/10/11). - counts[counts_offset + tid ] = local_counts[3]; - counts[counts_offset + tid + num_threads] = local_counts[2]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[1]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[0]; - } - else if (sort_type == 2) - { - // If sort_type == 2 (highest two bits of a signed integer, ascending), we - // want to store the results in the bit order 10/11/00/01 - counts[counts_offset + tid ] = local_counts[2]; - counts[counts_offset + tid + num_threads] = local_counts[3]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[0]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[1]; - } - else if (sort_type == 3) - { - // If sort_type == 3 (highest two bits of a signed integer, descending), we - // want to store the results in the bit order 01/00/11/10 - counts[counts_offset + tid ] = local_counts[1]; - counts[counts_offset + tid + num_threads] = local_counts[0]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[3]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[2]; - } - else if (sort_type == 4 || sort_type == 6) - { - // If sort_type == 4 or 6 (highest two bits of floating-point number, ascending), - // we want to store the results in the bit order 11/10/00/01 - counts[counts_offset + tid ] = local_counts[3]; - counts[counts_offset + tid + num_threads] = local_counts[2]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[0]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[1]; - } - else if (sort_type == 5 || sort_type == 7) - { - // If sort_type == 5 or 7 (highest two bits of floating-point number, - // descending), we want to store the results in the bit order 01/00/10/11 - counts[counts_offset + tid ] = local_counts[1]; - counts[counts_offset + tid + num_threads] = local_counts[0]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[2]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[3]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_shuffle.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_shuffle.cl deleted file mode 100644 index a211364..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_shuffle.cl +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_multi_wg_shuffle)(__global eT1* A, - const UWORD A_offset, - __global eT1* out, - const UWORD out_offset, - __global UWORD* counts, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - const UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - __global uint_eT1* uA = (__global uint_eT1*) A; - const uint_eT1 mask = (((uint_eT1) 3) << start_bit); - - int upper_bit_shift = 1; - UWORD local_offsets[4]; - if (sort_type == 0) - { - // for an ascending sort, the offsets are ordered for bit values 00/01/10/11 - local_offsets[0] = counts[tid ]; // first place we should put a 00 point - local_offsets[1] = counts[tid + num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[tid + 2 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid + 3 * num_threads]; // first place we should put a 11 point - } - else if (sort_type == 1) - { - // for a descending sort, the offsets are ordered for bit values 11/10/01/00 - local_offsets[0] = counts[tid + 3 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid + 2 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid ]; // first place we should put a 11 point - } - else if (sort_type == 2) - { - // for the last bits of a signed integer in an ascending sort, the offsets are ordered for bit values (10/11/00/01) - local_offsets[0] = counts[tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[tid ]; // first place we should put a 10 point - local_offsets[3] = counts[tid + num_threads]; // first place we should put a 11 point - } - else if (sort_type == 3) - { - // for the last bits of a signed integer in a descending sort, the offsets are ordered for bit values (01/00/11/10) - local_offsets[0] = counts[tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid ]; // first place we should put a 01 point - local_offsets[2] = counts[tid + 3 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid + 2 * num_threads]; // first place we should put a 11 point - } - else if (sort_type == 4) - { - // for the last bits of a floating-point number in an ascending sort, the offsets are ordered for bit values (11/10/00/01) - // and, the negative values are ordered in a descending order, so we have to reverse them - local_offsets[0] = counts[tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[2 * num_threads] - counts[tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[ num_threads] - counts[tid ]; // first place we should put a 11 point - - // avoid underflow - local_offsets[2] = (local_offsets[2] == 0) ? 0 : local_offsets[2] - 1; - local_offsets[3] = (local_offsets[3] == 0) ? 0 : local_offsets[3] - 1; - - upper_bit_shift = -1; // sort negative values backwards - } - else if (sort_type == 5) - { - // for the last bits of a floating-point number in a descending sort, the offsets are ordered for bit values (01/00/10/11) - // and, the negative values are ordered in a descending order, so we have to reverse them - local_offsets[0] = counts[tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid ]; // first place we should put a 01 point - local_offsets[2] = counts[3 * num_threads] - (counts[tid + 2 * num_threads] - counts[2 * num_threads]); // first place we should put a 10 point - local_offsets[3] = n_elem - (counts[tid + 3 * num_threads] - counts[3 * num_threads]); // first place we should put a 11 point - - // avoid underflow - local_offsets[2] = (local_offsets[2] == 0) ? 0 : local_offsets[2] - 1; - local_offsets[3] = (local_offsets[3] == 0) ? 0 : local_offsets[3] - 1; - - upper_bit_shift = -1; // sort negative values backwards - } - - // Move all points that this thread is responsible for into the correct place. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - const uint_eT1 val1 = uA[A_offset + i ]; - const uint_eT1 val2 = uA[A_offset + i + 1]; - - const UWORD loc1 = ((val1 & mask) >> start_bit); - const UWORD loc2 = ((val2 & mask) >> start_bit); - - const UWORD out_index1 = local_offsets[loc1]; - local_offsets[loc1] += ((loc1 >= 2) ? upper_bit_shift : 1); - const UWORD out_index2 = local_offsets[loc2]; - local_offsets[loc2] += ((loc2 >= 2) ? upper_bit_shift : 1); - - out[out_offset + out_index1] = A[A_offset + i]; - out[out_offset + out_index2] = A[A_offset + i + 1]; - - i += 2; - } - if (i < end_elem) - { - const uint_eT1 val = uA[A_offset + i]; - const UWORD loc = ((val & mask) >> start_bit); - const UWORD out_index = local_offsets[loc]; - local_offsets[loc] += ((loc >= 2) ? upper_bit_shift : 1); - out[out_offset + out_index] = A[A_offset + i]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_asc.cl deleted file mode 100644 index 804aec0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_asc.cl +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_rowwise_asc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < A_n_rows) - { - __global eT1* unsorted_rowptr = &A[A_offset + row]; - __global eT1* sorted_rowptr = &tmp_mem[ row]; - - UWORD unsorted_n_rows = A_M_n_rows; - UWORD sorted_n_rows = A_n_rows; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - __global uint_eT1* rowptr = (__global uint_eT1*) unsorted_rowptr; - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> b]; - } - - counts[1] = counts[0]; // now holds the offset to put the next value at - counts[0] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD out_index = (counts[((rowptr[in_index] & mask) >> b)]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - __global eT1* tmp = unsorted_rowptr; - unsorted_rowptr = sorted_rowptr; - sorted_rowptr = tmp; - - UWORD tmp2 = unsorted_n_rows; - unsorted_n_rows = sorted_n_rows; - sorted_n_rows = tmp2; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* rowptr = (__global uint_eT1*) unsorted_rowptr; - counts[0] = 0; - counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> last_bit]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - if (COOT_FN(coot_is_fp_,eT1)()) - { - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = counts[0] - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = counts[bit_val] * sorted_n_rows; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_rowptr[out_index] = val; - } - } - else - { - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = (counts[bit_val]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_desc.cl deleted file mode 100644 index 626409f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_desc.cl +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_rowwise_desc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < A_n_rows) - { - __global eT1* unsorted_rowptr = &A[A_offset + row]; - __global eT1* sorted_rowptr = &tmp_mem[ row]; - - UWORD unsorted_n_rows = A_M_n_rows; - UWORD sorted_n_rows = A_n_rows; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - __global uint_eT1* rowptr = (__global uint_eT1*) unsorted_rowptr; - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> b]; - } - - counts[0] = counts[1]; // now holds the offset to put the next value at - counts[1] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD out_index = (counts[((rowptr[in_index] & mask) >> b)]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - __global eT1* tmp = unsorted_rowptr; - unsorted_rowptr = sorted_rowptr; - sorted_rowptr = tmp; - - UWORD tmp2 = unsorted_n_rows; - unsorted_n_rows = sorted_n_rows; - sorted_n_rows = tmp2; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* rowptr = (__global uint_eT1*) unsorted_rowptr; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - if (COOT_FN(coot_is_fp_,eT1)()) - { - counts[0] = 0; // now holds the offset to put the next positive value at - counts[1] = A_n_cols - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = counts[bit_val] * sorted_n_rows; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_rowptr[out_index] = val; - } - } - else - { - counts[0] = 0; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> last_bit]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - counts[1] = counts[0]; // now holds the offset to put the next positive value at - counts[0] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = (counts[bit_val]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/regspace_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/regspace_desc.cl deleted file mode 100644 index b3da4e3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/regspace_desc.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,regspace_desc)(__global eT1* out_mem, - const UWORD out_mem_offset, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - const UWORD idx = get_global_id(0); - if (idx < num) - { - out_mem[out_mem_offset + idx * mem_incr] = (eT1) (start - step * idx); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/reorder_cols.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/reorder_cols.cl deleted file mode 100644 index a375123..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/reorder_cols.cl +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,reorder_cols)(__global eT1* out_mem, - const UWORD out_mem_offset, - __global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_rows, - __global const UWORD* ordering, - const UWORD out_n_cols) - { - const UWORD out_col = get_global_id(0); - if (out_col < out_n_cols) - { - const UWORD in_col = ordering[out_col]; - - __global eT1* out_colptr = out_mem + out_mem_offset + (out_col * n_rows); - const __global eT1* in_colptr = in_mem + in_mem_offset + (in_col * n_rows); - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (UWORD i = 0; i < n_rows; ++i) - { - out_colptr[i] = in_colptr[i]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/rotate_180.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/rotate_180.cl deleted file mode 100644 index ca571e1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/rotate_180.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rotate_180)(__global eT1* out, - const UWORD out_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if( (row < n_rows) && (col < n_cols) ) - { - const UWORD in_index = in_offset + col * n_rows + row; - // out(i, j) = in(n_rows - i - 1, n_cols - j - 1) - // or - // out(n_rows - i - 1, n_cols - j - 1) = in(i, j) - const UWORD out_row = n_rows - row - 1; - const UWORD out_col = n_cols - col - 1; - const UWORD out_index = out_offset + out_col * n_rows + out_row; - - out[out_index] = in[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_add_offset.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_add_offset.cl deleted file mode 100644 index 32b0b71..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_add_offset.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel adds workgroup-specific offsets to blocks of local memory. -// Specifically, workgroup i, which has t threads, adds offsets[i] to -// the range mem[i * (2 * t)] to mem[(i + 1) * (2 * t) - 1] (inclusive). -__kernel -void -COOT_FN(PREFIX,shifted_prefix_sum_add_offset)(__global eT1* mem, - const UWORD global_mem_offset, - __global const eT1* offsets, - const UWORD n_elem, - __local volatile eT1* aux_mem) - { - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = get_group_id(0); - - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD local_offset = 2 * local_tid; - const UWORD mem_offset = group_offset + local_offset; - - const eT1 offset = offsets[group_id]; - - const eT1 in_val1 = (mem_offset < n_elem) ? mem[global_mem_offset + mem_offset ] : (eT1) 0; - const eT1 in_val2 = (mem_offset + 1 < n_elem) ? mem[global_mem_offset + mem_offset + 1] : (eT1) 0; - - const eT1 out_val1 = in_val1 + offset; - const eT1 out_val2 = in_val2 + offset; - - // Copy results back to memory. - if (mem_offset + 1 < n_elem) - { - mem[global_mem_offset + mem_offset ] = out_val1; - mem[global_mem_offset + mem_offset + 1] = out_val2; - } - else if (mem_offset < n_elem) - { - mem[global_mem_offset + mem_offset ] = out_val1; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_small.cl deleted file mode 100644 index de2e5b7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_small.cl +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel performs shifted prefix-sum on `mem` assuming that (2 * local group size) <= n_elem. -// It's okay if n_elem is not a power of 2. -__kernel -void -COOT_FN(PREFIX,shifted_prefix_sum_small)(__global eT1* mem, - const UWORD global_mem_offset, - const UWORD n_elem, - __local volatile eT1* aux_mem) - { - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = get_group_id(0); - - // Copy relevant memory to auxiliary memory. - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD mem_offset = group_offset + 2 * local_tid; - - aux_mem[mem_offset ] = (mem_offset < n_elem) ? mem[global_mem_offset + mem_offset ] : (eT1) 0; - aux_mem[mem_offset + 1] = (mem_offset + 1 < n_elem) ? mem[global_mem_offset + mem_offset + 1] : (eT1) 0; - - UWORD offset = 1; - for (UWORD s = local_size; s > 0; s >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (local_tid < s) - { - const UWORD ai = group_offset + offset * (2 * local_tid + 1) - 1; - const UWORD bi = group_offset + offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - } - - // Prepare for down-sweep by setting the last element to 0. - if (local_tid == 0) - { - aux_mem[2 * local_size - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = 1; s <= local_size; s *= 2) - { - offset >>= 1; - if (local_tid < s) - { - const UWORD ai = group_offset + offset * (2 * local_tid + 1) - 1; - const UWORD bi = group_offset + offset * (2 * local_tid + 2) - 1; - eT1 tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Copy results back to memory. - if (mem_offset + 1 < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[mem_offset ]; - mem[global_mem_offset + mem_offset + 1] = aux_mem[mem_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[mem_offset ]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_subgroups.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_subgroups.cl deleted file mode 100644 index 7ee6b1b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_subgroups.cl +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel performs the shifted prefix-sum on each individual workgroup. -// This is the same as just running a regular prefix-sum kernel, except that -// `out_mem[i]` will store the total sum of elements in workgroup `i`. -// After running this, to finish prefix-sum on the entire memory, offsets for -// each workgroup need to be added. -__kernel -void -COOT_FN(PREFIX,shifted_prefix_sum_subgroups)(__global eT1* mem, - const UWORD global_mem_offset, - __global eT1* out_mem, - const UWORD n_elem, - __local volatile eT1* aux_mem) - { - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = get_group_id(0); - - // Copy relevant memory to auxiliary memory. - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD local_offset = 2 * local_tid; - const UWORD mem_offset = group_offset + local_offset; - - aux_mem[local_offset ] = (mem_offset < n_elem) ? mem[global_mem_offset + mem_offset ] : (eT1) 0; - aux_mem[local_offset + 1] = (mem_offset + 1 < n_elem) ? mem[global_mem_offset + mem_offset + 1] : (eT1) 0; - - UWORD offset = 1; - for (UWORD s = local_size; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (local_offset + 1) - 1; - const UWORD bi = offset * (local_offset + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (mem_offset + 1 < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[local_offset ]; - mem[global_mem_offset + mem_offset + 1] = aux_mem[local_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[local_offset ]; - } - - if (local_tid == 0) - { - // Write the sum of the subarray to the output memory. - out_mem[group_id] = aux_mem[2 * local_size - 1]; - // Prepare for the downsweep. - aux_mem[2 * local_size - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - offset = local_size; - for (UWORD s = 1; s <= local_size; s *= 2) - { - if (local_tid < s) - { - const UWORD ai = offset * (local_offset + 1) - 1; - const UWORD bi = offset * (local_offset + 2) - 1; - eT1 tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - offset >>= 1; - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Copy results back to memory. - // The results here are the prefix-summed results for each individual - // workgroup. - if (mem_offset + 1 < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[local_offset ]; - mem[global_mem_offset + mem_offset + 1] = aux_mem[local_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[local_offset ]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle.cl deleted file mode 100644 index 982298f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle.cl +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,shuffle)(__global eT1* out, - const UWORD out_offset, - const UWORD out_incr, /* how many eT1s to advance to get to the start of the next element to shuffle */ - const UWORD out_elem_stride, /* how many eT1s between each eT1 in each element */ - __global const eT1* in, - const UWORD in_offset, - const UWORD in_incr, - const UWORD in_elem_stride, - const UWORD n_elem, - const UWORD elems_per_elem, - const UWORD n_elem_pow2, - __global const UWORD* philox_key, - const UWORD num_bits, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - // Get our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // Fill aux_mem with the indicator of whether we are out of bounds. - // Then, we'll prefix-sum it. This will tell us where to put our result. - aux_mem[tid] = (in_loc < n_elem); - - // Now, prefix-sum the auxiliary memory. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = n_elem_pow2 / 2; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[n_elem_pow2 - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = 1; s <= n_elem_pow2 / 2; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // With the prefix sum complete, we shuffle our result into position aux_mem[tid], but only if we are a thread with a "valid" output. - if (in_loc < n_elem) - { - const UWORD in_addr_offset = in_offset + in_loc * in_incr; - const UWORD out_addr_offset = out_offset + aux_mem[tid] * out_incr; - - for (UWORD i = 0; i < elems_per_elem; ++i) - { - out[out_addr_offset + (i * out_elem_stride)] = in[in_addr_offset + (i * in_elem_stride)]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle_large.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle_large.cl deleted file mode 100644 index 4d77236..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle_large.cl +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,shuffle_large)(__global eT1* out, - const UWORD out_offset, - const UWORD out_incr, /* how many eT1s to advance to get to the start of the next element to shuffle */ - const UWORD out_elem_stride, /* how many eT1s between each eT1 in each element */ - __global const eT1* in, - const UWORD in_offset, - const UWORD in_incr, - const UWORD in_elem_stride, - __global const UWORD* block_offsets, - const UWORD n_elem, - const UWORD elems_per_elem, - const UWORD n_elem_pow2, - __global const UWORD* philox_key, - const UWORD num_bits, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); - - // Recompute our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // We actually have to perform the up-sweep a second time, since we did not save the memory the first time. - aux_mem[local_tid] = (in_loc < n_elem); - barrier(CLK_LOCAL_MEM_FENCE); - - // Now, prefix-sum the auxiliary memory for this block. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = local_size / 2; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (local_tid == 0) - { - aux_mem[local_size - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = 1; s <= local_size / 2; s *= 2) - { - offset >>= 1; - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // With the prefix sum complete, we shuffle our result into position aux_mem[tid], but only if we are a thread with a "valid" output. - if (in_loc < n_elem) - { - const UWORD in_addr_offset = in_offset + in_loc * in_incr; - const UWORD out_addr_offset = out_offset + (aux_mem[local_tid] + block_offsets[get_group_id(0)]) * out_incr; - - for (UWORD i = 0; i < elems_per_elem; ++i) - { - out[out_addr_offset + (i * out_elem_stride)] = in[in_addr_offset + (i * in_elem_stride)]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_asc.cl deleted file mode 100644 index 0f56d64..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_asc.cl +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,stable_radix_sort_index_asc)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* tmp_mem, - __global UWORD* tmp_mem_index, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - // The stable sort differs from the rest of our radix sorts in that we must avoid ever "reversing" point orders. - // We do this by adapting the regular radix sort to also consider the highest bit (the sign bit for signed types). - // This alleviates the need to ever unpack points in a reverse order, and so the sort is stable. - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill tmp_mem_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - tmp_mem_index[i] = i; - tmp_mem_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - tmp_mem_index[i] = i; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - UWORD local_counts[4]; - - __global eT1* unsorted_memptr = A + A_offset; - __global UWORD* unsorted_index_memptr = tmp_mem_index; - __global eT1* sorted_memptr = tmp_mem; - __global UWORD* sorted_index_memptr = A_index + A_index_offset; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 sign_mask = (((uint_eT1) 1) << last_bit); - - for (UWORD b = 0; b < 8 * sizeof(eT1) - 1; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 and sign value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 and sign value 0 - local_counts[2] = 0; // holds the count of elements with bit value 0 and sign value 1 - local_counts[3] = 0; // holds the count of elements with bit value 1 and sign value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[((memptr[i ] & mask) >> b) + ((memptr[i ] & sign_mask) >> (last_bit - 1))]; - ++local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]; - } - - // Step 2: aggregate the counts for all threads. - // There are a couple cases here to get things in an ascending order: - // * Floating point number: [11, 10, 00, 01] - // * Unsigned integer: [00, 01, 10, 11] - // * Signed integer: [10, 11, 00, 01] - // Note that the notation "11" indicates, e.g., a point whose sign is 1 and bit value in bit b is 1. - // For unsigned integers, we treat the top bit as a "sign" bit even though it's not---but we choose an ordering that's still correct. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - // Unsigned integer (00, 01, 10, 11) - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - aux_mem[tid + 2 * num_threads] = local_counts[2]; - aux_mem[tid + 3 * num_threads] = local_counts[3]; - } - else if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating-point (11, 10, 00, 01) - aux_mem[tid ] = local_counts[3]; - aux_mem[tid + num_threads] = local_counts[2]; - aux_mem[tid + 2 * num_threads] = local_counts[0]; - aux_mem[tid + 3 * num_threads] = local_counts[1]; - } - else - { - // Signed integer (10, 11, 00, 01) - aux_mem[tid ] = local_counts[2]; - aux_mem[tid + num_threads] = local_counts[3]; - aux_mem[tid + 2 * num_threads] = local_counts[0]; - aux_mem[tid + 3 * num_threads] = local_counts[1]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Now, we must assign four sections of memory for `tid` to put its points in. - // We do this by a prefix-sum operation across all threads. - // At the end of this operation (at the beginning of Step 3): - // - // local_counts[0] indicates the first place to put a sign-0 bit-value-0 point - // local_counts[1] indicates the first place to put a sign-0 bit-value-1 point - // local_counts[2] indicates the first place to put a sign-1 bit-value-0 point - // local_counts[3] indicates the first place to put a sign-1 bit-value-1 point - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - const UWORD ai1 = offset * (2 * tid + 1) - 1; - const UWORD bi1 = offset * (2 * tid + 2) - 1; - aux_mem[bi1] += aux_mem[ai1]; - const UWORD ai2 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi2 = offset * (2 * (tid + num_threads) + 2) - 1; - aux_mem[bi2] += aux_mem[ai2]; - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[4 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - offset >>= 1; - const UWORD ai3 = offset * (2 * tid + 1) - 1; - const UWORD bi3 = offset * (2 * tid + 2) - 1; - UWORD tmp3 = aux_mem[ai3]; - aux_mem[ai3] = aux_mem[bi3]; - aux_mem[bi3] += tmp3; - - const UWORD ai4 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi4 = offset * (2 * (tid + num_threads) + 2) - 1; - UWORD tmp4 = aux_mem[ai4]; - aux_mem[ai4] = aux_mem[bi4]; - aux_mem[bi4] += tmp4; - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 3: move points into the correct place. - // There are a couple cases here to get things in an ascending order: - // * Floating point number: [11, 10, 00, 01] - // * Unsigned integer: [00, 01, 10, 11] - // * Signed integer: [10, 11, 00, 01] - if (!COOT_FN(coot_is_signed_,eT1)()) - { - // Unsigned integer (00, 01, 10, 11) - local_counts[0] = aux_mem[tid ]; - local_counts[1] = aux_mem[tid + num_threads]; - local_counts[2] = aux_mem[tid + 2 * num_threads]; - local_counts[3] = aux_mem[tid + 3 * num_threads]; - } - else if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating-point (11, 10, 00, 01) - local_counts[0] = aux_mem[tid + 2 * num_threads]; - local_counts[1] = aux_mem[tid + 3 * num_threads]; - local_counts[2] = aux_mem[tid + num_threads]; - local_counts[3] = aux_mem[tid ]; - } - else - { - // Signed integer (10, 11, 00, 01) - local_counts[0] = aux_mem[tid + 2 * num_threads]; - local_counts[1] = aux_mem[tid + 3 * num_threads]; - local_counts[2] = aux_mem[tid ]; - local_counts[3] = aux_mem[tid + num_threads]; - } - - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - __global UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // Since we did an odd number of iterations, the result is stored in A_index. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_desc.cl deleted file mode 100644 index 713574c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_desc.cl +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,stable_radix_sort_index_desc)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* tmp_mem, - __global UWORD* tmp_mem_index, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - // The stable sort differs from the rest of our radix sorts in that we must avoid ever "reversing" point orders. - // We do this by adapting the regular radix sort to also consider the highest bit (the sign bit for signed types). - // This alleviates the need to ever unpack points in a reverse order, and so the sort is stable. - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill tmp_mem_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - tmp_mem_index[i] = i; - tmp_mem_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - tmp_mem_index[i] = i; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - UWORD local_counts[4]; - - __global eT1* unsorted_memptr = A + A_offset; - __global UWORD* unsorted_index_memptr = tmp_mem_index; - __global eT1* sorted_memptr = tmp_mem; - __global UWORD* sorted_index_memptr = A_index + A_index_offset; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 sign_mask = (((uint_eT1) 1) << last_bit); - - for (UWORD b = 0; b < 8 * sizeof(eT1) - 1; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - local_counts[2] = 0; // holds the count of elements with bit value 0 and sign value 1 - local_counts[3] = 0; // holds the count of elements with bit value 1 and sign value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[((memptr[i ] & mask) >> b) + ((memptr[i ] & sign_mask) >> (last_bit - 1))]; - ++local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]; - } - - // Step 2: aggregate the counts for all threads. - // There are a couple cases here to get things in a descending order: - // * Floating point number: [01, 00, 10, 11] - // * Unsigned integer: [11, 10, 01, 00] - // * Signed integer: [01, 00, 11, 10] - // Note that the notation "11" indicates, e.g., a point whose sign is 1 and bit value in bit b is 1. - // For unsigned integers, we treat the top bit as a "sign" bit even though it's not---but we choose an ordering that's still correct. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - // Unsigned integer (11, 10, 01, 00) - aux_mem[tid ] = local_counts[3]; - aux_mem[tid + num_threads] = local_counts[2]; - aux_mem[tid + 2 * num_threads] = local_counts[1]; - aux_mem[tid + 3 * num_threads] = local_counts[0]; - } - else if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating-point (01, 00, 10, 11) - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - aux_mem[tid + 2 * num_threads] = local_counts[2]; - aux_mem[tid + 3 * num_threads] = local_counts[3]; - } - else - { - // Signed integer (01, 00, 11, 10) - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - aux_mem[tid + 2 * num_threads] = local_counts[3]; - aux_mem[tid + 3 * num_threads] = local_counts[2]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Now, we must assign four sections of memory for `tid` to put its points in. - // We do this by a prefix-sum operation across all threads. - // At the end of this operation (at the beginning of Step 3): - // - // local_counts[0] indicates the first place to put a sign-0 bit-value-0 point - // local_counts[1] indicates the first place to put a sign-0 bit-value-1 point - // local_counts[2] indicates the first place to put a sign-1 bit-value-0 point - // local_counts[3] indicates the first place to put a sign-1 bit-value-1 point - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - const UWORD ai1 = offset * (2 * tid + 1) - 1; - const UWORD bi1 = offset * (2 * tid + 2) - 1; - aux_mem[bi1] += aux_mem[ai1]; - const UWORD ai2 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi2 = offset * (2 * (tid + num_threads) + 2) - 1; - aux_mem[bi2] += aux_mem[ai2]; - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[4 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - offset >>= 1; - const UWORD ai3 = offset * (2 * tid + 1) - 1; - const UWORD bi3 = offset * (2 * tid + 2) - 1; - UWORD tmp3 = aux_mem[ai3]; - aux_mem[ai3] = aux_mem[bi3]; - aux_mem[bi3] += tmp3; - - const UWORD ai4 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi4 = offset * (2 * (tid + num_threads) + 2) - 1; - UWORD tmp4 = aux_mem[ai4]; - aux_mem[ai4] = aux_mem[bi4]; - aux_mem[bi4] += tmp4; - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 3: move points into the correct place. - // There are a couple cases here to get things in a descending order: - // * Floating point number: [01, 00, 10, 11] - // * Unsigned integer: [11, 10, 01, 00] - // * Signed integer: [01, 00, 11, 10] - if (!COOT_FN(coot_is_signed_,eT1)()) - { - // Unsigned integer (11, 10, 01, 00) - local_counts[0] = aux_mem[tid + 3 * num_threads]; - local_counts[1] = aux_mem[tid + 2 * num_threads]; - local_counts[2] = aux_mem[tid + num_threads]; - local_counts[3] = aux_mem[tid ]; - } - else if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating-point (01, 00, 10, 11) - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid ]; - local_counts[2] = aux_mem[tid + 2 * num_threads]; - local_counts[3] = aux_mem[tid + 3 * num_threads]; - } - else - { - // Signed integer (01, 00, 11, 10) - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid ]; - local_counts[2] = aux_mem[tid + 3 * num_threads]; - local_counts[3] = aux_mem[tid + 2 * num_threads]; - } - - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - __global UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // Since we did an odd number of iterations, the result is stored in A_index. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var.cl deleted file mode 100644 index 2ad0fa8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var.cl +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,submat_var)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 mean_val, - const UWORD in_n_rows, - const UWORD start_row, - const UWORD start_col, - const UWORD sub_n_rows, - const UWORD sub_n_cols) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const UWORD col1 = (i ) / sub_n_rows; - const UWORD col2 = (i + get_local_size(0)) / sub_n_rows; - const UWORD row1 = (i ) % sub_n_rows; - const UWORD row2 = (i + get_local_size(0)) % sub_n_rows; - const UWORD index1 = (col1 + start_col) * in_n_rows + (row1 + start_row); - const UWORD index2 = (col2 + start_col) * in_n_rows + (row2 + start_row); - - const eT1 val1 = (in_mem[in_mem_offset + index1] - mean_val); - const eT1 val2 = (in_mem[in_mem_offset + index2] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const UWORD col = i / sub_n_rows; - const UWORD row = i % sub_n_rows; - const UWORD index = (col + start_col) * in_n_rows + (row + start_row); - - const eT1 val = (in_mem[in_mem_offset + index] - mean_val); - aux_mem[tid] += (val * val); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var_small.cl deleted file mode 100644 index d450bf5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var_small.cl +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,submat_var_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 mean_val, - const UWORD in_n_rows, - const UWORD start_row, - const UWORD start_col, - const UWORD sub_n_rows, - const UWORD sub_n_cols) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const UWORD col1 = (i ) / sub_n_rows; - const UWORD col2 = (i + get_local_size(0)) / sub_n_rows; - const UWORD row1 = (i ) % sub_n_rows; - const UWORD row2 = (i + get_local_size(0)) % sub_n_rows; - const UWORD index1 = (col1 + start_col) * in_n_rows + (row1 + start_row); - const UWORD index2 = (col2 + start_col) * in_n_rows + (row2 + start_row); - - const eT1 val1 = (in_mem[in_mem_offset + index1] - mean_val); - const eT1 val2 = (in_mem[in_mem_offset + index2] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const UWORD col = i / sub_n_rows; - const UWORD row = i % sub_n_rows; - const UWORD index = (col + start_col) * in_n_rows + (row + start_row); - - const eT1 val = (in_mem[in_mem_offset + index] - mean_val); - aux_mem[tid] += (val * val); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatl_inplace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatl_inplace.cl deleted file mode 100644 index bd49f1e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatl_inplace.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,symmatl_inplace)(__global eT1* out, - const UWORD out_offset, - const UWORD size) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < size && col < size && row > col) - { - const eT1 val = out[out_offset + row + size * col]; - - // only need to copy to the upper triangle for the in-place version - out[out_offset + col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatu_inplace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatu_inplace.cl deleted file mode 100644 index 35e6566..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatu_inplace.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,symmatu_inplace)(__global eT1* out, - const UWORD out_offset, - const UWORD size) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < size && col < size && col > row) - { - const eT1 val = out[out_offset + row + size * col]; - - // only need to copy to the lower triangle for the in-place version - out[out_offset + col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/trace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/trace.cl deleted file mode 100644 index e1dcf4a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/trace.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - -__kernel -void -COOT_FN(PREFIX,trace)(__global eT1* out, - __global const eT1* A, - const UWORD A_offset, - const UWORD n_rows, - const UWORD N) - { - const UWORD id = get_global_id(0); - if(id == 0) - { - eT1 acc = (eT1)(0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=0; i SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_colwise.cl deleted file mode 100644 index 3e03607..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_colwise.cl +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,var_colwise)(__global eT1* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - __global const eT1* src_means, - const UWORD src_means_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD norm_correction, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows, - const UWORD src_means_mem_incr) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - const __global eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - const eT1 mean_val = src_means[src_means_offset + col * src_means_mem_incr]; - eT1 acc = (eT1) (0); - for (UWORD i = 0; i < n_rows; ++i) - { - eT1 val = (colptr[i] - mean_val); - acc += (val * val); - } - - dest[dest_offset + col * dest_mem_incr] = (acc / (eT1) (n_rows - norm_correction)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_rowwise.cl deleted file mode 100644 index c79b49f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_rowwise.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,var_rowwise)(__global eT1* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - __global const eT1* src_means, - const UWORD src_means_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD norm_correction, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows, - const UWORD src_means_mem_incr) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1)(0); - const eT1 mean_val = src_means[src_means_offset + row * src_means_mem_incr]; - for (UWORD i = 0; i < n_cols; ++i) - { - const eT1 val = (src[src_offset + (i * src_M_n_rows) + row] - mean_val); - acc += (val * val); - } - - dest[dest_offset + row * dest_mem_incr] = (acc / (eT1) (n_cols - norm_correction)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_small.cl deleted file mode 100644 index 55710ee..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_small.cl +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,var_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 mean_val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = (in_mem[in_mem_offset + i] - mean_val); - const eT1 val2 = (in_mem[in_mem_offset + i + get_local_size(0)] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 val = (in_mem[in_mem_offset + i] - mean_val); - aux_mem[tid] += (val * val); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce.cl deleted file mode 100644 index 251e9e6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce.cl +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] &= data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] &=data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &=data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &=data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &=data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] &= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void COOT_FN(PREFIX,and_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] &= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -__kernel -void -COOT_FN(PREFIX,and_reduce)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = ~((eT1) 0); - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] &= in_mem[in_mem_offset + i]; - aux_mem[tid] &= in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] &= in_mem[in_mem_offset + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,and_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce_small.cl deleted file mode 100644 index 20df12a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce_small.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,and_reduce_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = ~((eT1) 0); - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] &= in_mem[in_mem_offset + i]; - aux_mem[tid] &= in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] &= in_mem[in_mem_offset + i]; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det.cl deleted file mode 100644 index f5842ff..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det.cl +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,ipiv_det)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - // This kernel is not used by the OpenCL backend, so we leave it empty! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det_small.cl deleted file mode 100644 index 210b7f9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det_small.cl +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,ipiv_det_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - // This kernel is not used by the OpenCL backend, so we leave it empty! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce.cl deleted file mode 100644 index 53d3c75..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce.cl +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] |= data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void COOT_FN(PREFIX,or_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -__kernel -void -COOT_FN(PREFIX,or_reduce)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = (eT1) 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] |= in_mem[in_mem_offset + i]; - aux_mem[tid] |= in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] |= in_mem[in_mem_offset + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,or_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce_small.cl deleted file mode 100644 index 041662f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce_small.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,or_reduce_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = (eT1) 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] |= in_mem[in_mem_offset + i]; - aux_mem[tid] |= in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] |= in_mem[in_mem_offset + i]; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod.cl deleted file mode 100644 index 792e738..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod.cl +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// Forward declarations we may need. -void COOT_FN(PREFIX,prod_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid); - - - -// Compute the product of the elements on the diagonal of a matrix. -__kernel -void -COOT_FN(PREFIX,diag_prod)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_rows, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_rows) - { - const UWORD index1 = i * n_rows + i; - const eT1 v1 = in_mem[in_mem_offset + index1]; - const UWORD index2 = (i + get_local_size(0)) * n_rows + (i + get_local_size(0)); - const eT1 v2 = in_mem[in_mem_offset + index2]; - aux_mem[tid] *= v1 * v2; - i += grid_size; - } - if (i < n_rows) - { - const UWORD index = i * n_rows + i; - const eT1 v = in_mem[in_mem_offset + index]; - aux_mem[tid] *= v; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,prod_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod_small.cl deleted file mode 100644 index 4d1067c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod_small.cl +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Compute the product of the elements on the diagonal of a matrix. -__kernel -void -COOT_FN(PREFIX,diag_prod_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_rows, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_rows) - { - const UWORD index1 = i * n_rows + i; - const eT1 v1 = in_mem[in_mem_offset + index1]; - const UWORD index2 = (i + get_local_size(0)) * n_rows + (i + get_local_size(0)); - const eT1 v2 = in_mem[in_mem_offset + index2]; - aux_mem[tid] *= v1 * v2; - i += grid_size; - } - if (i < n_rows) - { - const UWORD index = i * n_rows + i; - const eT1 v = in_mem[in_mem_offset + index]; - aux_mem[tid] *= v; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/extract_cx.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/extract_cx.cl deleted file mode 100644 index a343916..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/extract_cx.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Extract real or imaginary elements from a complex matrix into a real matrix. -// This kernel is a bit of a hack until we have actual complex matrix support! -__kernel -void -COOT_FN(PREFIX,extract_cx)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global eT1* out_mem, - const UWORD out_mem_offset, - const UWORD real_or_imag, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows, - const UWORD out_M_n_rows) - { - // If real_or_imag is 0, we extract the real part. If 1, we extract the - // imaginary part. - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD in_index = 2 * (col * in_M_n_rows + row) + real_or_imag; - const UWORD out_index = col * out_M_n_rows + row; - - if (col < n_cols && row < n_rows) - { - out_mem[out_mem_offset + out_index] = in_mem[in_mem_offset + in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_l.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_l.cl deleted file mode 100644 index 69f4e85..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_l.cl +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This extracts L and U from in, and sets the lower diagonal of U to 0. -// It's okay if U == in, if n_rows <= n_cols. -__kernel -void -COOT_FN(PREFIX,lu_extract_l)(__global eT1* L, - const UWORD L_offset, - __global eT1* U, - const UWORD U_offset, - const __global eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - // Note that neither U nor L must be square. - // L has size n_rows x min(n_rows, n_cols). - // U has size min(n_rows, n_cols) x n_cols. - const UWORD min_rows_cols = min(n_rows, n_cols); - - const UWORD in_index = row + n_rows * col; // this is also L_out_index - const UWORD U_out_index = row + min_rows_cols * col; - - if ((row < n_rows) && (col < min_rows_cols)) - { - L[L_offset + in_index] = (row > col) ? in[in_offset + in_index] : ((row == col) ? 1 : 0); - } - - if ((row < min_rows_cols) && (col < n_cols)) - { - U[U_offset + U_out_index] = (row > col) ? 0 : in[in_offset + in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_p.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_p.cl deleted file mode 100644 index c1a4c2b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_p.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,lu_extract_p)(__global eT1* P, - const UWORD P_offset, - __global const UWORD* ipiv2, - const UWORD n_rows) - { - const UWORD row = get_global_id(0); - - if (row < n_rows) - { - const UWORD index = row + ipiv2[row] * n_rows; - P[P_offset + index] = (eT1) 1; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_pivoted_l.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_pivoted_l.cl deleted file mode 100644 index c81a122..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_pivoted_l.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This extracts L from U, and sets the lower diagonal of U to 0. -__kernel -void -COOT_FN(PREFIX,lu_extract_pivoted_l)(__global eT1* L, - const UWORD L_offset, - __global eT1* U, - const UWORD U_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - __global const UWORD* ipiv) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - // Note that neither U nor L must be square. - // L has size n_rows x min(n_rows, n_cols). - // U has size min(n_rows, n_cols) x n_cols. - const UWORD min_rows_cols = min(n_rows, n_cols); - - const UWORD in_index = row + n_rows * col; // this is also L_out_index - const UWORD U_out_index = row + min_rows_cols * col; - - // We are extracted a permuted version of L. - // Instead of extracting row i of U as row i of L, - // we extract row i of U as row ipiv[i] of L. - const UWORD L_out_index = ipiv[row] + n_rows * col; - - if ((row < n_rows) && (col < min_rows_cols)) - { - L[L_offset + L_out_index] = (row > col) ? in[in_offset + in_index] : ((row == col) ? 1 : 0); - } - - if ((row < min_rows_cols) && (col < n_cols)) - { - U[U_offset + U_out_index] = (row > col) ? 0 : in[in_offset + in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf.cl deleted file mode 100644 index 8c2a9a3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_inf)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isinf(val1); - aux_mem[tid] |= isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isinf(val1); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(or_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf_small.cl deleted file mode 100644 index 7baa213..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf_small.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_inf_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isinf(val1); - aux_mem[tid] |= isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isinf(val1); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan.cl deleted file mode 100644 index 640d8ad..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_nan)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isnan(val1); - aux_mem[tid] |= isnan(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isnan(val1); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(or_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan_small.cl deleted file mode 100644 index 6ccadc9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan_small.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_nan_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isnan(val1); - aux_mem[tid] |= isnan(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isnan(val1); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite.cl deleted file mode 100644 index 4588da3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_nonfinite)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isnan(val1) | isinf(val1); - aux_mem[tid] |= isnan(val2) | isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isnan(val1) | isinf(val1); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(or_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite_small.cl deleted file mode 100644 index 48c7b86..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite_small.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_nonfinite_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isnan(val1) | isinf(val1); - aux_mem[tid] |= isnan(val2) | isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isnan(val1) | isinf(val1); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isfinite.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isfinite.cl deleted file mode 100644 index 64b470b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isfinite.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_isfinite)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - if (i < n_elem) - { - const eT1 val = (eT1) X[X_offset + i]; - out[out_offset + i] = isfinite(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnan.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnan.cl deleted file mode 100644 index 437fcb1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnan.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_isnan)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - if (i < n_elem) - { - const eT1 val1 = (eT1) X[X_offset + i]; - out[out_offset + i] = isnan(val1); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnonfinite.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnonfinite.cl deleted file mode 100644 index 582fbe4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnonfinite.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_isnonfinite)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - if (i < n_elem) - { - const eT1 val = (eT1) X[X_offset + i]; - out[out_offset + i] = !isfinite(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1.cl deleted file mode 100644 index 1177ea0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1.cl +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// Forward declarations we may need. -void COOT_FN(PREFIX,accu_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid); - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_1)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += ET1_ABS(in_mem[in_mem_offset + i]) + ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += ET1_ABS(in_mem[in_mem_offset + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1_small.cl deleted file mode 100644 index b0a175d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1_small.cl +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_1_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += ET1_ABS(in_mem[in_mem_offset + i]) + ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += ET1_ABS(in_mem[in_mem_offset + i]); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2.cl deleted file mode 100644 index 4587b92..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_2)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i]); - const eT1 v2 = (in_mem[in_mem_offset + i + get_local_size(0)] * in_mem[in_mem_offset + i + get_local_size(0)]); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += (in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust.cl deleted file mode 100644 index 20ef94e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_2_robust)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 max_val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] / max_val); - const eT1 v2 = (in_mem[in_mem_offset + i + get_local_size(0)] / max_val); - aux_mem[tid] += (v1 * v1) + (v2 * v2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] / max_val); - aux_mem[tid] += (v1 * v1); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust_small.cl deleted file mode 100644 index 11de83d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust_small.cl +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_2_robust_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 max_val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] / max_val); - const eT1 v2 = (in_mem[in_mem_offset + i + get_local_size(0)] / max_val); - aux_mem[tid] += (v1 * v1) + (v2 * v2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] / max_val); - aux_mem[tid] += (v1 + v1); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_small.cl deleted file mode 100644 index 7ea93a0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_small.cl +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_2_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i]); - const eT1 v2 = (in_mem[in_mem_offset + i + get_local_size(0)] * in_mem[in_mem_offset + i + get_local_size(0)]); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += (in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i]); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k.cl deleted file mode 100644 index 54bd953..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_k)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const UWORD k) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = pow(in_mem[in_mem_offset + i], (eT1) k); - const eT1 v2 = pow(in_mem[in_mem_offset + i + get_local_size(0)], (eT1) k); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v1 = pow(in_mem[in_mem_offset + i], (eT1) k); - aux_mem[tid] += v1; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k_small.cl deleted file mode 100644 index b7e0a3c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k_small.cl +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_k_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const UWORD k) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = pow(in_mem[in_mem_offset + i], (eT1) k); - const eT1 v2 = pow(in_mem[in_mem_offset + i + get_local_size(0)], (eT1) k); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v1 = pow(in_mem[in_mem_offset + i], (eT1) k); - aux_mem[tid] += v1; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min.cl deleted file mode 100644 index 888fcb8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min.cl +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declarations we may need. -void COOT_FN(PREFIX,min_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid); - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_min)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[in_mem_offset + i]); - } - if (i + get_local_size(0) < n_elem) - { - const eT1 v = ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)]); - aux_mem[tid] = min(aux_mem[tid], v); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v = min(ET1_ABS(in_mem[in_mem_offset + i]), ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - aux_mem[tid] = min(aux_mem[tid], v); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], ET1_ABS(in_mem[in_mem_offset + i])); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,min_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min_small.cl deleted file mode 100644 index 1639e1f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min_small.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_min_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[in_mem_offset + i]); - } - if (i + get_local_size(0) < n_elem) - { - const eT1 v = ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)]); - aux_mem[tid] = min(aux_mem[tid], v); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v = min(ET1_ABS(in_mem[in_mem_offset + i]), ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - aux_mem[tid] = min(aux_mem[tid], v); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], ET1_ABS(in_mem[in_mem_offset + i])); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_post.cl deleted file mode 100644 index 7c371bd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_div_post)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] / ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_pre.cl deleted file mode 100644 index 718d1ef..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_div_pre)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]) / out_src[out_src_offset + out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_post.cl deleted file mode 100644 index 243d5ea..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_minus_post)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] - ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_pre.cl deleted file mode 100644 index 50fd41e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_minus_pre)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]) - out_src[out_src_offset + out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_plus.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_plus.cl deleted file mode 100644 index dcb6e45..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_plus.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_plus)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] + ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_schur.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_schur.cl deleted file mode 100644 index 1fd0953..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_schur.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_schur)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] * ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_set.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_set.cl deleted file mode 100644 index 18189a1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_set.cl +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_set)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_post.cl deleted file mode 100644 index 054f937..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_post.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_div_post)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] / ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_pre.cl deleted file mode 100644 index 9b782e9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_pre.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_div_pre)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]) / out_src[out_src_offset + out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_post.cl deleted file mode 100644 index 436ac08..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_post.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_minus_post)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] - ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_pre.cl deleted file mode 100644 index 58a03a7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_pre.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_minus_pre)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]) - out_src[out_src_offset + out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_plus.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_plus.cl deleted file mode 100644 index 624293b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_plus.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_plus)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] + ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_schur.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_schur.cl deleted file mode 100644 index 701cff1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_schur.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_schur)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] * ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_set.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_set.cl deleted file mode 100644 index fa3ea4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_set.cl +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_set)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/clamp.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/clamp.cl deleted file mode 100644 index 32c075e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/clamp.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,clamp)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 min_val, - const eT1 max_val, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_index = src_offset + row + col * src_M_n_rows; - const UWORD dest_index = dest_offset + row + col * dest_M_n_rows; - - const eT1 clamped_val = max(min_val, min(max_val, src[src_index])); - dest[dest_index] = (eT2) clamped_val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/cross.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/cross.cl deleted file mode 100644 index e9db1d0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/cross.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,cross)(__global eT2* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - __global const eT1* B, - const UWORD B_offset) // A and B should have 3 elements - { - const UWORD idx = get_global_id(0); - - if (idx < 3) - { - const UWORD a1_index = ((idx + 1) % 3) + A_offset; - const UWORD a2_index = ((idx + 2) % 3) + A_offset; - - const UWORD b1_index = ((idx + 2) % 3) + B_offset; - const UWORD b2_index = ((idx + 1) % 3) + B_offset; - - const eT1 val = (A[a1_index] * B[b1_index]) - (A[a2_index] * B[b2_index]); - out[idx + out_offset] = (eT2) val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot.cl deleted file mode 100644 index 3d43e2b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot.cl +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_other)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] += data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_8)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_16)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_32)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_64)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_128)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -__kernel -void -COOT_FN(PREFIX,dot)(__global twoway_promoted_eT* out_mem, - __global const eT1* A, - const UWORD A_offset, - __global const eT2* B, - const UWORD B_offset, - const UWORD n_elem, - __local volatile twoway_promoted_eT* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += (((twoway_promoted_eT) A[A_offset + i]) * ((twoway_promoted_eT) B[B_offset + i])) + - (((twoway_promoted_eT) A[A_offset + i + get_local_size(0)]) * ((twoway_promoted_eT) B[B_offset + i + get_local_size(0)])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += (((twoway_promoted_eT) A[A_offset + i]) * ((twoway_promoted_eT) B[B_offset + i])); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,dot_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot_small.cl deleted file mode 100644 index 551ce4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot_small.cl +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,dot_small)(__global twoway_promoted_eT* out_mem, - __global const eT1* A, - const UWORD A_offset, - __global const eT2* B, - const UWORD B_offset, - const UWORD n_elem, - __local volatile twoway_promoted_eT* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += (((twoway_promoted_eT) A[A_offset + i]) * ((twoway_promoted_eT) B[B_offset + i])) + - (((twoway_promoted_eT) A[A_offset + i + get_local_size(0)]) * ((twoway_promoted_eT) B[B_offset + i + get_local_size(0)])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += (((twoway_promoted_eT) A[A_offset + i]) * ((twoway_promoted_eT) B[B_offset + i])); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/htrans.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/htrans.cl deleted file mode 100644 index d5c5403..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/htrans.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,htrans)(__global eT2* out, - const UWORD out_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD in_n_rows, - const UWORD in_n_cols) - { - // For a non-inplace transpose, we can use a pretty naive approach. - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD in_total_offset = in_offset + row + col * in_n_rows; - const UWORD out_total_offset = out_offset + col + row * in_n_cols; - - if( (row < in_n_rows) && (col < in_n_cols) ) - { - const eT2 element = (eT2) in[in_total_offset]; - out[out_total_offset] = COOT_FN(coot_conj_,eT2)(element); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_array.cl deleted file mode 100644 index cd8c236..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_div_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] /= src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_sve1.cl deleted file mode 100644 index 1f9f216..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_div_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] /= src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_array.cl deleted file mode 100644 index 5e08dd9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_eq_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_sve1.cl deleted file mode 100644 index 7878625..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_eq_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_array.cl deleted file mode 100644 index ea1ecc8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_minus_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] -= src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_sve1.cl deleted file mode 100644 index c54f258..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_minus_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] -= src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_array.cl deleted file mode 100644 index ea6440f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_mul_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] *= src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_sve1.cl deleted file mode 100644 index eeeff6a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_mul_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] *= src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_array.cl deleted file mode 100644 index b750e57..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_plus_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] += src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_sve1.cl deleted file mode 100644 index 882139d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_plus_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] += src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_array.cl deleted file mode 100644 index 131acf0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_div_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] /= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_sve2.cl deleted file mode 100644 index 061e82f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_div_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] /= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_array.cl deleted file mode 100644 index 9924db0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_eq_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] = (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_sve2.cl deleted file mode 100644 index 65961b8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_eq_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] = (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_array.cl deleted file mode 100644 index 774ddf7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_minus_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] -= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_sve2.cl deleted file mode 100644 index 721674e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_minus_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] -= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_array.cl deleted file mode 100644 index c7c3108..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_mul_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] *= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_sve2.cl deleted file mode 100644 index 294b915..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_mul_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] *= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_array.cl deleted file mode 100644 index 4def038..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_plus_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] += (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_sve2.cl deleted file mode 100644 index d373a22..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_plus_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] += (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_post.cl deleted file mode 100644 index 4688349..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_post.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_colwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT1 acc = (eT1) colptr[0]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - acc = max(acc, colptr[i]); - } - dest[dest_offset + col * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_pre.cl deleted file mode 100644 index d0c9c67..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_colwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT2 acc = (eT2) colptr[0]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - acc = max(acc, (eT2) (colptr[i])); - } - dest[dest_offset + col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_post.cl deleted file mode 100644 index 0064a8c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_post.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_cube_col_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT1 acc = (eT1) src[src_offset + row + slice * n_rows * n_cols]; - for(UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols]); - } - dest[dest_offset + row + slice * n_rows] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_pre.cl deleted file mode 100644 index 46ff643..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_cube_col_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT2 acc = (eT2) src[src_offset + row + slice * n_rows * n_cols]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - acc = max(acc, (eT2) (src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols])); - } - dest[dest_offset + row + slice * n_rows] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_post.cl deleted file mode 100644 index 9771bb4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_post.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_rowwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1) src[src_offset + row]; - for(UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, src[src_offset + (i * n_rows) + row]); - } - dest[dest_offset + row * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_pre.cl deleted file mode 100644 index f16c906..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_pre.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_rowwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT2 acc = (eT2) src[src_offset + row]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - acc = max(acc, (eT2) (src[src_offset + (i * src_M_n_rows) + row])); - } - dest[dest_offset + row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_post.cl deleted file mode 100644 index 878ff93..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_post.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,mean_colwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT1 acc = (eT1) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - acc += colptr[i]; - } - dest[dest_offset + col * dest_mem_incr] = (eT2) (acc / (eT1) n_rows); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_pre.cl deleted file mode 100644 index a820b6d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,mean_colwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[col * src_M_n_rows + src_offset]); - eT2 acc = (eT2) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - acc += (eT2) (colptr[i]); - } - dest[dest_offset + col * dest_mem_incr] = (acc / (eT2) n_rows); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_post.cl deleted file mode 100644 index eb63132..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_post.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,mean_rowwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1) (0); - for(UWORD i = 0; i < n_cols; ++i) - { - acc += src[src_offset + (i * src_M_n_rows) + row]; - } - dest[dest_offset + row * dest_mem_incr] = (eT2) (acc / (eT1) n_cols); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_pre.cl deleted file mode 100644 index a158168..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_pre.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,mean_rowwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT2 acc = (eT2) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=0; i < n_cols; ++i) - { - acc += (eT2) (src[src_offset + (i * src_M_n_rows) + row]); - } - dest[dest_offset + row * dest_mem_incr] = (acc / (eT2) n_cols); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_post.cl deleted file mode 100644 index 67cd08f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_post.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_colwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT1 acc = (eT1) colptr[0]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - acc = min(acc, colptr[i]); - } - dest[dest_offset + col * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_pre.cl deleted file mode 100644 index b1b4873..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_colwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT2 acc = (eT2) colptr[0]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - acc = min(acc, (eT2) (colptr[i])); - } - dest[dest_offset + col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_post.cl deleted file mode 100644 index 1cb69c5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_post.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_cube_col_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT1 acc = (eT1) src[src_offset + row + slice * n_rows * n_cols]; - for(UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols]); - } - dest[dest_offset + row + slice * n_rows] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_pre.cl deleted file mode 100644 index 4d9cfc1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_cube_col_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT2 acc = (eT2) src[src_offset + row + slice * n_rows * n_cols]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - acc = min(acc, (eT2) (src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols])); - } - dest[dest_offset + row + slice * n_rows] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_post.cl deleted file mode 100644 index 5a325aa..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_post.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_rowwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1) src[src_offset + row]; - for(UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, src[src_offset + (i * src_M_n_rows) + row]); - } - dest[dest_offset + row * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_pre.cl deleted file mode 100644 index 01f9055..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_pre.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_rowwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT2 acc = (eT2) src[src_offset + row]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - acc = min(acc, (eT2) (src[src_offset + (i * src_M_n_rows) + row])); - } - dest[dest_offset + row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq.cl deleted file mode 100644 index 39e09ce..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_all_neq)(__global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT2 val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - const eT2 val2 = (eT2) X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] &= (val1 != val); - aux_mem[tid] &= (val2 != val); - i += grid_size; - } - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - aux_mem[tid] &= (val1 != val); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(and_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_colwise.cl deleted file mode 100644 index fc5a016..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_colwise.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_all_neq_colwise)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD col = get_global_id(0); - if(col < A_n_cols) - { - __global const eT1* colptr = &(A[ col*A_n_rows + A_offset ]); - UWORD result = 1; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < A_n_rows; ++i) - { - const eT2 val1 = (eT2) colptr[i]; - result &= (val1 != val); - } - out[col + out_offset] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_rowwise.cl deleted file mode 100644 index b83fec4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_rowwise.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_all_neq_rowwise)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD row = get_global_id(0); - if(row < A_n_rows) - { - UWORD result = 1; - for(UWORD i = 0; i < A_n_cols; ++i) - { - const eT2 val1 = (eT2) A[i * A_n_rows + row + A_offset]; - result &= (val1 != val); - } - out[row + out_offset] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_small.cl deleted file mode 100644 index 4158239..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_small.cl +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_all_neq_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT2 val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - const eT2 val2 = (eT2) X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] &= (val1 != val); - aux_mem[tid] &= (val2 != val); - i += grid_size; - } - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - aux_mem[tid] &= (val1 != val); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq.cl deleted file mode 100644 index 6990ce5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_neq)(__global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT2 val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - const eT2 val2 = (eT2) X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= (val1 != val); - aux_mem[tid] |= (val2 != val); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT2 val1 = (eT2) X[X_offset + i]; - aux_mem[tid] |= (val1 != val); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(or_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_colwise.cl deleted file mode 100644 index 3f6a7eb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_colwise.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_neq_colwise)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD col = get_global_id(0); - if(col < A_n_cols) - { - __global const eT1* colptr = &(A[ col*A_n_rows + A_offset ]); - UWORD result = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < A_n_rows; ++i) - { - const eT2 val1 = (eT2) colptr[i]; - result |= (val1 != val); - if (result == 1) - break; - } - out[col + out_offset] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_rowwise.cl deleted file mode 100644 index aebf3d3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_rowwise.cl +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_neq_rowwise)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD row = get_global_id(0); - if(row < A_n_rows) - { - UWORD result = 0; - for(UWORD i = 0; i < A_n_cols; ++i) - { - const eT2 val1 = (eT2) A[i * A_n_rows + row + A_offset]; - result |= (val1 != val); - if (result == 1) - break; - } - out[row + out_offset] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_small.cl deleted file mode 100644 index 610b99b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_small.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_neq_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT2 val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - const eT2 val2 = (eT2) X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= (val1 != val); - aux_mem[tid] |= (val2 != val); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT2 val1 = (eT2) X[X_offset + i]; - aux_mem[tid] |= (val1 != val); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/repmat.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/repmat.cl deleted file mode 100644 index e5c1897..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/repmat.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,repmat)(__global const eT1* in, - const UWORD in_offset, - __global eT2* out, - const UWORD out_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD new_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD offset = row + col * n_rows; - const eT2 element = (eT2) in[in_offset + offset]; - if( (row < n_rows) && (col < n_cols) ) - { - for (UWORD c_copy = 0; c_copy < copies_per_col; ++c_copy) - { - const UWORD col_offset = (col + n_cols * c_copy) * new_n_rows; - for (UWORD r_copy = 0; r_copy < copies_per_row; ++r_copy) - { - const UWORD copy_offset = col_offset + (row + n_rows * r_copy); - out[out_offset + copy_offset] = element; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/strans.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/strans.cl deleted file mode 100644 index 2d7c667..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/strans.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,strans)(__global eT2* out, - const UWORD out_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD in_n_rows, - const UWORD in_n_cols) - { - // For a non-inplace transpose, we can use a pretty naive approach. - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD in_total_offset = in_offset + row + col * in_n_rows; - const UWORD out_total_offset = out_offset + col + row * in_n_cols; - - if( (row < in_n_rows) && (col < in_n_cols) ) - { - const eT2 element = (eT2) in[in_total_offset]; - out[out_total_offset] = element; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_post.cl deleted file mode 100644 index dc4b82f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_post.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,sum_colwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT1 acc = (eT1) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - acc += colptr[i]; - } - dest[dest_offset + col * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_pre.cl deleted file mode 100644 index b15c12b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,sum_colwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT2 acc = (eT2) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - acc += (eT2) (colptr[i]); - } - dest[dest_offset + col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_post.cl deleted file mode 100644 index fe2f867..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_post.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,sum_rowwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1) (0); - for(UWORD i = 0; i < n_cols; ++i) - { - acc += src[src_offset + (i * src_M_n_rows) + row]; - } - dest[dest_offset + row * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_pre.cl deleted file mode 100644 index 399cce3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_pre.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,sum_rowwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT2 acc = (eT2) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=0; i < n_cols; ++i) - { - acc += (eT2) (src[src_offset + (i * src_M_n_rows) + row]); - } - dest[dest_offset + row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatl.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatl.cl deleted file mode 100644 index 8ab65f5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatl.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,symmatl)(__global eT2* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const UWORD size) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < size && col < size && row >= col) - { - const eT2 val = (eT2) A[A_offset + row + size * col]; - - out[out_offset + row + size * col] = val; - out[out_offset + col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatu.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatu.cl deleted file mode 100644 index 514096b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatu.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,symmatu)(__global eT2* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const UWORD size) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < size && col < size && col >= row) - { - const eT2 val = (eT2) A[A_offset + row + size * col]; - - out[out_offset + row + size * col] = val; - out[out_offset + col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/zeroway/shuffle_large_compute_locs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/zeroway/shuffle_large_compute_locs.cl deleted file mode 100644 index a81414f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/zeroway/shuffle_large_compute_locs.cl +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This performs the first part of the shuffle_vec kernel: it computes random -// locations for the output using the variable philox bijective shuffle, -// and then does the first step of the output compression (the upsweep of the -// shifted prefix sum). -__kernel -void -shuffle_large_compute_locs(__global UWORD* out_block_mem, - const UWORD n_elem, - const UWORD n_elem_pow2, - __global const UWORD* philox_key, - const UWORD num_bits, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); - - // Get our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // Fill aux_mem with the indicator of whether we are out of bounds. - // Then, we'll prefix-sum it. This will tell us where to put our result. - aux_mem[local_tid] = (in_loc < n_elem); - barrier(CLK_LOCAL_MEM_FENCE); - - // Now, prefix-sum the auxiliary memory. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = local_size / 2; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (local_tid == 0) - { - out_block_mem[get_group_id(0)] = aux_mem[local_size - 1]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/c_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/c_defs.cl similarity index 100% rename from inst/include/bandicoot_bits/ks/kernels/opencl/defs/c_defs.cl rename to inst/include/bandicoot_bits/ks/opencl/defs/c_defs.cl diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/d_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/d_defs.cl index eb58c38..a4801ac 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/d_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/d_defs.cl @@ -22,5 +22,9 @@ inline bool coot_isnan_double(const double x) { return isnan(x); } inline double coot_absdiff_double(const double x, const double y) { return fabs(x - y); } -inline double coot_conj_double(const double x) { return x; } -//inline cx_double coot_conj_cx_double(const cx_double x) { return cx_double(x.x, -x.y); } +inline double coot_conj_double(const double x) { return x; } + +inline double coot_plus_double(const double a, const double b) { return a + b; } +inline double coot_minus_double(const double a, const double b) { return a - b; } +inline double coot_mul_double(const double a, const double b) { return a * b; } +inline double coot_div_double(const double a, const double b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/f_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/f_defs.cl index ac35a12..be682a1 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/f_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/f_defs.cl @@ -22,5 +22,9 @@ inline bool coot_isnan_float(const float x) { return isnan(x); } inline float coot_absdiff_float(const float x, const float y) { return fabs(x - y); } -inline float coot_conj_float(const float x) { return x; } -//inline cx_float coot_conj_cx_float(const cx_float x) { return cx_float(x.x, -x.y); } +inline float coot_conj_float(const float x) { return x; } + +inline float coot_plus_float(const float a, const float b) { return a + b; } +inline float coot_minus_float(const float a, const float b) { return a - b; } +inline float coot_mul_float(const float a, const float b) { return a * b; } +inline float coot_div_float(const float a, const float b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/h_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/h_defs.cl index 8fb5ca3..f8215c5 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/h_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/h_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_half(const half x) { return isnan(x); } inline half coot_absdiff_half(const half x, const half y) { return fabs(x - y); } inline half coot_conj_half(const half x) { return x; } + +inline half coot_plus_half(const half a, const half b) { return a + b; } +inline half coot_minus_half(const half a, const half b) { return a - b; } +inline half coot_mul_half(const half a, const half b) { return a * b; } +inline half coot_div_half(const half a, const half b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/s16_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/s16_defs.cl index 9a8ad90..98898a3 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/s16_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/s16_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_short(const short x) { return false; } inline short coot_absdiff_short(const short x, const short y) { return abs(x - y); } inline short coot_conj_short(const short x) { return x; } + +inline short coot_plus_short(const short a, const short b) { return a + b; } +inline short coot_minus_short(const short a, const short b) { return a - b; } +inline short coot_mul_short(const short a, const short b) { return a * b; } +inline short coot_div_short(const short a, const short b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/s32_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/s32_defs.cl index 4f8fff7..9403ec6 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/s32_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/s32_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_int(const int x) { return false; } inline int coot_absdiff_int(const int x, const int y) { return abs(x - y); } inline int coot_conj_int(const int x) { return x; } + +inline int coot_plus_int(const int a, const int b) { return a + b; } +inline int coot_minus_int(const int a, const int b) { return a - b; } +inline int coot_mul_int(const int a, const int b) { return a * b; } +inline int coot_div_int(const int a, const int b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/s64_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/s64_defs.cl index 3b81dc4..93c9c96 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/s64_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/s64_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_long(const long x) { return false; } inline long coot_absdiff_long(const long x, const long y) { return abs(x - y); } inline long coot_conj_long(const long x) { return x; } + +inline long coot_plus_long(const long a, const long b) { return a + b; } +inline long coot_minus_long(const long a, const long b) { return a - b; } +inline long coot_mul_long(const long a, const long b) { return a * b; } +inline long coot_div_long(const long a, const long b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/s8_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/s8_defs.cl index 5979c5c..20fb6fb 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/s8_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/s8_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_char(const char x) { return false; } inline char coot_absdiff_char(const char x, const char y) { return abs(x - y); } inline char coot_conj_char(const char x) { return x; } + +inline char coot_plus_char(const char a, const char b) { return a + b; } +inline char coot_minus_char(const char a, const char b) { return a - b; } +inline char coot_mul_char(const char a, const char b) { return a * b; } +inline char coot_div_char(const char a, const char b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/u16_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/u16_defs.cl index 6bf74cc..5848c03 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/u16_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/u16_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_ushort(const ushort x) { return false; } inline ushort coot_absdiff_ushort(const ushort x, const ushort y) { return (x > y) ? (x - y) : (y - x); } inline ushort coot_conj_ushort(const ushort x) { return x; } + +inline ushort coot_plus_ushort(const ushort a, const ushort b) { return a + b; } +inline ushort coot_minus_ushort(const ushort a, const ushort b) { return a - b; } +inline ushort coot_mul_ushort(const ushort a, const ushort b) { return a * b; } +inline ushort coot_div_ushort(const ushort a, const ushort b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/u32_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/u32_defs.cl index cb61bca..e427814 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/u32_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/u32_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_uint(const uint x) { return false; } inline uint coot_absdiff_uint(const uint x, const uint y) { return (x > y) ? (x - y) : (y - x); } inline uint coot_conj_uint(const uint x) { return x; } + +inline uint coot_plus_uint(const uint a, const uint b) { return a + b; } +inline uint coot_minus_uint(const uint a, const uint b) { return a - b; } +inline uint coot_mul_uint(const uint a, const uint b) { return a * b; } +inline uint coot_div_uint(const uint a, const uint b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/u64_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/u64_defs.cl index 4455c1b..3e308b3 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/u64_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/u64_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_ulong(const ulong x) { return false; } inline ulong coot_absdiff_ulong(const ulong x, const ulong y) { return (x > y) ? (x - y) : (y - x); } inline ulong coot_conj_ulong(const ulong x) { return x; } + +inline ulong coot_plus_ulong(const ulong a, const ulong b) { return a + b; } +inline ulong coot_minus_ulong(const ulong a, const ulong b) { return a - b; } +inline ulong coot_mul_ulong(const ulong a, const ulong b) { return a * b; } +inline ulong coot_div_ulong(const ulong a, const ulong b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/u8_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/u8_defs.cl index bbb6d97..fda8d45 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/u8_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/u8_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_uchar(const uchar x) { return false; } inline uchar coot_absdiff_uchar(const uchar x, const uchar y) { return (x > y) ? (x - y) : (y - x); } inline uchar coot_conj_uchar(const uchar x) { return x; } + +inline uchar coot_plus_uchar(const uchar a, const uchar b) { return a + b; } +inline uchar coot_minus_uchar(const uchar a, const uchar b) { return a - b; } +inline uchar coot_mul_uchar(const uchar a, const uchar b) { return a * b; } +inline uchar coot_div_uchar(const uchar a, const uchar b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/z_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/z_defs.cl similarity index 100% rename from inst/include/bandicoot_bits/ks/kernels/opencl/defs/z_defs.cl rename to inst/include/bandicoot_bits/ks/opencl/defs/z_defs.cl diff --git a/inst/include/bandicoot_bits/ks/opencl/oneway/fill.cl b/inst/include/bandicoot_bits/ks/opencl/oneway/fill.cl deleted file mode 100644 index 7df6b7f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/oneway/fill.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - -__kernel -void -COOT_FN(PREFIX,fill)(__global eT1* out, - const UWORD out_offset, - const eT1 val, - const UWORD n_rows, - const UWORD n_cols, - const UWORD M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD index = col * M_n_rows + row; - - if(row < n_rows && col < n_cols) - { - out[index + out_offset] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve1.cl deleted file mode 100644 index 6ae09ba..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve1.cl +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - -__kernel -void -COOT_FN(PREFIX,fill_sve1)(__global eT1* out, - const UWORD out_offset, - __global const UWORD* out_locs, - const UWORD out_locs_offset, - const eT1 val, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - out[out_locs[i + out_locs_offset] + out_offset] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve2.cl deleted file mode 100644 index 3d2c81e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve2.cl +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - -__kernel -void -COOT_FN(PREFIX,fill_sve2)(__global eT1* out, - const UWORD out_offset, - __global const UWORD* out_row_locs, - const UWORD out_row_locs_offset, - __global const UWORD* out_col_locs, - const UWORD out_col_locs_offset, - const eT1 val, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD out_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD out_loc = out_offset + - ((out_row_locs == NULL) ? row : out_row_locs[row + out_row_locs_offset]) + - out_n_rows * ((out_col_locs == NULL) ? col : out_col_locs[col + out_col_locs_offset]); - - out[out_loc] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_atan2.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_atan2.cl deleted file mode 100644 index 2599aad..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_atan2.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atan2)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const fp_eT3 a_val = (fp_eT3) src_A[src_A_index]; - const fp_eT3 b_val = (fp_eT3) src_B[src_B_index]; - dest[dest_index] = (eT3) atan2(a_val, b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array.cl deleted file mode 100644 index 6e4e169..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) (a_val / b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array_cube.cl deleted file mode 100644 index 1cfb814..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_array_cube)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = (eT3) (src_A[src_A_index] / src_B[src_B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_hypot.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_hypot.cl deleted file mode 100644 index 260ad94..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_hypot.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_hypot)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const fp_eT3 a_val = (fp_eT3) src_A[src_A_index]; - const fp_eT3 b_val = (fp_eT3) src_B[src_B_index]; - dest[dest_index] = (eT3) hypot(a_val, b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_max_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_max_array.cl deleted file mode 100644 index 1431168..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_max_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_max_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) max(a_val, b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_min_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_min_array.cl deleted file mode 100644 index d6a13c0..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_min_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_min_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) min(a_val, b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array.cl deleted file mode 100644 index 026a56c..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) (a_val - b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array_cube.cl deleted file mode 100644 index ab5f752..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_array_cube)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = (eT3) (src_A[src_A_index] - src_B[src_B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array.cl deleted file mode 100644 index 78f8bd2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) (a_val * b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array_cube.cl deleted file mode 100644 index 66f2355..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_array_cube)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = (eT3) (src_A[src_A_index] * src_B[src_B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array.cl deleted file mode 100644 index 115ece9..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) (a_val + b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array_cube.cl deleted file mode 100644 index 1d51d11..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_array_cube)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = (eT3) (src_A[src_A_index] + src_B[src_B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type.cl deleted file mode 100644 index 521a91d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,convert_type)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_index = src_offset + row + col * src_M_n_rows; - const UWORD dest_index = dest_offset + row + col * dest_M_n_rows; - - const eT1 in_val = src[src_index]; - dest[dest_index] = (eT2) (in_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type_cube.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type_cube.cl deleted file mode 100644 index a2f4a5e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type_cube.cl +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,convert_type_cube)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) src_A; - (void) src_A_offset; - (void) src_A_M_n_rows; - (void) src_A_M_n_cols; - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_index = src_offset + row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = dest_offset + row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - const eT1 in_val = src[src_index]; - dest[dest_index] = (eT2) (in_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_abs.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_abs.cl deleted file mode 100644 index b1331f6..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_abs.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2021-2025 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_abs)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ET1_ABS(src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_post.cl deleted file mode 100644 index 4730add..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_acos_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) acos(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_pre.cl deleted file mode 100644 index 830b9a2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_acos_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) acos(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_post.cl deleted file mode 100644 index 128330c..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_acosh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) acosh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_pre.cl deleted file mode 100644 index 3556df0..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_acosh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) acosh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_post.cl deleted file mode 100644 index de5bee3..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_asin_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) asin(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_pre.cl deleted file mode 100644 index 0d0f1a9..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_asin_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) asin(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_post.cl deleted file mode 100644 index 1a4e2dd..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_asinh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) asinh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_pre.cl deleted file mode 100644 index 36d1448..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_asinh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) asinh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_post.cl deleted file mode 100644 index 1a0668f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atan_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) atan(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_pre.cl deleted file mode 100644 index b218b7b..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atan_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) atan(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_post.cl deleted file mode 100644 index 7b70889..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atanh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) atanh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_pre.cl deleted file mode 100644 index b0015a2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atanh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) atanh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_post.cl deleted file mode 100644 index b8b71ce..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_post.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_ceil_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) ceil(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_pre.cl deleted file mode 100644 index 5089f10..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_pre.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_ceil_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 val = (fp_eT2) src[src_index]; - dest[dest_index] = (eT2) ceil(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_post.cl deleted file mode 100644 index 9fc13c0..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_cos_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) cos(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_pre.cl deleted file mode 100644 index 8809cb8..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_cos_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) cos(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_post.cl deleted file mode 100644 index fd6d28e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_cosh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) cosh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_pre.cl deleted file mode 100644 index a100520..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_cosh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) cosh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post.cl deleted file mode 100644 index 2c60180..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) (src[src_index] / val_pre)) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve1.cl deleted file mode 100644 index 0e13d62..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve1.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_post_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = ((eT2) (src[src_locs[i + src_locs_offset] + src_offset] / val_pre)) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve2.cl deleted file mode 100644 index 856b8ba..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_post_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = ((eT2) (src[src_loc] / val_pre)) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre.cl deleted file mode 100644 index 886bafb..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre.cl +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (val_post == (eT2) (0)) - { - // if both are 0, we take it as val_pre == 0 and val_post unused - dest[dest_index] = (eT2) (val_pre / src[src_index]); - } - else if (val_pre == (eT1) (0) && val_post != (eT2) (0)) - { - dest[dest_index] = val_post / ((eT2) src[src_index]); - } - else - { - // if both are nonzero, we apply sequentially---be careful! - dest[dest_index] = val_post / ((eT2) (val_pre / src[src_index])); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve1.cl deleted file mode 100644 index d19a030..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve1.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_pre_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - if (val_post == (eT2) (0)) - { - // if both are 0, we take it as val_pre == 0 and val_post unused - dest[dest_locs[i + dest_locs_offset] + dest_offset] = (eT2) (val_pre / src[src_locs[i + src_locs_offset] + src_offset]); - } - else if (val_pre == (eT1) (0) && val_post != (eT2) (0)) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = val_post / ((eT2) src[src_locs[i + src_locs_offset] + src_offset]); - } - else - { - // if both are nonzero, we apply sequentially---be careful! - dest[dest_locs[i + dest_locs_offset] + dest_offset] = val_post / ((eT2) (val_pre / src[src_locs[i + src_locs_offset] + src_offset])); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve2.cl deleted file mode 100644 index 444c8fc..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve2.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_pre_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - if (val_post == (eT2) (0)) - { - // If both are 0, we take it as val_pre == 0 and val_post unused - dest[dest_loc] = (eT2) (val_pre / src[src_loc]); - } - else if (val_pre == (eT1) (0) && val_post != (eT2) (0)) - { - dest[dest_loc] = val_post / ((eT2) src[src_loc]); - } - else - { - // If both are nonzero, we apply sequentially---be careful! - dest[dest_loc] = val_post / ((eT2) (val_pre / src[src_loc])); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_post.cl deleted file mode 100644 index b823806..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_erf_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) erf(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_pre.cl deleted file mode 100644 index 6369282..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_erf_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) erf(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_post.cl deleted file mode 100644 index 85bc476..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_erfc_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) erfc(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_pre.cl deleted file mode 100644 index 0df332a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_erfc_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) erfc(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_post.cl deleted file mode 100644 index 012a4f8..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp10_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) exp10((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_pre.cl deleted file mode 100644 index 4947908..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp10_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) exp10((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_post.cl deleted file mode 100644 index e2e730d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp2_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) exp2((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_pre.cl deleted file mode 100644 index eede48d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp2_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) exp2((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_post.cl deleted file mode 100644 index 433f2a5..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) exp((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_pre.cl deleted file mode 100644 index 8d9c70f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) exp((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_post.cl deleted file mode 100644 index 14b1b03..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_post.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_floor_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) floor(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_pre.cl deleted file mode 100644 index 4aa7e36..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_pre.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_floor_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 val = (fp_eT2) src[src_index]; - dest[dest_index] = (eT2) floor(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_post.cl deleted file mode 100644 index b0b9050..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_lgamma_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) lgamma(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_pre.cl deleted file mode 100644 index e899f12..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_lgamma_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) lgamma(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_post.cl deleted file mode 100644 index 78a981e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log10_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) log10((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_pre.cl deleted file mode 100644 index 15ed5f2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log10_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) log10((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_post.cl deleted file mode 100644 index 6b0eee0..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log2_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) log2((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_pre.cl deleted file mode 100644 index f92a105..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log2_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) log2((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_post.cl deleted file mode 100644 index 3580e96..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) log((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_pre.cl deleted file mode 100644 index 9d1223f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) log((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_max_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_max_array_cube.cl deleted file mode 100644 index 0107550..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_max_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_max_array_cube)(__global eT2* dest, - const UWORD dest_offset, - __global const eT2* src_A, - const UWORD src_A_offset, - __global const eT1* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = max(src_A[src_A_index], ((eT2) src_B[src_B_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_min_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_min_array_cube.cl deleted file mode 100644 index 963b574..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_min_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_min_array_cube)(__global eT2* dest, - const UWORD dest_offset, - __global const eT2* src_A, - const UWORD src_A_offset, - __global const eT1* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = min(src_A[src_A_index], ((eT2) src_B[src_B_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post.cl deleted file mode 100644 index b0cef78..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) (src[src_index] - val_pre)) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve1.cl deleted file mode 100644 index dc79c2a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve1.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_post_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = ((eT2) (src[src_locs[i + src_locs_offset] + src_offset] - val_pre)) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve2.cl deleted file mode 100644 index 8ed24e8..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_post_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = ((eT2) (src[src_loc] - val_pre)) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post.cl deleted file mode 100644 index 0b2cd8a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) val_post; - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) (val_pre - src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve1.cl deleted file mode 100644 index cd55c0d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve1.cl +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - (void) val_post; - - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = (eT2) (val_pre - src[src_locs[i + src_locs_offset] + src_offset]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve2.cl deleted file mode 100644 index 1326c28..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = (eT2) (val_pre - src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre.cl deleted file mode 100644 index e66edaf..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) val_pre; - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = val_post - ((eT2) (src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve1.cl deleted file mode 100644 index 0188d16..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve1.cl +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - (void) val_pre; - - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = val_post - ((eT2) (src[src_locs[i + src_locs_offset] + src_offset])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve2.cl deleted file mode 100644 index ad1e83f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = val_post - ((eT2) (src[src_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mod_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mod_scalar.cl deleted file mode 100644 index a73ce37..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mod_scalar.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mod_scalar)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD src_index = row + col * src_M_n_rows + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - if (row < n_rows && col < n_cols) - { - // For an integer type, the casts end up doing nothing. - uint_eT1 val = ((uint_eT1) src[src_index]) % ((uint_eT1) val_pre); - dest[dest_index] = (eT2) (((uint_eT2) val) % ((uint_eT2) val_post)); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar.cl deleted file mode 100644 index 818212e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_scalar)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) (src[src_index] * val_pre)) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve1.cl deleted file mode 100644 index 5b022a4..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve1.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_scalar_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = ((eT2) (src[src_locs[i + src_locs_offset] + src_offset] * val_pre)) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve2.cl deleted file mode 100644 index cbd9ca5..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_scalar_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = ((eT2) (src[src_loc] * val_pre)) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_post.cl deleted file mode 100644 index 8d2710e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_neg_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) -src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_pre.cl deleted file mode 100644 index abe43fe..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_neg_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = -((eT2) src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar.cl deleted file mode 100644 index 6f382ba..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_scalar)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) (src[src_index] + val_pre)) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve1.cl deleted file mode 100644 index 3bec2dd..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve1.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_scalar_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = ((eT2) (src[src_locs[i + src_locs_offset] + src_offset] + val_pre)) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve2.cl deleted file mode 100644 index 6afdb65..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_scalar_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = ((eT2) (src[src_loc] + val_pre)) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_post.cl deleted file mode 100644 index 9d98f9a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_pow_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) pow(val, (fp_eT1) val_pre); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_pre.cl deleted file mode 100644 index e26a831..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_pow_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) pow(val, (fp_eT2) val_post); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_post.cl deleted file mode 100644 index 7e4413e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_post.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_round_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) round(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_pre.cl deleted file mode 100644 index a5d715c..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_pre.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_round_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 val = (fp_eT2) src[src_index]; - dest[dest_index] = (eT2) round(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_post.cl deleted file mode 100644 index 6721792..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_post.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sign_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = (eT1) src[src_index]; - if (val > (eT1) 0) - { - dest[dest_index] = (eT2) 1; - } - else if (val == (eT1) 0) - { - dest[dest_index] = (eT2) 0; - } - else - { - dest[dest_index] = (eT2) -1; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_pre.cl deleted file mode 100644 index 6223859..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_pre.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sign_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = (eT2) src[src_index]; - if (val > (eT2) 0) - { - dest[dest_index] = (eT2) 1; - } - else if (val == (eT2) 0) - { - dest[dest_index] = (eT2) 0; - } - else - { - dest[dest_index] = (eT2) -1; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_post.cl deleted file mode 100644 index 10a2f6b..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sin_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) sin(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_pre.cl deleted file mode 100644 index f16ca0f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sin_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) sin(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_post.cl deleted file mode 100644 index d576c7d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_post.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sinc_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = (eT1) src[src_index]; - // To imitate Armadillo correctly, we use double if the type is not floating point. - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 tmp = val * M_PI; - dest[dest_index] = (tmp == (eT1) 0.0) ? (eT2) 1.0 : (eT2) (sin(tmp) / tmp); - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - const ARMA_FP_TYPE tmp = fp_val * M_PI; - dest[dest_index] = (tmp == 0.0) ? (eT2) 1.0 : (eT2) (sin(tmp) / tmp); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_pre.cl deleted file mode 100644 index 707254d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_pre.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sinc_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = (eT2) src[src_index]; - // To imitate Armadillo correctly, we use double if the type is not floating point. - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 tmp = val * M_PI; - dest[dest_index] = (tmp == (eT2) 0.0) ? (eT2) 1.0 : (eT2) (sin(tmp) / tmp); - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - const ARMA_FP_TYPE tmp = fp_val * M_PI; - dest[dest_index] = (tmp == 0.0) ? (eT2) 1.0 : (eT2) (sin(tmp) / tmp); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_post.cl deleted file mode 100644 index 46c297a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sinh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) sinh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_pre.cl deleted file mode 100644 index f01a83d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sinh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) sinh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_post.cl deleted file mode 100644 index 0951351..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sqrt_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) sqrt((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_pre.cl deleted file mode 100644 index 71ea604..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sqrt_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) sqrt((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_post.cl deleted file mode 100644 index 6def56f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_square_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) (src[src_index] * src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_pre.cl deleted file mode 100644 index edb6936..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_pre.cl +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_square_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = (eT2) src[src_index]; - dest[dest_index] = val * val; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_post.cl deleted file mode 100644 index f9e8d0b..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_tan_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) tan(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_pre.cl deleted file mode 100644 index ccba133..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_tan_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) tan(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_post.cl deleted file mode 100644 index 487e9cf..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_tanh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) tanh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_pre.cl deleted file mode 100644 index 29419b2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_tanh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) tanh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_post.cl deleted file mode 100644 index 4a50a3a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_post.cl +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_exp_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To match Armadillo, we always use `double` as the intermediate type for any non-floating point type. - const eT1 val = src[src_index]; - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 fp_val = (fp_eT1) val; - if (fp_val >= log(COOT_FN(coot_type_max_,fp_eT1)())) - { - dest[dest_index] = (eT2) ((eT1) COOT_FN(coot_type_max_,fp_eT1)()); - } - else - { - dest[dest_index] = (eT2) ((eT1) exp(fp_val)); - } - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - if (fp_val >= log(ARMA_FP_MAX)) - { - dest[dest_index] = (eT2) ((eT1) ARMA_FP_MAX); - } - else - { - dest[dest_index] = (eT2) ((eT1) exp(fp_val)); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_pre.cl deleted file mode 100644 index ffd470d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_pre.cl +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_exp_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To imitate Armadillo's behavior exactly, if the type is not floating-point, we convert to double. - const eT2 val = (eT2) src[src_index]; - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 fp_val = (fp_eT2) val; - if (fp_val >= log(COOT_FN(coot_type_max_,fp_eT2)())) - { - dest[dest_index] = (eT2) COOT_FN(coot_type_max_,fp_eT2)(); - } - else - { - dest[dest_index] = (eT2) exp(fp_val); - } - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - if (fp_val >= log(ARMA_FP_MAX)) - { - dest[dest_index] = (eT2) ARMA_FP_MAX; - } - else - { - dest[dest_index] = (eT2) exp(fp_val); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_post.cl deleted file mode 100644 index 1bfc97f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_post.cl +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_log_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 fp_val = (fp_eT1) val; - if (fp_val <= (fp_eT1) 0) - { - dest[dest_index] = (eT2) log(COOT_FN(coot_type_minpos_,fp_eT1)()); - } - else if (isinf(fp_val)) - { - dest[dest_index] = (eT2) log(COOT_FN(coot_type_max_,fp_eT1)()); - } - else - { - dest[dest_index] = (eT2) ((eT1) log(fp_val)); - } - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - if (fp_val <= (ARMA_FP_TYPE) 0) - { - dest[dest_index] = (eT2) log(ARMA_FP_MIN); - } - else if (isinf(fp_val)) - { - dest[dest_index] = (eT2) log(ARMA_FP_MAX); - } - else - { - dest[dest_index] = (eT2) ((eT1) log(fp_val)); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_pre.cl deleted file mode 100644 index ac92d12..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_pre.cl +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_log_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To match Armadillo, we always use `double` as the intermediate type for any non-floating point type. - const eT2 val = (eT2) src[src_index]; - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 fp_val = (fp_eT2) val; - if (fp_val <= (fp_eT2) 0) - { - dest[dest_index] = (eT2) log(COOT_FN(coot_type_minpos_,fp_eT2)()); - } - else if (isinf(fp_val)) - { - dest[dest_index] = (eT2) log(COOT_FN(coot_type_max_,fp_eT2)()); - } - else - { - dest[dest_index] = (eT2) log(fp_val); - } - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - if (fp_val <= (ARMA_FP_TYPE) 0) - { - dest[dest_index] = (eT2) log(ARMA_FP_MIN); - } - else if (isinf(fp_val)) - { - dest[dest_index] = (eT2) log(ARMA_FP_MAX); - } - else - { - dest[dest_index] = (eT2) log(fp_val); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_post.cl deleted file mode 100644 index c6d77ed..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_post.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) trunc(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_pre.cl deleted file mode 100644 index 95c2624..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_pre.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 val = (fp_eT2) src[src_index]; - dest[dest_index] = (eT2) trunc(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve1.cl deleted file mode 100644 index 7a44609..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve1.cl +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,extract_sve1)(__global eT2* out_mem, - const UWORD out_mem_offset, - __global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_locs, - const UWORD in_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - out_mem[i + out_mem_offset] = (eT2) in_mem[in_locs[i + in_locs_offset] + in_mem_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve2.cl deleted file mode 100644 index 908a292..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve2.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,extract_sve2)(__global eT2* out_mem, - const UWORD out_mem_offset, - __global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_row_locs, - const UWORD in_row_locs_offset, - __global const UWORD* in_col_locs, - const UWORD in_col_locs_offset, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD out_n_rows, - const UWORD in_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD in_loc = in_mem_offset + - ((in_row_locs == NULL) ? row : in_row_locs[row + in_row_locs_offset]) + - in_n_rows * ((in_col_locs == NULL) ? col : in_col_locs[col + in_col_locs_offset]); - - const UWORD out_loc = out_mem_offset + row + out_n_rows * col; - - out_mem[out_loc] = (eT2) in_mem[in_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_and_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_and_array.cl deleted file mode 100644 index 6e47aaf..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_and_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_and_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 && val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_array.cl deleted file mode 100644 index db6e3cd..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_eq_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 == val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_scalar.cl deleted file mode 100644 index c03a8ee..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_eq_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 == val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_array.cl deleted file mode 100644 index ce94343..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_gt_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 > val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_scalar.cl deleted file mode 100644 index 62f6e00..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_gt_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 > val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_array.cl deleted file mode 100644 index c045833..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_gteq_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 >= val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_scalar.cl deleted file mode 100644 index d4b3642..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_gteq_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 >= val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_array.cl deleted file mode 100644 index 96452ce..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_lt_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 < val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_scalar.cl deleted file mode 100644 index 5602adf..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_lt_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 < val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_array.cl deleted file mode 100644 index 6e1b121..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_lteq_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 <= val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_scalar.cl deleted file mode 100644 index 555597f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_lteq_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 <= val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_array.cl deleted file mode 100644 index b3f4480..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_neq_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 != val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_scalar.cl deleted file mode 100644 index 0a7c0db..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_neq_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 != val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_or_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_or_array.cl deleted file mode 100644 index 1493265..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_or_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_or_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 || val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/replace.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/replace.cl deleted file mode 100644 index a5aed9d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/replace.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,replace)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_find, - const eT1 val_replace, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (COOT_FN(coot_isnan_,eT1)(val_find)) - { - // We are searching for a NaN so the check is a little different. - dest[dest_index] = (eT2) (COOT_FN(coot_isnan_,eT1)(val) ? val_replace : val); - } - else - { - // No special handling needed. - dest[dest_index] = (eT2) ((val == val_find) ? val_replace : val); - } - } - } From 14a19c5320bb1009d4946bf12def6c650ffa9de3 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Fri, 24 Apr 2026 00:06:03 -0500 Subject: [PATCH 27/31] Bump outdated GitHub Actions to latest majors --- .github/workflows/R-CMD-check.yaml | 2 +- .github/workflows/upstream-update.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 15538b6..2a6ea15 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -28,7 +28,7 @@ jobs: INTEL_OPENCL_URL: "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b6dccdb7-b503-41ea-bd4b-a78e9c2d8dd6/w_opencl_runtime_p_2025.1.0.972.exe" steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: r-lib/actions/setup-pandoc@v2 diff --git a/.github/workflows/upstream-update.yml b/.github/workflows/upstream-update.yml index 1e8e047..c76c211 100644 --- a/.github/workflows/upstream-update.yml +++ b/.github/workflows/upstream-update.yml @@ -11,7 +11,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Set up GitLab API access id: setup @@ -160,7 +160,7 @@ jobs: - name: Create Pull Request if: steps.check-update.outputs.update_needed == 'true' - uses: peter-evans/create-pull-request@v7 + uses: peter-evans/create-pull-request@v8 with: token: ${{ secrets.GITHUB_TOKEN }} commit-message: "Update to Bandicoot release ${{ steps.get-version.outputs.version }}" From b97f60801772a7c305511083f4f289bcc24bbe47 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Fri, 24 Apr 2026 00:21:40 -0500 Subject: [PATCH 28/31] Silence upstream bandicoot warnings under MinGW/GCC 14 Adds -Wno-reorder and -Wno-address to PKG_CXXFLAGS on Windows to suppress cosmetic -Wreorder noise in opencl/runtime_bones.hpp and the -Waddress flood GCC 14 emits against bandicoot's compile-time string metaprogramming in kernel_gen/array_util.hpp. Both are upstream issues with no correctness impact. --- src/Makevars.win.in | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/Makevars.win.in b/src/Makevars.win.in index ccdda30..6831c49 100644 --- a/src/Makevars.win.in +++ b/src/Makevars.win.in @@ -6,7 +6,14 @@ PKG_CPPFLAGS = -I../inst/include -DCOOT_TARGET_OPENCL_VERSION=@OPENCL_TARGET_VERSION@ -DCOOT_KERNEL_SOURCE_DIR=\"@BANDICOOT_KERNELS_DIR@\" ## Compiler flags from configure.win -PKG_CXXFLAGS = @BANDICOOT_CXXFLAGS@ +## +## Silence two upstream bandicoot warning classes on MinGW/GCC 14: +## -Wno-reorder: field-init order mismatches in opencl/runtime_bones.hpp +## (cosmetic; upstream issue, no correctness impact) +## -Wno-address: GCC 14's -Waddress false-positives against bandicoot's +## compile-time string metaprogramming where `Class::len` +## is a static const member, not a function call +PKG_CXXFLAGS = @BANDICOOT_CXXFLAGS@ -Wno-reorder -Wno-address ## Linker flags from configure.win PKG_LIBS = @OPENMP_CXXFLAGS@ @BANDICOOT_LIBS@ From 23e7db7dc83385dc1a62782fe836c08869afd275 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Fri, 24 Apr 2026 00:43:18 -0500 Subject: [PATCH 29/31] Patch bundled bandicoot headers to fix GCC 14 warnings; drop Makevars suppressions Applies four local patches to the embedded bandicoot 4.0.0 headers to eliminate -Wreorder, -Wmaybe-uninitialized, -Waddress, and -Wunused-local-typedefs warnings at the source, then removes the -Wno-reorder/-Wno-address flags from Makevars.win.in that CRAN flagged as non-portable. Candidate upstream fixes: opencl/runtime_meat.hpp reorder val64/val32 mem-initializers to match declaration order (copy + move ctors) opencl/runtime_bones.hpp default-initialize adapt_uword members so the branch not taken by the primary ctor is no longer formally uninitialized kernel_gen/array_util.hpp replace decltype(T::len, void()) SFINAE check with decltype((void)T::len) to avoid GCC 14's -Waddress false-positives mtglue_mixed_meat.hpp drop unused in_eT1/in_eT2 typedefs in mtglue_mixed_times::apply --- inst/include/bandicoot_bits/kernel_gen/array_util.hpp | 2 +- inst/include/bandicoot_bits/mtglue_mixed_meat.hpp | 3 --- inst/include/bandicoot_bits/opencl/runtime_bones.hpp | 8 ++++---- inst/include/bandicoot_bits/opencl/runtime_meat.hpp | 4 ++-- src/Makevars.win.in | 9 +-------- 5 files changed, 8 insertions(+), 18 deletions(-) diff --git a/inst/include/bandicoot_bits/kernel_gen/array_util.hpp b/inst/include/bandicoot_bits/kernel_gen/array_util.hpp index d21e684..2efc1e1 100644 --- a/inst/include/bandicoot_bits/kernel_gen/array_util.hpp +++ b/inst/include/bandicoot_bits/kernel_gen/array_util.hpp @@ -75,7 +75,7 @@ struct has_len_member }; template -struct has_len_member +struct has_len_member { static const bool value = true; }; diff --git a/inst/include/bandicoot_bits/mtglue_mixed_meat.hpp b/inst/include/bandicoot_bits/mtglue_mixed_meat.hpp index c1e4328..e52f4cc 100644 --- a/inst/include/bandicoot_bits/mtglue_mixed_meat.hpp +++ b/inst/include/bandicoot_bits/mtglue_mixed_meat.hpp @@ -31,9 +31,6 @@ mtglue_mixed_times::apply(Mat& out, const mtGlue> tmp1(mtOp(X.A)); const partial_unwrap> tmp2(mtOp(X.B)); diff --git a/inst/include/bandicoot_bits/opencl/runtime_bones.hpp b/inst/include/bandicoot_bits/opencl/runtime_bones.hpp index 67b4845..4aad7b1 100644 --- a/inst/include/bandicoot_bits/opencl/runtime_bones.hpp +++ b/inst/include/bandicoot_bits/opencl/runtime_bones.hpp @@ -271,10 +271,10 @@ class runtime_t::adapt_uword { public: - coot_aligned size_t size; - coot_aligned void* addr; - coot_aligned u64 val64; - coot_aligned u32 val32; + coot_aligned size_t size = 0; + coot_aligned void* addr = nullptr; + coot_aligned u64 val64 = 0; + coot_aligned u32 val32 = 0; inline adapt_uword(const uword val = 0); // default value needed for allocating several at once diff --git a/inst/include/bandicoot_bits/opencl/runtime_meat.hpp b/inst/include/bandicoot_bits/opencl/runtime_meat.hpp index 1190a0f..df18955 100644 --- a/inst/include/bandicoot_bits/opencl/runtime_meat.hpp +++ b/inst/include/bandicoot_bits/opencl/runtime_meat.hpp @@ -1702,8 +1702,8 @@ runtime_t::adapt_uword::adapt_uword(const uword val) inline runtime_t::adapt_uword::adapt_uword(const runtime_t::adapt_uword& other) : size(other.size) - , val32(other.val32) , val64(other.val64) + , val32(other.val32) { if (other.addr == &other.val32) { @@ -1720,8 +1720,8 @@ runtime_t::adapt_uword::adapt_uword(const runtime_t::adapt_uword& other) inline runtime_t::adapt_uword::adapt_uword(runtime_t::adapt_uword&& other) : size(other.size) - , val32(other.val32) , val64(other.val64) + , val32(other.val32) { if (other.addr == &other.val32) { diff --git a/src/Makevars.win.in b/src/Makevars.win.in index 6831c49..ccdda30 100644 --- a/src/Makevars.win.in +++ b/src/Makevars.win.in @@ -6,14 +6,7 @@ PKG_CPPFLAGS = -I../inst/include -DCOOT_TARGET_OPENCL_VERSION=@OPENCL_TARGET_VERSION@ -DCOOT_KERNEL_SOURCE_DIR=\"@BANDICOOT_KERNELS_DIR@\" ## Compiler flags from configure.win -## -## Silence two upstream bandicoot warning classes on MinGW/GCC 14: -## -Wno-reorder: field-init order mismatches in opencl/runtime_bones.hpp -## (cosmetic; upstream issue, no correctness impact) -## -Wno-address: GCC 14's -Waddress false-positives against bandicoot's -## compile-time string metaprogramming where `Class::len` -## is a static const member, not a function call -PKG_CXXFLAGS = @BANDICOOT_CXXFLAGS@ -Wno-reorder -Wno-address +PKG_CXXFLAGS = @BANDICOOT_CXXFLAGS@ ## Linker flags from configure.win PKG_LIBS = @OPENMP_CXXFLAGS@ @BANDICOOT_LIBS@ From d999819a5ae830a800e07873359083c449dfe867 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Fri, 24 Apr 2026 01:01:08 -0500 Subject: [PATCH 30/31] Restore exec bit on configure/cleanup during Windows CI actions/checkout drops git's 100755 mode on NTFS, so R CMD build warns "did not have execute permissions: corrected" on every run. A chmod +x via MSYS bash restores the bit in a form R's tar picks up. --- .github/workflows/R-CMD-check.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 2a6ea15..9f91c7f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -30,6 +30,14 @@ jobs: steps: - uses: actions/checkout@v6 + - name: Restore execute bit on shell scripts (Windows) + if: runner.os == 'Windows' + shell: bash + # NTFS drops the git-stored +x bit on checkout, so R CMD build + # otherwise warns "did not have execute permissions: corrected" + # every run. chmod via MSYS bash lands in a form R's tar sees. + run: chmod +x configure cleanup + - uses: r-lib/actions/setup-pandoc@v2 - uses: r-lib/actions/setup-r@v2 From f0df64a74a33ef9f907148724ac59fe3c2e525f8 Mon Sep 17 00:00:00 2001 From: "james.balamuta@gmail.com" Date: Fri, 24 Apr 2026 01:07:40 -0500 Subject: [PATCH 31/31] Re-enable macOS and Linux runners in R-CMD-check matrix --- .github/workflows/R-CMD-check.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 9f91c7f..bc8b6a0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,8 +18,8 @@ jobs: fail-fast: false matrix: config: - #- {os: macos-latest, r: 'release'} - #- {os: ubuntu-latest, r: 'release'} + - {os: macos-latest, r: 'release'} + - {os: ubuntu-latest, r: 'release'} - {os: windows-latest, r: 'release'} env: