diff --git a/.Rbuildignore b/.Rbuildignore index 1d20737..b18f2ed 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,3 +6,5 @@ ^autom4te\.cache$ ^config\.log$ ^config\.status$ +^\.gitattributes$ +^install\.log$ diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c6bc410 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +# Ensure shell scripts and configure files always use LF line endings +configure text eol=lf +configure.ac text eol=lf +configure.win text eol=lf +cleanup text eol=lf +*.sh text eol=lf diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 2ff3f6f..bc8b6a0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -19,14 +19,24 @@ jobs: matrix: config: - {os: macos-latest, r: 'release'} - - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: windows-latest, r: 'release'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes + INTEL_OPENCL_URL: "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b6dccdb7-b503-41ea-bd4b-a78e9c2d8dd6/w_opencl_runtime_p_2025.1.0.972.exe" steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 + + - name: Restore execute bit on shell scripts (Windows) + if: runner.os == 'Windows' + shell: bash + # NTFS drops the git-stored +x bit on checkout, so R CMD build + # otherwise warns "did not have execute permissions: corrected" + # every run. chmod via MSYS bash lands in a form R's tar sees. + run: chmod +x configure cleanup - uses: r-lib/actions/setup-pandoc@v2 @@ -38,12 +48,77 @@ jobs: - name: Install clblast via Homebrew (macOS) if: runner.os == 'macOS' - run: brew install clblast clblas + run: brew install clblast clblas clinfo - name: Install clblast via apt (Ubuntu) if: runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y opencl-headers ocl-icd-opencl-dev libclblast-dev + - name: Cache Intel OpenCL Runtime + if: runner.os == 'Windows' + id: cache-opencl-win + uses: actions/cache@v5 + with: + # Default install location is x86 ... + path: C:\Program Files (x86)\Common Files\Intel\Shared Libraries + # Key is derived from INTEL_OPENCL_URL, so bumping the runtime + # URL above automatically invalidates the cache. + key: intel-opencl-runtime-${{ env.INTEL_OPENCL_URL }} + + - name: Install OpenCL SDK (Windows) + if: runner.os == 'Windows' + shell: pwsh + run: | + vcpkg install opencl:x64-windows clblast:x64-windows + # Convert to forward slashes to avoid escaping issues in configure.win + $vcpkgPath = "$env:VCPKG_INSTALLATION_ROOT/installed/x64-windows" -replace '\\','/' + + # Register OpenCL location + echo "OPENCL_CPPFLAGS=-I$vcpkgPath/include" >> $env:GITHUB_ENV + echo "OPENCL_LIBS=-L$vcpkgPath/lib -lOpenCL" >> $env:GITHUB_ENV + + # Register CLBlast location + echo "CLBLAST_CPPFLAGS=-I$vcpkgPath/include" >> $env:GITHUB_ENV + echo "CLBLAST_LIBS=-L$vcpkgPath/lib -lclblast" >> $env:GITHUB_ENV + + # Add vcpkg bin to PATH so DLLs can be found at runtime + echo "$env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" >> $env:GITHUB_PATH + + - name: Install Intel CPU Runtime for OpenCL (Windows) + if: runner.os == 'Windows' && steps.cache-opencl-win.outputs.cache-hit != 'true' + shell: pwsh + run: | + # Download + curl -o opencl-installer.exe "${{ env.INTEL_OPENCL_URL }}" + + # Extract MSI from the self-extracting exe + $proc = Start-Process "./opencl-installer.exe" "-s -x -f extracted" -NoNewWindow -PassThru + $proc.WaitForExit() + + # Install via msiexec + $msi = Get-ChildItem ./extracted/*.msi | Select-Object -First 1 + # Write the MSI log outside the repo so it doesn't end up in the + # package tarball (R CMD check flags it as a non-standard top-level file). + $logPath = "$env:RUNNER_TEMP\intel-opencl-install.log" + $proc = Start-Process "msiexec" "/i `"$msi`" /qn /l*! `"$logPath`"" -NoNewWindow -PassThru + $proc.WaitForExit() + + if ($proc.ExitCode -ne 0) { + Get-Content $logPath + exit $proc.ExitCode + } + + Remove-Item -Recurse -Force extracted, opencl-installer.exe + + - name: Register Intel OpenCL ICD (Windows) + if: runner.os == 'Windows' + shell: pwsh + run: | + $dllPath = "C:\Program Files (x86)\Common Files\Intel\Shared Libraries\bin\OpenCL.dll" + REG ADD "HKLM\SOFTWARE\Khronos\OpenCL\Vendors" /v $dllPath /t REG_DWORD /d 0 /f + REG ADD "HKLM\SOFTWARE\Wow6432Node\Khronos\OpenCL\Vendors" /v $dllPath /t REG_DWORD /d 0 /f + Add-Content $env:GITHUB_PATH "C:\Program Files (x86)\Common Files\Intel\Shared Libraries\bin\" + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::rcmdcheck diff --git a/.github/workflows/upstream-update.yml b/.github/workflows/upstream-update.yml index 236b0e7..c76c211 100644 --- a/.github/workflows/upstream-update.yml +++ b/.github/workflows/upstream-update.yml @@ -11,7 +11,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Set up GitLab API access id: setup @@ -96,6 +96,12 @@ jobs: if [ -d "$BANDICOOT_DIR/include" ]; then echo "Found include directory at $BANDICOOT_DIR/include" + # Wipe existing bandicoot_bits/ so we don't mix old and new content. + # Previously the mv below nested kernels/ inside a pre-existing ks/, + # producing ks/kernels/ paths that exceeded CRAN's 100-byte limit + # and left stale kernel sources behind. + rm -rf inst/include/bandicoot_bits inst/include/bandicoot + # Create destination directory if it doesn't exist mkdir -p inst/include @@ -154,7 +160,7 @@ jobs: - name: Create Pull Request if: steps.check-update.outputs.update_needed == 'true' - uses: peter-evans/create-pull-request@v7 + uses: peter-evans/create-pull-request@v8 with: token: ${{ secrets.GITHUB_TOKEN }} commit-message: "Update to Bandicoot release ${{ steps.get-version.outputs.version }}" diff --git a/configure.win b/configure.win new file mode 100755 index 0000000..353e9b8 --- /dev/null +++ b/configure.win @@ -0,0 +1,180 @@ +#!/bin/sh +## +## RcppBandicoot configure.win +## +## Windows-specific configuration script +## Detects OpenCL and CLBlast from environment variables +## +## Copyright (C) 2023-2025 James Balamuta +## Licensed under GPL-2 or later +## + +echo "Configuring RcppBandicoot for Windows..." + +## Get R_HOME +: ${R_HOME=$(R RHOME)} +if test -z "${R_HOME}"; then + echo "ERROR: Could not determine R_HOME" + exit 1 +fi + +## Default values +BANDICOOT_CXXFLAGS="" +BANDICOOT_LIBS="" +OPENMP_CXXFLAGS='$(SHLIB_OPENMP_CXXFLAGS)' +OPENCL_TARGET_VERSION=300 + +## Kernel source directory (matches configure.ac logic) +BANDICOOT_KERNELS_DIR=$("${R_HOME}/bin/Rscript" -e 'cat(paste(head(.libPaths(),1), "RcppBandicoot", "include", "bandicoot_bits", "ks", "", sep="/"))') + +## Always disable CUDA on Windows (requires manual setup) +BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CUDA" + +## OpenCL requires both the headers (OPENCL_CPPFLAGS) and the link flags +## (OPENCL_LIBS). Enabling the backend with only one set would compile but +## fail to link, so gate COOT_USE_OPENCL on having both. +if [ -n "${OPENCL_CPPFLAGS}" ] && [ -n "${OPENCL_LIBS}" ]; then + echo " OpenCL headers: found (via OPENCL_CPPFLAGS)" + echo " OpenCL library: found (via OPENCL_LIBS): ${OPENCL_LIBS}" + BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_USE_OPENCL ${OPENCL_CPPFLAGS}" + BANDICOOT_LIBS="${BANDICOOT_LIBS} ${OPENCL_LIBS}" + HAVE_OPENCL=1 +else + echo " OpenCL: not configured (need both OPENCL_CPPFLAGS and OPENCL_LIBS)" + [ -z "${OPENCL_CPPFLAGS}" ] && echo " missing OPENCL_CPPFLAGS" + [ -z "${OPENCL_LIBS}" ] && echo " missing OPENCL_LIBS" + HAVE_OPENCL=0 +fi + +## Check for CLBlast via environment variable +if [ -n "${CLBLAST_CPPFLAGS}" ]; then + echo " CLBlast: found (via CLBLAST_CPPFLAGS)" + BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_USE_CLBLAST ${CLBLAST_CPPFLAGS}" + HAVE_CLBLAST=1 +else + echo " CLBlast: not found, disabling" + BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CLBLAST" + HAVE_CLBLAST=0 +fi + +## Check for CLBlast library +if [ -n "${CLBLAST_LIBS}" ]; then + echo " CLBlast library: ${CLBLAST_LIBS}" + BANDICOOT_LIBS="${BANDICOOT_LIBS} ${CLBLAST_LIBS}" +fi + +## Always disable clBLAS (CLBlast is preferred) +BANDICOOT_CXXFLAGS="${BANDICOOT_CXXFLAGS} -DCOOT_DONT_USE_CLBLAS" +HAVE_CLBLAS=0 + +## Add common flags +BANDICOOT_CXXFLAGS="${OPENMP_CXXFLAGS} ${BANDICOOT_CXXFLAGS}" + +## Add R's LAPACK/BLAS +BANDICOOT_LIBS="${BANDICOOT_LIBS} \$(LAPACK_LIBS) \$(BLAS_LIBS) \$(FLIBS)" + +## Check if we have at least OpenCL +if [ "${HAVE_OPENCL}" = "0" ]; then + echo "" + echo "WARNING: No GPU backend detected!" + echo "" + echo "RcppBandicoot requires OpenCL headers to compile on Windows." + echo "Set the following environment variables before building:" + echo "" + echo " OPENCL_CPPFLAGS - Path to OpenCL headers (e.g., -IC:/OpenCL/include)" + echo " OPENCL_LIBS - OpenCL library flags (e.g., -LC:/OpenCL/lib -lOpenCL)" + echo " CLBLAST_CPPFLAGS - Path to CLBlast headers (optional)" + echo " CLBLAST_LIBS - CLBlast library flags (optional)" + echo "" +fi + +echo "" +echo "Configuration Summary:" +echo " OpenCL: ${HAVE_OPENCL}" +echo " CLBlast: ${HAVE_CLBLAST}" +echo " CUDA: disabled (Windows)" +echo " Kernels: ${BANDICOOT_KERNELS_DIR}" +echo "" + +## Generate Makevars.win from template +if [ -f "src/Makevars.win.in" ]; then + sed -e "s|@BANDICOOT_CXXFLAGS@|${BANDICOOT_CXXFLAGS}|g" \ + -e "s|@BANDICOOT_LIBS@|${BANDICOOT_LIBS}|g" \ + -e "s|@OPENMP_CXXFLAGS@|${OPENMP_CXXFLAGS}|g" \ + -e "s|@OPENCL_TARGET_VERSION@|${OPENCL_TARGET_VERSION}|g" \ + -e "s|@BANDICOOT_KERNELS_DIR@|${BANDICOOT_KERNELS_DIR}|g" \ + src/Makevars.win.in > src/Makevars.win + echo "Generated src/Makevars.win" +else + echo "ERROR: src/Makevars.win.in not found!" + exit 1 +fi + +## Verify the generated file +if [ -f "src/Makevars.win" ]; then + echo "Contents of src/Makevars.win:" + cat src/Makevars.win +else + echo "ERROR: Failed to generate src/Makevars.win!" + exit 1 +fi + +## Generate R/flags.R from template +## Set variables for flags.R.in substitution +HAVE_CUDA=0 +HAVE_OPENMP=1 +DEFAULT_BACKEND="CL_BACKEND" +OPENCL_CXXFLAGS_FOR_R="" +OPENCL_LIBS_FOR_R="" +CUDA_CXXFLAGS="" +CUDA_LIBS="" +CLBLAST_CXXFLAGS_FOR_R="" +CLBLAST_LIBS_FOR_R="" +CLBLAS_CXXFLAGS="" +CLBLAS_LIBS="" +LAPACK_BLAS_LIBS="\$(LAPACK_LIBS) \$(BLAS_LIBS) \$(FLIBS)" +CLBLAST_PREFIX="" +CLBLAS_PREFIX="" +CUDA_HOME="" +SDKPATH="" + +## Set OpenCL flags for R if detected +if [ "${HAVE_OPENCL}" = "1" ]; then + OPENCL_CXXFLAGS_FOR_R="-DCOOT_USE_OPENCL ${OPENCL_CPPFLAGS}" + OPENCL_LIBS_FOR_R="${OPENCL_LIBS}" +fi + +## Set CLBlast flags for R if detected +if [ "${HAVE_CLBLAST}" = "1" ]; then + CLBLAST_CXXFLAGS_FOR_R="-DCOOT_USE_CLBLAST ${CLBLAST_CPPFLAGS}" + CLBLAST_LIBS_FOR_R="${CLBLAST_LIBS}" +fi + +if [ -f "R/flags.R.in" ]; then + sed -e "s|@BANDICOOT_CXXFLAGS@|${BANDICOOT_CXXFLAGS}|g" \ + -e "s|@BANDICOOT_LIBS@|${BANDICOOT_LIBS}|g" \ + -e "s|@OPENCL_CXXFLAGS@|${OPENCL_CXXFLAGS_FOR_R}|g" \ + -e "s|@OPENCL_LIBS@|${OPENCL_LIBS_FOR_R}|g" \ + -e "s|@CUDA_CXXFLAGS@|${CUDA_CXXFLAGS}|g" \ + -e "s|@CUDA_LIBS@|${CUDA_LIBS}|g" \ + -e "s|@CLBLAST_CXXFLAGS@|${CLBLAST_CXXFLAGS_FOR_R}|g" \ + -e "s|@CLBLAST_LIBS@|${CLBLAST_LIBS_FOR_R}|g" \ + -e "s|@CLBLAS_CXXFLAGS@|${CLBLAS_CXXFLAGS}|g" \ + -e "s|@CLBLAS_LIBS@|${CLBLAS_LIBS}|g" \ + -e "s|@LAPACK_BLAS_LIBS@|${LAPACK_BLAS_LIBS}|g" \ + -e "s|@HAVE_OPENCL@|${HAVE_OPENCL}|g" \ + -e "s|@HAVE_CUDA@|${HAVE_CUDA}|g" \ + -e "s|@HAVE_CLBLAST@|${HAVE_CLBLAST}|g" \ + -e "s|@HAVE_CLBLAS@|${HAVE_CLBLAS}|g" \ + -e "s|@HAVE_OPENMP@|${HAVE_OPENMP}|g" \ + -e "s|@DEFAULT_BACKEND@|${DEFAULT_BACKEND}|g" \ + -e "s|@CLBLAST_PREFIX@|${CLBLAST_PREFIX}|g" \ + -e "s|@CLBLAS_PREFIX@|${CLBLAS_PREFIX}|g" \ + -e "s|@CUDA_HOME@|${CUDA_HOME}|g" \ + -e "s|@SDKPATH@|${SDKPATH}|g" \ + R/flags.R.in > R/flags.R + echo "Generated R/flags.R" +else + echo "ERROR: R/flags.R.in not found!" + exit 1 +fi diff --git a/inst/include/bandicoot_bits/kernel_gen/array_util.hpp b/inst/include/bandicoot_bits/kernel_gen/array_util.hpp index d21e684..2efc1e1 100644 --- a/inst/include/bandicoot_bits/kernel_gen/array_util.hpp +++ b/inst/include/bandicoot_bits/kernel_gen/array_util.hpp @@ -75,7 +75,7 @@ struct has_len_member }; template -struct has_len_member +struct has_len_member { static const bool value = true; }; diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/c_defs.cu b/inst/include/bandicoot_bits/ks/cuda/defs/c_defs.cu similarity index 100% rename from inst/include/bandicoot_bits/ks/kernels/cuda/defs/c_defs.cu rename to inst/include/bandicoot_bits/ks/cuda/defs/c_defs.cu diff --git a/inst/include/bandicoot_bits/ks/cuda/defs/cuda_prelims.cu b/inst/include/bandicoot_bits/ks/cuda/defs/cuda_prelims.cu index bcab5e7..269dbc2 100644 --- a/inst/include/bandicoot_bits/ks/cuda/defs/cuda_prelims.cu +++ b/inst/include/bandicoot_bits/ks/cuda/defs/cuda_prelims.cu @@ -12,11 +12,16 @@ // limitations under the License. // ------------------------------------------------------------------------ +#include + // These statically-compiled definitions are available in any Bandicoot kernel. #define uchar unsigned char #define ushort unsigned short #define uint unsigned int +#define cx_float cuFloatComplex +#define cx_double cuDoubleComplex + #define COOT_FN2(ARG1, ARG2) ARG1 ## ARG2 #define COOT_FN(ARG1, ARG2) COOT_FN2(ARG1, ARG2) diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/z_defs.cu b/inst/include/bandicoot_bits/ks/cuda/defs/z_defs.cu similarity index 100% rename from inst/include/bandicoot_bits/ks/kernels/cuda/defs/z_defs.cu rename to inst/include/bandicoot_bits/ks/cuda/defs/z_defs.cu diff --git a/inst/include/bandicoot_bits/ks/cuda/oneway/fill.cu b/inst/include/bandicoot_bits/ks/cuda/oneway/fill.cu deleted file mode 100644 index 8dc5535..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/oneway/fill.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,fill)(eT1* out, - const eT1 val, - const UWORD n_rows, - const UWORD n_cols, - const UWORD M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD index = col * M_n_rows + row; - - if(row < n_rows && col < n_cols) - { - out[index] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve1.cu deleted file mode 100644 index ff9b2c5..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve1.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,fill_sve1)(eT1* out, - const UWORD* out_locs, - const eT1 val, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - out[out_locs[i]] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve2.cu deleted file mode 100644 index c01ebd6..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/oneway/fill_sve2.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,fill_sve2)(eT1* out, - const UWORD* in_row_locs, - const UWORD* in_col_locs, - const eT1 val, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD loc = ((in_row_locs == NULL) ? row : in_row_locs[row]) + - n_rows * ((in_col_locs == NULL) ? col : in_col_locs[col]); - - out[loc] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_atan2.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_atan2.cu deleted file mode 100644 index da0a001..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_atan2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atan2)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const fp_eT3 a_val = TO_FP_ET3(src_A[A_index]); - const fp_eT3 b_val = TO_FP_ET3(src_B[B_index]); - dest[dest_index] = TO_ET3(atan2(a_val, b_val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array.cu deleted file mode 100644 index ca20bb9..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(a_val / b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array_cube.cu deleted file mode 100644 index 09cf58b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_div_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_array_cube)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = TO_ET3(src_A[A_index] / src_B[B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_hypot.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_hypot.cu deleted file mode 100644 index 7bf819f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_hypot.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_hypot)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const fp_eT3 a_val = TO_FP_ET3(src_A[A_index]); - const fp_eT3 b_val = TO_FP_ET3(src_B[B_index]); - dest[dest_index] = TO_ET3(hypot(a_val, b_val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_max_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_max_array.cu deleted file mode 100644 index d47ea77..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_max_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_max_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(max(a_val, b_val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_min_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_min_array.cu deleted file mode 100644 index dc08dd8..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_min_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_min_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(min(a_val, b_val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array.cu deleted file mode 100644 index 5a16392..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(a_val - b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array_cube.cu deleted file mode 100644 index 6da345d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_minus_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_array_cube)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = TO_ET3(src_A[A_index] - src_B[B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array.cu deleted file mode 100644 index f34c181..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(a_val * b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array_cube.cu deleted file mode 100644 index 2caf2a4..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_mul_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_array_cube)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = TO_ET3(src_A[A_index] * src_B[B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array.cu deleted file mode 100644 index ed1fc02..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_array)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD A_index = row + col * src_A_M_n_rows; - const UWORD B_index = row + col * src_B_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const threeway_promoted_eT a_val = TO_THREEWAY_PROMOTED_ET(src_A[A_index]); - const threeway_promoted_eT b_val = TO_THREEWAY_PROMOTED_ET(src_B[B_index]); - dest[dest_index] = TO_ET3(a_val + b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array_cube.cu deleted file mode 100644 index 3709c1f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/threeway/equ_array_plus_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_array_cube)(eT3* dest, - const eT1* src_A, - const eT2* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = TO_ET3(src_A[A_index] + src_B[B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type.cu deleted file mode 100644 index 38fa2c7..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,convert_type)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD src_index = row + col * src_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const eT1 in_val = src[src_index]; - dest[dest_index] = TO_ET2(in_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type_cube.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type_cube.cu deleted file mode 100644 index 823cec6..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/convert_type_cube.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,convert_type_cube)(eT2* dest, - const eT2* /* src_A */, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD /* src_A_M_n_rows */, - const UWORD /* src_A_M_n_cols */, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - const eT1 in_val = src[src_index]; - dest[dest_index] = TO_ET2(in_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_abs.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_abs.cu deleted file mode 100644 index 0e5c5ed..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_abs.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2021-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_abs)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(ET1_ABS(src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_post.cu deleted file mode 100644 index 5e243f0..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_acos_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(acos(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_pre.cu deleted file mode 100644 index e87b09f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acos_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_acos_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(acos(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_post.cu deleted file mode 100644 index 8dd3e67..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_acosh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(acosh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_pre.cu deleted file mode 100644 index fc3fe8b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_acosh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_acosh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(acosh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_post.cu deleted file mode 100644 index 552b6b5..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_asin_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(asin(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_pre.cu deleted file mode 100644 index e9b3d00..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asin_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_asin_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(asin(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_post.cu deleted file mode 100644 index 42f88bd..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_asinh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(asinh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_pre.cu deleted file mode 100644 index a50b220..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_asinh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_asinh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(asinh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_post.cu deleted file mode 100644 index b6fddb9..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atan_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(atan(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_pre.cu deleted file mode 100644 index 12157c2..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atan_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atan_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(atan(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_post.cu deleted file mode 100644 index 2e9247a..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atanh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(atanh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_pre.cu deleted file mode 100644 index de8247f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_atanh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_atanh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(atanh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_post.cu deleted file mode 100644 index 7c54467..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_ceil_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - dest[dest_index] = TO_ET2(ceil(TO_FP_ET1(val))); - } - else - { - dest[dest_index] = TO_ET2(val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_pre.cu deleted file mode 100644 index 3c7948d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_ceil_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_ceil_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - dest[dest_index] = ceil(TO_FP_ET2(val)); - } - else - { - dest[dest_index] = val; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_post.cu deleted file mode 100644 index fe72a17..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_cos_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(cos(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_pre.cu deleted file mode 100644 index 8de59ee..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cos_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_cos_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(cos(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_post.cu deleted file mode 100644 index debd529..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_cosh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(cosh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_pre.cu deleted file mode 100644 index 38cc5f3..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_cosh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_cosh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(cosh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post.cu deleted file mode 100644 index e155adf..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] / val_pre) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve1.cu deleted file mode 100644 index 1dec0d9..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_post_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]] / val_pre) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve2.cu deleted file mode 100644 index 82931ae..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_post_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_post_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc] / val_pre) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre.cu deleted file mode 100644 index d17e94b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre.cu +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // if both are 0, we take it as val_pre == 0 and val_post unused - if (val_post == TO_ET2(0)) - { - dest[dest_index] = TO_ET2(val_pre / src[src_index]); - } - else if (val_pre == TO_ET1(0) && val_post != TO_ET2(0)) - { - dest[dest_index] = val_post / (TO_ET2(src[src_index])); - } - else - { - // if both are nonzero, we apply sequentially----be careful! - dest[dest_index] = val_post / TO_ET2(val_pre / src[src_index]); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve1.cu deleted file mode 100644 index 1d345e8..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve1.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_pre_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - // if both are 0, we take it as val_pre == 0 and val_post unused - if (val_post == TO_ET2(0)) - { - dest[dest_locs[i]] = TO_ET2(val_pre / src[src_locs[i]]); - } - else if (val_pre == TO_ET1(0) && val_post != TO_ET2(0)) - { - dest[dest_locs[i]] = val_post / (TO_ET2(src[src_locs[i]])); - } - else - { - // if both are nonzero, we apply sequentially----be careful! - dest[dest_locs[i]] = val_post / TO_ET2(val_pre / src[src_locs[i]]); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve2.cu deleted file mode 100644 index ee53258..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_div_scalar_pre_sve2.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_div_scalar_pre_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - // if both are 0, we take it as val_pre == 0 and val_post unused - if (val_post == TO_ET2(0)) - { - dest[dest_loc] = TO_ET2(val_pre / src[src_loc]); - } - else if (val_pre == TO_ET1(0) && val_post != TO_ET2(0)) - { - dest[dest_loc] = val_post / (TO_ET2(src[src_loc])); - } - else - { - // if both are nonzero, we apply sequentially----be careful! - dest[dest_loc] = val_post / TO_ET2(val_pre / src[src_loc]); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_post.cu deleted file mode 100644 index 2439868..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_erf_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - dest[dest_index] = TO_ET2(erf(TO_FP_ET1(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_pre.cu deleted file mode 100644 index 4257781..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erf_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_erf_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - dest[dest_index] = TO_ET2(erf(TO_FP_ET2(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_post.cu deleted file mode 100644 index 302c70f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_erfc_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - dest[dest_index] = TO_ET2(erfc(TO_FP_ET1(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_pre.cu deleted file mode 100644 index 44d4ce9..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_erfc_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_erfc_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - dest[dest_index] = TO_ET2(erfc(TO_FP_ET2(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_post.cu deleted file mode 100644 index 8e74e87..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp10_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(exp10(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_pre.cu deleted file mode 100644 index ca1f614..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp10_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp10_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(exp10(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_post.cu deleted file mode 100644 index b2c5926..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp2_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(exp2(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_pre.cu deleted file mode 100644 index eb98bf8..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp2_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp2_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(exp2(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_post.cu deleted file mode 100644 index c34faa2..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(exp(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_pre.cu deleted file mode 100644 index 9d4a38b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_exp_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_exp_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(exp(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_post.cu deleted file mode 100644 index 197779d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_floor_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - dest[dest_index] = TO_ET2(floor(TO_FP_ET1(val))); - } - else - { - dest[dest_index] = TO_ET2(val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_pre.cu deleted file mode 100644 index 2cc616d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_floor_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_floor_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - dest[dest_index] = floor(TO_FP_ET2(val)); - } - else - { - dest[dest_index] = val; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_post.cu deleted file mode 100644 index 78b0896..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_lgamma_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - dest[dest_index] = TO_ET2(lgamma(TO_FP_ET1(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_pre.cu deleted file mode 100644 index d7f9466..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_lgamma_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_lgamma_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - dest[dest_index] = TO_ET2(lgamma(TO_FP_ET2(val))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_post.cu deleted file mode 100644 index 96cc43b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log10_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(log10(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_pre.cu deleted file mode 100644 index 93e0f9b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log10_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log10_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(log10(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_post.cu deleted file mode 100644 index bddb666..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log2_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(log2(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_pre.cu deleted file mode 100644 index d374f14..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log2_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log2_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(log2(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_post.cu deleted file mode 100644 index cfcbbdb..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(log(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_pre.cu deleted file mode 100644 index e4b5ac5..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_log_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_log_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(log(TO_FP_ET2(src[src_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_max_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_max_array_cube.cu deleted file mode 100644 index aa03d2a..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_max_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_max_array_cube)(eT2* dest, - const eT2* src_A, - const eT1* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = max(src_A[A_index], (TO_ET2(src_B[B_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_min_array_cube.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_min_array_cube.cu deleted file mode 100644 index 20d5383..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_min_array_cube.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_min_array_cube)(eT2* dest, - const eT2* src_A, - const eT1* src_B, - // logical size of source and destination - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD A_index = row + col * src_A_M_n_rows + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD B_index = row + col * src_B_M_n_rows + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = min(src_A[A_index], (TO_ET2(src_B[B_index]))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post.cu deleted file mode 100644 index a254b3d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] - val_pre) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve1.cu deleted file mode 100644 index e99ed79..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_post_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]] - val_pre) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve2.cu deleted file mode 100644 index 0a04d7f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_post_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_post_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc] - val_pre) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post.cu deleted file mode 100644 index 5c03699..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) val_post; - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(val_pre - src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve1.cu deleted file mode 100644 index 66b3897..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve1.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - (void) val_post; - - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(val_pre - src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve2.cu deleted file mode 100644 index 7f2f5df..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_post_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(val_pre - src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre.cu deleted file mode 100644 index b5ae1eb..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) val_pre; - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = val_post - (TO_ET2(src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve1.cu deleted file mode 100644 index 85a6cea..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve1.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - (void) val_pre; - - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = val_post - (TO_ET2(src[src_locs[i]])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve2.cu deleted file mode 100644 index 8cd4772..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_minus_scalar_pre_pre_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = val_post - (TO_ET2(src[src_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mod_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mod_scalar.cu deleted file mode 100644 index a90104d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mod_scalar.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mod_scalar)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // For an integer type, the casts end up doing nothing. - uint_eT1 val = TO_UINT_ET1(src[src_index]) % TO_UINT_ET1(val_pre); - dest[dest_index] = TO_ET2((TO_UINT_ET2(val)) % TO_UINT_ET2(val_post)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar.cu deleted file mode 100644 index b868880..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_scalar)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] * val_pre) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve1.cu deleted file mode 100644 index 15f1f43..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_scalar_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]] * val_pre) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve2.cu deleted file mode 100644 index 60060ff..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_mul_scalar_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_mul_scalar_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc] * val_pre) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_post.cu deleted file mode 100644 index 95c2ad5..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_neg_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(-src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_pre.cu deleted file mode 100644 index edb8447..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_neg_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_neg_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = -(TO_ET2(src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar.cu deleted file mode 100644 index bcac46d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_scalar)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] + val_pre) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve1.cu deleted file mode 100644 index 8c83f5d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_scalar_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]] + val_pre) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve2.cu deleted file mode 100644 index 8c0a37d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_plus_scalar_sve2.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_plus_scalar_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc] + val_pre) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_post.cu deleted file mode 100644 index 05e841b..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_pow_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(pow(val, TO_FP_ET1(val_pre))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_pre.cu deleted file mode 100644 index f33f4a3..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_pow_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_pow_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(pow(val, TO_FP_ET2(val_post))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_post.cu deleted file mode 100644 index b88361c..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_round_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - dest[dest_index] = TO_ET2(round(TO_FP_ET1(val))); - } - else - { - dest[dest_index] = TO_ET2(val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_pre.cu deleted file mode 100644 index fb54509..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_round_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_round_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - dest[dest_index] = round(TO_FP_ET2(val)); - } - else - { - dest[dest_index] = val; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_post.cu deleted file mode 100644 index 438e23f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_post.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sign_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (val > TO_ET1(0)) - { - dest[dest_index] = TO_ET2(1); - } - else if (val == TO_ET1(0)) - { - dest[dest_index] = TO_ET2(0); - } - else - { - dest[dest_index] = TO_ET2(-1); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_pre.cu deleted file mode 100644 index 4d43198..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sign_pre.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sign_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (val > TO_ET2(0)) - { - dest[dest_index] = TO_ET2(1); - } - else if (val == TO_ET2(0)) - { - dest[dest_index] = TO_ET2(0); - } - else - { - dest[dest_index] = TO_ET2(-1); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_post.cu deleted file mode 100644 index 8a1668f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sin_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(sin(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_pre.cu deleted file mode 100644 index ae9bf58..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sin_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sin_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(sin(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_post.cu deleted file mode 100644 index 1f74f05..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_post.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sinc_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - // To imitate Armadillo correctly, we use double if the type is not floating point. - if (coot_is_fp(val)) - { - const fp_eT1 tmp = val * COOT_PI; - dest[dest_index] = (tmp == TO_ET1(0)) ? TO_ET2(1) : TO_ET2(sin(tmp) / tmp); - } - else - { - const double fp_val = (double) val; - const double tmp = fp_val * COOT_PI; - dest[dest_index] = (tmp == 0) ? TO_ET2(1) : TO_ET2(sin(tmp) / tmp); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_pre.cu deleted file mode 100644 index 340482c..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinc_pre.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sinc_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - // To imitate Armadillo correctly, we use double if the type is not floating point. - if (coot_is_fp(val)) - { - const fp_eT2 tmp = val * COOT_PI; - dest[dest_index] = (tmp == TO_ET2(0)) ? TO_ET2(1) : TO_ET2(sin(tmp) / tmp); - } - else - { - const double fp_val = (double) val; - const double tmp = fp_val * COOT_PI; - dest[dest_index] = (tmp == 0) ? TO_ET2(1) : TO_ET2(sin(tmp) / tmp); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_post.cu deleted file mode 100644 index 00ecefc..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sinh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(sinh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_pre.cu deleted file mode 100644 index dc343a3..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sinh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sinh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(sinh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_post.cu deleted file mode 100644 index f684434..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sqrt_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(TO_ET1(sqrt(TO_FP_ET1(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_pre.cu deleted file mode 100644 index a198b01..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_sqrt_pre.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_sqrt_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(sqrt(TO_FP_ET2(TO_ET2(src[src_index])))); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_post.cu deleted file mode 100644 index a555c9e..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_post.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_square_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = TO_ET2(src[src_index] * src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_pre.cu deleted file mode 100644 index d67c11f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_square_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_square_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - dest[dest_index] = val * val; - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_post.cu deleted file mode 100644 index 6b82e6f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_tan_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(tan(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_pre.cu deleted file mode 100644 index 8156316..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tan_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_tan_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(tan(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_post.cu deleted file mode 100644 index 281b782..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_post.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_tanh_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = TO_FP_ET1(src[src_index]); - dest[dest_index] = TO_ET2(tanh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_pre.cu deleted file mode 100644 index ed162b3..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_tanh_pre.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_tanh_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = TO_FP_ET2(TO_ET2(src[src_index])); - dest[dest_index] = TO_ET2(tanh(val)); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_post.cu deleted file mode 100644 index 142bf2d..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_post.cu +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_exp_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To imitate Armadillo's behavior exactly, if the type is not floating-point, we convert to double. - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - const fp_eT1 fp_val = TO_FP_ET1(val); - if (fp_val >= log(coot_type_max(TO_FP_ET1(0)))) - { - dest[dest_index] = TO_ET2(TO_ET1(coot_type_max(TO_FP_ET1(0)))); - } - else - { - dest[dest_index] = TO_ET2(TO_ET1(exp(fp_val))); - } - } - else - { - const double fp_val = (double) val; - if (fp_val >= log(DBL_MAX)) - { - dest[dest_index] = TO_ET2(TO_ET1(DBL_MAX)); - } - else - { - dest[dest_index] = TO_ET2(TO_ET1(exp(fp_val))); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_pre.cu deleted file mode 100644 index 7754fed..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_exp_pre.cu +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_exp_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To imitate Armadillo's behavior exactly, if the type is not floating-point, we convert to double. - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - const fp_eT2 fp_val = TO_FP_ET2(val); - if (fp_val >= log(coot_type_max(TO_FP_ET2(0)))) - { - dest[dest_index] = TO_ET2(coot_type_max(TO_FP_ET2(0))); - } - else - { - dest[dest_index] = TO_ET2(exp(fp_val)); - } - } - else - { - const double fp_val = (double) val; - if (fp_val >= log(DBL_MAX)) - { - dest[dest_index] = TO_ET2(DBL_MAX); - } - else - { - dest[dest_index] = TO_ET2(exp(fp_val)); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_post.cu deleted file mode 100644 index 92e2d76..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_post.cu +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_log_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To match Armadillo, we always use `double` as the intermediate type for any non-floating point type. - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - const fp_eT1 fp_val = TO_FP_ET1(val); - if (fp_val <= TO_FP_ET1(0)) - { - dest[dest_index] = TO_ET2(log(coot_type_minpos(TO_FP_ET1(0)))); - } - else if (coot_isinf(fp_val)) - { - dest[dest_index] = TO_ET2(log(coot_type_max(TO_FP_ET1(0)))); - } - else - { - dest[dest_index] = TO_ET2(TO_ET1(log(fp_val))); - } - } - else - { - const double fp_val = (double) val; - if (fp_val <= TO_FP_ET1(0)) - { - dest[dest_index] = TO_ET2(log(DBL_MIN)); - } - else if (isinf(fp_val)) - { - dest[dest_index] = TO_ET2(log(DBL_MAX)); - } - else - { - dest[dest_index] = TO_ET2(TO_ET1(log(fp_val))); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_pre.cu deleted file mode 100644 index 305ba01..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_log_pre.cu +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_log_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To match Armadillo, we always use `double` as the intermediate type for any non-floating point type. - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - const fp_eT2 fp_val = TO_FP_ET2(val); - if (fp_val <= TO_FP_ET2(0)) - { - dest[dest_index] = TO_ET2(log(coot_type_minpos(TO_FP_ET2(0)))); - } - else if (coot_isinf(fp_val)) - { - dest[dest_index] = TO_ET2(log(coot_type_max(TO_FP_ET2(0)))); - } - else - { - dest[dest_index] = TO_ET2(log(fp_val)); - } - } - else - { - const double fp_val = (double) val; - if (fp_val <= (double) 0) - { - dest[dest_index] = TO_ET2(log(DBL_MIN)); - } - else if (isinf(fp_val)) - { - dest[dest_index] = TO_ET2(log(DBL_MAX)); - } - else - { - dest[dest_index] = TO_ET2(log(fp_val)); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_post.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_post.cu deleted file mode 100644 index 645333f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_post)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_is_fp(val)) - { - dest[dest_index] = TO_ET2(trunc(TO_FP_ET1(val))); - } - else - { - dest[dest_index] = TO_ET2(val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_pre.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_pre.cu deleted file mode 100644 index 85da488..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/equ_array_trunc_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,equ_array_trunc_pre)(eT2* dest, - const eT1* src, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) (val_pre); - (void) (val_post); - - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = TO_ET2(src[src_index]); - if (coot_is_fp(val)) - { - dest[dest_index] = trunc(TO_FP_ET2(val)); - } - else - { - dest[dest_index] = val; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve1.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve1.cu deleted file mode 100644 index 4522d16..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve1.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,extract_sve1)(eT2* out_mem, - const eT1* in_mem, - const UWORD* in_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - out_mem[i] = TO_ET2(in_mem[in_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve2.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve2.cu deleted file mode 100644 index 7e40c22..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/extract_sve2.cu +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,extract_sve2)(eT2* out_mem, - const eT1* in_mem, - const UWORD* in_row_locs, - const UWORD* in_col_locs, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD out_n_rows, - const UWORD in_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD in_loc = ((in_row_locs == NULL) ? row : in_row_locs[row]) + - in_n_rows * ((in_col_locs == NULL) ? col : in_col_locs[col]); - - out_mem[row + out_n_rows * col] = TO_ET2(in_mem[in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_and_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_and_array.cu deleted file mode 100644 index 5661582..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_and_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_and_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 && val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_array.cu deleted file mode 100644 index 0f2008a..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_eq_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 == val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_scalar.cu deleted file mode 100644 index eb20434..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_eq_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_eq_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 == val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_array.cu deleted file mode 100644 index 079aefa..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_gt_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 > val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_scalar.cu deleted file mode 100644 index 9f2b065..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gt_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_gt_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 > val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_array.cu deleted file mode 100644 index 1b06a32..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_gteq_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 >= val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_scalar.cu deleted file mode 100644 index f1f1b5c..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_gteq_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_gteq_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 >= val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_array.cu deleted file mode 100644 index fd184a2..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_lt_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 < val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_scalar.cu deleted file mode 100644 index e4d143f..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lt_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_lt_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 < val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_array.cu deleted file mode 100644 index a1858ef..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_lteq_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 <= val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_scalar.cu deleted file mode 100644 index f713d26..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_lteq_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_lteq_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 <= val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_array.cu deleted file mode 100644 index 6c80404..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_neq_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 != val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_scalar.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_scalar.cu deleted file mode 100644 index 49193cb..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_neq_scalar.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_neq_scalar)(UWORD* out, - const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - const eT2 val) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - out[i] = (val1 != val); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_or_array.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/rel_or_array.cu deleted file mode 100644 index 9be1efd..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/rel_or_array.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_or_array)(UWORD* out, - const eT1* X, - const eT2* Y, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = X[i]; - const eT2 val2 = Y[i]; - out[i] = (val1 || val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/cuda/twoway/replace.cu b/inst/include/bandicoot_bits/ks/cuda/twoway/replace.cu deleted file mode 100644 index deb438e..0000000 --- a/inst/include/bandicoot_bits/ks/cuda/twoway/replace.cu +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,replace)(eT2* dest, - eT1* src, - const eT1 val_find, - const eT1 val_replace, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD slice = blockIdx.z * blockDim.z + threadIdx.z; - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (coot_isnan(val_find)) - { - // We are searching for a NaN so the check is a little different. - dest[dest_index] = TO_ET2((coot_isnan(val)) ? val_replace : val); - } - else - { - // No special handling needed. - dest[dest_index] = TO_ET2((val == val_find) ? val_replace : val); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/cuda_prelims.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/cuda_prelims.cu deleted file mode 100644 index 269dbc2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/cuda_prelims.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -#include - -// These statically-compiled definitions are available in any Bandicoot kernel. -#define uchar unsigned char -#define ushort unsigned short -#define uint unsigned int - -#define cx_float cuFloatComplex -#define cx_double cuDoubleComplex - -#define COOT_FN2(ARG1, ARG2) ARG1 ## ARG2 -#define COOT_FN(ARG1, ARG2) COOT_FN2(ARG1, ARG2) - -#define UWORD size_t - -// For older CUDA toolkit versions, we must manually make FP16 limit macros -// available. -#if CUDA_VERSION < 12020 - #define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U) - #define CUDART_NAN_FP16 __ushort_as_half((unsigned short)0x7FFFU) - #define CUDART_MIN_DENORM_FP16 __ushort_as_half((unsigned short)0x0001U) - #define CUDART_MAX_NORMAL_FP16 __ushort_as_half((unsigned short)0x7BFFU) - #define CUDART_NEG_ZERO_FP16 __ushort_as_half((unsigned short)0x8000U) - #define CUDART_ZERO_FP16 __ushort_as_half((unsigned short)0x0000U) - #define CUDART_ONE_FP16 __ushort_as_half((unsigned short)0x3C00U) -#endif - -extern __shared__ char aux_shared_mem[]; // this may be used in some kernels diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/d_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/d_defs.cu deleted file mode 100644 index 5b4776e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/d_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for double elements. -__device__ inline bool coot_is_fp(const double) { return true; } -__device__ inline bool coot_is_signed(const double) { return true; } -__device__ inline double coot_type_min(const double) { return -DBL_MAX; } -__device__ inline double coot_type_minpos(const double) { return DBL_MIN; } -__device__ inline double coot_type_max(const double) { return DBL_MAX; } -__device__ inline bool coot_isnan(const double x) { return isnan(x); } -__device__ inline bool coot_isinf(const double x) { return isinf(x); } -__device__ inline bool coot_isfinite(const double x) { return isfinite(x); } - -// Conversion functions for double elements. -__device__ inline double coot_to_double(const uchar& x) { return (double) x; } -__device__ inline double coot_to_double(const char& x) { return (double) x; } -__device__ inline double coot_to_double(const ushort& x) { return (double) x; } -__device__ inline double coot_to_double(const short& x) { return (double) x; } -__device__ inline double coot_to_double(const uint& x) { return (double) x; } -__device__ inline double coot_to_double(const int& x) { return (double) x; } -__device__ inline double coot_to_double(const size_t& x) { return (double) x; } -__device__ inline double coot_to_double(const long& x) { return (double) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline double coot_to_double(const __half& x) { return (double) __half2float(x); } -#endif -__device__ inline double coot_to_double(const float& x) { return (double) x; } -__device__ inline double coot_to_double(const double& x) { return (double) x; } - -// Utility mathematical functions. -__device__ inline double coot_absdiff(const double x, const double y) { return fabs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/f_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/f_defs.cu deleted file mode 100644 index 73526e7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/f_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for float elements. -__device__ inline bool coot_is_fp(const float) { return true; } -__device__ inline bool coot_is_signed(const float) { return true; } -__device__ inline float coot_type_min(const float) { return -FLT_MAX; } -__device__ inline float coot_type_minpos(const float) { return FLT_MIN; } -__device__ inline float coot_type_max(const float) { return FLT_MAX; } -__device__ inline bool coot_isnan(const float x) { return isnan(x); } -__device__ inline bool coot_isinf(const float x) { return isinf(x); } -__device__ inline bool coot_isfinite(const float x) { return isfinite(x); } - -// Conversion functions for float elements. -__device__ inline float coot_to_float(const uchar& x) { return (float) x; } -__device__ inline float coot_to_float(const char& x) { return (float) x; } -__device__ inline float coot_to_float(const ushort& x) { return (float) x; } -__device__ inline float coot_to_float(const short& x) { return (float) x; } -__device__ inline float coot_to_float(const uint& x) { return (float) x; } -__device__ inline float coot_to_float(const int& x) { return (float) x; } -__device__ inline float coot_to_float(const size_t& x) { return (float) x; } -__device__ inline float coot_to_float(const long& x) { return (float) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline float coot_to_float(const __half& x) { return __half2float(x); } -#endif -__device__ inline float coot_to_float(const float& x) { return (float) x; } -__device__ inline float coot_to_float(const double& x) { return (float) x; } - -// Utility mathematical functions. -__device__ inline float coot_absdiff(const float x, const float y) { return fabs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/h_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/h_defs.cu deleted file mode 100644 index e3b1ae9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/h_defs.cu +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for fp16 elements. -__device__ inline bool coot_is_fp(const __half) { return true; } -__device__ inline bool coot_is_signed(const __half) { return true; } -__device__ inline __half coot_type_min(const __half) { return -HALF_MAX; } -__device__ inline __half coot_type_minpos(const __half) { return HALF_MIN; } -__device__ inline __half coot_type_max(const __half) { return HALF_MAX; } -__device__ inline bool coot_isnan(const __half x) { return __hisnan(x); } -__device__ inline bool coot_isinf(const __half x) { return __hisinf(x); } -__device__ inline bool coot_isfinite(const __half x) { return !__hisnan(x) && !__hisinf(x); } - -// Conversion functions for fp16 elements. -#if CUDA_VERSION < 12020 -__device__ inline __half coot_to___half(const uchar& x) { return (__half) ((ushort) x); } -__device__ inline __half coot_to___half(const char& x) { return (__half) ((short) x); } -#else -__device__ inline __half coot_to___half(const uchar& x) { return (__half) x; } -__device__ inline __half coot_to___half(const char& x) { return (__half) x; } -#endif -__device__ inline __half coot_to___half(const ushort& x) { return (__half) x; } -__device__ inline __half coot_to___half(const short& x) { return (__half) x; } -__device__ inline __half coot_to___half(const uint& x) { return (__half) x; } -__device__ inline __half coot_to___half(const int& x) { return (__half) x; } -#if CUDA_VERSION < 12020 -__device__ inline __half coot_to___half(const size_t& x) { return (__half) ((unsigned long long) x); } -__device__ inline __half coot_to___half(const long& x) { return (__half) ((long long) x); } -#else -__device__ inline __half coot_to___half(const size_t& x) { return (__half) x; } -__device__ inline __half coot_to___half(const long& x) { return (__half) x; } -#endif -__device__ inline __half coot_to___half(const __half& x) { return (__half) x; } -__device__ inline __half coot_to___half(const float& x) { return __float2half(x); } -__device__ inline __half coot_to___half(const double& x) { return __float2half((double) x); } - -// CUDA FP16 support does not include some arithmetic operators that we need for volatile elements so we add them ourselves... -#if CUDA_VERSION < 12040 -__device__ inline volatile __half& operator+=(volatile __half& a, const volatile __half& b) { a = __hadd((__half) a, (__half) b); return a; } -__device__ inline volatile __half& operator-=(volatile __half& a, const volatile __half& b) { a = __hsub((__half) a, (__half) b); return a; } -__device__ inline volatile __half& operator*=(volatile __half& a, const volatile __half& b) { a = __hmul((__half) a, (__half) b); return a; } -__device__ inline volatile __half& operator/=(volatile __half& a, const volatile __half& b) { a = __hdiv((__half) a, (__half) b); return a; } -#else -__device__ inline volatile __half& operator+=(volatile __half& a, const volatile __half& b) { a = __hadd(a, b); return a; } -__device__ inline volatile __half& operator-=(volatile __half& a, const volatile __half& b) { a = __hsub(a, b); return a; } -__device__ inline volatile __half& operator*=(volatile __half& a, const volatile __half& b) { a = __hmul(a, b); return a; } -__device__ inline volatile __half& operator/=(volatile __half& a, const volatile __half& b) { a = __hdiv(a, b); return a; } -#endif -__device__ inline __half abs(const __half a) { return __habs(a); } -__device__ inline __half pow(const __half a, const __half b) { return hexp2(b * hlog2(a)); } -__device__ inline __half min(const __half a, const __half b) { return __hmin_nan(a, b); } -__device__ inline __half max(const __half a, const __half b) { return __hmax_nan(a, b); } - -// Utility mathematical functions. -__device__ inline __half coot_absdiff(const __half x, const __half y) { return fabs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s16_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s16_defs.cu deleted file mode 100644 index 3d541f5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s16_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u16 elements. -__device__ inline bool coot_is_fp(const short) { return false; } -__device__ inline bool coot_is_signed(const short) { return true; } -__device__ inline short coot_type_min(const short) { return COOT_S16_MIN; } -__device__ inline short coot_type_minpos(const short) { return 1; } -__device__ inline short coot_type_max(const short) { return COOT_S16_MAX; } -__device__ inline bool coot_isnan(const short) { return false; } -__device__ inline bool coot_isinf(const short) { return false; } -__device__ inline bool coot_isfinite(const short) { return true; } - -// Conversion functions for u16 elements. -__device__ inline short coot_to_short(const uchar& x) { return (short) x; } -__device__ inline short coot_to_short(const char& x) { return (short) x; } -__device__ inline short coot_to_short(const ushort& x) { return (short) x; } -__device__ inline short coot_to_short(const short& x) { return (short) x; } -__device__ inline short coot_to_short(const uint& x) { return (short) x; } -__device__ inline short coot_to_short(const int& x) { return (short) x; } -__device__ inline short coot_to_short(const size_t& x) { return (short) x; } -__device__ inline short coot_to_short(const long& x) { return (short) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline short coot_to_short(const __half& x) { return (short) x; } -#endif -__device__ inline short coot_to_short(const float& x) { return (short) x; } -__device__ inline short coot_to_short(const double& x) { return (short) x; } - -// Utility mathematical functions. -__device__ inline short coot_absdiff(const short x, const short y) { return abs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s32_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s32_defs.cu deleted file mode 100644 index d5a2f1e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s32_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for s32 elements. -__device__ inline bool coot_is_fp(const int) { return false; } -__device__ inline bool coot_is_signed(const int) { return true; } -__device__ inline int coot_type_min(const int) { return COOT_S32_MIN; } -__device__ inline int coot_type_minpos(const int) { return 1; } -__device__ inline int coot_type_max(const int) { return COOT_S32_MAX; } -__device__ inline bool coot_isnan(const int) { return false; } -__device__ inline bool coot_isinf(const int) { return false; } -__device__ inline bool coot_isfinite(const int) { return true; } - -// Conversion functions for s32 elements. -__device__ inline int coot_to_int(const uchar& x) { return (int) x; } -__device__ inline int coot_to_int(const char& x) { return (int) x; } -__device__ inline int coot_to_int(const ushort& x) { return (int) x; } -__device__ inline int coot_to_int(const short& x) { return (int) x; } -__device__ inline int coot_to_int(const uint& x) { return (int) x; } -__device__ inline int coot_to_int(const int& x) { return (int) x; } -__device__ inline int coot_to_int(const size_t& x) { return (int) x; } -__device__ inline int coot_to_int(const long& x) { return (int) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline int coot_to_int(const __half& x) { return (int) x; } -#endif -__device__ inline int coot_to_int(const float& x) { return (int) x; } -__device__ inline int coot_to_int(const double& x) { return (int) x; } - -// Utility mathematical functions. -__device__ inline int coot_absdiff(const int x, const int y) { return abs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s64_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s64_defs.cu deleted file mode 100644 index 1ceebc7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s64_defs.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for s64 elements. -__device__ inline bool coot_is_fp(const long) { return false; } -__device__ inline bool coot_is_signed(const long) { return true; } -__device__ inline long coot_type_min(const long) { return COOT_S64_MIN; } -__device__ inline long coot_type_minpos(const long) { return 1; } -__device__ inline long coot_type_max(const long) { return COOT_S64_MAX; } -__device__ inline bool coot_isnan(const long) { return false; } -__device__ inline bool coot_isinf(const long) { return false; } -__device__ inline bool coot_isfinite(const long) { return true; } - -// Conversion functions for s64 elements. -__device__ inline long coot_to_long(const uchar& x) { return (long) x; } -__device__ inline long coot_to_long(const char& x) { return (long) x; } -__device__ inline long coot_to_long(const ushort& x) { return (long) x; } -__device__ inline long coot_to_long(const short& x) { return (long) x; } -__device__ inline long coot_to_long(const uint& x) { return (long) x; } -__device__ inline long coot_to_long(const int& x) { return (long) x; } -__device__ inline long coot_to_long(const size_t& x) { return (long) x; } -__device__ inline long coot_to_long(const long& x) { return (long) x; } -#if defined(COOT_HAVE_FP16) -#if CUDA_VERSION < 12020 -__device__ inline long coot_to_long(const __half& x) { return (long) ((long long) x); } -#else -__device__ inline long coot_to_long(const __half& x) { return (long) x; } -#endif -#endif -__device__ inline long coot_to_long(const float& x) { return (long) x; } -__device__ inline long coot_to_long(const double& x) { return (long) x; } - -// Utility mathematical functions. -__device__ inline long coot_absdiff(const long x, const long y) { return abs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s8_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s8_defs.cu deleted file mode 100644 index 3130839..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/s8_defs.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for s8 elements. -__device__ inline bool coot_is_fp(const char) { return false; } -__device__ inline bool coot_is_signed(const char) { return true; } -__device__ inline bool coot_type_min(const char) { return COOT_S8_MIN; } -__device__ inline bool coot_type_minpos(const char) { return 1; } -__device__ inline bool coot_type_max(const char) { return COOT_S8_MAX; } -__device__ inline bool coot_isnan(const char) { return false; } -__device__ inline bool coot_isinf(const char) { return false; } -__device__ inline bool coot_isfinite(const char) { return true; } - -// Conversion functions for s8 elements. -__device__ inline char coot_to_char(const uchar& x) { return (char) x; } -__device__ inline char coot_to_char(const char& x) { return (char) x; } -__device__ inline char coot_to_char(const ushort& x) { return (char) x; } -__device__ inline char coot_to_char(const short& x) { return (char) x; } -__device__ inline char coot_to_char(const uint& x) { return (char) x; } -__device__ inline char coot_to_char(const int& x) { return (char) x; } -__device__ inline char coot_to_char(const size_t& x) { return (char) x; } -__device__ inline char coot_to_char(const long& x) { return (char) x; } -#if defined(COOT_HAVE_FP16) -#if CUDA_VERSION < 12020 -__device__ inline char coot_to_char(const __half& x) { return (char) ((short) x); } -#else -__device__ inline char coot_to_char(const __half& x) { return (char) x; } -#endif -#endif -__device__ inline char coot_to_char(const float& x) { return (char) x; } -__device__ inline char coot_to_char(const double& x) { return (char) x; } - -// Utility mathematical functions. -__device__ inline char coot_absdiff(const char x, const char y) { return abs(x - y); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u16_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u16_defs.cu deleted file mode 100644 index 66d909f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u16_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u16 elements. -__device__ inline bool coot_is_fp(const ushort) { return false; } -__device__ inline bool coot_is_signed(const ushort) { return false; } -__device__ inline ushort coot_type_min(const ushort) { return 0; } -__device__ inline ushort coot_type_minpos(const ushort) { return 1; } -__device__ inline ushort coot_type_max(const ushort) { return COOT_U16_MAX; } -__device__ inline bool coot_isnan(const ushort) { return false; } -__device__ inline bool coot_isinf(const ushort) { return false; } -__device__ inline bool coot_isfinite(const ushort) { return true; } - -// Conversion functions for u16 elements. -__device__ inline ushort coot_to_ushort(const uchar& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const char& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const ushort& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const short& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const uint& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const int& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const size_t& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const long& x) { return (ushort) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline ushort coot_to_ushort(const __half& x) { return (ushort) x; } -#endif -__device__ inline ushort coot_to_ushort(const float& x) { return (ushort) x; } -__device__ inline ushort coot_to_ushort(const double& x) { return (ushort) x; } - -// Utility mathematical functions. -__device__ inline ushort coot_absdiff(const ushort x, const ushort y) { return (x > y) ? (x - y) : (y - x); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u32_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u32_defs.cu deleted file mode 100644 index c67d75c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u32_defs.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u32 elements. -__device__ inline bool coot_is_fp(const uint) { return false; } -__device__ inline bool coot_is_signed(const uint) { return false; } -__device__ inline uint coot_type_min(const uint) { return 0; } -__device__ inline uint coot_type_minpos(const uint) { return 1; } -__device__ inline uint coot_type_max(const uint) { return COOT_U32_MAX; } -__device__ inline bool coot_isnan(const uint) { return false; } -__device__ inline bool coot_isinf(const uint) { return false; } -__device__ inline bool coot_isfinite(const uint) { return true; } - -// Conversion functions for u32 elements. -__device__ inline uint coot_to_uint(const uchar& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const char& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const ushort& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const short& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const uint& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const int& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const size_t& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const long& x) { return (uint) x; } -#if defined(COOT_HAVE_FP16) -__device__ inline uint coot_to_uint(const __half& x) { return (uint) x; } -#endif -__device__ inline uint coot_to_uint(const float& x) { return (uint) x; } -__device__ inline uint coot_to_uint(const double& x) { return (uint) x; } - -// Utility mathematical functions. -__device__ inline uint coot_absdiff(const uint x, const uint y) { return (x > y) ? (x - y) : (y - x); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u64_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u64_defs.cu deleted file mode 100644 index 942e795..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u64_defs.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u64 elements. -__device__ inline bool coot_is_fp(const size_t) { return false; } -__device__ inline bool coot_is_signed(const size_t) { return false; } -__device__ inline size_t coot_type_min(const size_t) { return 0; } -__device__ inline size_t coot_type_minpos(const size_t) { return 1; } -__device__ inline size_t coot_type_max(const size_t) { return COOT_U64_MAX; } -__device__ inline bool coot_isnan(const size_t) { return false; } -__device__ inline bool coot_isinf(const size_t) { return false; } -__device__ inline bool coot_isfinite(const size_t) { return true; } - -// Conversion functions for u64 elements. -__device__ inline size_t coot_to_size_t(const uchar& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const char& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const ushort& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const short& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const uint& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const int& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const size_t& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const long& x) { return (size_t) x; } -#if defined(COOT_HAVE_FP16) -#if CUDA_VERSION < 12020 -__device__ inline size_t coot_to_size_t(const __half& x) { return (size_t) ((unsigned long long) x); } -#else -__device__ inline size_t coot_to_size_t(const __half& x) { return (size_t) x; } -#endif -#endif -__device__ inline size_t coot_to_size_t(const float& x) { return (size_t) x; } -__device__ inline size_t coot_to_size_t(const double& x) { return (size_t) x; } - -// Utility mathematical functions. -__device__ inline size_t coot_absdiff(const size_t x, const size_t y) { return (x > y) ? (x - y) : (y - x); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u8_defs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u8_defs.cu deleted file mode 100644 index e2a2e0d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/defs/u8_defs.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Utility functions for u8 elements. -__device__ inline bool coot_is_fp(const uchar) { return false; } -__device__ inline bool coot_is_signed(const uchar) { return false; } -__device__ inline uchar coot_type_min(const uchar) { return 0; } -__device__ inline uchar coot_type_minpos(const uchar) { return 1; } -__device__ inline uchar coot_type_max(const uchar) { return COOT_U8_MAX; } -__device__ inline bool coot_isnan(const uchar) { return false; } -__device__ inline bool coot_isinf(const uchar) { return false; } -__device__ inline bool coot_isfinite(const uchar) { return true; } - -// Conversion functions for u8 elements. -__device__ inline uchar coot_to_uchar(const uchar& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const char& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const ushort& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const short& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const uint& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const int& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const size_t& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const long& x) { return (uchar) x; } -#if defined(COOT_HAVE_FP16) -#if CUDA_VERSION < 12020 -__device__ inline uchar coot_to_uchar(const __half& x) { return (uchar) ((ushort) x); } -#else -__device__ inline uchar coot_to_uchar(const __half& x) { return (uchar) x; } -#endif -#endif -__device__ inline uchar coot_to_uchar(const float& x) { return (uchar) x; } -__device__ inline uchar coot_to_uchar(const double& x) { return (uchar) x; } - -// Utility mathematical functions. -__device__ inline uchar coot_absdiff(const uchar x, const uchar y) { return (x > y) ? (x - y) : (y - x); } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/accu_subgroup_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/accu_subgroup_reduce.cu deleted file mode 100644 index 9169a5e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/accu_subgroup_reduce.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__device__ -void -COOT_FN(PREFIX,accu_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] += data[tid + 32]; - data[tid] += data[tid + 16]; - data[tid] += data[tid + 8]; - data[tid] += data[tid + 4]; - data[tid] += data[tid + 2]; - data[tid] += data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/and_subgroup_reduce_u32.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/and_subgroup_reduce_u32.cu deleted file mode 100644 index c1b6c84..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/and_subgroup_reduce_u32.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -and_subgroup_reduce_u32(volatile uint* data, int tid) - { - data[tid] &= data[tid + 32]; - data[tid] &= data[tid + 16]; - data[tid] &= data[tid + 8]; - data[tid] &= data[tid + 4]; - data[tid] &= data[tid + 2]; - data[tid] &= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/max_subgroup_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/max_subgroup_reduce.cu deleted file mode 100644 index 96241b2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/max_subgroup_reduce.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__device__ -void -COOT_FN(PREFIX,max_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] = max(data[tid], data[tid + 32]); - data[tid] = max(data[tid], data[tid + 16]); - data[tid] = max(data[tid], data[tid + 8]); - data[tid] = max(data[tid], data[tid + 4]); - data[tid] = max(data[tid], data[tid + 2]); - data[tid] = max(data[tid], data[tid + 1]); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/min_subgroup_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/min_subgroup_reduce.cu deleted file mode 100644 index 93caecb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/min_subgroup_reduce.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2021 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__device__ -void -COOT_FN(PREFIX,min_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] = min(data[tid], data[tid + 32]); - data[tid] = min(data[tid], data[tid + 16]); - data[tid] = min(data[tid], data[tid + 8]); - data[tid] = min(data[tid], data[tid + 4]); - data[tid] = min(data[tid], data[tid + 2]); - data[tid] = min(data[tid], data[tid + 1]); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/or_subgroup_reduce_u32.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/or_subgroup_reduce_u32.cu deleted file mode 100644 index de00860..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/or_subgroup_reduce_u32.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -or_subgroup_reduce_u32(volatile uint* data, int tid) - { - data[tid] |= data[tid + 32]; - data[tid] |= data[tid + 16]; - data[tid] |= data[tid + 8]; - data[tid] |= data[tid + 4]; - data[tid] |= data[tid + 2]; - data[tid] |= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/prod_subgroup_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/prod_subgroup_reduce.cu deleted file mode 100644 index 65a1d6b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/prod_subgroup_reduce.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__device__ -void -COOT_FN(PREFIX,prod_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] *= data[tid + 32]; - data[tid] *= data[tid + 16]; - data[tid] *= data[tid + 8]; - data[tid] *= data[tid + 4]; - data[tid] *= data[tid + 2]; - data[tid] *= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/var_philox.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/deps/var_philox.cu deleted file mode 100644 index 1135b5b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/deps/var_philox.cu +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// Implementations of the variable philox algorithm to generate random numbers. -// Adapted from Mitchell, Stokes, Frank, and Holmes (2022), Listing 1. - - - -inline __device__ UWORD var_philox(const UWORD val, const UWORD* keys, const unsigned char bits) - { - // via Salmon, Moraes, Dror, and Shaw (2011): "Parallel random numbers: as easy as 1, 2, 3". - static const UWORD M0 = 0xD2B74407B1CE6E93; - - // The right side is allowed to have the extra bits. - const unsigned char right_side_bits = (bits + 1) / 2; - const unsigned char left_side_bits = bits / 2; - const uint left_mask = (((uint) 1) << left_side_bits) - 1; - const uint right_mask = (((uint) 1) << right_side_bits) - 1; - - uint state0 = (uint) (val >> right_side_bits); - uint state1 = (uint) (val & right_mask); - - // 24 rounds is what is needed to pass all the RNG tests (see section 5 of the paper). - uint hi, lo; - for (unsigned char i = 0; i < 24; ++i) - { - - // 64-bit integer multiplication, split the results into two uints - UWORD hilo = M0 * state0; - hi = (hilo >> 32); - lo = (uint) hilo; - - lo = (lo << (right_side_bits - left_side_bits)) | (state1 >> left_side_bits); - - state0 = ((hi ^ keys[i]) ^ state1) & left_mask; - state1 = lo & right_mask; - } - - // Combine the sides for the result. - return UWORD((state0 << right_side_bits) | UWORD(state1)); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu.cu deleted file mode 100644 index ca8413c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu.cu +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,accu)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - aux_mem[tid] += in_mem[i] + in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += in_mem[i]; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_simple.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_simple.cu deleted file mode 100644 index a666628..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_simple.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,accu_simple)(eT1* out, - const eT1* A, - const UWORD A_len) - { - const UWORD id = blockIdx.x * blockDim.x + threadIdx.x; - if(id == 0) - { - eT1 acc = TO_ET1(0); // runtime unrolling is not supported by CUDA - for(UWORD i = 0; i < A_len; ++i) - { - acc += A[i]; - } - - out[0] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_small.cu deleted file mode 100644 index f196060..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/accu_small.cu +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,accu_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - aux_mem[tid] += in_mem[i] + in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += in_mem[i]; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal.cu deleted file mode 100644 index 01b1cf6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal.cu +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,approx_equal)(uint* out_mem, - const eT1* A_mem, - const UWORD A_M_n_rows, - const eT1* B_mem, - const UWORD B_M_n_rows, - const UWORD n_rows, - const UWORD n_elem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // A bit painful... - const UWORD row1 = i % n_rows; - const UWORD col1 = i / n_rows; - const UWORD row2 = (i + blockDim.x) % n_rows; - const UWORD col2 = (i + blockDim.x) / n_rows; - - const UWORD A_loc1 = row1 + col1 * A_M_n_rows; - const UWORD A_loc2 = row2 + col2 * A_M_n_rows; - const UWORD B_loc1 = row1 + col1 * B_M_n_rows; - const UWORD B_loc2 = row2 + col2 * B_M_n_rows; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (coot_isnan(A_val1) || coot_isnan(B_val1) || coot_isnan(A_val2) || coot_isnan(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = coot_absdiff(A_val1, B_val1); - const eT1 absdiff2 = coot_absdiff(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD row = i % n_rows; - const UWORD col = i / n_rows; - - const UWORD A_loc = row + col * A_M_n_rows; - const UWORD B_loc = row + col * B_M_n_rows; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (coot_isnan(A_val) || coot_isnan(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = coot_absdiff(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - and_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube.cu deleted file mode 100644 index e01db1e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube.cu +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,approx_equal_cube)(uint* out_mem, - const eT1* A_mem, - const UWORD A_M_n_rows, - const UWORD A_M_n_cols, - const eT1* B_mem, - const UWORD B_M_n_rows, - const UWORD B_M_n_cols, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_elem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - const UWORD n_elem_slice = n_rows * n_cols; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // A bit painful... TODO: implement a more efficient non-modulo approach - const UWORD elem1 = i % n_elem_slice; - const UWORD slice1 = i / n_elem_slice; - const UWORD row1 = elem1 % n_rows; - const UWORD col1 = elem1 / n_rows; - - const UWORD elem2 = (i + blockDim.x) % n_elem_slice; - const UWORD slice2 = (i + blockDim.x) / n_elem_slice; - const UWORD row2 = elem2 % n_rows; - const UWORD col2 = elem2 / n_rows; - - const UWORD A_loc1 = row1 + col1 * A_M_n_rows + slice1 * A_M_n_rows * A_M_n_cols; - const UWORD A_loc2 = row2 + col2 * A_M_n_rows + slice2 * A_M_n_rows * A_M_n_cols; - const UWORD B_loc1 = row1 + col1 * B_M_n_rows + slice1 * B_M_n_rows * B_M_n_cols; - const UWORD B_loc2 = row2 + col2 * B_M_n_rows + slice2 * B_M_n_rows * B_M_n_cols; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (coot_isnan(A_val1) || coot_isnan(B_val1) || coot_isnan(A_val2) || coot_isnan(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = coot_absdiff(A_val1, B_val1); - const eT1 absdiff2 = coot_absdiff(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD elem = i % n_elem_slice; - const UWORD slice = i / n_elem_slice; - const UWORD row = elem % n_rows; - const UWORD col = elem / n_rows; - - const UWORD A_loc = row + col * A_M_n_rows + slice * A_M_n_rows * A_M_n_cols; - const UWORD B_loc = row + col * B_M_n_rows + slice * B_M_n_rows * B_M_n_cols; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (coot_isnan(A_val) || coot_isnan(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = coot_absdiff(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - and_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube_small.cu deleted file mode 100644 index 72d3dde..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_cube_small.cu +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,approx_equal_cube_small)(uint* out_mem, - const eT1* A_mem, - const UWORD A_M_n_rows, - const UWORD A_M_n_cols, - const eT1* B_mem, - const UWORD B_M_n_rows, - const UWORD B_M_n_cols, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_elem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - const UWORD n_elem_slice = n_rows * n_cols; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // A bit painful... TODO: implement a more efficient non-modulo approach - const UWORD elem1 = i % n_elem_slice; - const UWORD slice1 = i / n_elem_slice; - const UWORD row1 = elem1 % n_rows; - const UWORD col1 = elem1 / n_rows; - - const UWORD elem2 = (i + blockDim.x) % n_elem_slice; - const UWORD slice2 = (i + blockDim.x) / n_elem_slice; - const UWORD row2 = elem2 % n_rows; - const UWORD col2 = elem2 / n_rows; - - const UWORD A_loc1 = row1 + col1 * A_M_n_rows + slice1 * A_M_n_rows * A_M_n_cols; - const UWORD A_loc2 = row2 + col2 * A_M_n_rows + slice2 * A_M_n_rows * A_M_n_cols; - const UWORD B_loc1 = row1 + col1 * B_M_n_rows + slice1 * B_M_n_rows * B_M_n_cols; - const UWORD B_loc2 = row2 + col2 * B_M_n_rows + slice2 * B_M_n_rows * B_M_n_cols; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (coot_isnan(A_val1) || coot_isnan(B_val1) || coot_isnan(A_val2) || coot_isnan(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = coot_absdiff(A_val1, B_val1); - const eT1 absdiff2 = coot_absdiff(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD elem = i % n_elem_slice; - const UWORD slice = i / n_elem_slice; - const UWORD row = elem % n_rows; - const UWORD col = elem / n_rows; - - const UWORD A_loc = row + col * A_M_n_rows + slice * A_M_n_rows * A_M_n_cols; - const UWORD B_loc = row + col * B_M_n_rows + slice * B_M_n_rows * B_M_n_cols; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (coot_isnan(A_val) || coot_isnan(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = coot_absdiff(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_small.cu deleted file mode 100644 index e16e98b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/approx_equal_small.cu +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,approx_equal_small)(uint* out_mem, - const eT1* A_mem, - const UWORD A_M_n_rows, - const eT1* B_mem, - const UWORD B_M_n_rows, - const UWORD n_rows, - const UWORD n_elem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // A bit painful... - const UWORD row1 = i % n_rows; - const UWORD col1 = i / n_rows; - const UWORD row2 = (i + blockDim.x) % n_rows; - const UWORD col2 = (i + blockDim.x) / n_rows; - - const UWORD A_loc1 = row1 + col1 * A_M_n_rows; - const UWORD A_loc2 = row2 + col2 * A_M_n_rows; - const UWORD B_loc1 = row1 + col1 * B_M_n_rows; - const UWORD B_loc2 = row2 + col2 * B_M_n_rows; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (coot_isnan(A_val1) || coot_isnan(B_val1) || coot_isnan(A_val2) || coot_isnan(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = coot_absdiff(A_val1, B_val1); - const eT1 absdiff2 = coot_absdiff(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD row = i % n_rows; - const UWORD col = i / n_rows; - - const UWORD A_loc = row + col * A_M_n_rows; - const UWORD B_loc = row + col * B_M_n_rows; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (coot_isnan(A_val) || coot_isnan(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = coot_absdiff(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= TO_ET1(1)) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/count_nonzeros.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/count_nonzeros.cu deleted file mode 100644 index 76a44bc..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/count_nonzeros.cu +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,count_nonzeros)(const eT1* A, - UWORD* thread_counts, - const UWORD n_elem) - { - // We want to pass over the memory in A and count the number of nonzero elements. - // This will give us a count for each individual thread; we then want to prefix-sum this. - // This kernel is meant to be used as the first part of find(). - - UWORD* aux_mem = (UWORD*) aux_shared_mem; // should have size equal to num_threads + 1 - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_count = 0; - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - if (A[i] != TO_ET1(0)) - { - ++local_count; - } - if (A[i + 1] != TO_ET1(0)) - { - ++local_count; - } - - i += 2; - } - if (i < end_elem) - { - if (A[i] != TO_ET1(0)) - { - ++local_count; - } - } - - // Aggregate the counts for all threads. - aux_mem[tid] = local_count; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - - for (UWORD s = num_threads / 2; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - // Set the last element correctly. - thread_counts[num_threads] = aux_mem[num_threads - 1]; - aux_mem[num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads / 2; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - thread_counts[tid] = aux_mem[tid]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find.cu deleted file mode 100644 index e9fc1fa..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find.cu +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,find)(const eT1* A, - const UWORD* thread_counts, - UWORD* out, - const UWORD n_elem) - { - // Our goal is to fill `out` with the indices of nonzero values. - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - - UWORD i = start_elem; - - while (i + 1 < end_elem) - { - if (A[i] != TO_ET1(0)) - { - out[out_index++] = i; - } - if (A[i + 1] != TO_ET1(0)) - { - out[out_index++] = (i + 1); - } - - i += 2; - } - if (i < end_elem) - { - if (A[i] != TO_ET1(0)) - { - out[out_index++] = i; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_first.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_first.cu deleted file mode 100644 index 750610c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_first.cu +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,find_first)(const eT1* A, - const UWORD* thread_counts, - UWORD* out, - const UWORD k, - const UWORD n_elem) - { - // Our goal is to fill `out` with the first `k` indices of nonzero values. - // It is assumed that `k != 0`; if `k` is `0`, use the `find` kernel instead. - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - - UWORD i = start_elem; - - // We only want to find the first k points. - if (out_index < k) - { - while (i + 1 < end_elem) - { - if (A[i] != TO_ET1(0) && out_index < k) - { - out[out_index++] = i; - } - if (A[i + 1] != TO_ET1(0) && out_index < k) - { - out[out_index++] = (i + 1); - } - - i += 2; - } - if (i < end_elem) - { - if (A[i] != TO_ET1(0) && out_index < k) - { - out[out_index++] = i; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_last.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_last.cu deleted file mode 100644 index e19e0fc..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/find_last.cu +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,find_last)(const eT1* A, - const UWORD* thread_counts, - UWORD* out, - const UWORD m, - const UWORD n_elem) - { - // Our goal is to fill `out` with the last `k` indices of nonzero values. - // (Note that to match Armadillo's behavior, we want the last `k` indices in ascending order.) - // Instead of accepting `k` as a parameter, we instead accept `m = nnz - k`. - // This gives us the first index we should be putting an output value in. - // It is also assumed that `k != 0`; if `k` is `0`, use the `find` kernel instead. - - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - UWORD last_out_index = thread_counts[tid + 1]; - - UWORD i = start_elem; - - // We only want to find points with index `m` or higher. - if (last_out_index >= m) - { - while (i + 1 < end_elem) - { - if (A[i] != TO_ET1(0)) - { - if (out_index >= m) - { - out[out_index - m] = i; - } - - ++out_index; - } - if (A[i + 1] != TO_ET1(0)) - { - if (out_index >= m) - { - out[out_index - m] = (i + 1); - } - - ++out_index; - } - - i += 2; - } - - if (i < end_elem) - { - if (A[i] != TO_ET1(0)) - { - if (out_index >= m) - { - out[out_index - m] = i; - } - - ++out_index; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max.cu deleted file mode 100644 index 7d7cf60..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max.cu +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,index_max_subgroup_reduce)(volatile eT1* data, volatile UWORD* uword_data, int tid) - { - if ((const eT1) data[tid + 32] > (const eT1) data[tid]) - { - data[tid] = data[tid + 32]; - uword_data[tid] = uword_data[tid + 32]; - } - - if ((const eT1) data[tid + 16] > (const eT1) data[tid]) - { - data[tid] = data[tid + 16]; - uword_data[tid] = uword_data[tid + 16]; - } - - if ((const eT1) data[tid + 8] > (const eT1) data[tid]) - { - data[tid] = data[tid + 8]; - uword_data[tid] = uword_data[tid + 8]; - } - - if ((const eT1) data[tid + 4] > (const eT1) data[tid]) - { - data[tid] = data[tid + 4]; - uword_data[tid] = uword_data[tid + 4]; - } - - if ((const eT1) data[tid + 2] > (const eT1) data[tid]) - { - data[tid] = data[tid + 2]; - uword_data[tid] = uword_data[tid + 2]; - } - - if ((const eT1) data[tid + 1] > (const eT1) data[tid]) - { - data[tid] = data[tid + 1]; - uword_data[tid] = uword_data[tid + 1]; - } - } - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,index_max)(const eT1* in_mem, - const UWORD* in_uword_mem, - const UWORD use_uword_mem, - const UWORD n_elem, - eT1* out_mem, - UWORD* out_uword_mem, - const UWORD uword_aux_mem_start) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - UWORD* aux_uword_mem = (UWORD*) (aux_shared_mem + uword_aux_mem_start); - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - aux_uword_mem[tid] = SIZE_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - if (i + blockDim.x < n_elem) - { - if (in_mem[i + blockDim.x] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - if (in_mem[i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - - if (in_mem[i + blockDim.x] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] > aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,index_max_subgroup_reduce)(aux_mem, aux_uword_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - out_uword_mem[blockIdx.x] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_colwise.cu deleted file mode 100644 index 5c71089..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_colwise.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_max_colwise)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 best_val = colptr[0]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_rows; ++i) - { - if (colptr[i] > best_val) - { - best_val = colptr[i]; - best_index = i; - } - } - - dest[col * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_cube_col.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_cube_col.cu deleted file mode 100644 index 68f4188..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_cube_col.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_max_cube_col)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT1 best_val = src[row + slice * n_rows * n_cols]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_cols; ++i) - { - if (src[(i * n_rows) + row + slice * n_rows * n_cols] > best_val) - { - best_val = src[(i * n_rows) + row + slice * n_rows * n_cols]; - best_index = i; - } - } - - dest[row + slice * n_rows] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_rowwise.cu deleted file mode 100644 index 3ae5cc2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_rowwise.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_max_rowwise)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 best_val = src[row]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_cols; ++i) - { - if (src[(i * src_M_n_rows) + row] > best_val) - { - best_val = src[(i * src_M_n_rows) + row]; - best_index = i; - } - } - - dest[row * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_small.cu deleted file mode 100644 index 2c1ca36..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_max_small.cu +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_max_small)(const eT1* in_mem, - const UWORD* in_uword_mem, - const UWORD use_uword_mem, - const UWORD n_elem, - eT1* out_mem, - UWORD* out_uword_mem, - const UWORD uword_aux_mem_start) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - UWORD* aux_uword_mem = (UWORD*) (aux_shared_mem + uword_aux_mem_start); - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - aux_uword_mem[tid] = SIZE_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - if (i + blockDim.x < n_elem) - { - if (in_mem[i + blockDim.x] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - if (in_mem[i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - - if (in_mem[i + blockDim.x] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] > aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - out_uword_mem[blockIdx.x] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min.cu deleted file mode 100644 index ae377c2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min.cu +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,index_min_subgroup_reduce)(volatile eT1* data, volatile UWORD* uword_data, int tid) - { - if ((const eT1) data[tid + 32] < (const eT1) data[tid]) - { - data[tid] = data[tid + 32]; - uword_data[tid] = uword_data[tid + 32]; - } - - if ((const eT1) data[tid + 16] < (const eT1) data[tid]) - { - data[tid] = data[tid + 16]; - uword_data[tid] = uword_data[tid + 16]; - } - - if ((const eT1) data[tid + 8] < (const eT1) data[tid]) - { - data[tid] = data[tid + 8]; - uword_data[tid] = uword_data[tid + 8]; - } - - if ((const eT1) data[tid + 4] < (const eT1) data[tid]) - { - data[tid] = data[tid + 4]; - uword_data[tid] = uword_data[tid + 4]; - } - - if ((const eT1) data[tid + 2] < (const eT1) data[tid]) - { - data[tid] = data[tid + 2]; - uword_data[tid] = uword_data[tid + 2]; - } - - if ((const eT1) data[tid + 1] < (const eT1) data[tid]) - { - data[tid] = data[tid + 1]; - uword_data[tid] = uword_data[tid + 1]; - } - } - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,index_min)(const eT1* in_mem, - const UWORD* in_uword_mem, - const UWORD use_uword_mem, - const UWORD n_elem, - eT1* out_mem, - UWORD* out_uword_mem, - const UWORD uword_aux_mem_start) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - UWORD* aux_uword_mem = (UWORD*) (aux_shared_mem + uword_aux_mem_start); - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - aux_uword_mem[tid] = SIZE_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - if (i + blockDim.x < n_elem) - { - if (in_mem[i + blockDim.x] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - if (in_mem[i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - - if (in_mem[i + blockDim.x] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] < aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,index_min_subgroup_reduce)(aux_mem, aux_uword_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - out_uword_mem[blockIdx.x] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_colwise.cu deleted file mode 100644 index 262e1b1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_colwise.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_min_colwise)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 best_val = colptr[0]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_rows; ++i) - { - if (colptr[i] < best_val) - { - best_val = colptr[i]; - best_index = i; - } - } - - dest[col * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_cube_col.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_cube_col.cu deleted file mode 100644 index 84eb56a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_cube_col.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_min_cube_col)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT1 best_val = src[row + slice * n_rows * n_cols]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_cols; ++i) - { - if (src[(i * n_rows) + row + slice * n_rows * n_cols] < best_val) - { - best_val = src[(i * n_rows) + row + slice * n_rows * n_cols]; - best_index = i; - } - } - - dest[row + slice * n_rows] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_rowwise.cu deleted file mode 100644 index b25c568..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_rowwise.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_min_rowwise)(UWORD* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 best_val = src[row]; - UWORD best_index = 0; - for (UWORD i = 1; i < n_cols; ++i) - { - if (src[(i * src_M_n_rows) + row] < best_val) - { - best_val = src[(i * src_M_n_rows) + row]; - best_index = i; - } - } - - dest[row * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_small.cu deleted file mode 100644 index d1d0561..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/index_min_small.cu +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,index_min_small)(const eT1* in_mem, - const UWORD* in_uword_mem, - const UWORD use_uword_mem, - const UWORD n_elem, - eT1* out_mem, - UWORD* out_uword_mem, - const UWORD uword_aux_mem_start) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - UWORD* aux_uword_mem = (UWORD*) (aux_shared_mem + uword_aux_mem_start); - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - aux_uword_mem[tid] = SIZE_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - if (i + blockDim.x < n_elem) - { - if (in_mem[i + blockDim.x] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - if (in_mem[i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - - if (in_mem[i + blockDim.x] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i + blockDim.x]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i + blockDim.x] : (i + blockDim.x)); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[i] : i); - } - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] < aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - out_uword_mem[blockIdx.x] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_philox_randn.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_philox_randn.cu deleted file mode 100644 index f27bba0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_philox_randn.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_philox_randn)(eT1* mem, - unsigned int* philox_state, - const UWORD n_elem, - const fp_eT1 mu, - const fp_eT1 sd) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_set_eye.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_set_eye.cu deleted file mode 100644 index 92d3c80..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_set_eye.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_set_eye)(eT1* out, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - if( (row < n_rows) && (col < n_cols) ) - { - const UWORD offset = row + col * n_rows; - out[offset] = (row == col) ? TO_ET1(1) : TO_ET1(0); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randi.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randi.cu deleted file mode 100644 index 5cee0a8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randi.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_xorwow32_randi)(eT1* mem, - uint* xorwow_state, - const UWORD n_elem, - const eT1 lo, - const uint_eT1 range, - const bool needs_modulo) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randu.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randu.cu deleted file mode 100644 index 45c251f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow32_randu.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_xorwow32_randu)(eT1* mem, - uint* xorwow_state, - const UWORD n_elem) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randi.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randi.cu deleted file mode 100644 index 817ebff..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randi.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_xorwow64_randi)(eT1* mem, - ulong* xorwow_state, - const UWORD n_elem, - const eT1 lo, - const uint_eT1 range, - const bool needs_modulo) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randu.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randu.cu deleted file mode 100644 index d3b7318..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/inplace_xorwow64_randu.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This kernel is a placeholder---we don't use it with the CUDA backend. -// It does nothing. We use cuRand instead. - -__global__ -void -COOT_FN(PREFIX,inplace_xorwow64_randu)(eT1* mem, - ulong* xorwow_state, - const UWORD n_elem) - { - // Do nothing! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/linspace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/linspace.cu deleted file mode 100644 index 8d14282..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/linspace.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ -__global__ -void -COOT_FN(PREFIX,linspace)(eT1* out_mem, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - UWORD idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num) - { - out_mem[idx * mem_incr] = TO_ET1(start + step * (TO_ET1(idx))); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/logspace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/logspace.cu deleted file mode 100644 index c0314a6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/logspace.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ -__global__ -void -COOT_FN(PREFIX,logspace)(eT1* out_mem, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - UWORD idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num) - { - out_mem[idx * mem_incr] = TO_ET1(pow(TO_FP_ET1(10), TO_FP_ET1(start + step * (TO_ET1(idx))))); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/ltri_set_zero.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/ltri_set_zero.cu deleted file mode 100644 index a0e56d7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/ltri_set_zero.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,ltri_set_zero)(eT1* out, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD index = col * n_rows + row; - if ( (row < n_rows) && (col < n_cols) && (row > col) ) - { - out[index] = TO_ET1(0); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max.cu deleted file mode 100644 index c3190bd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max.cu +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,max)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i + blockDim.x]); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i]); - aux_mem[tid] = max(aux_mem[tid], in_mem[i + blockDim.x]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i]); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,max_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs.cu deleted file mode 100644 index ce87fbd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs.cu +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,max_abs)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[i]); - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i + blockDim.x])); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i])); - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i + blockDim.x])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i])); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,max_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs_small.cu deleted file mode 100644 index 9ce7022..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_abs_small.cu +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_abs_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[i]); - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i + blockDim.x])); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i])); - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i + blockDim.x])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], ET1_ABS(in_mem[i])); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_small.cu deleted file mode 100644 index 63b8f75..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/max_small.cu +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_min(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i + blockDim.x]); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i]); - aux_mem[tid] = max(aux_mem[tid], in_mem[i + blockDim.x]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[i]); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min.cu deleted file mode 100644 index 63877c8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min.cu +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2021 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,min)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i + blockDim.x]); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i]); - aux_mem[tid] = min(aux_mem[tid], in_mem[i + blockDim.x]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i]); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,min_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min_small.cu deleted file mode 100644 index 4d44a67..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/min_small.cu +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2021 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max(TO_ET1(0)); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[i]; - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i + blockDim.x]); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i]); - aux_mem[tid] = min(aux_mem[tid], in_mem[i + blockDim.x]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[i]); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise.cu deleted file mode 100644 index a6e3259..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each column in `in` by the corresponding value in `A` -__global__ -void -COOT_FN(PREFIX,mul_colwise)(eT1* out, - const eT1* A, // expected to have length n_cols - const UWORD A_incr, - const eT1* in, - const eT1 alpha, // scalar to multiply - const UWORD n_rows, // size of `out` and `in` - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const UWORD in_offset = col * in_M_n_rows; - const UWORD out_offset = col * n_rows; - const eT1 val = alpha * A[col * A_incr]; - for (UWORD i = 0; i < n_rows; ++i) - { - out[i + out_offset] = val * in[i + in_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise_trans.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise_trans.cu deleted file mode 100644 index 3448d85..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_colwise_trans.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each column in `trans(in)` by the corresponding value in `A` -__global__ -void -COOT_FN(PREFIX,mul_colwise_trans)(eT1* out, - const eT1* A, // expected to have length n_cols - const UWORD A_incr, - const eT1* in, - const eT1 alpha, // scalar to multiply - const UWORD n_rows, // size of `out` - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1 val = alpha * A[col]; - for (UWORD i = 0; i < n_rows; ++i) - { - const UWORD in_offset = col + i * in_M_n_rows; - const UWORD out_offset = col * n_rows + i; - out[out_offset] = val * in[in_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise.cu deleted file mode 100644 index 7a85015..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each row in `in` by the corresponding value in `A` -__global__ -void -COOT_FN(PREFIX,mul_rowwise)(eT1* out, - const eT1* A, // expected to have length n_rows - const UWORD A_incr, - const eT1* in, - const eT1 alpha, // scalar to multiply - const UWORD n_rows, // size of `out` and `in` - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - const eT1 val = alpha * A[row * A_incr]; - for (UWORD i = 0; i < n_cols; ++i) - { - const UWORD out_offset = i * n_rows + row; - const UWORD in_offset = i * in_M_n_rows + row; - out[out_offset] = val * in[in_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise_trans.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise_trans.cu deleted file mode 100644 index cc910ec..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/mul_rowwise_trans.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each row in `trans(in)` by the corresponding value in `A` -__global__ -void -COOT_FN(PREFIX,mul_rowwise_trans)(eT1* out, - const eT1* A, // expected to have length n_rows - const UWORD A_incr, - const eT1* in, - const eT1 alpha, // scalar to multiply - const UWORD n_rows, // size of `out` - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - const eT1 val = alpha * A[row * A_incr]; - for (UWORD i = 0; i < n_cols; ++i) - { - const UWORD in_offset = i + row * in_M_n_rows; - const UWORD out_offset = i * n_rows + row; - out[out_offset] = val * in[in_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod.cu deleted file mode 100644 index 1e5ef8d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod.cu +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,prod)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - aux_mem[tid] *= in_mem[i] * in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] *= in_mem[i]; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,prod_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod_small.cu deleted file mode 100644 index 8c52cdf..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/prod_small.cu +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,prod_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - aux_mem[tid] *= in_mem[i] * in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] *= in_mem[i]; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_asc.cu deleted file mode 100644 index e170f7e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_asc.cu +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_asc)(eT1* A, - eT1* tmp_mem, - const UWORD n_elem) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_counts[2]; - - eT1* unsorted_memptr = A; - eT1* sorted_memptr = tmp_mem; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - __syncthreads(); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid ]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid + num_threads]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - - __syncthreads(); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // swap these and perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = aux_mem[num_threads] - aux_mem[tid]; // contains the first place we should put a 1 point (we will move downwards) - local_counts[1] = (local_counts[1] == 0) ? 0 : local_counts[1] - 1; // avoid underflow - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_asc.cu deleted file mode 100644 index b42eb8d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_asc.cu +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_colwise_asc)(eT1* A, - eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < A_n_cols) - { - eT1* unsorted_colptr = &A[col * A_M_n_rows]; - eT1* sorted_colptr = &tmp_mem[col * A_n_rows]; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - uint_eT1* colptr = reinterpret_cast(unsorted_colptr); - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> b)]; - } - - counts[1] = counts[0]; // now holds the offset to put the next value at - counts[0] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD out_index = counts[((colptr[i] & mask) >> b)]++; - sorted_colptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - eT1* tmp = unsorted_colptr; - unsorted_colptr = sorted_colptr; - sorted_colptr = tmp; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* colptr = reinterpret_cast(unsorted_colptr); - counts[0] = 0; - counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> last_bit)]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = counts[0] - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_colptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]++; - sorted_colptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_desc.cu deleted file mode 100644 index 6bd3959..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_colwise_desc.cu +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_colwise_desc)(eT1* A, - eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < A_n_cols) - { - eT1* unsorted_colptr = &A[col * A_M_n_rows]; - eT1* sorted_colptr = &tmp_mem[col * A_n_rows]; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - uint_eT1* colptr = reinterpret_cast(unsorted_colptr); - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> b)]; - } - - // Since we are sorting in descending order, 1-valued points come first. - counts[0] = counts[1]; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD out_index = counts[((colptr[i] & mask) >> b)]++; - sorted_colptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - eT1* tmp = unsorted_colptr; - unsorted_colptr = sorted_colptr; - sorted_colptr = tmp; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* colptr = reinterpret_cast(unsorted_colptr); - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - counts[0] = 0; - counts[1] = A_n_rows - 1; // points to the last element - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_colptr[out_index] = val; - } - } - else - { - counts[0] = 0; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> last_bit)]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the positive values ahead of the negative values. - counts[1] = counts[0]; // now holds the offset to put the next negative value at - counts[0] = 0; // now holds the offset to put the next positive value at - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]++; - sorted_colptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_desc.cu deleted file mode 100644 index bb1ec97..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_desc.cu +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_desc)(eT1* A, - eT1* tmp_mem, - const UWORD n_elem) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_counts[2]; - - eT1* unsorted_memptr = A; - eT1* sorted_memptr = tmp_mem; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - - // Step 2: aggregate the counts for all threads. - // Since we want the largest values to come first, the first entry will be the count of 1s. - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - __syncthreads(); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 1 point - // aux_mem[tid + num_threads] should hold the first place to put a 0 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid ]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - - __syncthreads(); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 0-bit values before the 1-bit values (since we are sorting in descending order). - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // perform a prefix sum, as with the rest of the bits; positive points will come first - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we already have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = n_elem - 1 - (aux_mem[num_threads + tid] - aux_mem[num_threads]); // contains the first place we should put a 1 point (we will move downwards) - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the positive values ahead of the negative values. - local_counts[0] = aux_mem[tid]; - local_counts[1] = aux_mem[tid + num_threads]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_asc.cu deleted file mode 100644 index e93347f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_asc.cu +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_index_asc)(eT1* A, - UWORD* A_index, - eT1* tmp_mem, - UWORD* tmp_mem_index, - const UWORD n_elem) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill A_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - A_index[i] = i; - A_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - A_index[i] = i; - } - - __syncthreads(); - - UWORD local_counts[2]; - - eT1* unsorted_memptr = A; - UWORD* unsorted_index_memptr = A_index; - eT1* sorted_memptr = tmp_mem; - UWORD* sorted_index_memptr = tmp_mem_index; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - __syncthreads(); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid ]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid + num_threads]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - UWORD* tmp_index = unsorted_index_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_index_memptr = tmp_index; - - __syncthreads(); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // swap these and perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = aux_mem[num_threads] - aux_mem[tid]; // contains the first place we should put a 1 point (we will move downwards) - local_counts[1] = (local_counts[1] == 0) ? 0 : local_counts[1] - 1; // avoid underflow - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_desc.cu deleted file mode 100644 index 90d475f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_desc.cu +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_index_desc)(eT1* A, - UWORD* A_index, - eT1* tmp_mem, - UWORD* tmp_mem_index, - const UWORD n_elem) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill A_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - A_index[i] = i; - A_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - A_index[i] = i; - } - - __syncthreads(); - - UWORD local_counts[2]; - - eT1* unsorted_memptr = A; - UWORD* unsorted_index_memptr = A_index; - eT1* sorted_memptr = tmp_mem; - UWORD* sorted_index_memptr = tmp_mem_index; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - - // Step 2: aggregate the counts for all threads. - // Since we want the largest values to come first, the first entry will be the count of 1s. - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - __syncthreads(); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 1 point - // aux_mem[tid + num_threads] should hold the first place to put a 0 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid ]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - __syncthreads(); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 0-bit values before the 1-bit values (since we are sorting in descending order). - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // perform a prefix sum, as with the rest of the bits; positive points will come first - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - __syncthreads(); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - __syncthreads(); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (coot_is_fp(TO_ET1(0))) - { - // Floating point implementation: - // For negative values, we already have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = n_elem - 1 - (aux_mem[num_threads + tid] - aux_mem[num_threads]); // contains the first place we should put a 1 point (we will move downwards) - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const eT1 index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const eT1 index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const eT1 index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the positive values ahead of the negative values. - local_counts[0] = aux_mem[tid]; - local_counts[1] = aux_mem[tid + num_threads]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const eT1 index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const eT1 index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_multi_wg_shuffle.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_multi_wg_shuffle.cu deleted file mode 100644 index 147012f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_index_multi_wg_shuffle.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,radix_sort_index_multi_wg_shuffle)(eT1* A, - UWORD* A_index, - eT1* out, - UWORD* out_index, - UWORD* counts, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - // This kernel is a placeholder and is not used by the CUDA backend. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_bit_count.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_bit_count.cu deleted file mode 100644 index 8602e1d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_bit_count.cu +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,radix_sort_multi_wg_bit_count)(eT1* A, - UWORD* counts, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - // This kernel is a placeholder and is not used by the CUDA backend. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_shuffle.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_shuffle.cu deleted file mode 100644 index 16de88b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_multi_wg_shuffle.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,radix_sort_multi_wg_shuffle)(eT1* A, - eT1* out, - UWORD* counts, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - // This kernel is a placeholder and is not used by the CUDA backend. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_asc.cu deleted file mode 100644 index 99995a3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_asc.cu +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_rowwise_asc)(eT1* A, - eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < A_n_rows) - { - eT1* unsorted_rowptr = &A[row]; - eT1* sorted_rowptr = &tmp_mem[row]; - - UWORD unsorted_n_rows = A_M_n_rows; - UWORD sorted_n_rows = A_n_rows; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - uint_eT1* rowptr = reinterpret_cast(unsorted_rowptr); - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> b]; - } - - counts[1] = counts[0]; // now holds the offset to put the next value at - counts[0] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD out_index = (counts[((rowptr[in_index] & mask) >> b)]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - eT1* tmp = unsorted_rowptr; - unsorted_rowptr = sorted_rowptr; - sorted_rowptr = tmp; - - UWORD tmp2 = unsorted_n_rows; - unsorted_n_rows = sorted_n_rows; - sorted_n_rows = tmp2; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* rowptr = reinterpret_cast(unsorted_rowptr); - counts[0] = 0; - counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> last_bit]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - if (coot_is_fp(TO_ET1(0))) - { - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = counts[0] - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = counts[bit_val] * sorted_n_rows; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_rowptr[out_index] = val; - } - } - else - { - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = (counts[bit_val]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_desc.cu deleted file mode 100644 index 077738a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/radix_sort_rowwise_desc.cu +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,radix_sort_rowwise_desc)(eT1* A, - eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < A_n_rows) - { - eT1* unsorted_rowptr = &A[row]; - eT1* sorted_rowptr = &tmp_mem[row]; - - UWORD unsorted_n_rows = A_M_n_rows; - UWORD sorted_n_rows = A_n_rows; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = coot_is_signed(TO_ET1(0)) ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - uint_eT1* rowptr = reinterpret_cast(unsorted_rowptr); - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> b]; - } - - // Since we are sorting in descending order, 1-valued points come first. - counts[0] = counts[1]; // now holds the offset to put the next value at - counts[1] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD out_index = (counts[((rowptr[in_index] & mask) >> b)]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - eT1* tmp = unsorted_rowptr; - unsorted_rowptr = sorted_rowptr; - sorted_rowptr = tmp; - - UWORD tmp2 = unsorted_n_rows; - unsorted_n_rows = sorted_n_rows; - sorted_n_rows = tmp2; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!coot_is_signed(TO_ET1(0))) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - uint_eT1* rowptr = reinterpret_cast(unsorted_rowptr); - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - if (coot_is_fp(TO_ET1(0))) - { - counts[0] = 0; // now holds the offset to put the next positive value at - counts[1] = A_n_cols - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = counts[bit_val] * sorted_n_rows; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_rowptr[out_index] = val; - } - } - else - { - counts[0] = 0; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> last_bit]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - counts[1] = counts[0]; // now holds the offset to put the next negative value at - counts[0] = 0; // now holds the offset to put the next positive value at - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = (counts[bit_val]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/regspace_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/regspace_desc.cu deleted file mode 100644 index 39956f9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/regspace_desc.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ -__global__ -void -COOT_FN(PREFIX,regspace_desc)(eT1* out_mem, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 delta, - const UWORD num) - { - UWORD idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num) - { - out_mem[idx * mem_incr] = start - delta * TO_ET1(idx); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/reorder_cols.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/reorder_cols.cu deleted file mode 100644 index 83ac2a8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/reorder_cols.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,reorder_cols)(eT1* out_mem, - const eT1* in_mem, - const UWORD n_rows, - const UWORD* ordering, - const UWORD out_n_cols) - { - const UWORD out_col = blockIdx.x * blockDim.x + threadIdx.x; - if (out_col < out_n_cols) - { - const UWORD in_col = ordering[out_col]; - - eT1* out_colptr = out_mem + (out_col * n_rows); - const eT1* in_colptr = in_mem + (in_col * n_rows); - - for (UWORD i = 0; i < n_rows; ++i) - { - out_colptr[i] = in_colptr[i]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/rotate_180.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/rotate_180.cu deleted file mode 100644 index e1eca97..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/rotate_180.cu +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rotate_180)(eT1* out, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && col < n_cols) - { - const UWORD in_index = col * n_rows + row; - // out(i, j) = in(n_rows - i - 1, n_cols - j - 1) - // or - // out(n_rows - i - 1, n_cols - j - 1) = in(i, j) - const UWORD out_row = n_rows - row - 1; - const UWORD out_col = n_cols - col - 1; - const UWORD out_index = out_col * n_rows + out_row; - - out[out_index] = in[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_add_offset.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_add_offset.cu deleted file mode 100644 index b3883bb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_add_offset.cu +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel adds block-specific offsets to blocks of local memory. -// Specifically, block i, which has t threads, adds offsets[i] to the range -// mem[i * (2 * t)] to mem[(i + 1) * (2 * t) - 1] (inclusive). -__global__ -void -COOT_FN(PREFIX,shifted_prefix_sum_add_offset)(eT1* mem, - const eT1* offsets, - const UWORD n_elem) - { - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = blockIdx.x; - - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD local_offset = 2 * local_tid; - const UWORD mem_offset = group_offset + local_offset; - - const eT1 offset = offsets[group_id]; - - const eT1 in_val1 = (mem_offset < n_elem) ? mem[mem_offset ] : TO_ET1(0); - const eT1 in_val2 = (mem_offset + 1 < n_elem) ? mem[mem_offset + 1] : TO_ET1(0); - - const eT1 out_val1 = in_val1 + offset; - const eT1 out_val2 = in_val2 + offset; - - // Copy results back to memory. - if (mem_offset + 1 < n_elem) - { - mem[mem_offset ] = out_val1; - mem[mem_offset + 1] = out_val2; - } - else if (mem_offset < n_elem) - { - mem[mem_offset ] = out_val1; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_small.cu deleted file mode 100644 index f3a090f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_small.cu +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel performs shifted prefix-sum on `mem` assuming that (2 * blockDim.x) <= n_elem. -// It's okay if n_elem is not a power of 2. -__global__ -void -COOT_FN(PREFIX,shifted_prefix_sum_small)(eT1* mem, - const UWORD n_elem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = blockIdx.x; - - // Copy relevant memory to auxiliary memory. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD mem_offset = group_offset + 2 * local_tid; - - aux_mem[mem_offset ] = (mem_offset < n_elem) ? mem[mem_offset ] : TO_ET1(0); - aux_mem[mem_offset + 1] = (mem_offset + 1 < n_elem) ? mem[mem_offset + 1] : TO_ET1(0); - - UWORD offset = 1; - for (UWORD s = local_size; s > 0; s >>= 1) - { - __syncthreads(); - if (local_tid < s) - { - const UWORD ai = group_offset + offset * (2 * local_tid + 1) - 1; - const UWORD bi = group_offset + offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - } - - // Prepare for down-sweep by setting the last element to 0. - if (local_tid == 0) - { - aux_mem[2 * local_size - 1] = 0; - } - __syncthreads(); - - for (UWORD s = 1; s <= local_size; s *= 2) - { - offset >>= 1; - if (local_tid < s) - { - const UWORD ai = group_offset + offset * (2 * local_tid + 1) - 1; - const UWORD bi = group_offset + offset * (2 * local_tid + 2) - 1; - eT1 tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Copy results back to memory. - if (mem_offset + 1 < n_elem) - { - mem[mem_offset ] = aux_mem[mem_offset ]; - mem[mem_offset + 1] = aux_mem[mem_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[mem_offset ] = aux_mem[mem_offset ]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_subgroups.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_subgroups.cu deleted file mode 100644 index 1a7fed4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shifted_prefix_sum_subgroups.cu +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel performs the shifted prefix-sum on each individual block. -// This is the same as just running a regular prefix-sum kernel, except that -// `out_mem[i]` will store the total sum of elements in block `i`. -// After running this, to finish prefix-sum on the entire memory, offsets for -// each workgroup need to be added. -__global__ -void -COOT_FN(PREFIX,shifted_prefix_sum_subgroups)(eT1* mem, - eT1* out_mem, - const UWORD n_elem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = blockIdx.x; - - // Copy relevant memory to auxiliary memory. - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD local_offset = 2 * local_tid; - const UWORD mem_offset = group_offset + local_offset; - - aux_mem[local_offset ] = (mem_offset < n_elem) ? mem[mem_offset ] : TO_ET1(0); - aux_mem[local_offset + 1] = (mem_offset + 1 < n_elem) ? mem[mem_offset + 1] : TO_ET1(0); - - UWORD offset = 1; - for (UWORD s = local_size; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (local_offset + 1) - 1; - const UWORD bi = offset * (local_offset + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (mem_offset + 1 < n_elem) - { - mem[mem_offset ] = aux_mem[local_offset ]; - mem[mem_offset + 1] = aux_mem[local_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[mem_offset ] = aux_mem[local_offset ]; - } - - if (local_tid == 0) - { - // Write the sum of the subarray to the output memory. - out_mem[group_id] = aux_mem[2 * local_size - 1]; - // Prepare for the downsweep. - aux_mem[2 * local_size - 1] = 0; - } - __syncthreads(); - - offset = local_size; - for (UWORD s = 1; s <= local_size; s *= 2) - { - if (local_tid < s) - { - const UWORD ai = offset * (local_offset + 1) - 1; - const UWORD bi = offset * (local_offset + 2) - 1; - eT1 tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - offset >>= 1; - __syncthreads(); - } - - // Copy results back to memory. - // The results here are the prefix-summed results for each individual - // workgroup. - if (mem_offset + 1 < n_elem) - { - mem[mem_offset ] = aux_mem[local_offset ]; - mem[mem_offset + 1] = aux_mem[local_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[mem_offset ] = aux_mem[local_offset ]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle.cu deleted file mode 100644 index f116a93..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle.cu +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,shuffle)(eT1* out, - const UWORD out_incr, /* how many eT1s to advance to get to the start of the next element to shuffle */ - const UWORD out_elem_stride, /* how many eT1s between each eT1 in each element */ - const eT1* in, - const UWORD in_incr, - const UWORD in_elem_stride, - const UWORD n_elem, - const UWORD elems_per_elem, /* how many eT1s in each element to shuffle */ - const UWORD n_elem_pow2, - const UWORD* philox_key, - const UWORD num_bits) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x + blockIdx.x * blockDim.x; - - // Get our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // Fill aux_mem with the indicator of whether we are out of bounds. - // Then, we'll prefix-sum it. This will tell us where to put our result. - aux_mem[tid] = (in_loc < n_elem); - __syncthreads(); - - // Now, prefix-sum the auxiliary memory. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = n_elem_pow2 / 2; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[n_elem_pow2 - 1] = 0; - } - __syncthreads(); - - for (UWORD s = 1; s <= n_elem_pow2 / 2; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // With the prefix sum complete, we shuffle our result into position aux_mem[tid], but only if we are a thread with a "valid" output. - if (in_loc < n_elem) - { - const UWORD in_addr_offset = in_loc * in_incr; - const UWORD out_addr_offset = aux_mem[tid] * out_incr; - - for (UWORD i = 0; i < elems_per_elem; ++i) - { - out[out_addr_offset + (i * out_elem_stride)] = in[in_addr_offset + (i * in_elem_stride)]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle_large.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle_large.cu deleted file mode 100644 index 3a9cd30..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/shuffle_large.cu +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,shuffle_large)(eT1* out, - const UWORD out_incr, /* how many eT1s to advance to get to the start of the next element to shuffle */ - const UWORD out_elem_stride, /* how many eT1s between each eT1 in each element */ - const eT1* in, - const UWORD in_incr, - const UWORD in_elem_stride, - const UWORD* block_offsets, - const UWORD n_elem, - const UWORD elems_per_elem, /* how many eT1s in each element to shuffle */ - const UWORD n_elem_pow2, - const UWORD* philox_key, - const UWORD num_bits) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x + blockIdx.x * blockDim.x; - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; - - // Recompute our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // We actually have to perform the up-sweep a second time, since we did not save the memory the first time. - aux_mem[local_tid] = (in_loc < n_elem); - __syncthreads(); - - // Now, prefix-sum the auxiliary memory for this block. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = local_size / 2; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (local_tid == 0) - { - aux_mem[local_size - 1] = 0; - } - __syncthreads(); - - for (UWORD s = 1; s <= local_size / 2; s *= 2) - { - offset >>= 1; - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // With the prefix sum complete, we shuffle our result into position aux_mem[tid], but only if we are a thread with a "valid" output. - if (in_loc < n_elem) - { - const UWORD in_addr_offset = in_loc * in_incr; - const UWORD out_addr_offset = (aux_mem[local_tid] + block_offsets[blockIdx.x]) * out_incr; - - for (UWORD i = 0; i < elems_per_elem; ++i) - { - out[out_addr_offset + (i * out_elem_stride)] = in[in_addr_offset + (i * in_elem_stride)]; - } - } - } - diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_asc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_asc.cu deleted file mode 100644 index d5a8e16..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_asc.cu +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,stable_radix_sort_index_asc)(eT1* A, - UWORD* A_index, - eT1* tmp_mem, - UWORD* tmp_mem_index, - const UWORD n_elem) - { - // The stable sort differs from the rest of our radix sorts in that we must avoid ever "reversing" point orders. - // We do this by adapting the regular radix sort to also consider the highest bit (the sign bit for signed types). - // This alleviates the need to ever unpack points in a reverse order, and so the sort is stable. - - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill tmp_mem_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - tmp_mem_index[i] = i; - tmp_mem_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - tmp_mem_index[i] = i; - } - - __syncthreads(); - - // This is 4 instead of two because we need to account for the sign bit. - UWORD local_counts[4]; - - // We are doing an odd number of iterations, so set things up such that A_index will be holding the final results. - eT1* unsorted_memptr = A; - UWORD* unsorted_index_memptr = tmp_mem_index; - eT1* sorted_memptr = tmp_mem; - UWORD* sorted_index_memptr = A_index; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 sign_mask = (((uint_eT1) 1) << last_bit); - - for (UWORD b = 0; b < 8 * sizeof(eT1) - 1; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with sign 0 and bit value 0 - local_counts[1] = 0; // holds the count of elements with sign 0 and bit value 1 - local_counts[2] = 0; // holds the count of elements with sign 1 and bit value 0 - local_counts[3] = 0; // holds the count of elements with sign 1 and bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[((memptr[i ] & mask) >> b) + ((memptr[i ] & sign_mask) >> (last_bit - 1))]; - ++local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]; - } - - // Step 2: aggregate the counts for all threads. - // There are a couple cases here to get things in an ascending order: - // * Floating point number: [11, 10, 00, 01] - // * Unsigned integer: [00, 01, 10, 11] - // * Signed integer: [10, 11, 00, 01] - // Note that the notation "11" indicates, e.g., a point whose sign is 1 and bit value in bit b is 1. - // For unsigned integers, we treat the top bit as a "sign" bit even though it's not---but we choose an ordering that's still correct. - - if (!coot_is_signed(TO_ET1(0))) - { - // Unsigned integer (00, 01, 10, 11) - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - aux_mem[tid + 2 * num_threads] = local_counts[2]; - aux_mem[tid + 3 * num_threads] = local_counts[3]; - } - else if (coot_is_fp(TO_ET1(0))) - { - // Floating point (11, 10, 00, 01) - aux_mem[tid ] = local_counts[3]; - aux_mem[tid + num_threads] = local_counts[2]; - aux_mem[tid + 2 * num_threads] = local_counts[0]; - aux_mem[tid + 3 * num_threads] = local_counts[1]; - } - else - { - // Signed integer (10, 11, 00, 01) - aux_mem[tid ] = local_counts[2]; - aux_mem[tid + num_threads] = local_counts[3]; - aux_mem[tid + 2 * num_threads] = local_counts[0]; - aux_mem[tid + 3 * num_threads] = local_counts[1]; - } - __syncthreads(); - - // Now, we must assign four sections of memory for `tid` to put its points in. - // We do this by a prefix-sum operation across all threads. - // At the end of this operation (at the beginning of Step 3): - // - // local_counts[0] indicates the first place to put a sign-0 bit-value-0 point - // local_counts[1] indicates the first place to put a sign-0 bit-value-1 point - // local_counts[2] indicates the first place to put a sign-1 bit-value-0 point - // local_counts[3] indicates the first place to put a sign-1 bit-value-1 point - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - const UWORD ai1 = offset * (2 * tid + 1) - 1; - const UWORD bi1 = offset * (2 * tid + 2) - 1; - aux_mem[bi1] += aux_mem[ai1]; - const UWORD ai2 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi2 = offset * (2 * (tid + num_threads) + 2) - 1; - aux_mem[bi2] += aux_mem[ai2]; - offset *= 2; - __syncthreads(); - - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[4 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - offset >>= 1; - const UWORD ai3 = offset * (2 * tid + 1) - 1; - const UWORD bi3 = offset * (2 * tid + 2) - 1; - UWORD tmp3 = aux_mem[ai3]; - aux_mem[ai3] = aux_mem[bi3]; - aux_mem[bi3] += tmp3; - - const UWORD ai4 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi4 = offset * (2 * (tid + num_threads) + 2) - 1; - UWORD tmp4 = aux_mem[ai4]; - aux_mem[ai4] = aux_mem[bi4]; - aux_mem[bi4] += tmp4; - __syncthreads(); - - // Step 3: move points into the correct place. - // There are a couple cases here to get things in an ascending order: - // * Floating point number: [11, 10, 00, 01] - // * Unsigned integer: [00, 01, 10, 11] - // * Signed integer: [10, 11, 00, 01] - - if (!coot_is_signed(TO_ET1(0))) - { - // Unsigned integer (00, 01, 10, 11) - local_counts[0] = aux_mem[tid ]; - local_counts[1] = aux_mem[tid + num_threads]; - local_counts[2] = aux_mem[tid + 2 * num_threads]; - local_counts[3] = aux_mem[tid + 3 * num_threads]; - } - else if (coot_is_fp(TO_ET1(0))) - { - // Floating point (11, 10, 00, 01) - local_counts[0] = aux_mem[tid + 2 * num_threads]; - local_counts[1] = aux_mem[tid + 3 * num_threads]; - local_counts[2] = aux_mem[tid + num_threads]; - local_counts[3] = aux_mem[tid ]; - } - else - { - // Signed integer (10, 11, 00, 01) - local_counts[0] = aux_mem[tid + 2 * num_threads]; - local_counts[1] = aux_mem[tid + 3 * num_threads]; - local_counts[2] = aux_mem[tid ]; - local_counts[3] = aux_mem[tid + num_threads]; - } - - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - UWORD* tmp_index = unsorted_index_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_index_memptr = tmp_index; - - __syncthreads(); - } - - // Since we did an odd number of iterations, the result is stored in A_index. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_desc.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_desc.cu deleted file mode 100644 index fa7a134..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/stable_radix_sort_index_desc.cu +++ /dev/null @@ -1,256 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,stable_radix_sort_index_desc)(eT1* A, - UWORD* A_index, - eT1* tmp_mem, - UWORD* tmp_mem_index, - const UWORD n_elem) - { - // The stable sort differs from the rest of our radix sorts in that we must avoid ever "reversing" point orders. - // We do this by adapting the regular radix sort to also consider the highest bit (the sign bit for signed types). - // This alleviates the need to ever unpack points in a reverse order, and so the sort is stable. - - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - - const UWORD num_threads = blockDim.x; - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill tmp_mem_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - tmp_mem_index[i] = i; - tmp_mem_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - tmp_mem_index[i] = i; - } - - __syncthreads(); - - UWORD local_counts[4]; - - // We are doing an odd number of iterations, so set things up such that A_index will be holding the final results. - eT1* unsorted_memptr = A; - UWORD* unsorted_index_memptr = tmp_mem_index; - eT1* sorted_memptr = tmp_mem; - UWORD* sorted_index_memptr = A_index; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 sign_mask = (((uint_eT1) 1) << last_bit); - - for (UWORD b = 0; b < 8 * sizeof(eT1) - 1; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - uint_eT1* memptr = reinterpret_cast(unsorted_memptr); - - local_counts[0] = 0; // holds the count of elements with bit value 0 and sign value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 and sign value 0 - local_counts[2] = 0; // holds the count of elements with bit value 0 and sign value 1 - local_counts[3] = 0; // holds the count of elements with bit value 1 and sign value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[((memptr[i ] & mask) >> b) + ((memptr[i ] & sign_mask) >> (last_bit - 1))]; - ++local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]; - } - - // Step 2: aggregate the counts for all threads. - // There are a couple cases here to get things in a descending order: - // * Floating point number: [01, 00, 10, 11] - // * Unsigned integer: [11, 10, 01, 00] - // * Signed integer: [01, 00, 11, 10] - // Note that the notation "11" indicates, e.g., a point whose sign is 1 and bit value in bit b is 1. - // For unsigned integers, we treat the top bit as a "sign" bit even though it's not---but we choose an ordering that's still correct. - - if (!coot_is_signed(TO_ET1(0))) - { - // Unsigned integer (11, 10, 01, 00) - aux_mem[tid ] = local_counts[3]; - aux_mem[tid + num_threads] = local_counts[2]; - aux_mem[tid + 2 * num_threads] = local_counts[1]; - aux_mem[tid + 3 * num_threads] = local_counts[0]; - } - else if (coot_is_fp(TO_ET1(0))) - { - // Floating point (01, 00, 10, 11) - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - aux_mem[tid + 2 * num_threads] = local_counts[2]; - aux_mem[tid + 3 * num_threads] = local_counts[3]; - } - else - { - // Signed integer (01, 00, 11, 10) - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - aux_mem[tid + 2 * num_threads] = local_counts[3]; - aux_mem[tid + 3 * num_threads] = local_counts[2]; - } - __syncthreads(); - - // Now, we must assign four sections of memory for `tid` to put its points in. - // We do this by a prefix-sum operation across all threads. - // At the end of this operation (at the beginning of Step 3): - // - // local_counts[0] indicates the first place to put a sign-0 bit-value-0 point - // local_counts[1] indicates the first place to put a sign-0 bit-value-1 point - // local_counts[2] indicates the first place to put a sign-1 bit-value-0 point - // local_counts[3] indicates the first place to put a sign-1 bit-value-1 point - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - const UWORD ai1 = offset * (2 * tid + 1) - 1; - const UWORD bi1 = offset * (2 * tid + 2) - 1; - aux_mem[bi1] += aux_mem[ai1]; - const UWORD ai2 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi2 = offset * (2 * (tid + num_threads) + 2) - 1; - aux_mem[bi2] += aux_mem[ai2]; - offset *= 2; - __syncthreads(); - - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (tid == 0) - { - aux_mem[4 * num_threads - 1] = 0; - } - __syncthreads(); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - __syncthreads(); - } - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - offset >>= 1; - const UWORD ai3 = offset * (2 * tid + 1) - 1; - const UWORD bi3 = offset * (2 * tid + 2) - 1; - UWORD tmp3 = aux_mem[ai3]; - aux_mem[ai3] = aux_mem[bi3]; - aux_mem[bi3] += tmp3; - - const UWORD ai4 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi4 = offset * (2 * (tid + num_threads) + 2) - 1; - UWORD tmp4 = aux_mem[ai4]; - aux_mem[ai4] = aux_mem[bi4]; - aux_mem[bi4] += tmp4; - __syncthreads(); - - // Step 3: move points into the correct place. - if (!coot_is_signed(TO_ET1(0))) - { - // Unsigned integer (11, 10, 01, 00) - local_counts[0] = aux_mem[tid + 3 * num_threads]; - local_counts[1] = aux_mem[tid + 2 * num_threads]; - local_counts[2] = aux_mem[tid + num_threads]; - local_counts[3] = aux_mem[tid ]; - } - else if (coot_is_fp(TO_ET1(0))) - { - // Floating point (01, 00, 10, 11) - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid ]; - local_counts[2] = aux_mem[tid + 2 * num_threads]; - local_counts[3] = aux_mem[tid + 3 * num_threads]; - } - else - { - // Signed integer (01, 00, 11, 10) - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid ]; - local_counts[2] = aux_mem[tid + 3 * num_threads]; - local_counts[3] = aux_mem[tid + 2 * num_threads]; - } - - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - eT1* tmp = unsorted_memptr; - UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - __syncthreads(); - } - - // Since we did an odd number of iterations, the result is stored in A_index. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var.cu deleted file mode 100644 index 7414178..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var.cu +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,submat_var)(const eT1* in_mem, - const UWORD n_elem, // number of elements in subview - eT1* out_mem, - const eT1 mean_val, - const UWORD in_n_rows, - const UWORD start_row, - const UWORD start_col, - const UWORD sub_n_rows, - const UWORD sub_n_cols) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const UWORD col1 = (i ) / sub_n_rows; - const UWORD col2 = (i + blockDim.x) / sub_n_rows; - const UWORD row1 = (i ) % sub_n_rows; - const UWORD row2 = (i + blockDim.x) % sub_n_rows; - const UWORD index1 = (col1 + start_col) * in_n_rows + (row1 + start_row); - const UWORD index2 = (col2 + start_col) * in_n_rows + (row2 + start_row); - - const eT1 val1 = (in_mem[index1] - mean_val); - const eT1 val2 = (in_mem[index2] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const UWORD col = i / sub_n_rows; - const UWORD row = i % sub_n_rows; - const UWORD index = (col + start_col) * in_n_rows + (row + start_row); - - const eT1 val = (in_mem[index] - mean_val); - aux_mem[tid] += (val * val); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var_small.cu deleted file mode 100644 index 0503006..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/submat_var_small.cu +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,submat_var_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const eT1 mean_val, - const UWORD in_n_rows, - const UWORD start_row, - const UWORD start_col, - const UWORD sub_n_rows, - const UWORD sub_n_cols) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const UWORD col1 = (i ) / sub_n_rows; - const UWORD col2 = (i + blockDim.x) / sub_n_rows; - const UWORD row1 = (i ) % sub_n_rows; - const UWORD row2 = (i + blockDim.x) % sub_n_rows; - const UWORD index1 = (col1 + start_col) * in_n_rows + (row1 + start_row); - const UWORD index2 = (col2 + start_col) * in_n_rows + (row2 + start_row); - - const eT1 val1 = (in_mem[index1] - mean_val); - const eT1 val2 = (in_mem[index2] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const UWORD col = i / sub_n_rows; - const UWORD row = i % sub_n_rows; - const UWORD index = (col + start_col) * in_n_rows + (row + start_row); - - const eT1 val = (in_mem[index] - mean_val); - aux_mem[tid] += (val * val); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatl_inplace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatl_inplace.cu deleted file mode 100644 index f4907eb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatl_inplace.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,symmatl_inplace)(eT1* out, - const UWORD size) // matrix is expected to be square - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < size && col < size && row > col) - { - const eT1 val = out[row + size * col]; - - // only need to copy to the upper triangle for the in-place version - out[col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatu_inplace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatu_inplace.cu deleted file mode 100644 index e16d8ca..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/symmatu_inplace.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,symmatu_inplace)(eT1* out, - const UWORD size) // matrix is expected to be square - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < size && col < size && col > row) - { - const eT1 val = out[row + size * col]; - - // only need to copy to the lower triangle for the in-place version - out[col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/trace.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/trace.cu deleted file mode 100644 index 8dc31a7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/trace.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,trace)(eT1* out, - const eT1* A, - const UWORD n_rows, - const UWORD N) - { - const UWORD id = blockIdx.x * blockDim.x + threadIdx.x; - if(id == 0) - { - eT1 acc = TO_ET1(0); - // runtime unrolling is not supported by CUDA - for(UWORD i=0; i 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_colwise.cu deleted file mode 100644 index aad8c98..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_colwise.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,var_colwise)(eT1* dest, - const eT1* src, - const eT1* src_means, - const UWORD n_rows, - const UWORD n_cols, - const UWORD norm_correction, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows, - const UWORD src_means_mem_incr) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - const eT1 mean_val = src_means[col * src_means_mem_incr]; - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_rows; ++i) - { - eT1 val = (colptr[i] - mean_val); - acc += (val * val); - } - - dest[col * dest_mem_incr] = (acc / TO_ET1(n_rows - norm_correction)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_rowwise.cu deleted file mode 100644 index 5c0f08a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_rowwise.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,var_rowwise)(eT1* dest, - const eT1* src, - const eT1* src_means, - const UWORD n_rows, - const UWORD n_cols, - const UWORD norm_correction, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows, - const UWORD src_means_mem_incr) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = TO_ET1(0); - const eT1 mean_val = src_means[row]; - for (UWORD i = 0; i < n_cols; ++i) - { - const eT1 val = (src[i * src_M_n_rows + row] - mean_val); - acc += (val * val); - } - - dest[row * dest_mem_incr] = (acc / TO_ET1(n_cols - norm_correction)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_small.cu deleted file mode 100644 index 790a779..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway/var_small.cu +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,var_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const eT1 mean_val) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 val1 = (in_mem[i] - mean_val); - const eT1 val2 = (in_mem[i + blockDim.x] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 val = (in_mem[i] - mean_val); - aux_mem[tid] += (val * val); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce.cu deleted file mode 100644 index 3e9a140..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce.cu +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,and_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] &= data[tid + 32]; - data[tid] &= data[tid + 16]; - data[tid] &= data[tid + 8]; - data[tid] &= data[tid + 4]; - data[tid] &= data[tid + 2]; - data[tid] &= data[tid + 1]; - } - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,and_reduce)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - - aux_mem[tid] = ~((eT1) 0); // all bits to 1 - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] &= in_mem[i]; - aux_mem[tid] &= in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] &= in_mem[i]; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,and_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce_small.cu deleted file mode 100644 index 7c5b1da..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/and_reduce_small.cu +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2021 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,and_reduce_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = ~((eT1) 0); // all bits to 1 - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] &= in_mem[i]; - aux_mem[tid] &= in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] &= in_mem[i]; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det.cu deleted file mode 100644 index fb5f7da..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det.cu +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declaration of one-way kernel that we need. -__device__ void COOT_FN(PREFIX,prod_subgroup_reduce)(volatile eT1* data, int tid); - -// this kernel is technically incorrect if the size is not a factor of 2! -// Compute the determinant of a permutation matrix as given by getrf(). -__global__ -void -COOT_FN(PREFIX,ipiv_det)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - - aux_mem[tid] = (eT1) 1; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = ((in_mem[i] - 1) == i) ? 1 : -1; - aux_mem[tid] *= val1; - const eT1 val2 = ((in_mem[i + blockDim.x] - 1) == (i + blockDim.x)) ? 1 : -1; - aux_mem[tid] *= val2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 val = ((in_mem[i] - 1) == i) ? 1 : -1; - aux_mem[tid] *= val; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,prod_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det_small.cu deleted file mode 100644 index d9d3c1f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/ipiv_det_small.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Compute the determinant of a permutation matrix as given by getrf(). -__global__ -void -COOT_FN(PREFIX,ipiv_det_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = (eT1) 1; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = ((in_mem[i] - 1) == i) ? 1 : -1; - aux_mem[tid] *= val1; - const eT1 val2 = ((in_mem[i + blockDim.x] - 1) == (i + blockDim.x)) ? 1 : -1; - aux_mem[tid] *= val2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 val = ((in_mem[i] - 1) == i) ? 1 : -1; - aux_mem[tid] *= val; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce.cu deleted file mode 100644 index ff63de1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce.cu +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,or_subgroup_reduce)(volatile eT1* data, int tid) - { - data[tid] |= data[tid + 32]; - data[tid] |= data[tid + 16]; - data[tid] |= data[tid + 8]; - data[tid] |= data[tid + 4]; - data[tid] |= data[tid + 2]; - data[tid] |= data[tid + 1]; - } - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,or_reduce)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - - aux_mem[tid] = (eT1) 0; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] |= in_mem[i]; - aux_mem[tid] |= in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] |= in_mem[i]; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,or_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce_small.cu deleted file mode 100644 index 9a8290c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_integral/or_reduce_small.cu +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,or_reduce_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = (eT1) 0; - - while (i + blockDim.x < n_elem) - { - aux_mem[tid] |= in_mem[i]; - aux_mem[tid] |= in_mem[i + blockDim.x]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] |= in_mem[i]; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod.cu deleted file mode 100644 index dd0cac4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod.cu +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declaration of one-way kernel that we need. -__device__ void COOT_FN(PREFIX,prod_subgroup_reduce)(volatile eT1* data, int tid); - -// this kernel is technically incorrect if the size is not a factor of 2! -// Compute the product of the elements on the diagonal of a matrix. -__global__ -void -COOT_FN(PREFIX,diag_prod)(const eT1* in_mem, - const UWORD n_rows, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_rows) - { - // copy to local shared memory - const UWORD index1 = i * n_rows + i; - const eT1 v1 = in_mem[index1]; - const UWORD index2 = (i + blockDim.x) * n_rows + (i + blockDim.x); - const eT1 v2 = in_mem[index2]; - aux_mem[tid] *= v1 * v2; - i += grid_size; - } - if (i < n_rows) - { - const UWORD index = i * n_rows + i; - const eT1 v = in_mem[index]; - aux_mem[tid] *= v; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,prod_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod_small.cu deleted file mode 100644 index 448de93..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/diag_prod_small.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// compute the product of the diagonal of a matrix -__global__ -void -COOT_FN(PREFIX,diag_prod_small)(const eT1* in_mem, - const UWORD n_rows, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_rows) - { - // copy to local shared memory - const UWORD index1 = i * n_rows + i; - const eT1 v1 = in_mem[index1]; - const UWORD index2 = (i + blockDim.x) * n_rows + (i + blockDim.x); - const eT1 v2 = in_mem[index2]; - aux_mem[tid] *= v1 * v2; - i += grid_size; - } - if (i < n_rows) - { - const UWORD index = i * n_rows + i; - const eT1 v = in_mem[index]; - aux_mem[tid] *= v; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/extract_cx.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/extract_cx.cu deleted file mode 100644 index 3957225..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/extract_cx.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Extract real or imaginary elements from a complex matrix into a real matrix. -// This kernel is a bit of a hack until we have actual complex matrix support! -__global__ -void -COOT_FN(PREFIX,extract_cx)(const eT1* in_mem, - eT1* out_mem, - const UWORD real_or_imag, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows, - const UWORD out_M_n_rows) - { - // If real_or_imag is 0, we extract the real part. If 1, we extract the - // imaginary part. - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD in_index = 2 * (col * in_M_n_rows + row) + real_or_imag; - const UWORD out_index = col * out_M_n_rows + row; - - if (col < n_cols && row < n_rows) - { - out_mem[out_index] = in_mem[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_l.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_l.cu deleted file mode 100644 index cebe438..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_l.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This extracts L from U, and sets the lower diagonal of U to 0. -__global__ -void -COOT_FN(PREFIX,lu_extract_l)(eT1* L, - eT1* U, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - // Note that neither U nor L must be square. - // L has size n_rows x min(n_rows, n_cols). - // U has size min(n_rows, n_cols) x n_cols. - const UWORD min_rows_cols = min(n_rows, n_cols); - - const UWORD in_index = row + n_rows * col; // this is also L_out_index - const UWORD U_out_index = row + min_rows_cols * col; - - if ((row < n_rows) && (col < min_rows_cols)) - { - L[in_index] = (row > col) ? in[in_index] : ((row == col) ? 1 : 0); - } - - if ((row < min_rows_cols) && (col < n_cols)) - { - U[U_out_index] = (row > col) ? 0 : in[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_p.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_p.cu deleted file mode 100644 index 090462a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_p.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,lu_extract_p)(eT1* P, - const UWORD* ipiv2, - const UWORD n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - - if (row < n_rows) - { - const UWORD index = row + ipiv2[row] * n_rows; - P[index] = (UWORD) 1; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_pivoted_l.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_pivoted_l.cu deleted file mode 100644 index 5e50481..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/lu_extract_pivoted_l.cu +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This extracts L from U, and sets the lower diagonal of U to 0. -__global__ -void -COOT_FN(PREFIX,lu_extract_pivoted_l)(eT1* L, - eT1* U, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD* ipiv) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - // Note that neither U nor L must be square. - // L has size n_rows x min(n_rows, n_cols). - // U has size min(n_rows, n_cols) x n_cols. - const UWORD min_rows_cols = min(n_rows, n_cols); - - const UWORD in_index = row + n_rows * col; // this is also L_out_index - const UWORD U_out_index = row + min_rows_cols * col; - - // We are extracted a permuted version of L. - // Instead of extracting row i of U as row i of L, - // we extract row i of U as row ipiv[i] of L. - const UWORD L_out_index = ipiv[row] + n_rows * col; - - if ((row < n_rows) && (col < min_rows_cols)) - { - L[L_out_index] = (row > col) ? in[in_index] : ((row == col) ? 1 : 0); - } - - if ((row < min_rows_cols) && (col < n_cols)) - { - U[U_out_index] = (row > col) ? 0 : in[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf.cu deleted file mode 100644 index c1af8f5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf.cu +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_any_inf)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= coot_isinf(val1); - aux_mem[tid] |= coot_isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= coot_isinf(val1); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - or_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf_small.cu deleted file mode 100644 index 0c91941..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_inf_small.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_inf_small)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= coot_isinf(val1); - aux_mem[tid] |= coot_isinf(val2); - if (aux_mem[tid] == 1) - break; - - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= coot_isinf(val1); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan.cu deleted file mode 100644 index a5e82fc..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan.cu +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_any_nan)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= coot_isnan(val1); - aux_mem[tid] |= coot_isnan(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= coot_isnan(val1); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - or_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan_small.cu deleted file mode 100644 index 5e39330..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nan_small.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_nan_small)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= coot_isnan(val1); - aux_mem[tid] |= coot_isnan(val2); - if (aux_mem[tid] == 1) - break; - - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= coot_isnan(val1); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite.cu deleted file mode 100644 index 93847ba..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite.cu +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_any_nonfinite)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= !coot_isfinite(val1); - aux_mem[tid] |= !coot_isfinite(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= !coot_isfinite(val1); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - or_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite_small.cu deleted file mode 100644 index b6e7af8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_any_nonfinite_small.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_nonfinite_small)(const eT1* X, - const UWORD n_elem, - uint* out, - const eT1 val /* ignored */) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 val1 = X[i]; - const eT1 val2 = X[i + blockDim.x]; - - aux_mem[tid] |= !coot_isfinite(val1); - aux_mem[tid] |= !coot_isfinite(val2); - if (aux_mem[tid] == 1) - break; - - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[i]; - - aux_mem[tid] |= !coot_isfinite(val1); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isfinite.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isfinite.cu deleted file mode 100644 index 19d2241..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isfinite.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_isfinite)(UWORD* out, - const eT1* X, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val = (eT1) X[i]; - out[i] = coot_isfinite(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnan.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnan.cu deleted file mode 100644 index d7b5dde..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnan.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_isnan)(UWORD* out, - const eT1* X, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val1 = (eT1) X[i]; - out[i] = coot_isnan(val1); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnonfinite.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnonfinite.cu deleted file mode 100644 index f1fa746..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/rel_isnonfinite.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_isnonfinite)(UWORD* out, - const eT1* X, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_elem) - { - const eT1 val = (eT1) X[i]; - out[i] = !coot_isfinite(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1.cu deleted file mode 100644 index 59b5818..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1.cu +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declaration of one-way kernel that we need. -__device__ void COOT_FN(PREFIX,accu_subgroup_reduce)(volatile eT1* data, int tid); - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,vec_norm_1)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = abs(in_mem[i]); - const eT1 v2 = abs(in_mem[i + blockDim.x]); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = abs(in_mem[i]); - aux_mem[tid] += v; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1_small.cu deleted file mode 100644 index 0f4c579..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_1_small.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_1_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = abs(in_mem[i]); - const eT1 v2 = abs(in_mem[i + blockDim.x]); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = abs(in_mem[i]); - aux_mem[tid] += v; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2.cu deleted file mode 100644 index 54ef65e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_2)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = in_mem[i] * in_mem[i]; - const eT1 v2 = in_mem[i + blockDim.x] * in_mem[i + blockDim.x]; - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = in_mem[i] * in_mem[i]; - aux_mem[tid] += v; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust.cu deleted file mode 100644 index 7684dc8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust.cu +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_2_robust)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const eT1 max_val) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = (in_mem[i] / max_val); - const eT1 v2 = (in_mem[i + blockDim.x] / max_val); - aux_mem[tid] += (v1 * v1) + (v2 * v2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = in_mem[i] / max_val; - aux_mem[tid] += (v * v); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust_small.cu deleted file mode 100644 index 672296d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_robust_small.cu +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_2_robust_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const eT1 max_val) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = (in_mem[i] / max_val); - const eT1 v2 = (in_mem[i + blockDim.x] / max_val); - aux_mem[tid] += (v1 * v1) + (v2 * v2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = (in_mem[i] / max_val); - aux_mem[tid] += (v * v); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_small.cu deleted file mode 100644 index 4584680..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_2_small.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_2_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = in_mem[i] * in_mem[i]; - const eT1 v2 = in_mem[i + blockDim.x] * in_mem[i + blockDim.x]; - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = in_mem[i] * in_mem[i]; - aux_mem[tid] += v; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k.cu deleted file mode 100644 index f594ce3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k.cu +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,vec_norm_k)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const UWORD k) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = pow(in_mem[i], (eT1) k); - const eT1 v2 = pow(in_mem[i + blockDim.x], (eT1) k); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = pow(in_mem[i], (eT1) k); - aux_mem[tid] += v; - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the accu_subgroup_reduce utility function. - COOT_FN(PREFIX,accu_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k_small.cu deleted file mode 100644 index fb49e4c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_k_small.cu +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_k_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem, - const UWORD k) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = pow(in_mem[i], (eT1) k); - const eT1 v2 = pow(in_mem[i + blockDim.x], (eT1) k); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = pow(in_mem[i], (eT1) k); - aux_mem[tid] += v; - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min.cu deleted file mode 100644 index 49684a9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min.cu +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declaration of one-way kernel that we need. -__device__ void COOT_FN(PREFIX,min_subgroup_reduce)(volatile eT1* data, int tid); - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,vec_norm_min)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max((eT1) 0); - - if (i < n_elem) - { - aux_mem[tid] = abs(in_mem[i]); - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], abs(in_mem[i + blockDim.x])); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = abs(in_mem[i]); - const eT1 v2 = abs(in_mem[i + blockDim.x]); - const eT1 v3 = min(v1, v2); - aux_mem[tid] = min(aux_mem[tid], v3); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = abs(in_mem[i]); - aux_mem[tid] = min(aux_mem[tid], v); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - // Since we are just accumulating, we can use the min_subgroup_reduce utility function. - COOT_FN(PREFIX,min_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min_small.cu deleted file mode 100644 index aca3f19..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/oneway_real/vec_norm_min_small.cu +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,vec_norm_min_small)(const eT1* in_mem, - const UWORD n_elem, - eT1* out_mem) - { - eT1* aux_mem = (eT1*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = coot_type_max((eT1) 0); - - if (i < n_elem) - { - aux_mem[tid] = abs(in_mem[i]); - } - if (i + blockDim.x < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], abs(in_mem[i + blockDim.x])); - } - i += grid_size; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT1 v1 = abs(in_mem[i]); - const eT1 v2 = abs(in_mem[i + blockDim.x]); - const eT1 v3 = min(v1, v2); - aux_mem[tid] = min(aux_mem[tid], v3); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v = abs(in_mem[i]); - aux_mem[tid] = min(aux_mem[tid], v); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_post.cu deleted file mode 100644 index 976cd25..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_post.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_div_post)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = out_src[out_src_loc] / (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_pre.cu deleted file mode 100644 index b215f9f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_div_pre.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_div_pre)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = (TO_ET2(in[in_loc])) / out_src[out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_post.cu deleted file mode 100644 index 779de93..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_post.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_minus_post)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = out_src[out_src_loc] - (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_pre.cu deleted file mode 100644 index c5038bf..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_minus_pre.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_minus_pre)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = (TO_ET2(in[in_loc])) - out_src[out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_plus.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_plus.cu deleted file mode 100644 index bebefac..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_plus.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_plus)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = out_src[out_src_loc] + (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_schur.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_schur.cu deleted file mode 100644 index 8b5070b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_schur.cu +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_schur)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = out_src[out_src_loc] * (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_set.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_set.cu deleted file mode 100644 index 624c16d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_set.cu +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_set)(eT2* out, - const eT2* /* out_src */, // old values are unused - const eT1* in, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD /* out_src_M_n_rows */, - const UWORD in_M_n_rows) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = TO_ET2(in[in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_post.cu deleted file mode 100644 index a4f9206..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_div_post)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = out_src[out_src_loc] / (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_pre.cu deleted file mode 100644 index d41d0ea..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_div_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_div_pre)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = (TO_ET2(in[in_loc])) / out_src[out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_post.cu deleted file mode 100644 index a1951c2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_post.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_minus_post)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = out_src[out_src_loc] - (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_pre.cu deleted file mode 100644 index 6478147..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_minus_pre.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_minus_pre)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = (TO_ET2(in[in_loc])) - out_src[out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_plus.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_plus.cu deleted file mode 100644 index fc8d542..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_plus.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_plus)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = out_src[out_src_loc] + (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_schur.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_schur.cu deleted file mode 100644 index c2e4095..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_schur.cu +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_schur)(eT2* out, - const eT2* out_src, - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_loc] = out_src[out_src_loc] * (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_set.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_set.cu deleted file mode 100644 index b1d6dd0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/broadcast_subset_set.cu +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,broadcast_subset_set)(eT2* out, - const eT2* /* out_src */, // unused - const eT1* in, - const UWORD* indices, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD /* out_src_M_n_rows */, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD out_col = blockIdx.y * blockDim.y + threadIdx.y; - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_loc] = (TO_ET2(in[in_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/clamp.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/clamp.cu deleted file mode 100644 index 428aa03..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/clamp.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ -__global__ -void -COOT_FN(PREFIX,clamp)(eT2* dest, - const eT1* src, - const eT1 min_val, - const eT1 max_val, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD src_index = row + col * src_M_n_rows; - const UWORD dest_index = row + col * dest_M_n_rows; - - const eT1 clamped_val = max(min_val, min(max_val, src[src_index])); - dest[dest_index] = TO_ET2(clamped_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/cross.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/cross.cu deleted file mode 100644 index 6cc7a96..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/cross.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,cross)(eT2* out, - const eT1* A, - const eT1* B) // A and B should have 3 elements - { - const UWORD idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx < 3) - { - const UWORD a1_index = ((idx + 1) % 3); - const UWORD a2_index = ((idx + 2) % 3); - - const UWORD b1_index = ((idx + 2) % 3); - const UWORD b2_index = ((idx + 1) % 3); - - const eT1 val = (A[a1_index] * B[b1_index]) - (A[a2_index] * B[b2_index]); - out[idx] = TO_ET2(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot.cu deleted file mode 100644 index c0f1af3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot.cu +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__device__ -void -COOT_FN(PREFIX,dot_subgroup_reduce)(volatile twoway_promoted_eT* data, int tid) - { - data[tid] += data[tid + 32]; - data[tid] += data[tid + 16]; - data[tid] += data[tid + 8]; - data[tid] += data[tid + 4]; - data[tid] += data[tid + 2]; - data[tid] += data[tid + 1]; - } - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,dot)(twoway_promoted_eT* out_mem, - const eT1* A, - const eT2* B, - const UWORD n_elem) - { - twoway_promoted_eT* aux_mem = (twoway_promoted_eT*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const twoway_promoted_eT A_i1 = TO_TWOWAY_PROMOTED_ET(A[i]); - const twoway_promoted_eT B_i1 = TO_TWOWAY_PROMOTED_ET(B[i]); - - const twoway_promoted_eT A_i2 = TO_TWOWAY_PROMOTED_ET(A[i + blockDim.x]); - const twoway_promoted_eT B_i2 = TO_TWOWAY_PROMOTED_ET(B[i + blockDim.x]); - - aux_mem[tid] += (A_i1 * B_i1) + (A_i2 * B_i2); // copy to local shared memory - i += grid_size; - } - if (i < n_elem) - { - const twoway_promoted_eT A_i1 = TO_TWOWAY_PROMOTED_ET(A[i]); - const twoway_promoted_eT B_i1 = TO_TWOWAY_PROMOTED_ET(B[i]); - - aux_mem[tid] += (A_i1 * B_i1); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - COOT_FN(PREFIX,dot_subgroup_reduce)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot_small.cu deleted file mode 100644 index 27735f6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/dot_small.cu +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,dot_small)(twoway_promoted_eT* out_mem, - const eT1* A, - const eT2* B, - const UWORD n_elem) - { - twoway_promoted_eT* aux_mem = (twoway_promoted_eT*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const twoway_promoted_eT A_i1 = TO_TWOWAY_PROMOTED_ET(A[i]); - const twoway_promoted_eT B_i1 = TO_TWOWAY_PROMOTED_ET(B[i]); - - const twoway_promoted_eT A_i2 = TO_TWOWAY_PROMOTED_ET(A[i + blockDim.x]); - const twoway_promoted_eT B_i2 = TO_TWOWAY_PROMOTED_ET(B[i + blockDim.x]); - - // copy to local shared memory - aux_mem[tid] += (A_i1 * B_i1) + (A_i2 * B_i2); - i += grid_size; - } - if (i < n_elem) - { - const twoway_promoted_eT A_i1 = TO_TWOWAY_PROMOTED_ET(A[i]); - const twoway_promoted_eT B_i1 = TO_TWOWAY_PROMOTED_ET(B[i]); - - aux_mem[tid] += (A_i1 * B_i1); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/htrans.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/htrans.cu deleted file mode 100644 index 4dc7d45..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/htrans.cu +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// TODO: once we have complex support, this will need to be amended -__global__ -void -COOT_FN(PREFIX,htrans)(eT2* out, - const eT1* in, - const UWORD in_n_rows, - const UWORD in_n_cols) - { - // For a non-inplace transpose, we can use a pretty naive approach. - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD in_offset = row + col * in_n_rows; - const UWORD out_offset = col + row * in_n_cols; - - if( (row < in_n_rows) && (col < in_n_cols) ) - { - const eT2 element = TO_ET2(in[in_offset]); - out[out_offset] = element; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_array.cu deleted file mode 100644 index 6cab7e1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_div_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] /= TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_sve1.cu deleted file mode 100644 index 4b5524b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_div_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_div_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] /= TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_array.cu deleted file mode 100644 index db7051c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_eq_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_sve1.cu deleted file mode 100644 index 8e45725..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_eq_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_eq_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] = TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_array.cu deleted file mode 100644 index 4447bf4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_minus_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] -= TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_sve1.cu deleted file mode 100644 index aa3141b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_minus_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_minus_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] -= TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_array.cu deleted file mode 100644 index 97efc6e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_mul_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] *= TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_sve1.cu deleted file mode 100644 index 6bb3583..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_mul_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_mul_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] *= TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_array.cu deleted file mode 100644 index f2aa01c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_array.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_plus_array)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] += TO_ET2(src[i]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_sve1.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_sve1.cu deleted file mode 100644 index 6f7dd53..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve1_plus_sve1.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve1_plus_sve1)(eT2* dest, - const UWORD* dest_locs, - const eT1* src, - const UWORD* src_locs, - const UWORD n_elem) - { - const UWORD i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i < n_elem) - { - dest[dest_locs[i]] += TO_ET2(src[src_locs[i]]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_array.cu deleted file mode 100644 index 427ec81..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_div_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] /= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_sve2.cu deleted file mode 100644 index ea73f4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_div_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_div_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] /= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_array.cu deleted file mode 100644 index c19ee2d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_eq_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] = TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_sve2.cu deleted file mode 100644 index 293bb28..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_eq_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_eq_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] = TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_array.cu deleted file mode 100644 index a8c1655..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_minus_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] -= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_sve2.cu deleted file mode 100644 index 94988f1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_minus_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_minus_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] -= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_array.cu deleted file mode 100644 index fa47196..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_mul_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] *= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_sve2.cu deleted file mode 100644 index 52eca01..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_mul_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_mul_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] *= TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_array.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_array.cu deleted file mode 100644 index c7830d2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_array.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_plus_array)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - (dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col])); - const UWORD src_loc = row + col * n_rows; - - dest[dest_loc] += TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_sve2.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_sve2.cu deleted file mode 100644 index 22c23f8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/inplace_sve2_plus_sve2.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,inplace_sve2_plus_sve2)(eT2* dest, - const UWORD* dest_row_locs, - const UWORD* dest_col_locs, - const eT1* src, - const UWORD* src_row_locs, - const UWORD* src_col_locs, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = ((dest_row_locs == NULL) ? row : dest_row_locs[row]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col]); - const UWORD src_loc = ((src_row_locs == NULL) ? row : src_row_locs[row]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col]); - - dest[dest_loc] += TO_ET2(src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_post.cu deleted file mode 100644 index 0348de6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_colwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 acc = colptr[0]; - for (UWORD i = 1; i < n_rows; ++i) - { - acc = max(acc, colptr[i]); - } - - dest[col * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_pre.cu deleted file mode 100644 index fa94bc3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_colwise_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_colwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT2 acc = TO_ET2(colptr[0]); - for (UWORD i = 1; i < n_rows; ++i) - { - acc = max(acc, TO_ET2(colptr[i])); - } - - dest[col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_post.cu deleted file mode 100644 index f6dd6f3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_cube_col_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT1 acc = src[row + slice * n_rows * n_cols]; - for (UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, src[(i * n_rows) + row + slice * n_rows * n_cols]); - } - - dest[row + slice * n_rows] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_pre.cu deleted file mode 100644 index 7b2f2ce..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_cube_col_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_cube_col_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT2 acc = TO_ET2(src[row + slice * n_rows * n_cols]); - for (UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, TO_ET2(src[(i * n_rows) + row + slice * n_rows * n_cols])); - } - - dest[row + slice * n_rows] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_post.cu deleted file mode 100644 index b456f62..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_post.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_rowwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = src[row]; - for (UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, src[(i * src_M_n_rows) + row]); - } - - dest[row * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_pre.cu deleted file mode 100644 index 28b6c20..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/max_rowwise_conv_pre.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,max_rowwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT2 acc = TO_ET2(src[row]); - for (UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, TO_ET2(src[(i * src_M_n_rows) + row])); - } - - dest[row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_post.cu deleted file mode 100644 index e12c7a5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,mean_colwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[ col * src_M_n_rows ]); - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_rows; ++i) - { - acc += colptr[i]; - } - - dest[col * dest_mem_incr] = TO_ET2(acc / TO_ET1(n_rows)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_pre.cu deleted file mode 100644 index f0c17ee..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_colwise_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,mean_colwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[ col * src_M_n_rows ]); - eT2 acc = TO_ET2(0); - for (UWORD i = 0; i < n_rows; ++i) - { - acc += TO_ET2(colptr[i]); - } - - dest[col * dest_mem_incr] = (acc / TO_ET2(n_rows)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_post.cu deleted file mode 100644 index 21b41d8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_post.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,mean_rowwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_cols; ++i) - { - acc += src[i * src_M_n_rows + row]; - } - - dest[row * dest_mem_incr] = TO_ET2(acc / TO_ET1(n_cols)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_pre.cu deleted file mode 100644 index c3edf00..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/mean_rowwise_conv_pre.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,mean_rowwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT2 acc = TO_ET2(0); - for (UWORD i = 0; i < n_cols; ++i) - { - acc += TO_ET2(src[i * src_M_n_rows + row]); - } - - dest[row * dest_mem_incr] = (acc / TO_ET2(n_cols)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_post.cu deleted file mode 100644 index 026597c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_colwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 acc = colptr[0]; - for (UWORD i = 1; i < n_rows; ++i) - { - acc = min(acc, colptr[i]); - } - - dest[col * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_pre.cu deleted file mode 100644 index a5f7086..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_colwise_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_colwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT2 acc = TO_ET2(colptr[0]); - for (UWORD i = 1; i < n_rows; ++i) - { - acc = min(acc, TO_ET2(colptr[i])); - } - - dest[col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_post.cu deleted file mode 100644 index 161a7c3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_cube_col_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT1 acc = src[row + slice * n_rows * n_cols]; - for (UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, src[(i * n_rows) + row + slice * n_rows * n_cols]); - } - - dest[row + slice * n_rows] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_pre.cu deleted file mode 100644 index d62aea0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_cube_col_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_cube_col_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD slice = blockIdx.y * blockDim.y + threadIdx.y; - - if(row < n_rows && slice < n_slices) - { - eT2 acc = TO_ET2(src[row + slice * n_rows * n_cols]); - for (UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, TO_ET2(src[(i * n_rows) + row + slice * n_rows * n_cols])); - } - - dest[row + slice * n_rows] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_post.cu deleted file mode 100644 index 8e125af..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_post.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_rowwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = src[row]; - for (UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, src[(i * src_M_n_rows) + row]); - } - - dest[row * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_pre.cu deleted file mode 100644 index aadf89a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/min_rowwise_conv_pre.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,min_rowwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT2 acc = TO_ET2(src[row]); - for (UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, TO_ET2(src[(i * src_M_n_rows) + row])); - } - - dest[row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq.cu deleted file mode 100644 index b6390cd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq.cu +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_all_neq)(const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - uint* out, - const eT2 val) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - const eT2 val2 = TO_ET2(X[i + blockDim.x]); - - aux_mem[tid] &= (val1 != val); - aux_mem[tid] &= (val2 != val); - i += grid_size; - } - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - - aux_mem[tid] &= (val1 != val); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - and_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_colwise.cu deleted file mode 100644 index 510e92f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_colwise.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_all_neq_colwise)(UWORD* out, - const eT1* A, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < A_n_cols) - { - const eT1* colptr = &(A[ col*A_n_rows ]); - UWORD result = 1; - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT2 val1 = TO_ET2(colptr[i]); - result &= (val1 != val); - } - - out[col] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_rowwise.cu deleted file mode 100644 index cd04aaf..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_rowwise.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_all_neq_rowwise)(UWORD* out, - const eT1* A, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < A_n_rows) - { - UWORD result = 1; - for (UWORD i = 0; i < A_n_cols; ++i) - { - const eT2 val1 = TO_ET2(A[i * A_n_rows + row]); - result &= (val1 != val); - } - - out[row] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_small.cu deleted file mode 100644 index 396e49e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_all_neq_small.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_all_neq_small)(const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - uint* out, - const eT2 val) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 1; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT2 val1 = TO_ET2(X[i]); - const eT2 val2 = TO_ET2(X[i + blockDim.x]); - - aux_mem[tid] &= (val1 != val); - aux_mem[tid] &= (val2 != val); - i += grid_size; - } - if (i < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - - aux_mem[tid] &= (val1 != val); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq.cu deleted file mode 100644 index ebb2e8f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq.cu +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// this kernel is technically incorrect if the size is not a factor of 2! -__global__ -void -COOT_FN(PREFIX,rel_any_neq)(const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - uint* out, - const eT2 val) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - const eT2 val1 = TO_ET2(X[i]); - const eT2 val2 = TO_ET2(X[i + blockDim.x]); - - aux_mem[tid] |= (val1 != val); - aux_mem[tid] |= (val2 != val); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT2 val1 = TO_ET2(X[i]); - - aux_mem[tid] |= (val1 != val); - } - __syncthreads(); - - for (UWORD s = blockDim.x / 2; s > 32; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - __syncthreads(); - } - - if (tid < 32) // unroll last warp's worth of work - { - or_subgroup_reduce_u32(aux_mem, tid); - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_colwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_colwise.cu deleted file mode 100644 index 49694df..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_colwise.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_neq_colwise)(UWORD* out, - const eT1* A, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < A_n_cols) - { - const eT1* colptr = &(A[ col*A_n_rows ]); - UWORD result = 0; - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT2 val1 = TO_ET2(colptr[i]); - result |= (val1 != val); - if (result == 1) - break; - } - - out[col] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_rowwise.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_rowwise.cu deleted file mode 100644 index 8656a3b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_rowwise.cu +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_neq_rowwise)(UWORD* out, - const eT1* A, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < A_n_rows) - { - UWORD result = 0; - for (UWORD i = 0; i < A_n_cols; ++i) - { - const eT2 val1 = TO_ET2(A[i * A_n_rows + row]); - result |= (val1 != val); - if (result == 1) - break; - } - - out[row] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_small.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_small.cu deleted file mode 100644 index e79e394..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/rel_any_neq_small.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,rel_any_neq_small)(const eT1* X, // will be casted to eT2 before comparison - const UWORD n_elem, - uint* out, - const eT2 val) - { - uint* aux_mem = (uint*) aux_shared_mem; - - const UWORD tid = threadIdx.x; - UWORD i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - const UWORD grid_size = blockDim.x * 2 * gridDim.x; - - aux_mem[tid] = 0; - - while (i + blockDim.x < n_elem) - { - // copy to local shared memory - const eT2 val1 = TO_ET2(X[i]); - const eT2 val2 = TO_ET2(X[i + blockDim.x]); - - aux_mem[tid] |= (val1 != val); - aux_mem[tid] |= (val2 != val); - if (aux_mem[tid] == 1) - break; - - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT2 val1 = TO_ET2(X[i]); - - aux_mem[tid] |= (val1 != val); - } - - for (UWORD s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[blockIdx.x] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/strans.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/strans.cu deleted file mode 100644 index 0e9dc7b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/strans.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,strans)(eT2* out, - const eT1* in, - const UWORD in_n_rows, - const UWORD in_n_cols) - { - // For a non-inplace transpose, we can use a pretty naive approach. - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - const UWORD in_offset = row + col * in_n_rows; - const UWORD out_offset = col + row * in_n_cols; - - if( (row < in_n_rows) && (col < in_n_cols) ) - { - const eT2 element = TO_ET2(in[in_offset]); - out[out_offset] = element; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_post.cu deleted file mode 100644 index d82bf19..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_post.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,sum_colwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_rows; ++i) - { - acc += colptr[i]; - } - - dest[col * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_pre.cu deleted file mode 100644 index bc9816b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_colwise_conv_pre.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,sum_colwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = blockIdx.x * blockDim.x + threadIdx.x; - if(col < n_cols) - { - const eT1* colptr = &(src[col * src_M_n_rows]); - eT2 acc = TO_ET2(0); - for (UWORD i = 0; i < n_rows; ++i) - { - acc += TO_ET2(colptr[i]); - } - - dest[col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_post.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_post.cu deleted file mode 100644 index 26b257e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_post.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,sum_rowwise_conv_post)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT1 acc = TO_ET1(0); - for (UWORD i = 0; i < n_cols; ++i) - { - acc += src[(i * src_M_n_rows) + row]; - } - - dest[row * dest_mem_incr] = TO_ET2(acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_pre.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_pre.cu deleted file mode 100644 index 49425dd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/sum_rowwise_conv_pre.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__global__ -void -COOT_FN(PREFIX,sum_rowwise_conv_pre)(eT2* dest, - const eT1* src, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - if(row < n_rows) - { - eT2 acc = TO_ET2(0); - for (UWORD i = 0; i < n_cols; ++i) - { - acc += TO_ET2(src[(i * src_M_n_rows) + row]); - } - - dest[row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatl.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatl.cu deleted file mode 100644 index b0ec3ce..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatl.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,symmatl)(eT2* out, - const eT1* A, - const UWORD size) // matrix is expected to be square - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < size && col < size && row >= col) - { - const eT2 val = TO_ET2(A[row + size * col]); - - out[col + size * row] = val; - out[row + size * col] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatu.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatu.cu deleted file mode 100644 index 34dfbd6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/twoway/symmatu.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__global__ -void -COOT_FN(PREFIX,symmatu)(eT2* out, - const eT1* A, - const UWORD size) // matrix is expected to be square - { - const UWORD row = blockIdx.x * blockDim.x + threadIdx.x; - const UWORD col = blockIdx.y * blockDim.y + threadIdx.y; - - if (row < size && col < size && col >= row) - { - const eT2 val = TO_ET2(A[row + size * col]); - - out[row + size * col] = val; - out[col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/cuda/zeroway/shuffle_large_compute_locs.cu b/inst/include/bandicoot_bits/ks/kernels/cuda/zeroway/shuffle_large_compute_locs.cu deleted file mode 100644 index 9a78caa..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/cuda/zeroway/shuffle_large_compute_locs.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This performs the first part of the shuffle_vec kernel: it computes random -// locations for the output using the variable philox bijective shuffle, -// and then does the first step of the output compression (the upsweep of the -// shifted prefix sum). -__global__ -void -shuffle_large_compute_locs(UWORD* out_block_mem, - const UWORD n_elem, - const UWORD n_elem_pow2, - const UWORD* philox_key, - const UWORD num_bits) - { - UWORD* aux_mem = (UWORD*) aux_shared_mem; - - const UWORD tid = threadIdx.x + blockIdx.x * blockDim.x; - const UWORD local_tid = threadIdx.x; - const UWORD local_size = blockDim.x; - - // Get our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // Fill aux_mem with the indicator of whether we are out of bounds. - // Then, we'll prefix-sum it. This will tell us where to put our result. - aux_mem[local_tid] = (in_loc < n_elem); - __syncthreads(); - - // Now, prefix-sum the auxiliary memory. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = local_size / 2; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - __syncthreads(); - } - - if (local_tid == 0) - { - out_block_mem[blockIdx.x] = aux_mem[local_size - 1]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/d_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/d_defs.cl deleted file mode 100644 index a4801ac..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/d_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline double coot_type_min_double() { return -DBL_MAX; } -inline double coot_type_minpos_double() { return DBL_MIN; } -inline double coot_type_max_double() { return DBL_MAX; } - -inline bool coot_is_fp_double() { return true; } -inline bool coot_is_signed_double() { return true; } -inline bool coot_isnan_double(const double x) { return isnan(x); } - -inline double coot_absdiff_double(const double x, const double y) { return fabs(x - y); } - -inline double coot_conj_double(const double x) { return x; } - -inline double coot_plus_double(const double a, const double b) { return a + b; } -inline double coot_minus_double(const double a, const double b) { return a - b; } -inline double coot_mul_double(const double a, const double b) { return a * b; } -inline double coot_div_double(const double a, const double b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/f_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/f_defs.cl deleted file mode 100644 index be682a1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/f_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline float coot_type_min_float() { return -FLT_MAX; } -inline float coot_type_minpos_float() { return FLT_MIN; } -inline float coot_type_max_float() { return FLT_MAX; } - -inline bool coot_is_fp_float() { return true; } -inline bool coot_is_signed_float() { return true; } -inline bool coot_isnan_float(const float x) { return isnan(x); } - -inline float coot_absdiff_float(const float x, const float y) { return fabs(x - y); } - -inline float coot_conj_float(const float x) { return x; } - -inline float coot_plus_float(const float a, const float b) { return a + b; } -inline float coot_minus_float(const float a, const float b) { return a - b; } -inline float coot_mul_float(const float a, const float b) { return a * b; } -inline float coot_div_float(const float a, const float b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/h_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/h_defs.cl deleted file mode 100644 index f8215c5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/h_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline half coot_type_min_half() { return -HALF_MAX; } -inline half coot_type_minpos_half() { return HALF_MIN; } -inline half coot_type_max_half() { return HALF_MAX; } - -inline bool coot_is_fp_half() { return true; } -inline bool coot_is_signed_half() { return true; } -inline bool coot_isnan_half(const half x) { return isnan(x); } - -inline half coot_absdiff_half(const half x, const half y) { return fabs(x - y); } - -inline half coot_conj_half(const half x) { return x; } - -inline half coot_plus_half(const half a, const half b) { return a + b; } -inline half coot_minus_half(const half a, const half b) { return a - b; } -inline half coot_mul_half(const half a, const half b) { return a * b; } -inline half coot_div_half(const half a, const half b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/opencl_prelims.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/opencl_prelims.cl deleted file mode 100644 index 93be9b6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/opencl_prelims.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// These statically-compiled definitions are available in any Bandicoot kernel. -typedef float2 cx_float; -#ifdef COOT_HAVE_FP64 -typedef double2 cx_double; -#endif - -#define COOT_FN2(ARG1,ARG2) ARG1 ## ARG2 -#define COOT_FN(ARG1,ARG2) COOT_FN2(ARG1,ARG2) - -#define COOT_FN_3_2(ARG1,ARG2,ARG3) ARG1 ## ARG2 ## ARG3 -#define COOT_FN_3(ARG1,ARG2,ARG3) COOT_FN_3_2(ARG1,ARG2,ARG3) - -// Sometimes we need to approximate Armadillo functionality that uses -// double---but double may not be available. So we do our best... -#ifdef COOT_HAVE_FP64 - #define ARMA_FP_TYPE double - #define ARMA_FP_MAX DBL_MAX - #define ARMA_FP_MIN DBL_MIN -#else - #define ARMA_FP_TYPE float - #define ARMA_FP_MAX FLT_MAX - #define ARMA_FP_MIN FLT_MIN -#endif diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s16_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s16_defs.cl deleted file mode 100644 index 98898a3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s16_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline short coot_type_min_short() { return COOT_S16_MIN; } -inline short coot_type_minpos_short() { return 1; } -inline short coot_type_max_short() { return COOT_S16_MAX; } - -inline bool coot_is_fp_short() { return false; } -inline bool coot_is_signed_short() { return true; } -inline bool coot_isnan_short(const short x) { return false; } - -inline short coot_absdiff_short(const short x, const short y) { return abs(x - y); } - -inline short coot_conj_short(const short x) { return x; } - -inline short coot_plus_short(const short a, const short b) { return a + b; } -inline short coot_minus_short(const short a, const short b) { return a - b; } -inline short coot_mul_short(const short a, const short b) { return a * b; } -inline short coot_div_short(const short a, const short b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s32_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s32_defs.cl deleted file mode 100644 index 9403ec6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s32_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline int coot_type_min_int() { return COOT_S32_MIN; } -inline int coot_type_minpos_int() { return 1; } -inline int coot_type_max_int() { return COOT_S32_MAX; } - -inline bool coot_is_fp_int() { return false; } -inline bool coot_is_signed_int() { return true; } -inline bool coot_isnan_int(const int x) { return false; } - -inline int coot_absdiff_int(const int x, const int y) { return abs(x - y); } - -inline int coot_conj_int(const int x) { return x; } - -inline int coot_plus_int(const int a, const int b) { return a + b; } -inline int coot_minus_int(const int a, const int b) { return a - b; } -inline int coot_mul_int(const int a, const int b) { return a * b; } -inline int coot_div_int(const int a, const int b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s64_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s64_defs.cl deleted file mode 100644 index 93c9c96..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s64_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline long coot_type_min_long() { return COOT_S64_MIN; } -inline long coot_type_minpos_long() { return 1; } -inline long coot_type_max_long() { return COOT_S64_MAX; } - -inline bool coot_is_fp_long() { return false; } -inline bool coot_is_signed_long() { return true; } -inline bool coot_isnan_long(const long x) { return false; } - -inline long coot_absdiff_long(const long x, const long y) { return abs(x - y); } - -inline long coot_conj_long(const long x) { return x; } - -inline long coot_plus_long(const long a, const long b) { return a + b; } -inline long coot_minus_long(const long a, const long b) { return a - b; } -inline long coot_mul_long(const long a, const long b) { return a * b; } -inline long coot_div_long(const long a, const long b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s8_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s8_defs.cl deleted file mode 100644 index 20fb6fb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/s8_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline char coot_type_min_char() { return COOT_S8_MIN; } -inline char coot_type_minpos_char() { return 1; } -inline char coot_type_max_char() { return COOT_S8_MAX; } - -inline bool coot_is_fp_char() { return false; } -inline bool coot_is_signed_char() { return true; } -inline bool coot_isnan_char(const char x) { return false; } - -inline char coot_absdiff_char(const char x, const char y) { return abs(x - y); } - -inline char coot_conj_char(const char x) { return x; } - -inline char coot_plus_char(const char a, const char b) { return a + b; } -inline char coot_minus_char(const char a, const char b) { return a - b; } -inline char coot_mul_char(const char a, const char b) { return a * b; } -inline char coot_div_char(const char a, const char b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u16_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u16_defs.cl deleted file mode 100644 index 5848c03..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u16_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline ushort coot_type_min_ushort() { return 0; } -inline ushort coot_type_minpos_ushort() { return 1; } -inline ushort coot_type_max_ushort() { return COOT_U16_MAX; } - -inline bool coot_is_fp_ushort() { return false; } -inline bool coot_is_signed_ushort() { return false; } -inline bool coot_isnan_ushort(const ushort x) { return false; } - -inline ushort coot_absdiff_ushort(const ushort x, const ushort y) { return (x > y) ? (x - y) : (y - x); } - -inline ushort coot_conj_ushort(const ushort x) { return x; } - -inline ushort coot_plus_ushort(const ushort a, const ushort b) { return a + b; } -inline ushort coot_minus_ushort(const ushort a, const ushort b) { return a - b; } -inline ushort coot_mul_ushort(const ushort a, const ushort b) { return a * b; } -inline ushort coot_div_ushort(const ushort a, const ushort b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u32_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u32_defs.cl deleted file mode 100644 index e427814..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u32_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline uint coot_type_min_uint() { return 0; } -inline uint coot_type_minpos_uint() { return 1; } -inline uint coot_type_max_uint() { return COOT_U32_MAX; } - -inline bool coot_is_fp_uint() { return false; } -inline bool coot_is_signed_uint() { return false; } -inline bool coot_isnan_uint(const uint x) { return false; } - -inline uint coot_absdiff_uint(const uint x, const uint y) { return (x > y) ? (x - y) : (y - x); } - -inline uint coot_conj_uint(const uint x) { return x; } - -inline uint coot_plus_uint(const uint a, const uint b) { return a + b; } -inline uint coot_minus_uint(const uint a, const uint b) { return a - b; } -inline uint coot_mul_uint(const uint a, const uint b) { return a * b; } -inline uint coot_div_uint(const uint a, const uint b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u64_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u64_defs.cl deleted file mode 100644 index 3e308b3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u64_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline ulong coot_type_min_ulong() { return 0; } -inline ulong coot_type_minpos_ulong() { return 1; } -inline ulong coot_type_max_ulong() { return COOT_U64_MAX; } - -inline bool coot_is_fp_ulong() { return false; } -inline bool coot_is_signed_ulong() { return false; } -inline bool coot_isnan_ulong(const ulong x) { return false; } - -inline ulong coot_absdiff_ulong(const ulong x, const ulong y) { return (x > y) ? (x - y) : (y - x); } - -inline ulong coot_conj_ulong(const ulong x) { return x; } - -inline ulong coot_plus_ulong(const ulong a, const ulong b) { return a + b; } -inline ulong coot_minus_ulong(const ulong a, const ulong b) { return a - b; } -inline ulong coot_mul_ulong(const ulong a, const ulong b) { return a * b; } -inline ulong coot_div_ulong(const ulong a, const ulong b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u8_defs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u8_defs.cl deleted file mode 100644 index fda8d45..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/u8_defs.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -inline uchar coot_type_min_uchar() { return 0; } -inline uchar coot_type_minpos_uchar() { return 1; } -inline uchar coot_type_max_uchar() { return COOT_U8_MAX; } - -inline bool coot_is_fp_uchar() { return false; } -inline bool coot_is_signed_uchar() { return false; } -inline bool coot_isnan_uchar(const uchar x) { return false; } - -inline uchar coot_absdiff_uchar(const uchar x, const uchar y) { return (x > y) ? (x - y) : (y - x); } - -inline uchar coot_conj_uchar(const uchar x) { return x; } - -inline uchar coot_plus_uchar(const uchar a, const uchar b) { return a + b; } -inline uchar coot_minus_uchar(const uchar a, const uchar b) { return a - b; } -inline uchar coot_mul_uchar(const uchar a, const uchar b) { return a * b; } -inline uchar coot_div_uchar(const uchar a, const uchar b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/accu_subgroup_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/accu_subgroup_reduce.cl deleted file mode 100644 index 2063b30..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/accu_subgroup_reduce.cl +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -COOT_FN(PREFIX,accu_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] += data[tid + i]; - - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,accu_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] += data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/and_subgroup_reduce_u32.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/and_subgroup_reduce_u32.cl deleted file mode 100644 index 2b532e2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/and_subgroup_reduce_u32.cl +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -and_subgroup_reduce_other_u32(__local volatile uint* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] &= data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -and_subgroup_reduce_8_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -and_subgroup_reduce_16_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -and_subgroup_reduce_32_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -and_subgroup_reduce_64_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -and_subgroup_reduce_128_u32(__local volatile uint* data, UWORD tid) - { - data[tid] &= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/max_subgroup_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/max_subgroup_reduce.cl deleted file mode 100644 index c4d855a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/max_subgroup_reduce.cl +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -COOT_FN(PREFIX,max_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] = max(data[tid], data[tid + i]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 64]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,max_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] = max(data[tid], data[tid + 128]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 64]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = max(data[tid], data[tid + 1]); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/min_subgroup_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/min_subgroup_reduce.cl deleted file mode 100644 index 3cab836..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/min_subgroup_reduce.cl +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -COOT_FN(PREFIX,min_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] = min(data[tid], data[tid + i]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,min_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,min_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,min_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } - - - -void -COOT_FN(PREFIX,min_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 64]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } - - - -void COOT_FN(PREFIX,min_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] = min(data[tid], data[tid + 128]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 64]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 32]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 16]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 8]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 4]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 2]); - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] = min(data[tid], data[tid + 1]); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/or_subgroup_reduce_u32.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/or_subgroup_reduce_u32.cl deleted file mode 100644 index 98a8d5a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/or_subgroup_reduce_u32.cl +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -or_subgroup_reduce_other_u32(__local volatile uint* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] |= data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -or_subgroup_reduce_8_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -or_subgroup_reduce_16_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -or_subgroup_reduce_32_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -or_subgroup_reduce_64_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -or_subgroup_reduce_128_u32(__local volatile uint* data, UWORD tid) - { - data[tid] |= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/prod_subgroup_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/prod_subgroup_reduce.cl deleted file mode 100644 index cd8a056..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/prod_subgroup_reduce.cl +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -void -COOT_FN(PREFIX,prod_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] *= data[tid + i]; - - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,prod_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] *= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] *= data[tid + 1]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/var_philox.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/deps/var_philox.cl deleted file mode 100644 index 5135aa2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/deps/var_philox.cl +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// Implementations of the variable philox algorithm to generate random numbers. -// Adapted from Mitchell, Stokes, Frank, and Holmes (2022), Listing 1. - - - -inline -UWORD -var_philox(const UWORD val, const __global UWORD* keys, const unsigned char bits) - { - // via Salmon, Moraes, Dror, and Shaw (2011): "Parallel random numbers: as easy as 1, 2, 3". - const UWORD M0 = 0xD2B74407B1CE6E93; - - // The right side is allowed to have the extra bits. - const unsigned char right_side_bits = (bits + 1) / 2; - const unsigned char left_side_bits = bits / 2; - const uint left_mask = (((uint) 1) << left_side_bits) - 1; - const uint right_mask = (((uint) 1) << right_side_bits) - 1; - - uint state0 = (uint) (val >> right_side_bits); - uint state1 = (uint) (val & right_mask); - - // 24 rounds is what is needed to pass all the RNG tests (see section 5 of the paper). - uint hi, lo; - for (unsigned char i = 0; i < 24; ++i) - { - - // 64-bit integer multiplication, split the results into two uints - UWORD hilo = M0 * state0; - hi = (hilo >> 32); - lo = (uint) hilo; - - lo = (lo << (right_side_bits - left_side_bits)) | (state1 >> left_side_bits); - - state0 = ((hi ^ keys[i]) ^ state1) & left_mask; - state1 = lo & right_mask; - } - - // Combine the sides for the result. - return ((UWORD) (state0 << right_side_bits)) | ((UWORD) state1); - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_lower.cl deleted file mode 100644 index 8bb4e4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_lower.cl +++ /dev/null @@ -1,307 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Computes row sums dwork[i] = sum( abs( A(i,:) )), i=0:n-1, for || A ||_inf, -// where n is any size and A is stored lower. -// Has ceil( n / inf_bs ) blocks of (inf_bs x 4) threads each (inf_bs=32). -// z precision uses > 16 KB shared memory, so requires Fermi (arch >= 200). - - - -__kernel -void -COOT_FN(PREFIX,lansy_inf_lower) - ( - const UWORD n, - const __global eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* dwork, - const UWORD dwork_offset, - const UWORD n_full_block, - const UWORD n_mod_bs - ) - { - A += A_offset; - dwork += dwork_offset; - - UWORD tx = get_local_id(0); - UWORD ty = get_local_id(1); - - UWORD diag = get_group_id(0) * MAGMABLAS_LANSY_INF_BS; - UWORD ind = get_group_id(0) * MAGMABLAS_LANSY_INF_BS + tx; - - eT1 res = 0.; - - __local eT1 la[MAGMABLAS_LANSY_INF_BS][MAGMABLAS_LANSY_INF_BS + 1]; - - if ( get_group_id(0) < n_full_block ) - { - // ------------------------------ - // All full block rows - A += ind; - A += ty * lda; - - // ---------- - // loop over all blocks left of the diagonal block - for(UWORD i=0; i < diag; i += MAGMABLAS_LANSY_INF_BS ) - { - // 32x4 threads cooperatively load 32x32 block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; - } - A += lda * MAGMABLAS_LANSY_INF_BS; - barrier( CLK_LOCAL_MEM_FENCE ); - - // compute 4 partial sums of each row, i.e., - // for ty=0: res = sum( la[tx, 0: 7] ) - // for ty=1: res = sum( la[tx, 8:15] ) - // for ty=2: res = sum( la[tx,16:23] ) - // for ty=3: res = sum( la[tx,24:31] ) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // load diagonal block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // copy lower triangle to upper triangle, and - // make diagonal real (zero imaginary part) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD i=ty*8; i < ty*8 + 8; i++) - { - if ( i < tx ) - { - la[i][tx] = la[tx][i]; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // ---------- - // loop over all 32x32 blocks below diagonal block - A += MAGMABLAS_LANSY_INF_BS; - for(UWORD i=diag + MAGMABLAS_LANSY_INF_BS; i < n - n_mod_bs; i += MAGMABLAS_LANSY_INF_BS ) - { - // load block (transposed) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[ty+j][tx] = A[j*lda]; - } - A += MAGMABLAS_LANSY_INF_BS; - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // last partial block, which is (n_mod_bs by inf_bs) - if ( n_mod_bs > 0 ) - { - // load block (transposed), with zeros for rows outside matrix - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - if ( tx < n_mod_bs ) - { - la[ty+j][tx] = A[j*lda]; - } - else - { - la[ty+j][tx] = (eT1) 0; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // 32x4 threads store partial sums into shared memory - la[tx][ty] = res; - barrier( CLK_LOCAL_MEM_FENCE ); - - // first column of 32x1 threads computes final sum of each row - if ( ty == 0 ) - { - res = res + la[tx][1] + la[tx][2] + la[tx][3]; - dwork[ind] = res; - } - } - else - { - // ------------------------------ - // Last, partial block row - // Threads past end of matrix (i.e., ind >= n) are redundantly assigned - // the last row (n-1). At the end, those results are ignored -- only - // results for ind < n are saved into dwork. - if ( tx < n_mod_bs ) - { - A += ind; - } - else - { - A += (get_group_id(0) * MAGMABLAS_LANSY_INF_BS + n_mod_bs - 1); // redundantly do last row - } - A += ty * lda; - - // ---------- - // loop over all blocks left of the diagonal block - // each is (n_mod_bs by inf_bs) - for(UWORD i=0; i < diag; i += MAGMABLAS_LANSY_INF_BS ) - { - // load block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; - } - A += lda * MAGMABLAS_LANSY_INF_BS; - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < 8; j++) - { - res += ET1_ABS( la[tx][j+ty*8] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // partial diagonal block - if ( ty == 0 && tx < n_mod_bs ) - { - // sum rows left of diagonal - for(UWORD j=0; j < tx; j++) - { - res += ET1_ABS( *A ); - A += lda; - } - // sum diagonal (ignoring imaginary part) - res += ET1_ABS( *A ); - A += 1; - // sum column below diagonal - for(UWORD j=tx+1; j < n_mod_bs; j++) - { - res += ET1_ABS( *A ); - A += 1; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // ---------- - // 32x4 threads store partial sums into shared memory - la[tx][ty]= res; - barrier( CLK_LOCAL_MEM_FENCE ); - - // first column of 32x1 threads computes final sum of each row - // rows outside matrix are ignored - if ( ty == 0 && tx < n_mod_bs ) - { - res = res + la[tx][1] + la[tx][2] + la[tx][3]; - dwork[ind] = res; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_upper.cl deleted file mode 100644 index 3ffc03c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_inf_upper.cl +++ /dev/null @@ -1,313 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Computes row sums dwork[i] = sum( abs( A(i,:) )), i=0:n-1, for || A ||_inf, -// where n is any size and A is stored upper. -// Has ceil( n / inf_bs ) blocks of (inf_bs x 4) threads each (inf_bs=32). -// z precision uses > 16 KB shared memory, so requires Fermi (arch >= 200). -// The upper implementation is similar to lower, but processes blocks -// in the transposed order: -// lower goes from left over to diagonal, then down to bottom; -// upper goes from top down to diagonal, then over to right. -// Differences are noted with # in comments. - - - -__kernel -void -COOT_FN(PREFIX,lansy_inf_upper) - ( - const UWORD n, - const __global eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* dwork, - const UWORD dwork_offset, - const UWORD n_full_block, - const UWORD n_mod_bs - ) - { - A += A_offset; - dwork += dwork_offset; - - UWORD tx = get_local_id(0); - UWORD ty = get_local_id(1); - - UWORD diag = get_group_id(0) * MAGMABLAS_LANSY_INF_BS; - UWORD ind = get_group_id(0) * MAGMABLAS_LANSY_INF_BS + tx; - - eT1 res = 0.; - - __local eT1 la[MAGMABLAS_LANSY_INF_BS][MAGMABLAS_LANSY_INF_BS + 1]; - - if ( get_group_id(0) < n_full_block ) - { - // ------------------------------ - // All full block #columns - A += get_group_id(0) * MAGMABLAS_LANSY_INF_BS * lda + tx; //# - A += ty * lda; - - // ---------- - // loop over all blocks #above the diagonal block - for(UWORD i=0; i < diag; i += MAGMABLAS_LANSY_INF_BS ) - { - // 32x4 threads cooperatively load 32x32 block (#transposed) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[ty+j][tx] = A[j*lda]; //# - } - A += MAGMABLAS_LANSY_INF_BS; //# - barrier( CLK_LOCAL_MEM_FENCE ); - - // compute 4 partial sums of each row, i.e., - // for ty=0: res = sum( la[tx, 0: 7] ) - // for ty=1: res = sum( la[tx, 8:15] ) - // for ty=2: res = sum( la[tx,16:23] ) - // for ty=3: res = sum( la[tx,24:31] ) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // load diagonal block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // copy #upper triangle to #lower triangle, and - // make diagonal real (zero imaginary part) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD i=ty*8; i < ty*8 + 8; i++) - { - if ( i > tx ) - { //# - la[i][tx] = la[tx][i]; - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // ---------- - // loop over all 32x32 blocks #right of diagonal block - A += MAGMABLAS_LANSY_INF_BS * lda; //# - for(UWORD i=diag + MAGMABLAS_LANSY_INF_BS; i < n - n_mod_bs; i += MAGMABLAS_LANSY_INF_BS ) - { - // load block (#non-transposed) - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - la[tx][ty+j] = A[j*lda]; //# - } - A += MAGMABLAS_LANSY_INF_BS * lda; //# - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // last partial block, which is #(inf_bs by n_mod_bs) - if ( n_mod_bs > 0 ) - { - // load block (#non-transposed), with zeros for #cols outside matrix - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - if ( ty+j < n_mod_bs ) - { //# - la[tx][ty+j] = A[j*lda]; //# - } - else - { - la[tx][ty+j] = (eT1) 0; //# - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=ty*8; j < ty*8 + 8; j++) - { - res += ET1_ABS( la[tx][j] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // 32x4 threads store partial sums into shared memory - la[tx][ty] = res; - barrier( CLK_LOCAL_MEM_FENCE ); - - // first column of 32x1 threads computes final sum of each row - if ( ty == 0 ) - { - res = res + la[tx][1] + la[tx][2] + la[tx][3]; - dwork[ind] = res; - } - } - else - { - // ------------------------------ - // Last, partial block #column - // Instead of assigning threads ind >= n to the last row (n-1), as in Lower, - // Upper simply adjusts loop bounds to avoid loading columns outside the matrix. - // Again, at the end, those results are ignored -- only - // results for ind < n are saved into dwork. - A += get_group_id(0) * MAGMABLAS_LANSY_INF_BS * lda + tx; //# - A += ty * lda; - - // ---------- - // loop over all blocks #above the diagonal block - // each is #(inf_bs by n_mod_bs) - for(UWORD i=0; i < diag; i += MAGMABLAS_LANSY_INF_BS ) - { - // load block (#transposed), #ignoring columns outside matrix - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < MAGMABLAS_LANSY_INF_BS; j += 4) - { - if ( ty+j < n_mod_bs ) - { - la[ty+j][tx] = A[j*lda]; - } - } - A += MAGMABLAS_LANSY_INF_BS; //# - barrier( CLK_LOCAL_MEM_FENCE ); - - // partial row sums - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint(8))) - #endif - for(UWORD j=0; j < 8; j++) - { - res += ET1_ABS( la[tx][j+ty*8] ); - } - barrier( CLK_LOCAL_MEM_FENCE ); - } - - // ---------- - // partial diagonal block - if ( ty == 0 && tx < n_mod_bs ) - { - // #transpose pointer within diagonal block - // #i.e., from A = A(tx,ty), transpose to A = A(ty,tx). - A = A - tx - ty*lda + tx*lda + ty; - - // sum #column above diagonal - for(UWORD j=0; j < tx; j++) - { - res += ET1_ABS( *A ); - A += 1; //# - } - // sum diagonal (ignoring imaginary part) - res += ET1_ABS( *A ); - A += lda; //# - // sum #row right of diagonal - for(UWORD j=tx+1; j < n_mod_bs; j++) - { - res += ET1_ABS( *A ); - A += lda; //# - } - } - barrier( CLK_LOCAL_MEM_FENCE ); - - // ---------- - // 32x4 threads store partial sums into shared memory - la[tx][ty]= res; - barrier( CLK_LOCAL_MEM_FENCE ); - - // first column of 32x1 threads computes final sum of each row - // rows outside matrix are ignored - if ( ty == 0 && tx < n_mod_bs ) - { - res = res + la[tx][1] + la[tx][2] + la[tx][3]; - dwork[ind] = res; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_lower.cl deleted file mode 100644 index 7df958f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_lower.cl +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Computes dwork[i] = max( abs( A(i,0:i) )), i=0:n-1, for ||A||_max, where A is stored lower - - - -__kernel -void -COOT_FN(PREFIX,lansy_max_lower) - ( - const UWORD n, - const __global eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* dwork, - const UWORD dwork_offset - ) - { - A += A_offset; - dwork += dwork_offset; - - int ind = get_group_id(0) * MAGMABLAS_LANSY_MAX_BS + get_local_id(0); - eT1 res = 0; - - if (ind < n) - { - A += ind; - for(int j=0; j < ind; ++j) - { - res = fmax( res, ET1_ABS( *A )); - A += lda; - } - // diagonal element (ignoring imaginary part) - res = fmax( res, ET1_ABS( *A )); - dwork[ind] = res; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_upper.cl deleted file mode 100644 index 7b34f54..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lansy_max_upper.cl +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Computes dwork[i] = max( abs( A(i,0:i) )), i=0:n-1, for ||A||_max, where A is stored upper. - - - -__kernel -void -COOT_FN(PREFIX,lansy_max_upper) - ( - const UWORD n, - const __global eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* dwork, - const UWORD dwork_offset - ) - { - A += A_offset; - dwork += dwork_offset; - - int ind = get_group_id(0) * MAGMABLAS_LANSY_MAX_BS + get_local_id(0); - eT1 res = 0; - - if (ind < n) - { - A += ind; - A += (n-1)*lda; - for(int j=n-1; j > ind; j--) - { - res = fmax( res, ET1_ABS( *A )); - A -= lda; - } - // diagonal element (ignoring imaginary part) - res = fmax( res, ET1_ABS( *A )); - dwork[ind] = res; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_full.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_full.cl deleted file mode 100644 index 85b3ff0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_full.cl +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Multiply A by `mul`. - -__kernel -void -COOT_FN(PREFIX,lascl_full) - ( - const UWORD m, - const UWORD n, - const eT1 mul, - __global eT1* A, - const UWORD A_offset, - const UWORD lda - ) - { - UWORD ind = get_group_id(0) * MAGMABLAS_LASCL_NB + get_local_id(0); - - A += A_offset + ind; - if (ind < m) - { - for (UWORD j=0; j < n; j++ ) - { - A[j*lda] *= mul; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_lower.cl deleted file mode 100644 index 148a649..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_lower.cl +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Multiply A by `mul`. A is lower triangular. - -__kernel -void -COOT_FN(PREFIX,lascl_lower) - ( - const UWORD m, - const UWORD n, - const eT1 mul, - __global eT1* A, - const UWORD A_offset, - const UWORD lda - ) - { - UWORD ind = get_group_id(0) * MAGMABLAS_LASCL_NB + get_local_id(0); - - UWORD break_d = (ind < n) ? ind : n-1; - - A += A_offset + ind; - if (ind < m) - { - for (UWORD j=0; j <= break_d; j++ ) - { - A[j*lda] *= mul; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_upper.cl deleted file mode 100644 index 3c5d514..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/lascl_upper.cl +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Multiply A by `mul`. A is upper triangular. - -__kernel -void -COOT_FN(PREFIX,lascl_upper) - ( - const UWORD m, - const UWORD n, - const eT1 mul, - __global eT1* A, - const UWORD A_offset, - const UWORD lda - ) - { - UWORD ind = get_group_id(0) * MAGMABLAS_LASCL_NB + get_local_id(0); - - A += A_offset + ind; - if (ind < m) - { - for (UWORD j=n-1; j >= ind; j--) - { - A[j*lda] *= mul; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_lower.cl deleted file mode 100644 index 4663b4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_lower.cl +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// GPU kernel for setting the k-1 sub-diagonals to OFFDIAG -// and the main diagonal to DIAG. -// Divides matrix into min( ceil(m/nb), ceil(n/nb) ) block-columns, -// with k threads in each block. -// Each thread iterates across one diagonal. -// Thread 0 does the main diagonal, thread 1 the first sub-diagonal, etc. - - - -__kernel -void -COOT_FN(PREFIX,laset_band_lower)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - UWORD ibx = get_group_id(0) * MAGMABLAS_LASET_BAND_NB; - UWORD ind = ibx + get_local_id(0); - - A += A_offset + ind + ibx * lda; - - eT1 value = offdiag; - if (get_local_id(0) == 0) - { - value = diag; - } - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j=0; j < MAGMABLAS_LASET_BAND_NB; j++) - { - if (ibx + j < n && ind + j < m) - { - A[j * (lda + 1)] = value; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_upper.cl deleted file mode 100644 index 96af81c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_band_upper.cl +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// GPU kernel for setting the k-1 sub-diagonals to OFFDIAG -// and the main diagonal to DIAG. -// Divides matrix into min( ceil(m/nb), ceil(n/nb) ) block-columns, -// with k threads in each block. -// Each thread iterates across one diagonal. -// Thread 0 does the main diagonal, thread 1 the first sub-diagonal, etc. - - - -__kernel -void -COOT_FN(PREFIX,laset_band_upper)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - int k = get_local_size(0); - int ibx = get_group_id(0) * MAGMABLAS_LASET_BAND_NB; - int ind = ibx + get_local_id(0) - k + 1; - - A += A_offset + ind + ibx * lda; - - eT1 value = offdiag; - if (get_local_id(0) == k - 1) - { - value = diag; - } - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j = 0; j < MAGMABLAS_LASET_BAND_NB; j++) - { - if (ibx + j < n && ind + j >= 0 && ind + j < m) - { - A[j * (lda + 1)] = value; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_full.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_full.cl deleted file mode 100644 index 5701d59..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_full.cl +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. -// Each block has BLK_X threads. -// Each thread loops across one row, updating BLK_Y entries. -// -// Code similar to lacpy, lag2s, lag2z, geadd. - -__kernel -void -COOT_FN(PREFIX,laset_full)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - A += A_offset; - - UWORD ind = get_group_id(0) * MAGMABLAS_BLK_X + get_local_id(0); - UWORD iby = get_group_id(1) * MAGMABLAS_BLK_Y; - /* check if full block-column && (below diag || above diag || offdiag == diag) */ - bool full = (iby + MAGMABLAS_BLK_Y <= n && (ind >= iby + MAGMABLAS_BLK_Y || ind + MAGMABLAS_BLK_X <= iby || ( offdiag == diag ))); - /* do only rows inside matrix */ - if (ind < m) - { - A += ind + iby * lda; - if (full) - { - // full block-column, off-diagonal block or offdiag == diag - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(int j=0; j < MAGMABLAS_BLK_Y; ++j) - { - A[j * lda] = offdiag; - } - } - else - { - // either partial block-column or diagonal block - for (int j=0; j < MAGMABLAS_BLK_Y && iby+j < n; ++j) - { - if (iby + j == ind) - A[j * lda] = diag; - else - A[j * lda] = offdiag; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_lower.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_lower.cl deleted file mode 100644 index bb5b3b2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_lower.cl +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Similar to laset_full, but updates only the diagonal and below. -// Blocks that are fully above the diagonal exit immediately. -// -// Code similar to lacpy, zlat2c, clat2z. - -__kernel -void -COOT_FN(PREFIX,laset_lower)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - A += A_offset; - - UWORD ind = get_group_id(0) * MAGMABLAS_BLK_X + get_local_id(0); - UWORD iby = get_group_id(1) * MAGMABLAS_BLK_Y; - /* check if full block-column && (below diag) */ - bool full = (iby + MAGMABLAS_BLK_Y <= n && (ind >= iby + MAGMABLAS_BLK_Y)); - /* do only rows inside matrix, and blocks not above diag */ - if (ind < m && ind + MAGMABLAS_BLK_X > iby) - { - A += ind + iby*lda; - if (full) - { - // full block-column, off-diagonal block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j=0; j < MAGMABLAS_BLK_Y; ++j) - { - A[j * lda] = offdiag; - } - } - else - { - // either partial block-column or diagonal block - for (int j=0; j < MAGMABLAS_BLK_Y && iby+j < n; ++j) - { - if (iby + j == ind) - A[j * lda] = diag; - else if (ind > iby + j) - A[j*lda] = offdiag; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_upper.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_upper.cl deleted file mode 100644 index 02ed04d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laset_upper.cl +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Similar to laset_full, but updates only the diagonal and above. -// Blocks that are fully below the diagonal exit immediately. -// -// Code similar to lacpy, zlat2c, clat2z. - -__kernel -void -COOT_FN(PREFIX,laset_upper)(const UWORD m, - const UWORD n, - const eT1 offdiag, - const eT1 diag, - __global eT1* A, - const UWORD A_offset, - const UWORD lda) - { - A += A_offset; - - UWORD ind = get_group_id(0) * MAGMABLAS_BLK_X + get_local_id(0); - UWORD iby = get_group_id(1) * MAGMABLAS_BLK_Y; - /* check if full block-column && (above diag) */ - bool full = (iby + MAGMABLAS_BLK_Y <= n && (ind + MAGMABLAS_BLK_X <= iby)); - /* do only rows inside matrix, and blocks not below diag */ - if (ind < m && ind < iby + MAGMABLAS_BLK_Y) - { - A += ind + iby*lda; - if (full) - { - // full block-column, off-diagonal block - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(int j=0; j < MAGMABLAS_BLK_Y; ++j) - { - A[j * lda] = offdiag; - } - } - else - { - // either partial block-column or diagonal block - for (int j=0; j < MAGMABLAS_BLK_Y && iby+j < n; ++j) - { - if (iby + j == ind) - A[j*lda] = diag; - else if (ind < iby + j) - A[j*lda] = offdiag; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laswp.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laswp.cl deleted file mode 100644 index a7793b0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/laswp.cl +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// Matrix A is stored row-wise in dAT. -// Divide matrix A into block-columns of NTHREADS columns each. -// Each GPU block processes one block-column of A. -// Each thread goes down a column of A, -// swapping rows according to pivots stored in params. -__kernel -void -COOT_FN(PREFIX,laswp)(int n, - __global eT1* dAT, - unsigned long dAT_offset, - int ldda, - magmablas_laswp_params_t params) - { - dAT += dAT_offset; - - int tid = get_local_id(0) + get_local_size(0)*get_group_id(0); - if ( tid < n ) - { - dAT += tid; - __global eT1* A1 = dAT; - - for( int i1 = 0; i1 < params.npivots; ++i1 ) - { - int i2 = params.ipiv[i1]; - __global eT1* A2 = dAT + i2*ldda; - eT1 temp = *A1; - *A1 = *A2; - *A2 = temp; - A1 += ldda; // A1 = dA + i1*ldx - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_even_magma.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_even_magma.cl deleted file mode 100644 index 3917cec..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_even_magma.cl +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// grid is ((n/nb) + 1) x (n/nb)/2, where n/nb is even. -// lower indicates blocks in strictly lower triangle of grid, excluding diagonal. -// lower blocks shift up by one to cover left side of matrix including diagonal. -// upper blocks swap block indices (x,y) and shift by grid width -// to cover right side of matrix. -// [ A00 A01 ] [ A10 . | . . ] -// [ A10 A11 ] [ A20 A21 | . . ] -// grid [ A20 A21 ] covers matrix as [ A30 A31 | A00 . ] -// [ A30 A31 ] [ A40 A41 | A01 A11 ] -// [ A40 A41 ] -// -// Each block is NB x NB threads. -// For non-diagonal block A, block B is symmetric block. -// Thread (i,j) loads A(i,j) into sA(j,i) and B(i,j) into sB(j,i), i.e., transposed, -// syncs, then saves sA(i,j) to B(i,j) and sB(i,j) to A(i,j). -// Threads outside the matrix do not touch memory. - -__kernel -void -COOT_FN(PREFIX,transpose_inplace_even_magma)(const UWORD n, - __global eT1* matrix, - const UWORD matrix_offset, - const UWORD lda) - { - matrix += matrix_offset; - - __local eT1 sA[MAGMABLAS_TRANS_INPLACE_NB][MAGMABLAS_TRANS_INPLACE_NB + 1]; - __local eT1 sB[MAGMABLAS_TRANS_INPLACE_NB][MAGMABLAS_TRANS_INPLACE_NB + 1]; - - UWORD i = get_local_id(0); - UWORD j = get_local_id(1); - - bool lower = (get_group_id(0) > get_group_id(1)); - UWORD ii = (lower ? (get_group_id(0) - 1) : (get_group_id(1) + get_num_groups(1))); - UWORD jj = (lower ? (get_group_id(1) ) : (get_group_id(0) + get_num_groups(1))); - - ii *= MAGMABLAS_TRANS_INPLACE_NB; - jj *= MAGMABLAS_TRANS_INPLACE_NB; - - __global eT1* A = matrix + (ii + i) + (jj + j) * lda; - if (ii == jj) - { - if (ii + i < n && jj + j < n) - { - sA[j][i] = *A; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (ii + i < n && jj + j < n) - { - *A = sA[i][j]; - } - } - else - { - __global eT1* B = matrix + (jj + i) + (ii + j) * lda; - if (ii + i < n && jj + j < n) - { - sA[j][i] = *A; - } - if (jj + i < n && ii + j < n) - { - sB[j][i] = *B; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (ii + i < n && jj + j < n) - { - *A = sB[i][j]; - } - if (jj + i < n && ii + j < n) - { - *B = sA[i][j]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_odd_magma.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_odd_magma.cl deleted file mode 100644 index 923a887..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_inplace_odd_magma.cl +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// grid is (n/nb) x ((n/nb)/2 + 1), where n/nb is odd. -// lower indicates blocks in lower triangle of grid, including diagonal. -// lower blocks cover left side of matrix, including diagonal. -// upper blocks swap block indices (x,y) and shift by grid width (or width-1) -// to cover right side of matrix. -// [ A00 A01 A02 ] [ A00 . . | . . ] -// [ A10 A11 A12 ] [ A10 A11 . | . . ] -// grid [ A20 A21 A22 ] covers matrix as [ A20 A21 A22 | . . ] -// [ A30 A31 A32 ] [ A30 A31 A32 | A01 . ] -// [ A40 A41 A42 ] [ A40 A41 A42 | A02 A12 ] -// -// See transpose_inplace_even_magma for description of threads. - -__kernel -void -COOT_FN(PREFIX,transpose_inplace_odd_magma)(const UWORD n, - __global eT1* matrix, - const UWORD matrix_offset, - const UWORD lda) - { - matrix += matrix_offset; - - __local eT1 sA[MAGMABLAS_TRANS_INPLACE_NB][MAGMABLAS_TRANS_INPLACE_NB + 1]; - __local eT1 sB[MAGMABLAS_TRANS_INPLACE_NB][MAGMABLAS_TRANS_INPLACE_NB + 1]; - - UWORD i = get_local_id(0); - UWORD j = get_local_id(1); - - bool lower = (get_group_id(0) >= get_group_id(1)); - UWORD ii = (lower ? get_group_id(0) : (get_group_id(1) + get_num_groups(1) - 1)); - UWORD jj = (lower ? get_group_id(1) : (get_group_id(0) + get_num_groups(1) )); - - ii *= MAGMABLAS_TRANS_INPLACE_NB; - jj *= MAGMABLAS_TRANS_INPLACE_NB; - - __global eT1* A = matrix + (ii + i) + (jj + j) * lda; - if (ii == jj) - { - if (ii + i < n && jj + j < n) - { - sA[j][i] = *A; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (ii + i < n && jj + j < n) - { - *A = sA[i][j]; - } - } - else - { - __global eT1* B = matrix + (jj + i) + (ii + j) * lda; - if (ii + i < n && jj + j < n) - { - sA[j][i] = *A; - } - if (jj + i < n && ii + j < n) - { - sB[j][i] = *B; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (ii + i < n && jj + j < n) - { - *A = sB[i][j]; - } - if (jj + i < n && ii + j < n) - { - *B = sA[i][j]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_magma.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_magma.cl deleted file mode 100644 index 1288197..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/magma_real/transpose_magma.cl +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This file contains source code adapted from -// clMAGMA 1.3 (2014-11-14). -// clMAGMA 1.3 is distributed under a 3-clause BSD license as follows: -// -// -- Innovative Computing Laboratory -// -- Electrical Engineering and Computer Science Department -// -- University of Tennessee -// -- (C) Copyright 2009-2015 -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of Tennessee, Knoxville nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors -// ``as is'' and any express or implied warranties, including, but not -// limited to, the implied warranties of merchantability and fitness for -// a particular purpose are disclaimed. In no event shall the copyright -// holders or contributors be liable for any direct, indirect, incidental, -// special, exemplary, or consequential damages (including, but not -// limited to, procurement of substitute goods or services; loss of use, -// data, or profits; or business interruption) however caused and on any -// theory of liability, whether in contract, strict liability, or tort -// (including negligence or otherwise) arising in any way out of the use -// of this software, even if advised of the possibility of such damage. - - - -// tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB. -// uses NX-by-NY threads, where NB/NX, NB/NY, NX/NY evenly. -// subtile each NB-by-NB tile with (NB/NX) subtiles sized NX-by-NB -// for each subtile -// load NX-by-NB subtile transposed from A into sA, as (NB/NY) blocks sized NX-by-NY -// save NB-by-NX subtile from sA into AT, as (NB/NX)*(NX/NY) blocks sized NX-by-NY -// A += NX -// AT += NX*ldat - -__kernel -void -COOT_FN(PREFIX,transpose_magma)(const UWORD m, - const UWORD n, - __global const eT1* A, - const UWORD A_offset, - const UWORD lda, - __global eT1* AT, - const UWORD AT_offset, - const UWORD ldat) - { - A += A_offset; - AT += AT_offset; - - __local eT1 sA[MAGMABLAS_TRANS_NB][MAGMABLAS_TRANS_NX+1]; - - UWORD tx = get_local_id(0); - UWORD ty = get_local_id(1); - UWORD ibx = get_group_id(0) * MAGMABLAS_TRANS_NB; - UWORD iby = get_group_id(1) * MAGMABLAS_TRANS_NB; - UWORD i, j; - - A += ibx + tx + (iby + ty) * lda; - AT += iby + tx + (ibx + ty) * ldat; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int tile = 0; tile < MAGMABLAS_TRANS_NB / MAGMABLAS_TRANS_NX; ++tile) - { - // load NX-by-NB subtile transposed from A into sA - i = ibx + tx + tile * MAGMABLAS_TRANS_NX; - j = iby + ty; - if (i < m) - { - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j2=0; j2 < MAGMABLAS_TRANS_NB; j2 += MAGMABLAS_TRANS_NY) - { - if (j + j2 < n) - { - sA[ty + j2][tx] = A[j2*lda]; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // save NB-by-NX subtile from sA into AT - i = iby + tx; - j = ibx + ty + tile * MAGMABLAS_TRANS_NX; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int i2 = 0; i2 < MAGMABLAS_TRANS_NB; i2 += MAGMABLAS_TRANS_NX) - { - if (i + i2 < n) - { - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (int j2 = 0; j2 < MAGMABLAS_TRANS_NX; j2 += MAGMABLAS_TRANS_NY) - { - if (j + j2 < m) - { - AT[i2 + j2 * ldat] = sA[tx + i2][ty + j2]; - } - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // move to next subtile - A += MAGMABLAS_TRANS_NX; - AT += MAGMABLAS_TRANS_NX * ldat; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu.cl deleted file mode 100644 index e1ecc4e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu.cl +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,accu)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += in_mem[in_mem_offset + i] + in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += in_mem[in_mem_offset + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_simple.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_simple.cl deleted file mode 100644 index 57b00b0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_simple.cl +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,accu_simple)(__global eT1* out, - __global const eT1* A, - const UWORD A_len) - { - const UWORD id = get_global_id(0); - if(id == 0) - { - eT1 acc = (eT1)(0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < A_len; ++i) - { - acc += A[i]; - } - out[0] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_small.cl deleted file mode 100644 index 52a1e77..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/accu_small.cl +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,accu_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += in_mem[in_mem_offset + i] + in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += in_mem[in_mem_offset + i]; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal.cl deleted file mode 100644 index fe4649a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal.cl +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,approx_equal)(__global uint* out_mem, - __global const eT1* A_mem, - const UWORD A_offset, - const UWORD A_M_n_rows, - __global const eT1* B_mem, - const UWORD B_offset, - const UWORD B_M_n_rows, - const UWORD n_rows, - const UWORD n_elem, - __local volatile uint* aux_mem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - // A bit painful... - const UWORD row1 = i % n_rows; - const UWORD col1 = i / n_rows; - const UWORD row2 = (i + get_local_size(0)) % n_rows; - const UWORD col2 = (i + get_local_size(0)) / n_rows; - - const UWORD A_loc1 = A_offset + row1 + col1 * A_M_n_rows; - const UWORD A_loc2 = A_offset + row2 + col2 * A_M_n_rows; - const UWORD B_loc1 = B_offset + row1 + col1 * B_M_n_rows; - const UWORD B_loc2 = B_offset + row2 + col2 * B_M_n_rows; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (COOT_FN(coot_isnan_,eT1)(A_val1) || COOT_FN(coot_isnan_,eT1)(B_val1) || COOT_FN(coot_isnan_,eT1)(A_val2) || COOT_FN(coot_isnan_,eT1)(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = COOT_FN(coot_absdiff_,eT1)(A_val1, B_val1); - const eT1 absdiff2 = COOT_FN(coot_absdiff_,eT1)(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= (eT1) 1) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD row = i % n_rows; - const UWORD col = i / n_rows; - - const UWORD A_loc = A_offset + row + col * A_M_n_rows; - const UWORD B_loc = B_offset + row + col * B_M_n_rows; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (COOT_FN(coot_isnan_,eT1)(A_val) || COOT_FN(coot_isnan_,eT1)(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = COOT_FN(coot_absdiff_,eT1)(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= (eT1) 1) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(and_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube.cl deleted file mode 100644 index 5252aac..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube.cl +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,approx_equal_cube)(__global uint* out_mem, - __global const eT1* A_mem, - const UWORD A_offset, - const UWORD A_M_n_rows, - const UWORD A_M_n_cols, - __global const eT1* B_mem, - const UWORD B_offset, - const UWORD B_M_n_rows, - const UWORD B_M_n_cols, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_elem, - __local volatile uint* aux_mem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - const UWORD n_elem_slice = n_rows * n_cols; - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - // A bit painful... TODO: implement a more efficient non-modulo approach - const UWORD elem1 = i % n_elem_slice; - const UWORD slice1 = i / n_elem_slice; - const UWORD row1 = elem1 % n_rows; - const UWORD col1 = elem1 / n_rows; - - const UWORD elem2 = (i + get_local_size(0)) % n_elem_slice; - const UWORD slice2 = (i + get_local_size(0)) / n_elem_slice; - const UWORD row2 = elem2 % n_rows; - const UWORD col2 = elem2 / n_rows; - - const UWORD A_loc1 = A_offset + row1 + col1 * A_M_n_rows + slice1 * A_M_n_rows * A_M_n_cols; - const UWORD A_loc2 = A_offset + row2 + col2 * A_M_n_rows + slice2 * A_M_n_rows * A_M_n_cols; - const UWORD B_loc1 = B_offset + row1 + col1 * B_M_n_rows + slice1 * B_M_n_rows * B_M_n_cols; - const UWORD B_loc2 = B_offset + row2 + col2 * B_M_n_rows + slice2 * B_M_n_rows * B_M_n_cols; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (COOT_FN(coot_isnan_,eT1)(A_val1) || COOT_FN(coot_isnan_,eT1)(B_val1) || COOT_FN(coot_isnan_,eT1)(A_val2) || COOT_FN(coot_isnan_,eT1)(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = COOT_FN(coot_absdiff_,eT1)(A_val1, B_val1); - const eT1 absdiff2 = COOT_FN(coot_absdiff_,eT1)(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= (eT1) 1) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD elem = i % n_elem_slice; - const UWORD slice = i / n_elem_slice; - const UWORD row = elem % n_rows; - const UWORD col = elem / n_rows; - - const UWORD A_loc = A_offset + row + col * A_M_n_rows + slice * A_M_n_rows * A_M_n_cols; - const UWORD B_loc = B_offset + row + col * B_M_n_rows + slice * B_M_n_rows * B_M_n_cols; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (COOT_FN(coot_isnan_,eT1)(A_val) || COOT_FN(coot_isnan_,eT1)(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = COOT_FN(coot_absdiff_,eT1)(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= (eT1) 1) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(and_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube_small.cl deleted file mode 100644 index 5e2492a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_cube_small.cl +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,approx_equal_cube_small)(__global uint* out_mem, - __global const eT1* A_mem, - const UWORD A_offset, - const UWORD A_M_n_rows, - const UWORD A_M_n_cols, - __global const eT1* B_mem, - const UWORD B_offset, - const UWORD B_M_n_rows, - const UWORD B_M_n_cols, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_elem, - __local volatile uint* aux_mem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - const UWORD n_elem_slice = n_rows * n_cols; - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - // A bit painful... TODO: implement a more efficient non-modulo approach - const UWORD elem1 = i % n_elem_slice; - const UWORD slice1 = i / n_elem_slice; - const UWORD row1 = elem1 % n_rows; - const UWORD col1 = elem1 / n_rows; - - const UWORD elem2 = (i + get_local_size(0)) % n_elem_slice; - const UWORD slice2 = (i + get_local_size(0)) / n_elem_slice; - const UWORD row2 = elem2 % n_rows; - const UWORD col2 = elem2 / n_rows; - - const UWORD A_loc1 = A_offset + row1 + col1 * A_M_n_rows + slice1 * A_M_n_rows * A_M_n_cols; - const UWORD A_loc2 = A_offset + row2 + col2 * A_M_n_rows + slice2 * A_M_n_rows * A_M_n_cols; - const UWORD B_loc1 = B_offset + row1 + col1 * B_M_n_rows + slice1 * B_M_n_rows * B_M_n_cols; - const UWORD B_loc2 = B_offset + row2 + col2 * B_M_n_rows + slice2 * B_M_n_rows * B_M_n_cols; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (COOT_FN(coot_isnan_,eT1)(A_val1) || COOT_FN(coot_isnan_,eT1)(B_val1) || COOT_FN(coot_isnan_,eT1)(A_val2) || COOT_FN(coot_isnan_,eT1)(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = COOT_FN(coot_absdiff_,eT1)(A_val1, B_val1); - const eT1 absdiff2 = COOT_FN(coot_absdiff_,eT1)(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= (eT1) 1) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD elem = i % n_elem_slice; - const UWORD slice = i / n_elem_slice; - const UWORD row = elem % n_rows; - const UWORD col = elem / n_rows; - - const UWORD A_loc = A_offset + row + col * A_M_n_rows + slice * A_M_n_rows * A_M_n_cols; - const UWORD B_loc = B_offset + row + col * B_M_n_rows + slice * B_M_n_rows * B_M_n_cols; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (COOT_FN(coot_isnan_,eT1)(A_val) || COOT_FN(coot_isnan_,eT1)(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = COOT_FN(coot_absdiff_,eT1)(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= (eT1) 1) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_small.cl deleted file mode 100644 index 5deec2f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/approx_equal_small.cl +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,approx_equal_small)(__global uint* out_mem, - __global const eT1* A_mem, - const UWORD A_offset, - const UWORD A_M_n_rows, - __global const eT1* B_mem, - const UWORD B_offset, - const UWORD B_M_n_rows, - const UWORD n_rows, - const UWORD n_elem, - __local volatile uint* aux_mem, - const UWORD mode, - const eT1 abs_tol, - const eT1 rel_tol) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - // A bit painful... - const UWORD row1 = i % n_rows; - const UWORD col1 = i / n_rows; - const UWORD row2 = (i + get_local_size(0)) % n_rows; - const UWORD col2 = (i + get_local_size(0)) / n_rows; - - const UWORD A_loc1 = A_offset + row1 + col1 * A_M_n_rows; - const UWORD A_loc2 = A_offset + row2 + col2 * A_M_n_rows; - const UWORD B_loc1 = B_offset + row1 + col1 * B_M_n_rows; - const UWORD B_loc2 = B_offset + row2 + col2 * B_M_n_rows; - - const eT1 A_val1 = A_mem[A_loc1]; - const eT1 B_val1 = B_mem[B_loc1]; - const eT1 A_val2 = A_mem[A_loc2]; - const eT1 B_val2 = B_mem[B_loc2]; - - if (COOT_FN(coot_isnan_,eT1)(A_val1) || COOT_FN(coot_isnan_,eT1)(B_val1) || COOT_FN(coot_isnan_,eT1)(A_val2) || COOT_FN(coot_isnan_,eT1)(B_val2)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff1 = COOT_FN(coot_absdiff_,eT1)(A_val1, B_val1); - const eT1 absdiff2 = COOT_FN(coot_absdiff_,eT1)(A_val2, B_val2); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff1 <= abs_tol); - aux_mem[tid] &= (absdiff2 <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val1 = max(ET1_ABS(A_val1), ET1_ABS(B_val1)); - const eT1 max_val2 = max(ET1_ABS(A_val2), ET1_ABS(B_val2)); - - if (max_val1 >= (eT1) 1) - { - aux_mem[tid] &= (absdiff1 <= rel_tol * max_val1); - aux_mem[tid] &= (absdiff2 <= rel_tol * max_val2); - } - else - { - aux_mem[tid] &= (absdiff1 / max_val1 <= rel_tol); - aux_mem[tid] &= (absdiff2 / max_val2 <= rel_tol); - } - } - - i += grid_size; - } - if (i < n_elem) - { - const UWORD row = i % n_rows; - const UWORD col = i / n_rows; - - const UWORD A_loc = A_offset + row + col * A_M_n_rows; - const UWORD B_loc = B_offset + row + col * B_M_n_rows; - - const eT1 A_val = A_mem[A_loc]; - const eT1 B_val = B_mem[B_loc]; - - if (COOT_FN(coot_isnan_,eT1)(A_val) || COOT_FN(coot_isnan_,eT1)(B_val)) - { - // Not approximately equal. - aux_mem[tid] &= 0; - } - - const eT1 absdiff = COOT_FN(coot_absdiff_,eT1)(A_val, B_val); - - if ((mode & 1) == 1) // absolute - { - aux_mem[tid] &= (absdiff <= abs_tol); - } - - if ((mode & 2) == 2) // relative - { - const eT1 max_val = max(ET1_ABS(A_val), ET1_ABS(B_val)); - - if (max_val >= (eT1) 1) - { - aux_mem[tid] &= (absdiff <= rel_tol * max_val); - } - else - { - aux_mem[tid] &= (absdiff / max_val <= rel_tol); - } - } - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/count_nonzeros.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/count_nonzeros.cl deleted file mode 100644 index e981dec..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/count_nonzeros.cl +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,count_nonzeros)(__global const eT1* A, - const UWORD A_offset, - __global UWORD* thread_counts, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - // We want to pass over the memory in A and count the number of nonzero elements. - // This will give us a count for each individual thread; we then want to prefix-sum this. - // This kernel is meant to be used as the first part of find(). - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_count = 0; - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - ++local_count; - } - if (A[A_offset + i + 1] != (eT1) 0) - { - ++local_count; - } - - i += 2; - } - if (i < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - ++local_count; - } - } - - // Aggregate the counts for all threads. - aux_mem[tid] = local_count; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - - for (UWORD s = num_threads / 2; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - // Set the last element correctly. - thread_counts[num_threads] = aux_mem[num_threads - 1]; - aux_mem[num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads / 2; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - thread_counts[tid] = aux_mem[tid]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find.cl deleted file mode 100644 index ab2f0b4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find.cl +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,find)(__global const eT1* A, - const UWORD A_offset, - __global const UWORD* thread_counts, - __global UWORD* out, - const UWORD out_offset, - const UWORD n_elem) - { - // Our goal is to fill `out` with the indices of nonzero values. - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - - UWORD i = start_elem; - - while (i + 1 < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - out[out_offset + out_index++] = i; - } - if (A[A_offset + i + 1] != (eT1) 0) - { - out[out_offset + out_index++] = (i + 1); - } - - i += 2; - } - if (i < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - out[out_offset + out_index++] = i; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_first.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_first.cl deleted file mode 100644 index c0638d1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_first.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,find_first)(__global const eT1* A, - const UWORD A_offset, - __global const UWORD* thread_counts, - __global UWORD* out, - const UWORD out_offset, - const UWORD k, - const UWORD n_elem) - { - // Our goal is to fill `out` with the first `k` indices of nonzero values. - // It is assumed that `k != 0`; if `k` is `0`, use the `find` kernel instead. - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - - UWORD i = start_elem; - - // We only want to find the first k points. - if (out_index < k) - { - while (i + 1 < end_elem) - { - if (A[A_offset + i] != (eT1) 0 && out_index < k) - { - out[out_offset + out_index++] = i; - } - if (A[A_offset + i + 1] != (eT1) 0 && out_index < k) - { - out[out_offset + out_index++] = (i + 1); - } - - i += 2; - } - if (i < end_elem) - { - if (A[A_offset + i] != (eT1) 0 && out_index < k) - { - out[out_offset + out_index++] = i; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_last.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_last.cl deleted file mode 100644 index 3dee54b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/find_last.cl +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,find_last)(__global const eT1* A, - const UWORD A_offset, - __global const UWORD* thread_counts, - __global UWORD* out, - const UWORD out_offset, - const UWORD m, - const UWORD n_elem) - { - // Our goal is to fill `out` with the last `k` indices of nonzero values. - // (Note that to match Armadillo's behavior, we want the last `k` indices in ascending order.) - // Instead of accepting `k` as a parameter, we instead accept `m = nnz - k`. - // This gives us the first index we should be putting an output value in. - // It is also assumed that `k != 0`; if `k` is `0`, use the `find` kernel instead. - - // Since the kernel is multithreaded, each thread will handle a different (contiguous) part of `A`. - // We expect that we already have the starting position for each thread in `thread_counts`. - // (It should have been filled with the `count_nonzeros` kernel.) - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD out_index = thread_counts[tid]; - UWORD last_out_index = thread_counts[tid + 1]; - - UWORD i = start_elem; - - // We only want to find points with index `m` or higher. - if (last_out_index >= m) - { - while (i + 1 < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - if (out_index >= m) - { - out[out_offset + out_index - m] = i; - } - - ++out_index; - } - if (A[A_offset + i + 1] != (eT1) 0) - { - if (out_index >= m) - { - out[out_offset + out_index - m] = (i + 1); - } - - ++out_index; - } - - i += 2; - } - - if (i < end_elem) - { - if (A[A_offset + i] != (eT1) 0) - { - if (out_index >= m) - { - out[out_offset + out_index - m] = i; - } - - ++out_index; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max.cl deleted file mode 100644 index dd60fc3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max.cl +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright 2024 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_other)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - { - if (data[tid + i] > data[tid]) - { - data[tid] = data[tid + i]; - data_uword[tid] = data_uword[tid + i]; - } - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_8)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_16)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 16] > data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_32)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 32] > data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] > data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_max_subgroup_reduce_64)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 64] > data[tid]) - { - data[tid] = data[tid + 64]; - data_uword[tid] = data_uword[tid + 64]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 32] > data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] > data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void COOT_FN(PREFIX,index_max_subgroup_reduce_128)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 128] > data[tid]) - { - data[tid] = data[tid + 128]; - data_uword[tid] = data_uword[tid + 128]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 64] > data[tid]) - { - data[tid] = data[tid + 64]; - data_uword[tid] = data_uword[tid + 64]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 32] > data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] > data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] > data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] > data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] > data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] > data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -__kernel -void -COOT_FN(PREFIX,index_max)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_uword_mem, - const UWORD in_uword_mem_offset, - const UWORD use_uword_mem, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __global UWORD* out_uword_mem, - const UWORD out_uword_mem_offset, - __local volatile eT1* aux_mem, - __local volatile UWORD* aux_uword_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - aux_uword_mem[tid] = COOT_UWORD_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - if (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i + get_local_size(0)] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - - if (in_mem[in_mem_offset + i + get_local_size(0)] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[in_mem_offset + i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] > aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,index_max_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, aux_uword_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - out_uword_mem[out_uword_mem_offset + get_group_id(0)] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_colwise.cl deleted file mode 100644 index b72ec16..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_colwise.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_max_colwise)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - - eT1 best_val = colptr[0]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - if (colptr[i] > best_val) - { - best_val = colptr[i]; - best_index = i; - } - } - dest[dest_offset + col * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_cube_col.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_cube_col.cl deleted file mode 100644 index 57f7ac7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_cube_col.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_max_cube_col)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT1 best_val = src[src_offset + row + slice * n_rows * n_cols]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - if (src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols] > best_val) - { - best_val = src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols]; - best_index = i; - } - } - dest[dest_offset + row + slice * n_rows] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_rowwise.cl deleted file mode 100644 index 87a8206..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_rowwise.cl +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_max_rowwise)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 best_val = src[src_offset + row]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - if (src[src_offset + (i * src_M_n_rows) + row] > best_val) - { - best_val = src[src_offset + (i * src_M_n_rows) + row]; - best_index = i; - } - } - dest[dest_offset + row * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_small.cl deleted file mode 100644 index a36ba38..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_max_small.cl +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2024 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_max_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_uword_mem, - const UWORD in_uword_mem_offset, - const UWORD use_uword_mem, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __global UWORD* out_uword_mem, - const UWORD out_uword_mem_offset, - __local volatile eT1* aux_mem, - __local volatile UWORD* aux_uword_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - aux_uword_mem[tid] = COOT_UWORD_MAX; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - if (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i + get_local_size(0)] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - - if (in_mem[in_mem_offset + i + get_local_size(0)] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[in_mem_offset + i] > aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - if (aux_mem[tid + s] > aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - out_uword_mem[out_uword_mem_offset + get_group_id(0)] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min.cl deleted file mode 100644 index baba221..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min.cl +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright 2024 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_other)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - { - if (data[tid + i] < data[tid]) - { - data[tid] = data[tid + i]; - data_uword[tid] = data_uword[tid + i]; - } - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_8)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_16)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 16] < data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_32)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 32] < data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] < data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void -COOT_FN(PREFIX,index_min_subgroup_reduce_64)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 64] < data[tid]) - { - data[tid] = data[tid + 64]; - data_uword[tid] = data_uword[tid + 64]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 32] < data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] < data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -void COOT_FN(PREFIX,index_min_subgroup_reduce_128)(__local volatile eT1* data, __local volatile UWORD* data_uword, UWORD tid) - { - if (data[tid + 128] < data[tid]) - { - data[tid] = data[tid + 128]; - data_uword[tid] = data_uword[tid + 128]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 64] < data[tid]) - { - data[tid] = data[tid + 64]; - data_uword[tid] = data_uword[tid + 64]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 32] < data[tid]) - { - data[tid] = data[tid + 32]; - data_uword[tid] = data_uword[tid + 32]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 16] < data[tid]) - { - data[tid] = data[tid + 16]; - data_uword[tid] = data_uword[tid + 16]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 8] < data[tid]) - { - data[tid] = data[tid + 8]; - data_uword[tid] = data_uword[tid + 8]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 4] < data[tid]) - { - data[tid] = data[tid + 4]; - data_uword[tid] = data_uword[tid + 4]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 2] < data[tid]) - { - data[tid] = data[tid + 2]; - data_uword[tid] = data_uword[tid + 2]; - } - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - if (data[tid + 1] < data[tid]) - { - data[tid] = data[tid + 1]; - data_uword[tid] = data_uword[tid + 1]; - } - } - - - -__kernel -void -COOT_FN(PREFIX,index_min)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_uword_mem, - const UWORD in_uword_mem_offset, - const UWORD use_uword_mem, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __global UWORD* out_uword_mem, - const UWORD out_uword_mem_offset, - __local volatile eT1* aux_mem, - __local volatile UWORD* aux_uword_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - aux_uword_mem[tid] = (UWORD) 0; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - if (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i + get_local_size(0)] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - - if (in_mem[in_mem_offset + i + get_local_size(0)] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[in_mem_offset + i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - if (aux_mem[tid + s] < aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,index_min_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, aux_uword_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - out_uword_mem[out_uword_mem_offset + get_group_id(0)] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_colwise.cl deleted file mode 100644 index d793178..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_colwise.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_min_colwise)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - - eT1 best_val = colptr[0]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - if (colptr[i] < best_val) - { - best_val = colptr[i]; - best_index = i; - } - } - dest[dest_offset + col * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_cube_col.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_cube_col.cl deleted file mode 100644 index 1f93e2d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_cube_col.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_min_cube_col)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT1 best_val = src[src_offset + row + slice * n_rows * n_cols]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - if (src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols] < best_val) - { - best_val = src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols]; - best_index = i; - } - } - dest[dest_offset + row + slice * n_rows] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_rowwise.cl deleted file mode 100644 index 7f0bbe0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_rowwise.cl +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_min_rowwise)(__global UWORD* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 best_val = src[src_offset + row]; - UWORD best_index = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - if (src[src_offset + (i * src_M_n_rows) + row] < best_val) - { - best_val = src[src_offset + (i * src_M_n_rows) + row]; - best_index = i; - } - } - dest[dest_offset + row * dest_mem_incr] = best_index; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_small.cl deleted file mode 100644 index f746c58..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/index_min_small.cl +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2024 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,index_min_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_uword_mem, - const UWORD in_uword_mem_offset, - const UWORD use_uword_mem, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __global UWORD* out_uword_mem, - const UWORD out_uword_mem_offset, - __local volatile eT1* aux_mem, - __local volatile UWORD* aux_uword_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - aux_uword_mem[tid] = (UWORD) 0; - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - if (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i + get_local_size(0)] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - if (in_mem[in_mem_offset + i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - - if (in_mem[in_mem_offset + i + get_local_size(0)] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i + get_local_size(0)]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i + get_local_size(0)] : (i + get_local_size(0))); - } - - i += grid_size; - } - if (i < n_elem) - { - if (in_mem[in_mem_offset + i] < aux_mem[tid]) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - aux_uword_mem[tid] = ((use_uword_mem == 1) ? in_uword_mem[in_uword_mem_offset + i] : i); - } - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - if (aux_mem[tid + s] < aux_mem[tid]) - { - aux_mem[tid] = aux_mem[tid + s]; - aux_uword_mem[tid] = aux_uword_mem[tid + s]; - } - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - out_uword_mem[out_uword_mem_offset + get_group_id(0)] = aux_uword_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_philox_randn.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_philox_randn.cl deleted file mode 100644 index dbf60a3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_philox_randn.cl +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright 2021 Marcus Edel (http://www.kurg.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// philox_4x32_10, specific to generating u32s - - - -inline -void -philox_4x32_10_single_round(uint* counter, uint* key) - { - uint hi0 = mul_hi((uint) 0xD2511F53, counter[0]); - uint hi1 = mul_hi((uint) 0xCD9E8D57, counter[2]); - uint lo0 = 0xD2511F53 * counter[0]; - uint lo1 = 0xCD9E8D57 * counter[2]; - - counter[0] = hi1 ^ counter[1] ^ key[0]; - counter[1] = lo1; - counter[2] = hi0 ^ counter[3] ^ key[1]; - counter[3] = lo0; - } - - - -inline -void -philox_4x32_10_p_step(uint* philox_state) - { - if (++philox_state[0]) - return; - if (++philox_state[1]) - return; - if (++philox_state[2]) - return; - ++philox_state[3]; - } - - - -inline -void -philox_4x32_10_rng(uint* philox_state) - { - // 4 uint counter: philox_state[0:3] - // 2 uint key: philox_state[4:5] - - // apply P (increment state) - philox_4x32_10_p_step(philox_state); - - // apply S-box 10 times - for (UWORD i = 0; i < 9; ++i) - { - philox_4x32_10_single_round(philox_state, philox_state + 4); - philox_state[4] += 0x9E3779B9; - philox_state[5] += 0xBB67AE85; - } - philox_4x32_10_single_round(philox_state, philox_state + 4); - } - - -// -// Convenience functions to get random numbers of 32-bit or 64-bit width out of the Philox 4x32-10 generator. -// The aux memory is only used for ulongs (64-bit width). -// - -inline -void -philox_4x32_10_rng_uchar(uint* philox_state, uint* aux) - { - // This generates more than we need, but that's okay. - philox_4x32_10_rng(philox_state); - } - - - -inline -void -philox_4x32_10_rng_ushort(uint* philox_state, uint* aux) - { - // This generates more than we need, but that's okay. - philox_4x32_10_rng(philox_state); - } - - - -inline -void -philox_4x32_10_rng_uint(uint* philox_state, uint* aux) - { - philox_4x32_10_rng(philox_state); - } - - - -inline -void -philox_4x32_10_rng_ulong(uint* philox_state, uint* aux) - { - philox_4x32_10_rng(philox_state); - // Save 4x32 bits of random data. - aux[0] = philox_state[0]; - aux[1] = philox_state[1]; - aux[2] = philox_state[2]; - aux[3] = philox_state[3]; - // Generate the next 4x32 bits of random data. - philox_4x32_10_rng(philox_state); - } - - - -inline -uchar -philox_get_elem_uchar(uint* philox_state, uint* aux, const UWORD i) - { - return ((uchar*) philox_state)[i]; - } - - - -inline -ushort -philox_get_elem_ushort(uint* philox_state, uint* aux, const UWORD i) - { - return ((ushort*) philox_state)[i]; - } - - - -inline -uint -philox_get_elem_uint(uint* philox_state, uint* aux, const UWORD i) - { - return philox_state[i]; - } - - - -inline -ulong -philox_get_elem_ulong(uint* philox_state, uint* aux, const UWORD i) - { - if (i <= 1) - return ((ulong*) philox_state)[i]; - else - return ((ulong*) aux)[i - 2]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_philox_randn)(__global eT1* mem, - const UWORD mem_offset, - __global uint* philox_state, - const UWORD n, - const fp_eT1 mu, - const fp_eT1 sd) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - uint local_philox_state[6]; - local_philox_state[0] = philox_state[6 * tid ]; - local_philox_state[1] = philox_state[6 * tid + 1]; - local_philox_state[2] = philox_state[6 * tid + 2]; - local_philox_state[3] = philox_state[6 * tid + 3]; - local_philox_state[4] = philox_state[6 * tid + 4]; - local_philox_state[5] = philox_state[6 * tid + 5]; - - // Only used if we are generating 64-bit types. - uint aux_mem[4]; - - while (i < n) - { - COOT_FN(philox_4x32_10_rng_,uint_eT1)(local_philox_state, aux_mem); - - // Perform the Box-Muller transformation to transform [0, 1] samples to N(0, 1). - fp_eT1 sqrt_inner = -2 * log(COOT_FN(philox_get_elem_,uint_eT1)(local_philox_state, aux_mem, 0) / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - if (isnan(sqrt_inner) || isinf(sqrt_inner)) - sqrt_inner = (fp_eT1) 0; - fp_eT1 trig_inner = 2 * M_PI * (COOT_FN(philox_get_elem_,uint_eT1)(local_philox_state, aux_mem, 1) / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - - mem[mem_offset + i] = (eT1) ((sqrt(sqrt_inner) * cos(trig_inner)) * sd + mu); - i += num_threads; - if (i < n) - mem[mem_offset + i] = (eT1) ((sqrt(sqrt_inner) * sin(trig_inner)) * sd + mu); - i += num_threads; - - sqrt_inner = -2 * log(COOT_FN(philox_get_elem_,uint_eT1)(local_philox_state, aux_mem, 2) / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - if (isnan(sqrt_inner) || isinf(sqrt_inner)) - sqrt_inner = (fp_eT1) 0; - trig_inner = 2 * M_PI * (COOT_FN(philox_get_elem_,uint_eT1)(local_philox_state, aux_mem, 3) / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - - if (i < n) - mem[mem_offset + i] = (eT1) ((sqrt(sqrt_inner) * cos(trig_inner)) * sd + mu); - i += num_threads; - if (i < n) - mem[mem_offset + i] = (eT1) ((sqrt(sqrt_inner) * sin(trig_inner)) * sd + mu); - } - - // Restore RNG state. - philox_state[6 * tid ] = local_philox_state[0]; - philox_state[6 * tid + 1] = local_philox_state[1]; - philox_state[6 * tid + 2] = local_philox_state[2]; - philox_state[6 * tid + 3] = local_philox_state[3]; - philox_state[6 * tid + 4] = local_philox_state[4]; - philox_state[6 * tid + 5] = local_philox_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_set_eye.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_set_eye.cl deleted file mode 100644 index 7a17dec..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_set_eye.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_set_eye)(__global eT1* out, - const UWORD out_offset, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - if( (row < n_rows) && (col < n_cols) ) - { - const UWORD offset = row + col*n_rows + out_offset; - out[offset] = (row == col) ? (eT1)(1) : (eT1)(0); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randi.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randi.cl deleted file mode 100644 index 7516946..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randi.cl +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// See algorithm "xorwow" from page 5 of "Xorshift RNGs" by George Marsaglia. -inline -uint -xorwow32_rng_uint(uint* xorwow_state) - { - // xorwow_state[0] through xorwow_state[4] represent the 5 state integers, - // and xorwow_state[5] holds the counter. - uint t = xorwow_state[4] ^ (xorwow_state[4] >> 2); - - xorwow_state[4] = xorwow_state[3]; - xorwow_state[3] = xorwow_state[2]; - xorwow_state[2] = xorwow_state[1]; - xorwow_state[1] = xorwow_state[0]; - xorwow_state[0] ^= (xorwow_state[0] << 4) ^ (t ^ (t << 1)); - - // Following Saito and Matsumoto (2012), we use a larger constant for d so that the higher bits flip more often. - // We ignore their conclusion that XORWOW has problems (it's fast!). - xorwow_state[5] += 268183997; - return xorwow_state[0] + xorwow_state[5]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_xorwow32_randi)(__global eT1* mem, - const UWORD mem_offset, - __global uint* xorwow_state, - const UWORD n, - const eT1 lo, - const uint_eT1 range, - const char needs_modulo) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - uint local_xorwow_state[6]; - local_xorwow_state[0] = xorwow_state[6 * tid ]; - local_xorwow_state[1] = xorwow_state[6 * tid + 1]; - local_xorwow_state[2] = xorwow_state[6 * tid + 2]; - local_xorwow_state[3] = xorwow_state[6 * tid + 3]; - local_xorwow_state[4] = xorwow_state[6 * tid + 4]; - local_xorwow_state[5] = xorwow_state[6 * tid + 5]; - - while (i < n) - { - // This generates a number in [0, uint_eT1_max]. - uint_eT1 t = (uint_eT1) xorwow32_rng_uint(local_xorwow_state); - // Modulo down to the range [0, (hi - lo)], if needed. - if (needs_modulo == 1) - t %= (range + 1); - // Cast back to the correct type, and add lo to get the correct range. - mem[mem_offset + i] = ((eT1) t) + lo; - i += num_threads; - } - - // Return updated RNG state to global memory. - xorwow_state[6 * tid ] = local_xorwow_state[0]; - xorwow_state[6 * tid + 1] = local_xorwow_state[1]; - xorwow_state[6 * tid + 2] = local_xorwow_state[2]; - xorwow_state[6 * tid + 3] = local_xorwow_state[3]; - xorwow_state[6 * tid + 4] = local_xorwow_state[4]; - xorwow_state[6 * tid + 5] = local_xorwow_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randu.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randu.cl deleted file mode 100644 index 24192b2..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow32_randu.cl +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -//~ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -//~ -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// See algorithm "xorwow" from page 5 of "Xorshift RNGs" by George Marsaglia. -inline -uint -xorwow32_rng_uint(uint* xorwow_state) - { - // xorwow_state[0] through xorwow_state[4] represent the 5 state integers, - // and xorwow_state[5] holds the counter. - uint t = xorwow_state[4] ^ (xorwow_state[4] >> 2); - - xorwow_state[4] = xorwow_state[3]; - xorwow_state[3] = xorwow_state[2]; - xorwow_state[2] = xorwow_state[1]; - xorwow_state[1] = xorwow_state[0]; - xorwow_state[0] ^= (xorwow_state[0] << 4) ^ (t ^ (t << 1)); - - // Following Saito and Matsumoto (2012), we use a larger constant for d so that the higher bits flip more often. - // We ignore their conclusion that XORWOW has problems (it's fast!). - xorwow_state[5] += 268183997; - return xorwow_state[0] + xorwow_state[5]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_xorwow32_randu)(__global eT1* mem, - const UWORD mem_offset, - __global uint* xorwow_state, - const UWORD n) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - uint local_xorwow_state[6]; - local_xorwow_state[0] = xorwow_state[6 * tid ]; - local_xorwow_state[1] = xorwow_state[6 * tid + 1]; - local_xorwow_state[2] = xorwow_state[6 * tid + 2]; - local_xorwow_state[3] = xorwow_state[6 * tid + 3]; - local_xorwow_state[4] = xorwow_state[6 * tid + 4]; - local_xorwow_state[5] = xorwow_state[6 * tid + 5]; - - while (i < n) - { - uint_eT1 t = (uint_eT1) xorwow32_rng_uint(local_xorwow_state); - // Now normalize to [0, 1] and compute the output. - mem[mem_offset + i] = (eT1) (t / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - i += num_threads; - } - - // Return updated RNG state to global memory. - xorwow_state[6 * tid ] = local_xorwow_state[0]; - xorwow_state[6 * tid + 1] = local_xorwow_state[1]; - xorwow_state[6 * tid + 2] = local_xorwow_state[2]; - xorwow_state[6 * tid + 3] = local_xorwow_state[3]; - xorwow_state[6 * tid + 4] = local_xorwow_state[4]; - xorwow_state[6 * tid + 5] = local_xorwow_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randi.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randi.cl deleted file mode 100644 index 03d3817..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randi.cl +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// See algorithm "xorwow" from page 5 of "Xorshift RNGs" by George Marsaglia. -inline -ulong -xorwow64_rng_ulong(ulong* xorwow_state) - { - // xorwow_state[0] through xorwow_state[4] represent the 5 state integers, - // and xorwow_state[5] holds the counter. - ulong t = xorwow_state[4] ^ (xorwow_state[4] >> 2); - - xorwow_state[4] = xorwow_state[3]; - xorwow_state[3] = xorwow_state[2]; - xorwow_state[2] = xorwow_state[1]; - xorwow_state[1] = xorwow_state[0]; - xorwow_state[0] ^= (xorwow_state[0] << 4) ^ (t ^ (t << 1)); - - // Following Saito and Matsumoto (2012), we use a larger constant for d so that the higher bits flip more often. - // We ignore their conclusion that XORWOW has problems (it's fast!). - xorwow_state[5] += 2274084621458550325; - return xorwow_state[0] + xorwow_state[5]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_xorwow64_randi)(__global eT1* mem, - const UWORD mem_offset, - __global ulong* xorwow_state, - const UWORD n, - const eT1 lo, - const uint_eT1 range, - const char needs_modulo) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - ulong local_xorwow_state[6]; - local_xorwow_state[0] = xorwow_state[6 * tid ]; - local_xorwow_state[1] = xorwow_state[6 * tid + 1]; - local_xorwow_state[2] = xorwow_state[6 * tid + 2]; - local_xorwow_state[3] = xorwow_state[6 * tid + 3]; - local_xorwow_state[4] = xorwow_state[6 * tid + 4]; - local_xorwow_state[5] = xorwow_state[6 * tid + 5]; - - while (i < n) - { - // This generates a number in [0, uint_eT1_max]. - uint_eT1 t = (uint_eT1) xorwow64_rng_ulong(local_xorwow_state); - // Modulo down to the range [0, (hi - lo)], if needed. - if (needs_modulo == 1) - t %= (range + 1); - // Cast back to the correct type, and add lo to get the correct range. - mem[mem_offset + i] = ((eT1) t) + lo; - i += num_threads; - } - - // Return updated RNG state to global memory. - xorwow_state[6 * tid ] = local_xorwow_state[0]; - xorwow_state[6 * tid + 1] = local_xorwow_state[1]; - xorwow_state[6 * tid + 2] = local_xorwow_state[2]; - xorwow_state[6 * tid + 3] = local_xorwow_state[3]; - xorwow_state[6 * tid + 4] = local_xorwow_state[4]; - xorwow_state[6 * tid + 5] = local_xorwow_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randu.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randu.cl deleted file mode 100644 index 40450ae..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/inplace_xorwow64_randu.cl +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -//~ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -//~ -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// See algorithm "xorwow" from page 5 of "Xorshift RNGs" by George Marsaglia. -inline -ulong -xorwow64_rng_ulong(ulong* xorwow_state) - { - // xorwow_state[0] through xorwow_state[4] represent the 5 state integers, - // and xorwow_state[5] holds the counter. - ulong t = xorwow_state[4] ^ (xorwow_state[4] >> 2); - - xorwow_state[4] = xorwow_state[3]; - xorwow_state[3] = xorwow_state[2]; - xorwow_state[2] = xorwow_state[1]; - xorwow_state[1] = xorwow_state[0]; - xorwow_state[0] ^= (xorwow_state[0] << 4) ^ (t ^ (t << 1)); - - // Following Saito and Matsumoto (2012), we use a larger constant for d so that the higher bits flip more often. - // We ignore their conclusion that XORWOW has problems (it's fast!). - xorwow_state[5] += 2274084621458550325; - return xorwow_state[0] + xorwow_state[5]; - } - - - -__kernel -void -COOT_FN(PREFIX,inplace_xorwow64_randu)(__global eT1* mem, - const UWORD mem_offset, - __global ulong* xorwow_state, - const UWORD n) - { - const UWORD tid = get_global_id(0); - const UWORD num_threads = get_global_size(0); - UWORD i = tid; - - // Copy RNG state to local memory. - ulong local_xorwow_state[6]; - local_xorwow_state[0] = xorwow_state[6 * tid ]; - local_xorwow_state[1] = xorwow_state[6 * tid + 1]; - local_xorwow_state[2] = xorwow_state[6 * tid + 2]; - local_xorwow_state[3] = xorwow_state[6 * tid + 3]; - local_xorwow_state[4] = xorwow_state[6 * tid + 4]; - local_xorwow_state[5] = xorwow_state[6 * tid + 5]; - - while (i < n) - { - uint_eT1 t = (uint_eT1) xorwow64_rng_ulong(local_xorwow_state); - // Now normalize to [0, 1] and compute the output. - mem[mem_offset + i] = (eT1) (t / (fp_eT1) COOT_FN(coot_type_max_,uint_eT1)()); - i += num_threads; - } - - // Return updated RNG state to global memory. - xorwow_state[6 * tid ] = local_xorwow_state[0]; - xorwow_state[6 * tid + 1] = local_xorwow_state[1]; - xorwow_state[6 * tid + 2] = local_xorwow_state[2]; - xorwow_state[6 * tid + 3] = local_xorwow_state[3]; - xorwow_state[6 * tid + 4] = local_xorwow_state[4]; - xorwow_state[6 * tid + 5] = local_xorwow_state[5]; - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/linspace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/linspace.cl deleted file mode 100644 index 884218e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/linspace.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,linspace)(__global eT1* out_mem, - const UWORD out_mem_offset, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - const UWORD idx = get_global_id(0); - if (idx < num) - { - out_mem[out_mem_offset + idx * mem_incr] = (eT1) (start + step * idx); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/logspace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/logspace.cl deleted file mode 100644 index ec65f5f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/logspace.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,logspace)(__global eT1* out_mem, - const UWORD out_mem_offset, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - const UWORD idx = get_global_id(0); - if (idx < num) - { - out_mem[out_mem_offset + idx * mem_incr] = (eT1) pow((fp_eT1) 10, (fp_eT1) (start + step * idx)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/ltri_set_zero.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/ltri_set_zero.cl deleted file mode 100644 index 1890e0b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/ltri_set_zero.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,ltri_set_zero)(__global eT1* out, - const UWORD out_offset, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD index = row + n_rows * col; - if( (row < n_rows) && (col < n_cols) && (row > col) ) - { - out[index + out_offset] = (eT1)(0); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max.cl deleted file mode 100644 index e500ad7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max.cl +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i]); - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,max_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs.cl deleted file mode 100644 index 28a667e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs.cl +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_abs)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[in_mem_offset + i]); - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i])); - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i])); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,max_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs_small.cl deleted file mode 100644 index c7a531c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_abs_small.cl +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_abs_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[in_mem_offset + i]); - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i])); - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], (eT1) ET1_ABS(in_mem[in_mem_offset + i])); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_small.cl deleted file mode 100644 index b4872e1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/max_small.cl +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_max_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i]); - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = max(aux_mem[tid], in_mem[in_mem_offset + i]); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] = max(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min.cl deleted file mode 100644 index 156e598..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min.cl +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i]); - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,min_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min_small.cl deleted file mode 100644 index 91a6208..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/min_small.cl +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2021 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = in_mem[in_mem_offset + i]; - } - if (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i]); - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], in_mem[in_mem_offset + i]); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise.cl deleted file mode 100644 index 391f7f7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each column in `in` by the corresponding value in `A` -__kernel -void -COOT_FN(PREFIX,mul_colwise)(__global eT1* out, - const UWORD out_offset, - __global const eT1* A, // expected to have length n_cols - const UWORD A_offset, - const UWORD A_incr, - __global const eT1* in, - const UWORD in_offset, - const eT1 alpha, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - const UWORD in_elem_offset = col * in_M_n_rows; - const UWORD out_elem_offset = col * n_rows; - const eT1 val = alpha * A[A_offset + col * A_incr]; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - out[out_offset + i + out_elem_offset] = val * in[in_offset + i + in_elem_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise_trans.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise_trans.cl deleted file mode 100644 index 26a4713..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_colwise_trans.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each column in `trans(in)` by the corresponding value in `A` -__kernel -void -COOT_FN(PREFIX,mul_colwise_trans)(__global eT1* out, - const UWORD out_offset, - __global const eT1* A, // expected to have length n_cols - const UWORD A_offset, - const UWORD A_incr, - __global const eT1* in, - const UWORD in_offset, - const eT1 alpha, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - const eT1 val = alpha * A[A_offset + col * A_incr]; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - const UWORD in_elem_offset = col + i * in_M_n_rows; - const UWORD out_elem_offset = col * n_rows + i; - out[out_offset + out_elem_offset] = val * in[in_offset + in_elem_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise.cl deleted file mode 100644 index c3d6ba7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each row in `in` by the corresponding value in `A` -__kernel -void -COOT_FN(PREFIX,mul_rowwise)(__global eT1* out, - const UWORD out_offset, - __global const eT1* A, // expected to have length n_rows - const UWORD A_offset, - const UWORD A_incr, - __global const eT1* in, - const UWORD in_offset, - const eT1 alpha, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - const eT1 val = alpha * A[A_offset + row * A_incr]; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_cols; ++i) - { - const UWORD in_elem_offset = i * in_M_n_rows + row; - const UWORD out_elem_offset = i * in_M_n_rows + row; - out[out_offset + out_elem_offset] = val * in[in_offset + in_elem_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise_trans.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise_trans.cl deleted file mode 100644 index 83371d9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/mul_rowwise_trans.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// multiply each row in `trans(in)` by the corresponding value in `A` -__kernel -void -COOT_FN(PREFIX,mul_rowwise_trans)(__global eT1* out, - const UWORD out_offset, - __global const eT1* A, // expected to have length n_rows - const UWORD A_offset, - const UWORD A_incr, - __global const eT1* in, - const UWORD in_offset, - const eT1 alpha, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - const eT1 val = alpha * A[A_offset + row * A_incr]; - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_cols; ++i) - { - const UWORD in_elem_offset = i + row * in_M_n_rows; - const UWORD out_elem_offset = i * n_rows + row; - out[out_offset + out_elem_offset] = val * in[in_offset + in_elem_offset]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod.cl deleted file mode 100644 index e39bbf4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod.cl +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,prod)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] *= in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] *= in_mem[in_mem_offset + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,prod_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod_small.cl deleted file mode 100644 index 5a75ee8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/prod_small.cl +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,prod_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] *= in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] *= in_mem[in_mem_offset + i]; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_asc.cl deleted file mode 100644 index bd51347..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_asc.cl +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_asc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_counts[2]; - - __global eT1* unsorted_memptr = A + A_offset; - __global eT1* sorted_memptr = tmp_mem; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - barrier(CLK_LOCAL_MEM_FENCE); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid ]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid + num_threads]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // swap these and perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = aux_mem[num_threads] - aux_mem[tid]; // contains the first place we should put a 1 point (we will move downwards) - local_counts[1] = (local_counts[1] == 0) ? 0 : local_counts[1] - 1; // avoid underflow - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_asc.cl deleted file mode 100644 index 4dd8b55..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_asc.cl +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_colwise_asc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < A_n_cols) - { - __global eT1* unsorted_colptr = &A[A_offset + col * A_M_n_rows]; - __global eT1* sorted_colptr = &tmp_mem[ col * A_n_rows ]; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - __global uint_eT1* colptr = (__global uint_eT1*) unsorted_colptr; - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> b)]; - } - - counts[1] = counts[0]; // now holds the offset to put the next value at - counts[0] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD out_index = counts[((colptr[i] & mask) >> b)]++; - sorted_colptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - __global eT1* tmp = unsorted_colptr; - unsorted_colptr = sorted_colptr; - sorted_colptr = tmp; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* colptr = (__global uint_eT1*) unsorted_colptr; - counts[0] = 0; - counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> last_bit)]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = counts[0] - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_colptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]++; - sorted_colptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_desc.cl deleted file mode 100644 index e84408d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_colwise_desc.cl +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_colwise_desc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < A_n_cols) - { - __global eT1* unsorted_colptr = &A[A_offset + col * A_M_n_rows]; - __global eT1* sorted_colptr = &tmp_mem[ col * A_n_rows ]; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - __global uint_eT1* colptr = (__global uint_eT1*) unsorted_colptr; - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> b)]; - } - - counts[0] = counts[1]; // now holds the offset to put the next value at - counts[1] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD out_index = counts[((colptr[i] & mask) >> b)]++; - sorted_colptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - __global eT1* tmp = unsorted_colptr; - unsorted_colptr = sorted_colptr; - sorted_colptr = tmp; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* colptr = (__global uint_eT1*) unsorted_colptr; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - counts[0] = 0; // now holds the offset to put the next positive value at - counts[1] = A_n_rows - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_colptr[out_index] = val; - } - } - else - { - counts[0] = 0; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_rows; ++i) - { - ++counts[((colptr[i] & mask) >> last_bit)]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - counts[1] = counts[0]; // now holds the offset to put the next negative value at - counts[0] = 0; // now holds the offset to put the next positive value at - - for (UWORD i = 0; i < A_n_rows; ++i) - { - const eT1 val = unsorted_colptr[i]; - const UWORD bit_val = ((colptr[i] & mask) >> last_bit); - const UWORD out_index = counts[bit_val]++; - sorted_colptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_desc.cl deleted file mode 100644 index c76e7bc..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_desc.cl +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_desc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - UWORD local_counts[2]; - - __global eT1* unsorted_memptr = A + A_offset; - __global eT1* sorted_memptr = tmp_mem; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - barrier(CLK_LOCAL_MEM_FENCE); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid ]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - unsorted_memptr = sorted_memptr; - sorted_memptr = tmp; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = n_elem - 1 - (aux_mem[num_threads + tid] - aux_mem[num_threads]); // contains the first place we should put a 1 point (we will move downwards) - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid]; - local_counts[1] = aux_mem[tid + num_threads]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_asc.cl deleted file mode 100644 index db77609..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_asc.cl +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_index_asc)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* tmp_mem, - __global UWORD* tmp_mem_index, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill A_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - A_index[A_index_offset + i] = i; - A_index[A_index_offset + i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - A_index[A_index_offset + i] = i; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - UWORD local_counts[2]; - - __global eT1* unsorted_memptr = A + A_offset; - __global UWORD* unsorted_index_memptr = A_index + A_index_offset; - __global eT1* sorted_memptr = tmp_mem; - __global UWORD* sorted_index_memptr = tmp_mem_index; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - barrier(CLK_LOCAL_MEM_FENCE); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid ]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid + num_threads]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - __global UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // swap these and perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = aux_mem[num_threads] - aux_mem[tid]; // contains the first place we should put a 1 point (we will move downwards) - local_counts[1] = (local_counts[1] == 0) ? 0 : local_counts[1] - 1; // avoid underflow - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_desc.cl deleted file mode 100644 index a67aa7e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_desc.cl +++ /dev/null @@ -1,320 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_index_desc)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* tmp_mem, - __global UWORD* tmp_mem_index, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill A_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - A_index[A_index_offset + i] = i; - A_index[A_index_offset + i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - A_index[A_index_offset + i] = i; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - UWORD local_counts[2]; - - __global eT1* unsorted_memptr = A + A_offset; - __global UWORD* unsorted_index_memptr = A_index + A_index_offset; - __global eT1* sorted_memptr = tmp_mem; - __global UWORD* sorted_index_memptr = tmp_mem_index; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? 8 * sizeof(eT1) - 1 : 8 * sizeof(eT1); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> b]; - ++local_counts[(memptr[i + 1] & mask) >> b]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> b]; - } - // Step 2: aggregate the counts for all threads. - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - barrier(CLK_LOCAL_MEM_FENCE); - - // At the end of this, `tid` should be assigned two sections of memory to put its 0 and 1 points in. - // aux_mem[tid] should hold the first place to put a 0 point - // aux_mem[tid + num_threads] should hold the first place to put a 1 point - // More specifically: - // aux_mem[tid] := sum_{i = 0}^{tid - 1} aux_mem[i] - // aux_mem[tid + num_threads] := sum_{i = 0}^{tid + num_threads - 1} aux_mem[i] - // which means that this is just a prefix-sum operation on the full length of aux_mem. - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - local_counts[0] = aux_mem[tid + num_threads]; // contains the first place we should put a 0 point - local_counts[1] = aux_mem[tid ]; // contains the first place we should put a 1 point - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b)]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b)]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - __global UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // If the type is integral, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - local_counts[0] = 0; - local_counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(memptr[i ] & mask) >> last_bit]; - ++local_counts[(memptr[i + 1] & mask) >> last_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(memptr[i] & mask) >> last_bit]; - } - - // local_counts[0] now holds the number of positive points; local_counts[1] holds the number of negative points - // perform a prefix sum, as with the rest of the bits - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - barrier(CLK_LOCAL_MEM_FENCE); - - // Up-sweep total sum into final element. - UWORD offset = 1; - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[2 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Step 3: move points into the correct place. - // This is different for integral and floating point types. - if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating point implementation: - // For negative values, we have things sorted in reverse order, so we need to reverse that in our final swap pass. - // That means that thread 0's negative points go into the last slots, and the last thread's negative points go into the first slots. - local_counts[0] = aux_mem[tid]; // contains the first place we should put a 0 point (we will move upwards) - local_counts[1] = n_elem - 1 - (aux_mem[num_threads + tid] - aux_mem[num_threads]); // contains the first place we should put a 1 point (we will move downwards) - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]; - const int offset1 = (bit_val1 == 1) ? -1 : 1; - local_counts[bit_val1] += offset1; // decrements for negative values, increments for positive values - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]; - const int offset2 = (bit_val2 == 1) ? -1 : 1; - local_counts[bit_val2] += offset2; // decrements for negative values, increments for positive values - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]; - const int offset = (bit_val == 1) ? -1 : 1; - local_counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - else - { - // Signed integral implementation: - // Here, we have values in the right order, we just need to put the negative values ahead of the positive values. - local_counts[0] = aux_mem[tid]; - local_counts[1] = aux_mem[tid + num_threads]; - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD bit_val1 = ((memptr[i] & mask) >> last_bit); - const UWORD out_index1 = local_counts[bit_val1]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD bit_val2 = ((memptr[i + 1] & mask) >> last_bit); - const UWORD out_index2 = local_counts[bit_val2]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD bit_val = ((memptr[i] & mask) >> last_bit); - const UWORD out_index = local_counts[bit_val]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_multi_wg_shuffle.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_multi_wg_shuffle.cl deleted file mode 100644 index c841442..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_index_multi_wg_shuffle.cl +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_index_multi_wg_shuffle)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* out, - const UWORD out_offset, - __global UWORD* out_index, - const UWORD out_index_offset, - __global UWORD* counts, - const UWORD counts_offset, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - const UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - __global uint_eT1* uA = (__global uint_eT1*) A; - const uint_eT1 mask = (((uint_eT1) 3) << start_bit); - - int upper_bit_shift = 1; - UWORD local_offsets[4]; - if (sort_type == 0) - { - // for an ascending sort, the offsets are ordered for bit values 00/01/10/11 - local_offsets[0] = counts[counts_offset + tid ]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid + num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 11 point - } - else if (sort_type == 1) - { - // for a descending sort, the offsets are ordered for bit values 11/10/01/00 - local_offsets[0] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + tid ]; // first place we should put a 11 point - } - else if (sort_type == 2) - { - // for the last bits of a signed integer in an ascending sort, the offsets are ordered for bit values (10/11/00/01) - local_offsets[0] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + tid ]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + tid + num_threads]; // first place we should put a 11 point - } - else if (sort_type == 3) - { - // for the last bits of a signed integer in a descending sort, the offsets are ordered for bit values (01/00/11/10) - local_offsets[0] = counts[counts_offset + tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid ]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 11 point - } - else if (sort_type == 4) - { - // for the last bits of a floating-point number in an ascending sort, the offsets are ordered for bit values (11/10/00/01) - // and, the negative values are ordered in a descending order, so we have to reverse them - local_offsets[0] = counts[counts_offset + tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + 2 * num_threads] - counts[counts_offset + tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[counts_offset + num_threads] - counts[counts_offset + tid ]; // first place we should put a 11 point - - // avoid underflow - local_offsets[2] = (local_offsets[2] == 0) ? 0 : local_offsets[2] - 1; - local_offsets[3] = (local_offsets[3] == 0) ? 0 : local_offsets[3] - 1; - - upper_bit_shift = -1; // sort negative values backwards - } - else if (sort_type == 5) - { - // for the last bits of a floating-point number in a descending sort, the offsets are ordered for bit values (01/00/10/11) - // and, the negative values are ordered in a descending order, so we have to reverse them - local_offsets[0] = counts[counts_offset + tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[counts_offset + tid ]; // first place we should put a 01 point - local_offsets[2] = counts[counts_offset + 3 * num_threads] - (counts[counts_offset + tid + 2 * num_threads] - counts[counts_offset + 2 * num_threads]); // first place we should put a 10 point - local_offsets[3] = n_elem - (counts[counts_offset + tid + 3 * num_threads] - counts[counts_offset + 3 * num_threads]); // first place we should put a 11 point - - // avoid underflow - local_offsets[2] = (local_offsets[2] == 0) ? 0 : local_offsets[2] - 1; - local_offsets[3] = (local_offsets[3] == 0) ? 0 : local_offsets[3] - 1; - - upper_bit_shift = -1; // sort negative values backwards - } - else if (sort_type == 6) - { - // for the last bits of a floating-point number in a stable ascending sort, - // the offsets are ordered for bit values (11/10/00/01) - // but, we do not need to reverse any values - local_offsets[0] = counts[tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid ]; // first place we should put a 11 point - } - else if (sort_type == 7) - { - // for the last bits of a floating-point number in a stable descending sort, - // the offsets are ordered for bit values (01/00/10/11) - // but, we do not need to reverse any values - local_offsets[0] = counts[tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid ]; // first place we should put a 01 point - local_offsets[2] = counts[tid + 2 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid + 3 * num_threads]; // first place we should put a 11 point - } - - // Move all points that this thread is responsible for into the correct place. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - const uint_eT1 val1 = uA[A_offset + i ]; - const uint_eT1 val2 = uA[A_offset + i + 1]; - - const UWORD loc1 = ((val1 & mask) >> start_bit); - const UWORD loc2 = ((val2 & mask) >> start_bit); - - const UWORD out_index1 = local_offsets[loc1]; - local_offsets[loc1] += ((loc1 >= 2) ? upper_bit_shift : 1); - const UWORD out_index2 = local_offsets[loc2]; - local_offsets[loc2] += ((loc2 >= 2) ? upper_bit_shift : 1); - - out[out_offset + out_index1] = A[A_offset + i]; - out_index[out_index_offset + out_index1] = A_index[A_index_offset + i]; - - out[out_offset + out_index2] = A[A_offset + i + 1]; - out_index[out_offset + out_index2] = A_index[A_index_offset + i + 1]; - - i += 2; - } - if (i < end_elem) - { - const uint_eT1 val = uA[A_offset + i]; - const UWORD loc = ((val & mask) >> start_bit); - const UWORD out_index1 = local_offsets[loc]; - local_offsets[loc] += ((loc >= 2) ? upper_bit_shift : 1); - out[out_offset + out_index1] = A[A_offset + i]; - out_index[out_index_offset + out_index1] = A_index[A_index_offset + i]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_bit_count.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_bit_count.cl deleted file mode 100644 index a9cf02c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_bit_count.cl +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_multi_wg_bit_count)(__global eT1* A, - const UWORD A_offset, - __global UWORD* counts, - const UWORD counts_offset, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - const UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - const uint_eT1 mask = (((uint_eT1) 3) << start_bit); - - __global uint_eT1* uA = (__global uint_eT1*) A; // so that we can mask elements of A bitwise - - UWORD local_counts[4] = { 0, 0, 0, 0 }; - - // Count the number of elements with each bit value (00/01/10/11) that belong - // to this thread. - - UWORD i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[(uA[A_offset + i ] & mask) >> start_bit]; - ++local_counts[(uA[A_offset + i + 1] & mask) >> start_bit]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[(uA[A_offset + i] & mask) >> start_bit]; - } - - // Save results to the right place for later processing. - if (sort_type == 0) - { - counts[counts_offset + tid ] = local_counts[0]; - counts[counts_offset + tid + num_threads] = local_counts[1]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[2]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[3]; - } - else if (sort_type == 1) - { - // If sort_type == 1 (descending), we want to store the results in the bit - // order 11/10/01/00, instead of the order of local_counts (00/01/10/11). - counts[counts_offset + tid ] = local_counts[3]; - counts[counts_offset + tid + num_threads] = local_counts[2]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[1]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[0]; - } - else if (sort_type == 2) - { - // If sort_type == 2 (highest two bits of a signed integer, ascending), we - // want to store the results in the bit order 10/11/00/01 - counts[counts_offset + tid ] = local_counts[2]; - counts[counts_offset + tid + num_threads] = local_counts[3]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[0]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[1]; - } - else if (sort_type == 3) - { - // If sort_type == 3 (highest two bits of a signed integer, descending), we - // want to store the results in the bit order 01/00/11/10 - counts[counts_offset + tid ] = local_counts[1]; - counts[counts_offset + tid + num_threads] = local_counts[0]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[3]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[2]; - } - else if (sort_type == 4 || sort_type == 6) - { - // If sort_type == 4 or 6 (highest two bits of floating-point number, ascending), - // we want to store the results in the bit order 11/10/00/01 - counts[counts_offset + tid ] = local_counts[3]; - counts[counts_offset + tid + num_threads] = local_counts[2]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[0]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[1]; - } - else if (sort_type == 5 || sort_type == 7) - { - // If sort_type == 5 or 7 (highest two bits of floating-point number, - // descending), we want to store the results in the bit order 01/00/10/11 - counts[counts_offset + tid ] = local_counts[1]; - counts[counts_offset + tid + num_threads] = local_counts[0]; - counts[counts_offset + tid + 2 * num_threads] = local_counts[2]; - counts[counts_offset + tid + 3 * num_threads] = local_counts[3]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_shuffle.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_shuffle.cl deleted file mode 100644 index a211364..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_multi_wg_shuffle.cl +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_multi_wg_shuffle)(__global eT1* A, - const UWORD A_offset, - __global eT1* out, - const UWORD out_offset, - __global UWORD* counts, - const UWORD n_elem, - const UWORD sort_type, - const UWORD start_bit) - { - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - const UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - __global uint_eT1* uA = (__global uint_eT1*) A; - const uint_eT1 mask = (((uint_eT1) 3) << start_bit); - - int upper_bit_shift = 1; - UWORD local_offsets[4]; - if (sort_type == 0) - { - // for an ascending sort, the offsets are ordered for bit values 00/01/10/11 - local_offsets[0] = counts[tid ]; // first place we should put a 00 point - local_offsets[1] = counts[tid + num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[tid + 2 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid + 3 * num_threads]; // first place we should put a 11 point - } - else if (sort_type == 1) - { - // for a descending sort, the offsets are ordered for bit values 11/10/01/00 - local_offsets[0] = counts[tid + 3 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid + 2 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid ]; // first place we should put a 11 point - } - else if (sort_type == 2) - { - // for the last bits of a signed integer in an ascending sort, the offsets are ordered for bit values (10/11/00/01) - local_offsets[0] = counts[tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[tid ]; // first place we should put a 10 point - local_offsets[3] = counts[tid + num_threads]; // first place we should put a 11 point - } - else if (sort_type == 3) - { - // for the last bits of a signed integer in a descending sort, the offsets are ordered for bit values (01/00/11/10) - local_offsets[0] = counts[tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid ]; // first place we should put a 01 point - local_offsets[2] = counts[tid + 3 * num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[tid + 2 * num_threads]; // first place we should put a 11 point - } - else if (sort_type == 4) - { - // for the last bits of a floating-point number in an ascending sort, the offsets are ordered for bit values (11/10/00/01) - // and, the negative values are ordered in a descending order, so we have to reverse them - local_offsets[0] = counts[tid + 2 * num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid + 3 * num_threads]; // first place we should put a 01 point - local_offsets[2] = counts[2 * num_threads] - counts[tid + num_threads]; // first place we should put a 10 point - local_offsets[3] = counts[ num_threads] - counts[tid ]; // first place we should put a 11 point - - // avoid underflow - local_offsets[2] = (local_offsets[2] == 0) ? 0 : local_offsets[2] - 1; - local_offsets[3] = (local_offsets[3] == 0) ? 0 : local_offsets[3] - 1; - - upper_bit_shift = -1; // sort negative values backwards - } - else if (sort_type == 5) - { - // for the last bits of a floating-point number in a descending sort, the offsets are ordered for bit values (01/00/10/11) - // and, the negative values are ordered in a descending order, so we have to reverse them - local_offsets[0] = counts[tid + num_threads]; // first place we should put a 00 point - local_offsets[1] = counts[tid ]; // first place we should put a 01 point - local_offsets[2] = counts[3 * num_threads] - (counts[tid + 2 * num_threads] - counts[2 * num_threads]); // first place we should put a 10 point - local_offsets[3] = n_elem - (counts[tid + 3 * num_threads] - counts[3 * num_threads]); // first place we should put a 11 point - - // avoid underflow - local_offsets[2] = (local_offsets[2] == 0) ? 0 : local_offsets[2] - 1; - local_offsets[3] = (local_offsets[3] == 0) ? 0 : local_offsets[3] - 1; - - upper_bit_shift = -1; // sort negative values backwards - } - - // Move all points that this thread is responsible for into the correct place. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - const uint_eT1 val1 = uA[A_offset + i ]; - const uint_eT1 val2 = uA[A_offset + i + 1]; - - const UWORD loc1 = ((val1 & mask) >> start_bit); - const UWORD loc2 = ((val2 & mask) >> start_bit); - - const UWORD out_index1 = local_offsets[loc1]; - local_offsets[loc1] += ((loc1 >= 2) ? upper_bit_shift : 1); - const UWORD out_index2 = local_offsets[loc2]; - local_offsets[loc2] += ((loc2 >= 2) ? upper_bit_shift : 1); - - out[out_offset + out_index1] = A[A_offset + i]; - out[out_offset + out_index2] = A[A_offset + i + 1]; - - i += 2; - } - if (i < end_elem) - { - const uint_eT1 val = uA[A_offset + i]; - const UWORD loc = ((val & mask) >> start_bit); - const UWORD out_index = local_offsets[loc]; - local_offsets[loc] += ((loc >= 2) ? upper_bit_shift : 1); - out[out_offset + out_index] = A[A_offset + i]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_asc.cl deleted file mode 100644 index 804aec0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_asc.cl +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_rowwise_asc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < A_n_rows) - { - __global eT1* unsorted_rowptr = &A[A_offset + row]; - __global eT1* sorted_rowptr = &tmp_mem[ row]; - - UWORD unsorted_n_rows = A_M_n_rows; - UWORD sorted_n_rows = A_n_rows; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - __global uint_eT1* rowptr = (__global uint_eT1*) unsorted_rowptr; - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> b]; - } - - counts[1] = counts[0]; // now holds the offset to put the next value at - counts[0] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD out_index = (counts[((rowptr[in_index] & mask) >> b)]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - __global eT1* tmp = unsorted_rowptr; - unsorted_rowptr = sorted_rowptr; - sorted_rowptr = tmp; - - UWORD tmp2 = unsorted_n_rows; - unsorted_n_rows = sorted_n_rows; - sorted_n_rows = tmp2; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* rowptr = (__global uint_eT1*) unsorted_rowptr; - counts[0] = 0; - counts[1] = 0; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> last_bit]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - if (COOT_FN(coot_is_fp_,eT1)()) - { - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = counts[0] - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = counts[bit_val] * sorted_n_rows; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_rowptr[out_index] = val; - } - } - else - { - counts[0] = counts[1]; // now holds the offset to put the next positive value at - counts[1] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = (counts[bit_val]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_desc.cl deleted file mode 100644 index 626409f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/radix_sort_rowwise_desc.cl +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,radix_sort_rowwise_desc)(__global eT1* A, - const UWORD A_offset, - __global eT1* tmp_mem, - const UWORD A_n_rows, - const UWORD A_n_cols, - const UWORD A_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < A_n_rows) - { - __global eT1* unsorted_rowptr = &A[A_offset + row]; - __global eT1* sorted_rowptr = &tmp_mem[ row]; - - UWORD unsorted_n_rows = A_M_n_rows; - UWORD sorted_n_rows = A_n_rows; - - UWORD counts[2]; - - // If the type is unsigned, all the work will be done the same way. - const UWORD max_bit = COOT_FN(coot_is_signed_,eT1)() ? (8 * sizeof(eT1) - 1) : (8 * sizeof(eT1)); - - for (UWORD b = 0; b < max_bit; ++b) - { - // Since we are sorting bitwise, we should treat the data as unsigned integers to make bitwise operations easy. - __global uint_eT1* rowptr = (__global uint_eT1*) unsorted_rowptr; - - counts[0] = 0; // holds the count of points with bit value 0 - counts[1] = 0; // holds the count of points with bit value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> b]; - } - - counts[0] = counts[1]; // now holds the offset to put the next value at - counts[1] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD out_index = (counts[((rowptr[in_index] & mask) >> b)]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - - // swap pointers (unsorted is now sorted) - __global eT1* tmp = unsorted_rowptr; - unsorted_rowptr = sorted_rowptr; - sorted_rowptr = tmp; - - UWORD tmp2 = unsorted_n_rows; - unsorted_n_rows = sorted_n_rows; - sorted_n_rows = tmp2; - } - - // If the type is unsigned, we're now done---we don't have to handle a sign bit differently. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - return; - } - - // Only signed types get here. - // In both cases, we have to put the 1-bit values before the 0-bit values. - // But, for floating point signed types, we need to reverse the order of the 1-bit points. - // So, we need a slightly different implementation for both cases. - __global uint_eT1* rowptr = (__global uint_eT1*) unsorted_rowptr; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 mask = (((uint_eT1) 1) << last_bit); - - if (COOT_FN(coot_is_fp_,eT1)()) - { - counts[0] = 0; // now holds the offset to put the next positive value at - counts[1] = A_n_cols - 1; // now holds the offset to put the next negative value at (we move backwards) - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = counts[bit_val] * sorted_n_rows; - const int offset = (bit_val == 1) ? -1 : 1; - counts[bit_val] += offset; // decrements for negative values, increments for positive values - sorted_rowptr[out_index] = val; - } - } - else - { - counts[0] = 0; - counts[1] = 0; - - for (UWORD i = 0; i < A_n_cols; ++i) - { - ++counts[(rowptr[i * unsorted_n_rows] & mask) >> last_bit]; - } - // counts[0] now holds the number of positive points; counts[1] holds the number of negative points - - counts[1] = counts[0]; // now holds the offset to put the next positive value at - counts[0] = 0; // now holds the offset to put the next negative value at - - for (UWORD i = 0; i < A_n_cols; ++i) - { - const UWORD in_index = i * unsorted_n_rows; - const eT1 val = unsorted_rowptr[in_index]; - const UWORD bit_val = ((rowptr[in_index] & mask) >> last_bit); - const UWORD out_index = (counts[bit_val]++) * sorted_n_rows; - sorted_rowptr[out_index] = val; - } - } - } - - // Since there are an even number of bits in every data type (or... well... I am going to assume that!), the sorted result is now in A. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/regspace_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/regspace_desc.cl deleted file mode 100644 index b3da4e3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/regspace_desc.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,regspace_desc)(__global eT1* out_mem, - const UWORD out_mem_offset, - const UWORD mem_incr, - const eT1 start, - const eT1 end, - const eT1 step, - const UWORD num) - { - const UWORD idx = get_global_id(0); - if (idx < num) - { - out_mem[out_mem_offset + idx * mem_incr] = (eT1) (start - step * idx); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/reorder_cols.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/reorder_cols.cl deleted file mode 100644 index a375123..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/reorder_cols.cl +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,reorder_cols)(__global eT1* out_mem, - const UWORD out_mem_offset, - __global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_rows, - __global const UWORD* ordering, - const UWORD out_n_cols) - { - const UWORD out_col = get_global_id(0); - if (out_col < out_n_cols) - { - const UWORD in_col = ordering[out_col]; - - __global eT1* out_colptr = out_mem + out_mem_offset + (out_col * n_rows); - const __global eT1* in_colptr = in_mem + in_mem_offset + (in_col * n_rows); - - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for (UWORD i = 0; i < n_rows; ++i) - { - out_colptr[i] = in_colptr[i]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/rotate_180.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/rotate_180.cl deleted file mode 100644 index ca571e1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/rotate_180.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rotate_180)(__global eT1* out, - const UWORD out_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if( (row < n_rows) && (col < n_cols) ) - { - const UWORD in_index = in_offset + col * n_rows + row; - // out(i, j) = in(n_rows - i - 1, n_cols - j - 1) - // or - // out(n_rows - i - 1, n_cols - j - 1) = in(i, j) - const UWORD out_row = n_rows - row - 1; - const UWORD out_col = n_cols - col - 1; - const UWORD out_index = out_offset + out_col * n_rows + out_row; - - out[out_index] = in[in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_add_offset.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_add_offset.cl deleted file mode 100644 index 32b0b71..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_add_offset.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel adds workgroup-specific offsets to blocks of local memory. -// Specifically, workgroup i, which has t threads, adds offsets[i] to -// the range mem[i * (2 * t)] to mem[(i + 1) * (2 * t) - 1] (inclusive). -__kernel -void -COOT_FN(PREFIX,shifted_prefix_sum_add_offset)(__global eT1* mem, - const UWORD global_mem_offset, - __global const eT1* offsets, - const UWORD n_elem, - __local volatile eT1* aux_mem) - { - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = get_group_id(0); - - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD local_offset = 2 * local_tid; - const UWORD mem_offset = group_offset + local_offset; - - const eT1 offset = offsets[group_id]; - - const eT1 in_val1 = (mem_offset < n_elem) ? mem[global_mem_offset + mem_offset ] : (eT1) 0; - const eT1 in_val2 = (mem_offset + 1 < n_elem) ? mem[global_mem_offset + mem_offset + 1] : (eT1) 0; - - const eT1 out_val1 = in_val1 + offset; - const eT1 out_val2 = in_val2 + offset; - - // Copy results back to memory. - if (mem_offset + 1 < n_elem) - { - mem[global_mem_offset + mem_offset ] = out_val1; - mem[global_mem_offset + mem_offset + 1] = out_val2; - } - else if (mem_offset < n_elem) - { - mem[global_mem_offset + mem_offset ] = out_val1; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_small.cl deleted file mode 100644 index de2e5b7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_small.cl +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel performs shifted prefix-sum on `mem` assuming that (2 * local group size) <= n_elem. -// It's okay if n_elem is not a power of 2. -__kernel -void -COOT_FN(PREFIX,shifted_prefix_sum_small)(__global eT1* mem, - const UWORD global_mem_offset, - const UWORD n_elem, - __local volatile eT1* aux_mem) - { - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = get_group_id(0); - - // Copy relevant memory to auxiliary memory. - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD mem_offset = group_offset + 2 * local_tid; - - aux_mem[mem_offset ] = (mem_offset < n_elem) ? mem[global_mem_offset + mem_offset ] : (eT1) 0; - aux_mem[mem_offset + 1] = (mem_offset + 1 < n_elem) ? mem[global_mem_offset + mem_offset + 1] : (eT1) 0; - - UWORD offset = 1; - for (UWORD s = local_size; s > 0; s >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (local_tid < s) - { - const UWORD ai = group_offset + offset * (2 * local_tid + 1) - 1; - const UWORD bi = group_offset + offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - } - - // Prepare for down-sweep by setting the last element to 0. - if (local_tid == 0) - { - aux_mem[2 * local_size - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = 1; s <= local_size; s *= 2) - { - offset >>= 1; - if (local_tid < s) - { - const UWORD ai = group_offset + offset * (2 * local_tid + 1) - 1; - const UWORD bi = group_offset + offset * (2 * local_tid + 2) - 1; - eT1 tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Copy results back to memory. - if (mem_offset + 1 < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[mem_offset ]; - mem[global_mem_offset + mem_offset + 1] = aux_mem[mem_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[mem_offset ]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_subgroups.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_subgroups.cl deleted file mode 100644 index 7ee6b1b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shifted_prefix_sum_subgroups.cl +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This kernel performs the shifted prefix-sum on each individual workgroup. -// This is the same as just running a regular prefix-sum kernel, except that -// `out_mem[i]` will store the total sum of elements in workgroup `i`. -// After running this, to finish prefix-sum on the entire memory, offsets for -// each workgroup need to be added. -__kernel -void -COOT_FN(PREFIX,shifted_prefix_sum_subgroups)(__global eT1* mem, - const UWORD global_mem_offset, - __global eT1* out_mem, - const UWORD n_elem, - __local volatile eT1* aux_mem) - { - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); // will be the same across all workgroups (by calling convention), and must be a power of 2 - const UWORD group_id = get_group_id(0); - - // Copy relevant memory to auxiliary memory. - // This workgroup is responsible for mem[group_id * (2 * local_size)] to mem[(group_id + 1) * (2 * local_size) - 1]. - const UWORD group_offset = group_id * (2 * local_size); - const UWORD local_offset = 2 * local_tid; - const UWORD mem_offset = group_offset + local_offset; - - aux_mem[local_offset ] = (mem_offset < n_elem) ? mem[global_mem_offset + mem_offset ] : (eT1) 0; - aux_mem[local_offset + 1] = (mem_offset + 1 < n_elem) ? mem[global_mem_offset + mem_offset + 1] : (eT1) 0; - - UWORD offset = 1; - for (UWORD s = local_size; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (local_offset + 1) - 1; - const UWORD bi = offset * (local_offset + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (mem_offset + 1 < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[local_offset ]; - mem[global_mem_offset + mem_offset + 1] = aux_mem[local_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[local_offset ]; - } - - if (local_tid == 0) - { - // Write the sum of the subarray to the output memory. - out_mem[group_id] = aux_mem[2 * local_size - 1]; - // Prepare for the downsweep. - aux_mem[2 * local_size - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - offset = local_size; - for (UWORD s = 1; s <= local_size; s *= 2) - { - if (local_tid < s) - { - const UWORD ai = offset * (local_offset + 1) - 1; - const UWORD bi = offset * (local_offset + 2) - 1; - eT1 tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - offset >>= 1; - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Copy results back to memory. - // The results here are the prefix-summed results for each individual - // workgroup. - if (mem_offset + 1 < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[local_offset ]; - mem[global_mem_offset + mem_offset + 1] = aux_mem[local_offset + 1]; - } - else if (mem_offset < n_elem) - { - mem[global_mem_offset + mem_offset ] = aux_mem[local_offset ]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle.cl deleted file mode 100644 index 982298f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle.cl +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,shuffle)(__global eT1* out, - const UWORD out_offset, - const UWORD out_incr, /* how many eT1s to advance to get to the start of the next element to shuffle */ - const UWORD out_elem_stride, /* how many eT1s between each eT1 in each element */ - __global const eT1* in, - const UWORD in_offset, - const UWORD in_incr, - const UWORD in_elem_stride, - const UWORD n_elem, - const UWORD elems_per_elem, - const UWORD n_elem_pow2, - __global const UWORD* philox_key, - const UWORD num_bits, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - - // Get our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // Fill aux_mem with the indicator of whether we are out of bounds. - // Then, we'll prefix-sum it. This will tell us where to put our result. - aux_mem[tid] = (in_loc < n_elem); - - // Now, prefix-sum the auxiliary memory. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = n_elem_pow2 / 2; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[n_elem_pow2 - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = 1; s <= n_elem_pow2 / 2; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // With the prefix sum complete, we shuffle our result into position aux_mem[tid], but only if we are a thread with a "valid" output. - if (in_loc < n_elem) - { - const UWORD in_addr_offset = in_offset + in_loc * in_incr; - const UWORD out_addr_offset = out_offset + aux_mem[tid] * out_incr; - - for (UWORD i = 0; i < elems_per_elem; ++i) - { - out[out_addr_offset + (i * out_elem_stride)] = in[in_addr_offset + (i * in_elem_stride)]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle_large.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle_large.cl deleted file mode 100644 index 4d77236..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/shuffle_large.cl +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,shuffle_large)(__global eT1* out, - const UWORD out_offset, - const UWORD out_incr, /* how many eT1s to advance to get to the start of the next element to shuffle */ - const UWORD out_elem_stride, /* how many eT1s between each eT1 in each element */ - __global const eT1* in, - const UWORD in_offset, - const UWORD in_incr, - const UWORD in_elem_stride, - __global const UWORD* block_offsets, - const UWORD n_elem, - const UWORD elems_per_elem, - const UWORD n_elem_pow2, - __global const UWORD* philox_key, - const UWORD num_bits, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); - - // Recompute our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // We actually have to perform the up-sweep a second time, since we did not save the memory the first time. - aux_mem[local_tid] = (in_loc < n_elem); - barrier(CLK_LOCAL_MEM_FENCE); - - // Now, prefix-sum the auxiliary memory for this block. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = local_size / 2; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (local_tid == 0) - { - aux_mem[local_size - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = 1; s <= local_size / 2; s *= 2) - { - offset >>= 1; - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // With the prefix sum complete, we shuffle our result into position aux_mem[tid], but only if we are a thread with a "valid" output. - if (in_loc < n_elem) - { - const UWORD in_addr_offset = in_offset + in_loc * in_incr; - const UWORD out_addr_offset = out_offset + (aux_mem[local_tid] + block_offsets[get_group_id(0)]) * out_incr; - - for (UWORD i = 0; i < elems_per_elem; ++i) - { - out[out_addr_offset + (i * out_elem_stride)] = in[in_addr_offset + (i * in_elem_stride)]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_asc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_asc.cl deleted file mode 100644 index 0f56d64..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_asc.cl +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,stable_radix_sort_index_asc)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* tmp_mem, - __global UWORD* tmp_mem_index, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - // The stable sort differs from the rest of our radix sorts in that we must avoid ever "reversing" point orders. - // We do this by adapting the regular radix sort to also consider the highest bit (the sign bit for signed types). - // This alleviates the need to ever unpack points in a reverse order, and so the sort is stable. - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill tmp_mem_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - tmp_mem_index[i] = i; - tmp_mem_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - tmp_mem_index[i] = i; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - UWORD local_counts[4]; - - __global eT1* unsorted_memptr = A + A_offset; - __global UWORD* unsorted_index_memptr = tmp_mem_index; - __global eT1* sorted_memptr = tmp_mem; - __global UWORD* sorted_index_memptr = A_index + A_index_offset; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 sign_mask = (((uint_eT1) 1) << last_bit); - - for (UWORD b = 0; b < 8 * sizeof(eT1) - 1; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 and sign value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 and sign value 0 - local_counts[2] = 0; // holds the count of elements with bit value 0 and sign value 1 - local_counts[3] = 0; // holds the count of elements with bit value 1 and sign value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[((memptr[i ] & mask) >> b) + ((memptr[i ] & sign_mask) >> (last_bit - 1))]; - ++local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]; - } - - // Step 2: aggregate the counts for all threads. - // There are a couple cases here to get things in an ascending order: - // * Floating point number: [11, 10, 00, 01] - // * Unsigned integer: [00, 01, 10, 11] - // * Signed integer: [10, 11, 00, 01] - // Note that the notation "11" indicates, e.g., a point whose sign is 1 and bit value in bit b is 1. - // For unsigned integers, we treat the top bit as a "sign" bit even though it's not---but we choose an ordering that's still correct. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - // Unsigned integer (00, 01, 10, 11) - aux_mem[tid ] = local_counts[0]; - aux_mem[tid + num_threads] = local_counts[1]; - aux_mem[tid + 2 * num_threads] = local_counts[2]; - aux_mem[tid + 3 * num_threads] = local_counts[3]; - } - else if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating-point (11, 10, 00, 01) - aux_mem[tid ] = local_counts[3]; - aux_mem[tid + num_threads] = local_counts[2]; - aux_mem[tid + 2 * num_threads] = local_counts[0]; - aux_mem[tid + 3 * num_threads] = local_counts[1]; - } - else - { - // Signed integer (10, 11, 00, 01) - aux_mem[tid ] = local_counts[2]; - aux_mem[tid + num_threads] = local_counts[3]; - aux_mem[tid + 2 * num_threads] = local_counts[0]; - aux_mem[tid + 3 * num_threads] = local_counts[1]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Now, we must assign four sections of memory for `tid` to put its points in. - // We do this by a prefix-sum operation across all threads. - // At the end of this operation (at the beginning of Step 3): - // - // local_counts[0] indicates the first place to put a sign-0 bit-value-0 point - // local_counts[1] indicates the first place to put a sign-0 bit-value-1 point - // local_counts[2] indicates the first place to put a sign-1 bit-value-0 point - // local_counts[3] indicates the first place to put a sign-1 bit-value-1 point - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - const UWORD ai1 = offset * (2 * tid + 1) - 1; - const UWORD bi1 = offset * (2 * tid + 2) - 1; - aux_mem[bi1] += aux_mem[ai1]; - const UWORD ai2 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi2 = offset * (2 * (tid + num_threads) + 2) - 1; - aux_mem[bi2] += aux_mem[ai2]; - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[4 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - offset >>= 1; - const UWORD ai3 = offset * (2 * tid + 1) - 1; - const UWORD bi3 = offset * (2 * tid + 2) - 1; - UWORD tmp3 = aux_mem[ai3]; - aux_mem[ai3] = aux_mem[bi3]; - aux_mem[bi3] += tmp3; - - const UWORD ai4 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi4 = offset * (2 * (tid + num_threads) + 2) - 1; - UWORD tmp4 = aux_mem[ai4]; - aux_mem[ai4] = aux_mem[bi4]; - aux_mem[bi4] += tmp4; - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 3: move points into the correct place. - // There are a couple cases here to get things in an ascending order: - // * Floating point number: [11, 10, 00, 01] - // * Unsigned integer: [00, 01, 10, 11] - // * Signed integer: [10, 11, 00, 01] - if (!COOT_FN(coot_is_signed_,eT1)()) - { - // Unsigned integer (00, 01, 10, 11) - local_counts[0] = aux_mem[tid ]; - local_counts[1] = aux_mem[tid + num_threads]; - local_counts[2] = aux_mem[tid + 2 * num_threads]; - local_counts[3] = aux_mem[tid + 3 * num_threads]; - } - else if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating-point (11, 10, 00, 01) - local_counts[0] = aux_mem[tid + 2 * num_threads]; - local_counts[1] = aux_mem[tid + 3 * num_threads]; - local_counts[2] = aux_mem[tid + num_threads]; - local_counts[3] = aux_mem[tid ]; - } - else - { - // Signed integer (10, 11, 00, 01) - local_counts[0] = aux_mem[tid + 2 * num_threads]; - local_counts[1] = aux_mem[tid + 3 * num_threads]; - local_counts[2] = aux_mem[tid ]; - local_counts[3] = aux_mem[tid + num_threads]; - } - - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - __global UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // Since we did an odd number of iterations, the result is stored in A_index. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_desc.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_desc.cl deleted file mode 100644 index 713574c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/stable_radix_sort_index_desc.cl +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,stable_radix_sort_index_desc)(__global eT1* A, - const UWORD A_offset, - __global UWORD* A_index, - const UWORD A_index_offset, - __global eT1* tmp_mem, - __global UWORD* tmp_mem_index, - const UWORD n_elem, - __local volatile UWORD* aux_mem) - { - // The stable sort differs from the rest of our radix sorts in that we must avoid ever "reversing" point orders. - // We do this by adapting the regular radix sort to also consider the highest bit (the sign bit for signed types). - // This alleviates the need to ever unpack points in a reverse order, and so the sort is stable. - - const UWORD tid = get_global_id(0); - - const UWORD num_threads = get_global_size(0); - const UWORD elems_per_thread = (n_elem + num_threads - 1) / num_threads; // this is ceil(n_elem / num_threads) - const UWORD start_elem = tid * elems_per_thread; - UWORD end_elem = min((tid + 1) * elems_per_thread, n_elem); - - // Fill tmp_mem_index with [0, 1, ..., n_elem - 1]. - UWORD i = start_elem; - while (i + 1 < end_elem) - { - tmp_mem_index[i] = i; - tmp_mem_index[i + 1] = i + 1; - i += 2; - } - if (i < end_elem) - { - tmp_mem_index[i] = i; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - UWORD local_counts[4]; - - __global eT1* unsorted_memptr = A + A_offset; - __global UWORD* unsorted_index_memptr = tmp_mem_index; - __global eT1* sorted_memptr = tmp_mem; - __global UWORD* sorted_index_memptr = A_index + A_index_offset; - - const UWORD last_bit = 8 * sizeof(eT1) - 1; - uint_eT1 sign_mask = (((uint_eT1) 1) << last_bit); - - for (UWORD b = 0; b < 8 * sizeof(eT1) - 1; ++b) - { - // Step 1: count the number of elements with each bit value that belong to this thread. - __global uint_eT1* memptr = (__global uint_eT1*) unsorted_memptr; - - local_counts[0] = 0; // holds the count of elements with bit value 0 - local_counts[1] = 0; // holds the count of elements with bit value 1 - local_counts[2] = 0; // holds the count of elements with bit value 0 and sign value 1 - local_counts[3] = 0; // holds the count of elements with bit value 1 and sign value 1 - - uint_eT1 mask = (((uint_eT1) 1) << b); - - i = start_elem; - while (i + 1 < end_elem) - { - ++local_counts[((memptr[i ] & mask) >> b) + ((memptr[i ] & sign_mask) >> (last_bit - 1))]; - ++local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]; - i += 2; - } - if (i < end_elem) - { - ++local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]; - } - - // Step 2: aggregate the counts for all threads. - // There are a couple cases here to get things in a descending order: - // * Floating point number: [01, 00, 10, 11] - // * Unsigned integer: [11, 10, 01, 00] - // * Signed integer: [01, 00, 11, 10] - // Note that the notation "11" indicates, e.g., a point whose sign is 1 and bit value in bit b is 1. - // For unsigned integers, we treat the top bit as a "sign" bit even though it's not---but we choose an ordering that's still correct. - if (!COOT_FN(coot_is_signed_,eT1)()) - { - // Unsigned integer (11, 10, 01, 00) - aux_mem[tid ] = local_counts[3]; - aux_mem[tid + num_threads] = local_counts[2]; - aux_mem[tid + 2 * num_threads] = local_counts[1]; - aux_mem[tid + 3 * num_threads] = local_counts[0]; - } - else if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating-point (01, 00, 10, 11) - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - aux_mem[tid + 2 * num_threads] = local_counts[2]; - aux_mem[tid + 3 * num_threads] = local_counts[3]; - } - else - { - // Signed integer (01, 00, 11, 10) - aux_mem[tid ] = local_counts[1]; - aux_mem[tid + num_threads] = local_counts[0]; - aux_mem[tid + 2 * num_threads] = local_counts[3]; - aux_mem[tid + 3 * num_threads] = local_counts[2]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Now, we must assign four sections of memory for `tid` to put its points in. - // We do this by a prefix-sum operation across all threads. - // At the end of this operation (at the beginning of Step 3): - // - // local_counts[0] indicates the first place to put a sign-0 bit-value-0 point - // local_counts[1] indicates the first place to put a sign-0 bit-value-1 point - // local_counts[2] indicates the first place to put a sign-1 bit-value-0 point - // local_counts[3] indicates the first place to put a sign-1 bit-value-1 point - - // Step 2a: up-sweep total sum into final element. - UWORD offset = 1; - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - const UWORD ai1 = offset * (2 * tid + 1) - 1; - const UWORD bi1 = offset * (2 * tid + 2) - 1; - aux_mem[bi1] += aux_mem[ai1]; - const UWORD ai2 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi2 = offset * (2 * (tid + num_threads) + 2) - 1; - aux_mem[bi2] += aux_mem[ai2]; - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = num_threads; s > 0; s >>= 1) - { - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid == 0) - { - aux_mem[4 * num_threads - 1] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 2b: down-sweep to build prefix sum. - for (UWORD s = 1; s <= num_threads; s *= 2) - { - offset >>= 1; - if (tid < s) - { - const UWORD ai = offset * (2 * tid + 1) - 1; - const UWORD bi = offset * (2 * tid + 2) - 1; - UWORD tmp = aux_mem[ai]; - aux_mem[ai] = aux_mem[bi]; - aux_mem[bi] += tmp; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Since we have auxiliary memory size of 4x the number of threads, we need to add an extra iteration where each thread handles two values. - offset >>= 1; - const UWORD ai3 = offset * (2 * tid + 1) - 1; - const UWORD bi3 = offset * (2 * tid + 2) - 1; - UWORD tmp3 = aux_mem[ai3]; - aux_mem[ai3] = aux_mem[bi3]; - aux_mem[bi3] += tmp3; - - const UWORD ai4 = offset * (2 * (tid + num_threads) + 1) - 1; - const UWORD bi4 = offset * (2 * (tid + num_threads) + 2) - 1; - UWORD tmp4 = aux_mem[ai4]; - aux_mem[ai4] = aux_mem[bi4]; - aux_mem[bi4] += tmp4; - barrier(CLK_LOCAL_MEM_FENCE); - - // Step 3: move points into the correct place. - // There are a couple cases here to get things in a descending order: - // * Floating point number: [01, 00, 10, 11] - // * Unsigned integer: [11, 10, 01, 00] - // * Signed integer: [01, 00, 11, 10] - if (!COOT_FN(coot_is_signed_,eT1)()) - { - // Unsigned integer (11, 10, 01, 00) - local_counts[0] = aux_mem[tid + 3 * num_threads]; - local_counts[1] = aux_mem[tid + 2 * num_threads]; - local_counts[2] = aux_mem[tid + num_threads]; - local_counts[3] = aux_mem[tid ]; - } - else if (COOT_FN(coot_is_fp_,eT1)()) - { - // Floating-point (01, 00, 10, 11) - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid ]; - local_counts[2] = aux_mem[tid + 2 * num_threads]; - local_counts[3] = aux_mem[tid + 3 * num_threads]; - } - else - { - // Signed integer (01, 00, 11, 10) - local_counts[0] = aux_mem[tid + num_threads]; - local_counts[1] = aux_mem[tid ]; - local_counts[2] = aux_mem[tid + 3 * num_threads]; - local_counts[3] = aux_mem[tid + 2 * num_threads]; - } - - i = start_elem; - while (i + 1 < end_elem) - { - const eT1 val1 = unsorted_memptr[i]; - const UWORD index1 = unsorted_index_memptr[i]; - const UWORD out_index1 = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index1] = val1; - sorted_index_memptr[out_index1] = index1; - - const eT1 val2 = unsorted_memptr[i + 1]; - const UWORD index2 = unsorted_index_memptr[i + 1]; - const UWORD out_index2 = local_counts[((memptr[i + 1] & mask) >> b) + ((memptr[i + 1] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index2] = val2; - sorted_index_memptr[out_index2] = index2; - - i += 2; - } - if (i < end_elem) - { - const eT1 val = unsorted_memptr[i]; - const UWORD index = unsorted_index_memptr[i]; - const UWORD out_index = local_counts[((memptr[i] & mask) >> b) + ((memptr[i] & sign_mask) >> (last_bit - 1))]++; - sorted_memptr[out_index] = val; - sorted_index_memptr[out_index] = index; - } - - // Now swap pointers. - __global eT1* tmp = unsorted_memptr; - __global UWORD* tmp_index = unsorted_index_memptr; - unsorted_memptr = sorted_memptr; - unsorted_index_memptr = sorted_index_memptr; - sorted_memptr = tmp; - sorted_index_memptr = tmp_index; - - barrier(CLK_GLOBAL_MEM_FENCE); - } - - // Since we did an odd number of iterations, the result is stored in A_index. - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var.cl deleted file mode 100644 index 2ad0fa8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var.cl +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,submat_var)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 mean_val, - const UWORD in_n_rows, - const UWORD start_row, - const UWORD start_col, - const UWORD sub_n_rows, - const UWORD sub_n_cols) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const UWORD col1 = (i ) / sub_n_rows; - const UWORD col2 = (i + get_local_size(0)) / sub_n_rows; - const UWORD row1 = (i ) % sub_n_rows; - const UWORD row2 = (i + get_local_size(0)) % sub_n_rows; - const UWORD index1 = (col1 + start_col) * in_n_rows + (row1 + start_row); - const UWORD index2 = (col2 + start_col) * in_n_rows + (row2 + start_row); - - const eT1 val1 = (in_mem[in_mem_offset + index1] - mean_val); - const eT1 val2 = (in_mem[in_mem_offset + index2] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const UWORD col = i / sub_n_rows; - const UWORD row = i % sub_n_rows; - const UWORD index = (col + start_col) * in_n_rows + (row + start_row); - - const eT1 val = (in_mem[in_mem_offset + index] - mean_val); - aux_mem[tid] += (val * val); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var_small.cl deleted file mode 100644 index d450bf5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/submat_var_small.cl +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,submat_var_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 mean_val, - const UWORD in_n_rows, - const UWORD start_row, - const UWORD start_col, - const UWORD sub_n_rows, - const UWORD sub_n_cols) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const UWORD col1 = (i ) / sub_n_rows; - const UWORD col2 = (i + get_local_size(0)) / sub_n_rows; - const UWORD row1 = (i ) % sub_n_rows; - const UWORD row2 = (i + get_local_size(0)) % sub_n_rows; - const UWORD index1 = (col1 + start_col) * in_n_rows + (row1 + start_row); - const UWORD index2 = (col2 + start_col) * in_n_rows + (row2 + start_row); - - const eT1 val1 = (in_mem[in_mem_offset + index1] - mean_val); - const eT1 val2 = (in_mem[in_mem_offset + index2] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const UWORD col = i / sub_n_rows; - const UWORD row = i % sub_n_rows; - const UWORD index = (col + start_col) * in_n_rows + (row + start_row); - - const eT1 val = (in_mem[in_mem_offset + index] - mean_val); - aux_mem[tid] += (val * val); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatl_inplace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatl_inplace.cl deleted file mode 100644 index bd49f1e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatl_inplace.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,symmatl_inplace)(__global eT1* out, - const UWORD out_offset, - const UWORD size) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < size && col < size && row > col) - { - const eT1 val = out[out_offset + row + size * col]; - - // only need to copy to the upper triangle for the in-place version - out[out_offset + col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatu_inplace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatu_inplace.cl deleted file mode 100644 index 35e6566..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/symmatu_inplace.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,symmatu_inplace)(__global eT1* out, - const UWORD out_offset, - const UWORD size) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < size && col < size && col > row) - { - const eT1 val = out[out_offset + row + size * col]; - - // only need to copy to the lower triangle for the in-place version - out[out_offset + col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/trace.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/trace.cl deleted file mode 100644 index e1dcf4a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/trace.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - -__kernel -void -COOT_FN(PREFIX,trace)(__global eT1* out, - __global const eT1* A, - const UWORD A_offset, - const UWORD n_rows, - const UWORD N) - { - const UWORD id = get_global_id(0); - if(id == 0) - { - eT1 acc = (eT1)(0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=0; i SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_colwise.cl deleted file mode 100644 index 3e03607..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_colwise.cl +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,var_colwise)(__global eT1* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - __global const eT1* src_means, - const UWORD src_means_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD norm_correction, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows, - const UWORD src_means_mem_incr) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - const __global eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - const eT1 mean_val = src_means[src_means_offset + col * src_means_mem_incr]; - eT1 acc = (eT1) (0); - for (UWORD i = 0; i < n_rows; ++i) - { - eT1 val = (colptr[i] - mean_val); - acc += (val * val); - } - - dest[dest_offset + col * dest_mem_incr] = (acc / (eT1) (n_rows - norm_correction)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_rowwise.cl deleted file mode 100644 index c79b49f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_rowwise.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,var_rowwise)(__global eT1* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - __global const eT1* src_means, - const UWORD src_means_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD norm_correction, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows, - const UWORD src_means_mem_incr) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1)(0); - const eT1 mean_val = src_means[src_means_offset + row * src_means_mem_incr]; - for (UWORD i = 0; i < n_cols; ++i) - { - const eT1 val = (src[src_offset + (i * src_M_n_rows) + row] - mean_val); - acc += (val * val); - } - - dest[dest_offset + row * dest_mem_incr] = (acc / (eT1) (n_cols - norm_correction)); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_small.cl deleted file mode 100644 index 55710ee..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway/var_small.cl +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,var_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 mean_val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = (in_mem[in_mem_offset + i] - mean_val); - const eT1 val2 = (in_mem[in_mem_offset + i + get_local_size(0)] - mean_val); - aux_mem[tid] += (val1 * val1) + (val2 * val2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 val = (in_mem[in_mem_offset + i] - mean_val); - aux_mem[tid] += (val * val); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce.cl deleted file mode 100644 index 251e9e6..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce.cl +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] &= data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] &=data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &=data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &=data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &=data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,and_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] &= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -void COOT_FN(PREFIX,and_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] &= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] &= data[tid + 1]; - } - - - -__kernel -void -COOT_FN(PREFIX,and_reduce)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = ~((eT1) 0); - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] &= in_mem[in_mem_offset + i]; - aux_mem[tid] &= in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] &= in_mem[in_mem_offset + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,and_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce_small.cl deleted file mode 100644 index 20df12a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/and_reduce_small.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,and_reduce_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = ~((eT1) 0); - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] &= in_mem[in_mem_offset + i]; - aux_mem[tid] &= in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] &= in_mem[in_mem_offset + i]; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det.cl deleted file mode 100644 index f5842ff..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det.cl +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,ipiv_det)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - // This kernel is not used by the OpenCL backend, so we leave it empty! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det_small.cl deleted file mode 100644 index 210b7f9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/ipiv_det_small.cl +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,ipiv_det_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - // This kernel is not used by the OpenCL backend, so we leave it empty! - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce.cl deleted file mode 100644 index 53d3c75..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce.cl +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] |= data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,or_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -void COOT_FN(PREFIX,or_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid) - { - data[tid] |= data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] |= data[tid + 1]; - } - - - -__kernel -void -COOT_FN(PREFIX,or_reduce)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = (eT1) 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] |= in_mem[in_mem_offset + i]; - aux_mem[tid] |= in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] |= in_mem[in_mem_offset + i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,or_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce_small.cl deleted file mode 100644 index 041662f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_integral/or_reduce_small.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,or_reduce_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = (eT1) 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] |= in_mem[in_mem_offset + i]; - aux_mem[tid] |= in_mem[in_mem_offset + i + get_local_size(0)]; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] |= in_mem[in_mem_offset + i]; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod.cl deleted file mode 100644 index 792e738..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod.cl +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// Forward declarations we may need. -void COOT_FN(PREFIX,prod_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,prod_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid); - - - -// Compute the product of the elements on the diagonal of a matrix. -__kernel -void -COOT_FN(PREFIX,diag_prod)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_rows, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_rows) - { - const UWORD index1 = i * n_rows + i; - const eT1 v1 = in_mem[in_mem_offset + index1]; - const UWORD index2 = (i + get_local_size(0)) * n_rows + (i + get_local_size(0)); - const eT1 v2 = in_mem[in_mem_offset + index2]; - aux_mem[tid] *= v1 * v2; - i += grid_size; - } - if (i < n_rows) - { - const UWORD index = i * n_rows + i; - const eT1 v = in_mem[in_mem_offset + index]; - aux_mem[tid] *= v; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,prod_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod_small.cl deleted file mode 100644 index 4d1067c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/diag_prod_small.cl +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Compute the product of the elements on the diagonal of a matrix. -__kernel -void -COOT_FN(PREFIX,diag_prod_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_rows, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_rows) - { - const UWORD index1 = i * n_rows + i; - const eT1 v1 = in_mem[in_mem_offset + index1]; - const UWORD index2 = (i + get_local_size(0)) * n_rows + (i + get_local_size(0)); - const eT1 v2 = in_mem[in_mem_offset + index2]; - aux_mem[tid] *= v1 * v2; - i += grid_size; - } - if (i < n_rows) - { - const UWORD index = i * n_rows + i; - const eT1 v = in_mem[in_mem_offset + index]; - aux_mem[tid] *= v; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] *= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/extract_cx.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/extract_cx.cl deleted file mode 100644 index a343916..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/extract_cx.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Extract real or imaginary elements from a complex matrix into a real matrix. -// This kernel is a bit of a hack until we have actual complex matrix support! -__kernel -void -COOT_FN(PREFIX,extract_cx)(__global const eT1* in_mem, - const UWORD in_mem_offset, - __global eT1* out_mem, - const UWORD out_mem_offset, - const UWORD real_or_imag, - const UWORD n_rows, - const UWORD n_cols, - const UWORD in_M_n_rows, - const UWORD out_M_n_rows) - { - // If real_or_imag is 0, we extract the real part. If 1, we extract the - // imaginary part. - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD in_index = 2 * (col * in_M_n_rows + row) + real_or_imag; - const UWORD out_index = col * out_M_n_rows + row; - - if (col < n_cols && row < n_rows) - { - out_mem[out_mem_offset + out_index] = in_mem[in_mem_offset + in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_l.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_l.cl deleted file mode 100644 index 69f4e85..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_l.cl +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This extracts L and U from in, and sets the lower diagonal of U to 0. -// It's okay if U == in, if n_rows <= n_cols. -__kernel -void -COOT_FN(PREFIX,lu_extract_l)(__global eT1* L, - const UWORD L_offset, - __global eT1* U, - const UWORD U_offset, - const __global eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - // Note that neither U nor L must be square. - // L has size n_rows x min(n_rows, n_cols). - // U has size min(n_rows, n_cols) x n_cols. - const UWORD min_rows_cols = min(n_rows, n_cols); - - const UWORD in_index = row + n_rows * col; // this is also L_out_index - const UWORD U_out_index = row + min_rows_cols * col; - - if ((row < n_rows) && (col < min_rows_cols)) - { - L[L_offset + in_index] = (row > col) ? in[in_offset + in_index] : ((row == col) ? 1 : 0); - } - - if ((row < min_rows_cols) && (col < n_cols)) - { - U[U_offset + U_out_index] = (row > col) ? 0 : in[in_offset + in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_p.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_p.cl deleted file mode 100644 index c1a4c2b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_p.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,lu_extract_p)(__global eT1* P, - const UWORD P_offset, - __global const UWORD* ipiv2, - const UWORD n_rows) - { - const UWORD row = get_global_id(0); - - if (row < n_rows) - { - const UWORD index = row + ipiv2[row] * n_rows; - P[P_offset + index] = (eT1) 1; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_pivoted_l.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_pivoted_l.cl deleted file mode 100644 index c81a122..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/lu_extract_pivoted_l.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// This extracts L from U, and sets the lower diagonal of U to 0. -__kernel -void -COOT_FN(PREFIX,lu_extract_pivoted_l)(__global eT1* L, - const UWORD L_offset, - __global eT1* U, - const UWORD U_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - __global const UWORD* ipiv) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - // Note that neither U nor L must be square. - // L has size n_rows x min(n_rows, n_cols). - // U has size min(n_rows, n_cols) x n_cols. - const UWORD min_rows_cols = min(n_rows, n_cols); - - const UWORD in_index = row + n_rows * col; // this is also L_out_index - const UWORD U_out_index = row + min_rows_cols * col; - - // We are extracted a permuted version of L. - // Instead of extracting row i of U as row i of L, - // we extract row i of U as row ipiv[i] of L. - const UWORD L_out_index = ipiv[row] + n_rows * col; - - if ((row < n_rows) && (col < min_rows_cols)) - { - L[L_offset + L_out_index] = (row > col) ? in[in_offset + in_index] : ((row == col) ? 1 : 0); - } - - if ((row < min_rows_cols) && (col < n_cols)) - { - U[U_offset + U_out_index] = (row > col) ? 0 : in[in_offset + in_index]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf.cl deleted file mode 100644 index 8c2a9a3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_inf)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isinf(val1); - aux_mem[tid] |= isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isinf(val1); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(or_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf_small.cl deleted file mode 100644 index 7baa213..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_inf_small.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_inf_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isinf(val1); - aux_mem[tid] |= isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isinf(val1); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan.cl deleted file mode 100644 index 640d8ad..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_nan)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isnan(val1); - aux_mem[tid] |= isnan(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isnan(val1); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(or_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan_small.cl deleted file mode 100644 index 6ccadc9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nan_small.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_nan_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isnan(val1); - aux_mem[tid] |= isnan(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isnan(val1); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite.cl deleted file mode 100644 index 4588da3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_nonfinite)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isnan(val1) | isinf(val1); - aux_mem[tid] |= isnan(val2) | isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isnan(val1) | isinf(val1); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(or_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite_small.cl deleted file mode 100644 index 48c7b86..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_any_nonfinite_small.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_nonfinite_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT1 val /* ignored */) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT1 val2 = X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= isnan(val1) | isinf(val1); - aux_mem[tid] |= isnan(val2) | isinf(val2); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT1 val1 = X[X_offset + i]; - aux_mem[tid] |= isnan(val1) | isinf(val1); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isfinite.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isfinite.cl deleted file mode 100644 index 64b470b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isfinite.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_isfinite)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - if (i < n_elem) - { - const eT1 val = (eT1) X[X_offset + i]; - out[out_offset + i] = isfinite(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnan.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnan.cl deleted file mode 100644 index 437fcb1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnan.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_isnan)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - if (i < n_elem) - { - const eT1 val1 = (eT1) X[X_offset + i]; - out[out_offset + i] = isnan(val1); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnonfinite.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnonfinite.cl deleted file mode 100644 index 582fbe4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/rel_isnonfinite.cl +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_isnonfinite)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - if (i < n_elem) - { - const eT1 val = (eT1) X[X_offset + i]; - out[out_offset + i] = !isfinite(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1.cl deleted file mode 100644 index 1177ea0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1.cl +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// Forward declarations we may need. -void COOT_FN(PREFIX,accu_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,accu_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid); - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_1)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += ET1_ABS(in_mem[in_mem_offset + i]) + ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += ET1_ABS(in_mem[in_mem_offset + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1_small.cl deleted file mode 100644 index b0a175d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_1_small.cl +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_1_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += ET1_ABS(in_mem[in_mem_offset + i]) + ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)]); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += ET1_ABS(in_mem[in_mem_offset + i]); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2.cl deleted file mode 100644 index 4587b92..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_2)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i]); - const eT1 v2 = (in_mem[in_mem_offset + i + get_local_size(0)] * in_mem[in_mem_offset + i + get_local_size(0)]); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += (in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i]); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust.cl deleted file mode 100644 index 20ef94e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_2_robust)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 max_val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] / max_val); - const eT1 v2 = (in_mem[in_mem_offset + i + get_local_size(0)] / max_val); - aux_mem[tid] += (v1 * v1) + (v2 * v2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] / max_val); - aux_mem[tid] += (v1 * v1); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust_small.cl deleted file mode 100644 index 11de83d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_robust_small.cl +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_2_robust_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const eT1 max_val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] / max_val); - const eT1 v2 = (in_mem[in_mem_offset + i + get_local_size(0)] / max_val); - aux_mem[tid] += (v1 * v1) + (v2 * v2); - i += grid_size; - } - if (i < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] / max_val); - aux_mem[tid] += (v1 + v1); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_small.cl deleted file mode 100644 index 7ea93a0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_2_small.cl +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_2_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = (in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i]); - const eT1 v2 = (in_mem[in_mem_offset + i + get_local_size(0)] * in_mem[in_mem_offset + i + get_local_size(0)]); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += (in_mem[in_mem_offset + i] * in_mem[in_mem_offset + i]); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k.cl deleted file mode 100644 index 54bd953..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_k)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const UWORD k) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = pow(in_mem[in_mem_offset + i], (eT1) k); - const eT1 v2 = pow(in_mem[in_mem_offset + i + get_local_size(0)], (eT1) k); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v1 = pow(in_mem[in_mem_offset + i], (eT1) k); - aux_mem[tid] += v1; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,accu_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k_small.cl deleted file mode 100644 index b7e0a3c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_k_small.cl +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_k_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem, - const UWORD k) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v1 = pow(in_mem[in_mem_offset + i], (eT1) k); - const eT1 v2 = pow(in_mem[in_mem_offset + i + get_local_size(0)], (eT1) k); - aux_mem[tid] += v1 + v2; - i += grid_size; - } - if (i < n_elem) - { - const eT1 v1 = pow(in_mem[in_mem_offset + i], (eT1) k); - aux_mem[tid] += v1; - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min.cl deleted file mode 100644 index 888fcb8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min.cl +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -// Forward declarations we may need. -void COOT_FN(PREFIX,min_subgroup_reduce_other)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_8)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_16)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_32)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_64)(__local volatile eT1* data, UWORD tid); -void COOT_FN(PREFIX,min_subgroup_reduce_128)(__local volatile eT1* data, UWORD tid); - - - -__kernel -void -COOT_FN(PREFIX,vec_norm_min)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[in_mem_offset + i]); - } - if (i + get_local_size(0) < n_elem) - { - const eT1 v = ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)]); - aux_mem[tid] = min(aux_mem[tid], v); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v = min(ET1_ABS(in_mem[in_mem_offset + i]), ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - aux_mem[tid] = min(aux_mem[tid], v); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], ET1_ABS(in_mem[in_mem_offset + i])); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,min_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min_small.cl deleted file mode 100644 index 1639e1f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/oneway_real/vec_norm_min_small.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,vec_norm_min_small)(__global const eT1* in_mem, - const UWORD in_mem_offset, - const UWORD n_elem, - __global eT1* out_mem, - const UWORD out_mem_offset, - __local volatile eT1* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - // Make sure all auxiliary memory is initialized to something that won't - // screw up the final reduce. - aux_mem[tid] = COOT_FN(coot_type_min_,eT1)(); - - if (i < n_elem) - { - aux_mem[tid] = ET1_ABS(in_mem[in_mem_offset + i]); - } - if (i + get_local_size(0) < n_elem) - { - const eT1 v = ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)]); - aux_mem[tid] = min(aux_mem[tid], v); - } - i += grid_size; - - while (i + get_local_size(0) < n_elem) - { - const eT1 v = min(ET1_ABS(in_mem[in_mem_offset + i]), ET1_ABS(in_mem[in_mem_offset + i + get_local_size(0)])); - aux_mem[tid] = min(aux_mem[tid], v); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] = min(aux_mem[tid], ET1_ABS(in_mem[in_mem_offset + i])); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] = min(aux_mem[tid], aux_mem[tid + s]); - } - } - - if (tid == 0) - { - out_mem[out_mem_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_post.cl deleted file mode 100644 index 7c371bd..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_div_post)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] / ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_pre.cl deleted file mode 100644 index 718d1ef..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_div_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_div_pre)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]) / out_src[out_src_offset + out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_post.cl deleted file mode 100644 index 243d5ea..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_minus_post)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] - ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_pre.cl deleted file mode 100644 index 50fd41e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_minus_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_minus_pre)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]) - out_src[out_src_offset + out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_plus.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_plus.cl deleted file mode 100644 index dcb6e45..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_plus.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_plus)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] + ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_schur.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_schur.cl deleted file mode 100644 index 1fd0953..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_schur.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_schur)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD out_src_loc = out_col * out_src_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] * ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_set.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_set.cl deleted file mode 100644 index 18189a1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_set.cl +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_set)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = out_col * out_M_n_rows + out_row; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_post.cl deleted file mode 100644 index 054f937..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_post.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_div_post)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] / ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_pre.cl deleted file mode 100644 index 9b782e9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_div_pre.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_div_pre)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]) / out_src[out_src_offset + out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_post.cl deleted file mode 100644 index 436ac08..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_post.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_minus_post)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] - ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_pre.cl deleted file mode 100644 index 58a03a7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_minus_pre.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_minus_pre)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]) - out_src[out_src_offset + out_src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_plus.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_plus.cl deleted file mode 100644 index 624293b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_plus.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_plus)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] + ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_schur.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_schur.cl deleted file mode 100644 index 701cff1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_schur.cl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_schur)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD in_loc = in_col * in_M_n_rows + in_row; - const UWORD out_loc = (mode >= 2) ? - out_col * out_M_n_rows + out_row : - (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD out_src_loc = (mode == 0 || mode == 2) ? - indices[out_col * indices_incr] * out_src_M_n_rows + out_row : - out_col * out_src_M_n_rows + indices[out_row * indices_incr]; - - out[out_offset + out_loc] = out_src[out_src_offset + out_src_loc] * ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_set.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_set.cl deleted file mode 100644 index fa3ea4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/broadcast_subset_set.cl +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,broadcast_subset_set)(__global eT2* out, - const UWORD out_offset, - __global const eT2* out_src, - const UWORD out_src_offset, - __global const eT1* in, - const UWORD in_offset, - __global const UWORD* indices, - const UWORD indices_offset, - const UWORD mode, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD out_M_n_rows, - const UWORD out_src_M_n_rows, - const UWORD in_M_n_rows, - const UWORD indices_incr) - { - const UWORD out_row = get_global_id(0); - const UWORD out_col = get_global_id(1); - - if ((out_row < n_rows * copies_per_row) && (out_col < n_cols * copies_per_col)) - { - const UWORD in_row = out_row % n_rows; - const UWORD in_col = out_col % n_cols; - - const UWORD out_loc = (mode == 0) ? - indices[out_col * indices_incr] * out_M_n_rows + out_row : - out_col * out_M_n_rows + indices[out_row * indices_incr]; - const UWORD in_loc = in_col * in_M_n_rows + in_row; - - out[out_offset + out_loc] = ((eT2) in[in_offset + in_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/clamp.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/clamp.cl deleted file mode 100644 index 32c075e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/clamp.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,clamp)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 min_val, - const eT1 max_val, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_index = src_offset + row + col * src_M_n_rows; - const UWORD dest_index = dest_offset + row + col * dest_M_n_rows; - - const eT1 clamped_val = max(min_val, min(max_val, src[src_index])); - dest[dest_index] = (eT2) clamped_val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/cross.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/cross.cl deleted file mode 100644 index e9db1d0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/cross.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,cross)(__global eT2* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - __global const eT1* B, - const UWORD B_offset) // A and B should have 3 elements - { - const UWORD idx = get_global_id(0); - - if (idx < 3) - { - const UWORD a1_index = ((idx + 1) % 3) + A_offset; - const UWORD a2_index = ((idx + 2) % 3) + A_offset; - - const UWORD b1_index = ((idx + 2) % 3) + B_offset; - const UWORD b2_index = ((idx + 1) % 3) + B_offset; - - const eT1 val = (A[a1_index] * B[b1_index]) - (A[a2_index] * B[b2_index]); - out[idx + out_offset] = (eT2) val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot.cl deleted file mode 100644 index 3d43e2b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot.cl +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_other)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - for(UWORD i = SUBGROUP_SIZE; i > 0; i >>= 1) - { - if (tid < i) - data[tid] += data[tid + i]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - } - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_8)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_16)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_32)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_64)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -void -COOT_FN(PREFIX,dot_subgroup_reduce_128)(__local volatile twoway_promoted_eT* data, UWORD tid) - { - data[tid] += data[tid + 128]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 64]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 32]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 16]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 8]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 4]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 2]; - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - data[tid] += data[tid + 1]; - } - - - -__kernel -void -COOT_FN(PREFIX,dot)(__global twoway_promoted_eT* out_mem, - __global const eT1* A, - const UWORD A_offset, - __global const eT2* B, - const UWORD B_offset, - const UWORD n_elem, - __local volatile twoway_promoted_eT* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += (((twoway_promoted_eT) A[A_offset + i]) * ((twoway_promoted_eT) B[B_offset + i])) + - (((twoway_promoted_eT) A[A_offset + i + get_local_size(0)]) * ((twoway_promoted_eT) B[B_offset + i + get_local_size(0)])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += (((twoway_promoted_eT) A[A_offset + i]) * ((twoway_promoted_eT) B[B_offset + i])); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN_3(PREFIX,dot_subgroup_reduce_,SUBGROUP_SIZE_NAME)(aux_mem, tid); - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot_small.cl deleted file mode 100644 index 551ce4f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/dot_small.cl +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,dot_small)(__global twoway_promoted_eT* out_mem, - __global const eT1* A, - const UWORD A_offset, - __global const eT2* B, - const UWORD B_offset, - const UWORD n_elem, - __local volatile twoway_promoted_eT* aux_mem) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - aux_mem[tid] += (((twoway_promoted_eT) A[A_offset + i]) * ((twoway_promoted_eT) B[B_offset + i])) + - (((twoway_promoted_eT) A[A_offset + i + get_local_size(0)]) * ((twoway_promoted_eT) B[B_offset + i + get_local_size(0)])); - i += grid_size; - } - if (i < n_elem) - { - aux_mem[tid] += (((twoway_promoted_eT) A[A_offset + i]) * ((twoway_promoted_eT) B[B_offset + i])); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] += aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out_mem[get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/htrans.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/htrans.cl deleted file mode 100644 index d5c5403..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/htrans.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,htrans)(__global eT2* out, - const UWORD out_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD in_n_rows, - const UWORD in_n_cols) - { - // For a non-inplace transpose, we can use a pretty naive approach. - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD in_total_offset = in_offset + row + col * in_n_rows; - const UWORD out_total_offset = out_offset + col + row * in_n_cols; - - if( (row < in_n_rows) && (col < in_n_cols) ) - { - const eT2 element = (eT2) in[in_total_offset]; - out[out_total_offset] = COOT_FN(coot_conj_,eT2)(element); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_array.cl deleted file mode 100644 index cd8c236..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_div_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] /= src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_sve1.cl deleted file mode 100644 index 1f9f216..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_div_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_div_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] /= src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_array.cl deleted file mode 100644 index 5e08dd9..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_eq_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_sve1.cl deleted file mode 100644 index 7878625..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_eq_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_eq_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_array.cl deleted file mode 100644 index ea1ecc8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_minus_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] -= src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_sve1.cl deleted file mode 100644 index c54f258..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_minus_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_minus_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] -= src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_array.cl deleted file mode 100644 index ea6440f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_mul_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] *= src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_sve1.cl deleted file mode 100644 index eeeff6a..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_mul_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_mul_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] *= src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_array.cl deleted file mode 100644 index b750e57..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_array.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_plus_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] += src[i + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_sve1.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_sve1.cl deleted file mode 100644 index 882139d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve1_plus_sve1.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve1_plus_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] += src[src_locs[i + src_locs_offset] + src_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_array.cl deleted file mode 100644 index 131acf0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_div_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] /= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_sve2.cl deleted file mode 100644 index 061e82f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_div_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_div_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] /= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_array.cl deleted file mode 100644 index 9924db0..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_eq_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] = (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_sve2.cl deleted file mode 100644 index 65961b8..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_eq_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_eq_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] = (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_array.cl deleted file mode 100644 index 774ddf7..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_minus_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] -= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_sve2.cl deleted file mode 100644 index 721674e..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_minus_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_minus_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] -= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_array.cl deleted file mode 100644 index c7c3108..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_mul_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] *= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_sve2.cl deleted file mode 100644 index 294b915..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_mul_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_mul_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] *= (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_array.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_array.cl deleted file mode 100644 index 4def038..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_plus_array)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = row + col * n_rows + src_offset; - - dest[dest_loc] += (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_sve2.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_sve2.cl deleted file mode 100644 index d373a22..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/inplace_sve2_plus_sve2.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,inplace_sve2_plus_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - dest[dest_loc] += (eT2) src[src_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_post.cl deleted file mode 100644 index 4688349..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_post.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_colwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT1 acc = (eT1) colptr[0]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - acc = max(acc, colptr[i]); - } - dest[dest_offset + col * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_pre.cl deleted file mode 100644 index d0c9c67..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_colwise_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_colwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT2 acc = (eT2) colptr[0]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - acc = max(acc, (eT2) (colptr[i])); - } - dest[dest_offset + col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_post.cl deleted file mode 100644 index 0064a8c..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_post.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_cube_col_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT1 acc = (eT1) src[src_offset + row + slice * n_rows * n_cols]; - for(UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols]); - } - dest[dest_offset + row + slice * n_rows] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_pre.cl deleted file mode 100644 index 46ff643..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_cube_col_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_cube_col_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT2 acc = (eT2) src[src_offset + row + slice * n_rows * n_cols]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - acc = max(acc, (eT2) (src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols])); - } - dest[dest_offset + row + slice * n_rows] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_post.cl deleted file mode 100644 index 9771bb4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_post.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_rowwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1) src[src_offset + row]; - for(UWORD i = 1; i < n_cols; ++i) - { - acc = max(acc, src[src_offset + (i * n_rows) + row]); - } - dest[dest_offset + row * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_pre.cl deleted file mode 100644 index f16c906..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/max_rowwise_conv_pre.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,max_rowwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT2 acc = (eT2) src[src_offset + row]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - acc = max(acc, (eT2) (src[src_offset + (i * src_M_n_rows) + row])); - } - dest[dest_offset + row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_post.cl deleted file mode 100644 index 878ff93..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_post.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,mean_colwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT1 acc = (eT1) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - acc += colptr[i]; - } - dest[dest_offset + col * dest_mem_incr] = (eT2) (acc / (eT1) n_rows); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_pre.cl deleted file mode 100644 index a820b6d..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_colwise_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,mean_colwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[col * src_M_n_rows + src_offset]); - eT2 acc = (eT2) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - acc += (eT2) (colptr[i]); - } - dest[dest_offset + col * dest_mem_incr] = (acc / (eT2) n_rows); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_post.cl deleted file mode 100644 index eb63132..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_post.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,mean_rowwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1) (0); - for(UWORD i = 0; i < n_cols; ++i) - { - acc += src[src_offset + (i * src_M_n_rows) + row]; - } - dest[dest_offset + row * dest_mem_incr] = (eT2) (acc / (eT1) n_cols); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_pre.cl deleted file mode 100644 index a158168..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/mean_rowwise_conv_pre.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,mean_rowwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT2 acc = (eT2) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=0; i < n_cols; ++i) - { - acc += (eT2) (src[src_offset + (i * src_M_n_rows) + row]); - } - dest[dest_offset + row * dest_mem_incr] = (acc / (eT2) n_cols); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_post.cl deleted file mode 100644 index 67cd08f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_post.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_colwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT1 acc = (eT1) colptr[0]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - acc = min(acc, colptr[i]); - } - dest[dest_offset + col * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_pre.cl deleted file mode 100644 index b1b4873..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_colwise_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_colwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT2 acc = (eT2) colptr[0]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 1; i < n_rows; ++i) - { - acc = min(acc, (eT2) (colptr[i])); - } - dest[dest_offset + col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_post.cl deleted file mode 100644 index 1cb69c5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_post.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_cube_col_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT1 acc = (eT1) src[src_offset + row + slice * n_rows * n_cols]; - for(UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols]); - } - dest[dest_offset + row + slice * n_rows] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_pre.cl deleted file mode 100644 index 4d9cfc1..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_cube_col_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_cube_col_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices) - { - const UWORD row = get_global_id(0); - const UWORD slice = get_global_id(1); - - if(row < n_rows && slice < n_slices) - { - eT2 acc = (eT2) src[src_offset + row + slice * n_rows * n_cols]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - acc = min(acc, (eT2) (src[src_offset + (i * n_rows) + row + slice * n_rows * n_cols])); - } - dest[dest_offset + row + slice * n_rows] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_post.cl deleted file mode 100644 index 5a325aa..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_post.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_rowwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1) src[src_offset + row]; - for(UWORD i = 1; i < n_cols; ++i) - { - acc = min(acc, src[src_offset + (i * src_M_n_rows) + row]); - } - dest[dest_offset + row * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_pre.cl deleted file mode 100644 index 01f9055..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/min_rowwise_conv_pre.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,min_rowwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT2 acc = (eT2) src[src_offset + row]; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=1; i < n_cols; ++i) - { - acc = min(acc, (eT2) (src[src_offset + (i * src_M_n_rows) + row])); - } - dest[dest_offset + row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq.cl deleted file mode 100644 index 39e09ce..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_all_neq)(__global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT2 val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - const eT2 val2 = (eT2) X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] &= (val1 != val); - aux_mem[tid] &= (val2 != val); - i += grid_size; - } - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - aux_mem[tid] &= (val1 != val); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(and_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_colwise.cl deleted file mode 100644 index fc5a016..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_colwise.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_all_neq_colwise)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD col = get_global_id(0); - if(col < A_n_cols) - { - __global const eT1* colptr = &(A[ col*A_n_rows + A_offset ]); - UWORD result = 1; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < A_n_rows; ++i) - { - const eT2 val1 = (eT2) colptr[i]; - result &= (val1 != val); - } - out[col + out_offset] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_rowwise.cl deleted file mode 100644 index b83fec4..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_rowwise.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_all_neq_rowwise)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD row = get_global_id(0); - if(row < A_n_rows) - { - UWORD result = 1; - for(UWORD i = 0; i < A_n_cols; ++i) - { - const eT2 val1 = (eT2) A[i * A_n_rows + row + A_offset]; - result &= (val1 != val); - } - out[row + out_offset] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_small.cl deleted file mode 100644 index 4158239..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_all_neq_small.cl +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_all_neq_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT2 val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 1; - - while (i + get_local_size(0) < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - const eT2 val2 = (eT2) X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] &= (val1 != val); - aux_mem[tid] &= (val2 != val); - i += grid_size; - } - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - aux_mem[tid] &= (val1 != val); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] &= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq.cl deleted file mode 100644 index 6990ce5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_neq)(__global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT2 val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - const eT2 val2 = (eT2) X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= (val1 != val); - aux_mem[tid] |= (val2 != val); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT2 val1 = (eT2) X[X_offset + i]; - aux_mem[tid] |= (val1 != val); - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (UWORD s = get_local_size(0) / 2; s > SUBGROUP_SIZE; s >>= 1) - { - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid < SUBGROUP_SIZE) - { - COOT_FN(COOT_FN(or_subgroup_reduce_,SUBGROUP_SIZE_NAME),_u32)(aux_mem, tid); - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_colwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_colwise.cl deleted file mode 100644 index 3f6a7eb..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_colwise.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_neq_colwise)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD col = get_global_id(0); - if(col < A_n_cols) - { - __global const eT1* colptr = &(A[ col*A_n_rows + A_offset ]); - UWORD result = 0; - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < A_n_rows; ++i) - { - const eT2 val1 = (eT2) colptr[i]; - result |= (val1 != val); - if (result == 1) - break; - } - out[col + out_offset] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_rowwise.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_rowwise.cl deleted file mode 100644 index aebf3d3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_rowwise.cl +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_neq_rowwise)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const eT2 val, - const UWORD A_n_rows, - const UWORD A_n_cols) - { - const UWORD row = get_global_id(0); - if(row < A_n_rows) - { - UWORD result = 0; - for(UWORD i = 0; i < A_n_cols; ++i) - { - const eT2 val1 = (eT2) A[i * A_n_rows + row + A_offset]; - result |= (val1 != val); - if (result == 1) - break; - } - out[row + out_offset] = result; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_small.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_small.cl deleted file mode 100644 index 610b99b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/rel_any_neq_small.cl +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_any_neq_small)(__global const eT1* X, - const UWORD X_offset, - const UWORD n_elem, - __global uint* out, - const UWORD out_offset, - __local volatile uint* aux_mem, - const eT2 val) - { - const UWORD tid = get_local_id(0); - UWORD i = get_group_id(0) * (get_local_size(0) * 2) + tid; - const UWORD grid_size = get_local_size(0) * 2 * get_num_groups(0); - - aux_mem[tid] = 0; - - while (i + get_local_size(0) < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - const eT2 val2 = (eT2) X[X_offset + i + get_local_size(0)]; - - aux_mem[tid] |= (val1 != val); - aux_mem[tid] |= (val2 != val); - if (aux_mem[tid] == 1) - break; - i += grid_size; - } - - if (i < n_elem && aux_mem[tid] == 0) - { - const eT2 val1 = (eT2) X[X_offset + i]; - aux_mem[tid] |= (val1 != val); - } - - for (UWORD s = get_local_size(0) / 2; s > 0; s >>= 1) - { - SUBGROUP_BARRIER(CLK_LOCAL_MEM_FENCE); - - if (tid < s) - { - aux_mem[tid] |= aux_mem[tid + s]; - } - } - - if (tid == 0) - { - out[out_offset + get_group_id(0)] = aux_mem[0]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/repmat.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/repmat.cl deleted file mode 100644 index e5c1897..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/repmat.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,repmat)(__global const eT1* in, - const UWORD in_offset, - __global eT2* out, - const UWORD out_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD copies_per_row, - const UWORD copies_per_col, - const UWORD new_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD offset = row + col * n_rows; - const eT2 element = (eT2) in[in_offset + offset]; - if( (row < n_rows) && (col < n_cols) ) - { - for (UWORD c_copy = 0; c_copy < copies_per_col; ++c_copy) - { - const UWORD col_offset = (col + n_cols * c_copy) * new_n_rows; - for (UWORD r_copy = 0; r_copy < copies_per_row; ++r_copy) - { - const UWORD copy_offset = col_offset + (row + n_rows * r_copy); - out[out_offset + copy_offset] = element; - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/strans.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/strans.cl deleted file mode 100644 index 2d7c667..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/strans.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,strans)(__global eT2* out, - const UWORD out_offset, - __global const eT1* in, - const UWORD in_offset, - const UWORD in_n_rows, - const UWORD in_n_cols) - { - // For a non-inplace transpose, we can use a pretty naive approach. - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD in_total_offset = in_offset + row + col * in_n_rows; - const UWORD out_total_offset = out_offset + col + row * in_n_cols; - - if( (row < in_n_rows) && (col < in_n_cols) ) - { - const eT2 element = (eT2) in[in_total_offset]; - out[out_total_offset] = element; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_post.cl deleted file mode 100644 index dc4b82f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_post.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,sum_colwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT1 acc = (eT1) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - acc += colptr[i]; - } - dest[dest_offset + col * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_pre.cl deleted file mode 100644 index b15c12b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_colwise_conv_pre.cl +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,sum_colwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD col = get_global_id(0); - if(col < n_cols) - { - __global const eT1* colptr = &(src[src_offset + col * src_M_n_rows]); - eT2 acc = (eT2) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i = 0; i < n_rows; ++i) - { - acc += (eT2) (colptr[i]); - } - dest[dest_offset + col * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_post.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_post.cl deleted file mode 100644 index fe2f867..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_post.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,sum_rowwise_conv_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT1 acc = (eT1) (0); - for(UWORD i = 0; i < n_cols; ++i) - { - acc += src[src_offset + (i * src_M_n_rows) + row]; - } - dest[dest_offset + row * dest_mem_incr] = (eT2) (acc); - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_pre.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_pre.cl deleted file mode 100644 index 399cce3..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/sum_rowwise_conv_pre.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,sum_rowwise_conv_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_mem_incr, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - if(row < n_rows) - { - eT2 acc = (eT2) (0); - #ifdef CL_VERSION_2_0 - __attribute__((opencl_unroll_hint)) - #endif - for(UWORD i=0; i < n_cols; ++i) - { - acc += (eT2) (src[src_offset + (i * src_M_n_rows) + row]); - } - dest[dest_offset + row * dest_mem_incr] = acc; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatl.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatl.cl deleted file mode 100644 index 8ab65f5..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatl.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,symmatl)(__global eT2* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const UWORD size) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < size && col < size && row >= col) - { - const eT2 val = (eT2) A[A_offset + row + size * col]; - - out[out_offset + row + size * col] = val; - out[out_offset + col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatu.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatu.cl deleted file mode 100644 index 514096b..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/twoway/symmatu.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,symmatu)(__global eT2* out, - const UWORD out_offset, - __global const eT1* A, - const UWORD A_offset, - const UWORD size) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < size && col < size && col >= row) - { - const eT2 val = (eT2) A[A_offset + row + size * col]; - - out[out_offset + row + size * col] = val; - out[out_offset + col + size * row] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/zeroway/shuffle_large_compute_locs.cl b/inst/include/bandicoot_bits/ks/kernels/opencl/zeroway/shuffle_large_compute_locs.cl deleted file mode 100644 index a81414f..0000000 --- a/inst/include/bandicoot_bits/ks/kernels/opencl/zeroway/shuffle_large_compute_locs.cl +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2024 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -// This performs the first part of the shuffle_vec kernel: it computes random -// locations for the output using the variable philox bijective shuffle, -// and then does the first step of the output compression (the upsweep of the -// shifted prefix sum). -__kernel -void -shuffle_large_compute_locs(__global UWORD* out_block_mem, - const UWORD n_elem, - const UWORD n_elem_pow2, - __global const UWORD* philox_key, - const UWORD num_bits, - __local volatile UWORD* aux_mem) - { - const UWORD tid = get_global_id(0); - const UWORD local_tid = get_local_id(0); - const UWORD local_size = get_local_size(0); - - // Get our bijective shuffle location. - const UWORD in_loc = var_philox(tid, philox_key, num_bits); - - // Fill aux_mem with the indicator of whether we are out of bounds. - // Then, we'll prefix-sum it. This will tell us where to put our result. - aux_mem[local_tid] = (in_loc < n_elem); - barrier(CLK_LOCAL_MEM_FENCE); - - // Now, prefix-sum the auxiliary memory. - // This allows us to do the shuffle-compaction step. - UWORD offset = 1; - for (UWORD s = local_size / 2; s > 0; s >>= 1) - { - if (local_tid < s) - { - const UWORD ai = offset * (2 * local_tid + 1) - 1; - const UWORD bi = offset * (2 * local_tid + 2) - 1; - aux_mem[bi] += aux_mem[ai]; - } - offset *= 2; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (local_tid == 0) - { - out_block_mem[get_group_id(0)] = aux_mem[local_size - 1]; - } - } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/c_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/c_defs.cl similarity index 100% rename from inst/include/bandicoot_bits/ks/kernels/opencl/defs/c_defs.cl rename to inst/include/bandicoot_bits/ks/opencl/defs/c_defs.cl diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/d_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/d_defs.cl index eb58c38..a4801ac 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/d_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/d_defs.cl @@ -22,5 +22,9 @@ inline bool coot_isnan_double(const double x) { return isnan(x); } inline double coot_absdiff_double(const double x, const double y) { return fabs(x - y); } -inline double coot_conj_double(const double x) { return x; } -//inline cx_double coot_conj_cx_double(const cx_double x) { return cx_double(x.x, -x.y); } +inline double coot_conj_double(const double x) { return x; } + +inline double coot_plus_double(const double a, const double b) { return a + b; } +inline double coot_minus_double(const double a, const double b) { return a - b; } +inline double coot_mul_double(const double a, const double b) { return a * b; } +inline double coot_div_double(const double a, const double b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/f_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/f_defs.cl index ac35a12..be682a1 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/f_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/f_defs.cl @@ -22,5 +22,9 @@ inline bool coot_isnan_float(const float x) { return isnan(x); } inline float coot_absdiff_float(const float x, const float y) { return fabs(x - y); } -inline float coot_conj_float(const float x) { return x; } -//inline cx_float coot_conj_cx_float(const cx_float x) { return cx_float(x.x, -x.y); } +inline float coot_conj_float(const float x) { return x; } + +inline float coot_plus_float(const float a, const float b) { return a + b; } +inline float coot_minus_float(const float a, const float b) { return a - b; } +inline float coot_mul_float(const float a, const float b) { return a * b; } +inline float coot_div_float(const float a, const float b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/h_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/h_defs.cl index 8fb5ca3..f8215c5 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/h_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/h_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_half(const half x) { return isnan(x); } inline half coot_absdiff_half(const half x, const half y) { return fabs(x - y); } inline half coot_conj_half(const half x) { return x; } + +inline half coot_plus_half(const half a, const half b) { return a + b; } +inline half coot_minus_half(const half a, const half b) { return a - b; } +inline half coot_mul_half(const half a, const half b) { return a * b; } +inline half coot_div_half(const half a, const half b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/s16_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/s16_defs.cl index 9a8ad90..98898a3 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/s16_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/s16_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_short(const short x) { return false; } inline short coot_absdiff_short(const short x, const short y) { return abs(x - y); } inline short coot_conj_short(const short x) { return x; } + +inline short coot_plus_short(const short a, const short b) { return a + b; } +inline short coot_minus_short(const short a, const short b) { return a - b; } +inline short coot_mul_short(const short a, const short b) { return a * b; } +inline short coot_div_short(const short a, const short b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/s32_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/s32_defs.cl index 4f8fff7..9403ec6 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/s32_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/s32_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_int(const int x) { return false; } inline int coot_absdiff_int(const int x, const int y) { return abs(x - y); } inline int coot_conj_int(const int x) { return x; } + +inline int coot_plus_int(const int a, const int b) { return a + b; } +inline int coot_minus_int(const int a, const int b) { return a - b; } +inline int coot_mul_int(const int a, const int b) { return a * b; } +inline int coot_div_int(const int a, const int b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/s64_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/s64_defs.cl index 3b81dc4..93c9c96 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/s64_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/s64_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_long(const long x) { return false; } inline long coot_absdiff_long(const long x, const long y) { return abs(x - y); } inline long coot_conj_long(const long x) { return x; } + +inline long coot_plus_long(const long a, const long b) { return a + b; } +inline long coot_minus_long(const long a, const long b) { return a - b; } +inline long coot_mul_long(const long a, const long b) { return a * b; } +inline long coot_div_long(const long a, const long b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/s8_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/s8_defs.cl index 5979c5c..20fb6fb 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/s8_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/s8_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_char(const char x) { return false; } inline char coot_absdiff_char(const char x, const char y) { return abs(x - y); } inline char coot_conj_char(const char x) { return x; } + +inline char coot_plus_char(const char a, const char b) { return a + b; } +inline char coot_minus_char(const char a, const char b) { return a - b; } +inline char coot_mul_char(const char a, const char b) { return a * b; } +inline char coot_div_char(const char a, const char b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/u16_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/u16_defs.cl index 6bf74cc..5848c03 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/u16_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/u16_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_ushort(const ushort x) { return false; } inline ushort coot_absdiff_ushort(const ushort x, const ushort y) { return (x > y) ? (x - y) : (y - x); } inline ushort coot_conj_ushort(const ushort x) { return x; } + +inline ushort coot_plus_ushort(const ushort a, const ushort b) { return a + b; } +inline ushort coot_minus_ushort(const ushort a, const ushort b) { return a - b; } +inline ushort coot_mul_ushort(const ushort a, const ushort b) { return a * b; } +inline ushort coot_div_ushort(const ushort a, const ushort b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/u32_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/u32_defs.cl index cb61bca..e427814 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/u32_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/u32_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_uint(const uint x) { return false; } inline uint coot_absdiff_uint(const uint x, const uint y) { return (x > y) ? (x - y) : (y - x); } inline uint coot_conj_uint(const uint x) { return x; } + +inline uint coot_plus_uint(const uint a, const uint b) { return a + b; } +inline uint coot_minus_uint(const uint a, const uint b) { return a - b; } +inline uint coot_mul_uint(const uint a, const uint b) { return a * b; } +inline uint coot_div_uint(const uint a, const uint b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/u64_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/u64_defs.cl index 4455c1b..3e308b3 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/u64_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/u64_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_ulong(const ulong x) { return false; } inline ulong coot_absdiff_ulong(const ulong x, const ulong y) { return (x > y) ? (x - y) : (y - x); } inline ulong coot_conj_ulong(const ulong x) { return x; } + +inline ulong coot_plus_ulong(const ulong a, const ulong b) { return a + b; } +inline ulong coot_minus_ulong(const ulong a, const ulong b) { return a - b; } +inline ulong coot_mul_ulong(const ulong a, const ulong b) { return a * b; } +inline ulong coot_div_ulong(const ulong a, const ulong b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/opencl/defs/u8_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/u8_defs.cl index bbb6d97..fda8d45 100644 --- a/inst/include/bandicoot_bits/ks/opencl/defs/u8_defs.cl +++ b/inst/include/bandicoot_bits/ks/opencl/defs/u8_defs.cl @@ -23,3 +23,8 @@ inline bool coot_isnan_uchar(const uchar x) { return false; } inline uchar coot_absdiff_uchar(const uchar x, const uchar y) { return (x > y) ? (x - y) : (y - x); } inline uchar coot_conj_uchar(const uchar x) { return x; } + +inline uchar coot_plus_uchar(const uchar a, const uchar b) { return a + b; } +inline uchar coot_minus_uchar(const uchar a, const uchar b) { return a - b; } +inline uchar coot_mul_uchar(const uchar a, const uchar b) { return a * b; } +inline uchar coot_div_uchar(const uchar a, const uchar b) { return a / b; } diff --git a/inst/include/bandicoot_bits/ks/kernels/opencl/defs/z_defs.cl b/inst/include/bandicoot_bits/ks/opencl/defs/z_defs.cl similarity index 100% rename from inst/include/bandicoot_bits/ks/kernels/opencl/defs/z_defs.cl rename to inst/include/bandicoot_bits/ks/opencl/defs/z_defs.cl diff --git a/inst/include/bandicoot_bits/ks/opencl/oneway/fill.cl b/inst/include/bandicoot_bits/ks/opencl/oneway/fill.cl deleted file mode 100644 index 7df6b7f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/oneway/fill.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - -__kernel -void -COOT_FN(PREFIX,fill)(__global eT1* out, - const UWORD out_offset, - const eT1 val, - const UWORD n_rows, - const UWORD n_cols, - const UWORD M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD index = col * M_n_rows + row; - - if(row < n_rows && col < n_cols) - { - out[index + out_offset] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve1.cl deleted file mode 100644 index 6ae09ba..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve1.cl +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - -__kernel -void -COOT_FN(PREFIX,fill_sve1)(__global eT1* out, - const UWORD out_offset, - __global const UWORD* out_locs, - const UWORD out_locs_offset, - const eT1 val, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - out[out_locs[i + out_locs_offset] + out_offset] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve2.cl deleted file mode 100644 index 3d2c81e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/oneway/fill_sve2.cl +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - -__kernel -void -COOT_FN(PREFIX,fill_sve2)(__global eT1* out, - const UWORD out_offset, - __global const UWORD* out_row_locs, - const UWORD out_row_locs_offset, - __global const UWORD* out_col_locs, - const UWORD out_col_locs_offset, - const eT1 val, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD out_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD out_loc = out_offset + - ((out_row_locs == NULL) ? row : out_row_locs[row + out_row_locs_offset]) + - out_n_rows * ((out_col_locs == NULL) ? col : out_col_locs[col + out_col_locs_offset]); - - out[out_loc] = val; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_atan2.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_atan2.cl deleted file mode 100644 index 2599aad..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_atan2.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atan2)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const fp_eT3 a_val = (fp_eT3) src_A[src_A_index]; - const fp_eT3 b_val = (fp_eT3) src_B[src_B_index]; - dest[dest_index] = (eT3) atan2(a_val, b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array.cl deleted file mode 100644 index 6e4e169..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) (a_val / b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array_cube.cl deleted file mode 100644 index 1cfb814..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_div_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_array_cube)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = (eT3) (src_A[src_A_index] / src_B[src_B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_hypot.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_hypot.cl deleted file mode 100644 index 260ad94..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_hypot.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_hypot)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const fp_eT3 a_val = (fp_eT3) src_A[src_A_index]; - const fp_eT3 b_val = (fp_eT3) src_B[src_B_index]; - dest[dest_index] = (eT3) hypot(a_val, b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_max_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_max_array.cl deleted file mode 100644 index 1431168..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_max_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_max_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) max(a_val, b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_min_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_min_array.cl deleted file mode 100644 index d6a13c0..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_min_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_min_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) min(a_val, b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array.cl deleted file mode 100644 index 026a56c..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) (a_val - b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array_cube.cl deleted file mode 100644 index ab5f752..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_minus_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_array_cube)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = (eT3) (src_A[src_A_index] - src_B[src_B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array.cl deleted file mode 100644 index 78f8bd2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) (a_val * b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array_cube.cl deleted file mode 100644 index 66f2355..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_mul_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_array_cube)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = (eT3) (src_A[src_A_index] * src_B[src_B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array.cl deleted file mode 100644 index 115ece9..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_array)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_A_M_n_rows, - const UWORD src_B_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - const threeway_promoted_eT a_val = (threeway_promoted_eT) src_A[src_A_index]; - const threeway_promoted_eT b_val = (threeway_promoted_eT) src_B[src_B_index]; - dest[dest_index] = (eT3) (a_val + b_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array_cube.cl deleted file mode 100644 index 1d51d11..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/threeway/equ_array_plus_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_array_cube)(__global eT3* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT2* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = (eT3) (src_A[src_A_index] + src_B[src_B_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type.cl deleted file mode 100644 index 521a91d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type.cl +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,convert_type)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_rows && col < n_cols) - { - const UWORD src_index = src_offset + row + col * src_M_n_rows; - const UWORD dest_index = dest_offset + row + col * dest_M_n_rows; - - const eT1 in_val = src[src_index]; - dest[dest_index] = (eT2) (in_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type_cube.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type_cube.cl deleted file mode 100644 index a2f4a5e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/convert_type_cube.cl +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,convert_type_cube)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src_A, - const UWORD src_A_offset, - __global const eT1* src, - const UWORD src_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) src_A; - (void) src_A_offset; - (void) src_A_M_n_rows; - (void) src_A_M_n_cols; - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_index = src_offset + row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols; - const UWORD dest_index = dest_offset + row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols; - - const eT1 in_val = src[src_index]; - dest[dest_index] = (eT2) (in_val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_abs.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_abs.cl deleted file mode 100644 index b1331f6..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_abs.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2021-2025 Ryan Curtin (https://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_abs)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ET1_ABS(src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_post.cl deleted file mode 100644 index 4730add..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_acos_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) acos(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_pre.cl deleted file mode 100644 index 830b9a2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acos_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_acos_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) acos(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_post.cl deleted file mode 100644 index 128330c..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_acosh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) acosh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_pre.cl deleted file mode 100644 index 3556df0..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_acosh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_acosh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) acosh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_post.cl deleted file mode 100644 index de5bee3..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_asin_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) asin(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_pre.cl deleted file mode 100644 index 0d0f1a9..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asin_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_asin_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) asin(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_post.cl deleted file mode 100644 index 1a4e2dd..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_asinh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) asinh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_pre.cl deleted file mode 100644 index 36d1448..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_asinh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_asinh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) asinh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_post.cl deleted file mode 100644 index 1a0668f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atan_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) atan(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_pre.cl deleted file mode 100644 index b218b7b..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atan_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atan_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) atan(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_post.cl deleted file mode 100644 index 7b70889..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atanh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) atanh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_pre.cl deleted file mode 100644 index b0015a2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_atanh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_atanh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) atanh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_post.cl deleted file mode 100644 index b8b71ce..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_post.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_ceil_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) ceil(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_pre.cl deleted file mode 100644 index 5089f10..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_ceil_pre.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_ceil_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 val = (fp_eT2) src[src_index]; - dest[dest_index] = (eT2) ceil(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_post.cl deleted file mode 100644 index 9fc13c0..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_cos_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) cos(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_pre.cl deleted file mode 100644 index 8809cb8..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cos_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_cos_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) cos(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_post.cl deleted file mode 100644 index fd6d28e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_cosh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) cosh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_pre.cl deleted file mode 100644 index a100520..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_cosh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_cosh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) cosh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post.cl deleted file mode 100644 index 2c60180..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) (src[src_index] / val_pre)) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve1.cl deleted file mode 100644 index 0e13d62..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve1.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_post_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = ((eT2) (src[src_locs[i + src_locs_offset] + src_offset] / val_pre)) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve2.cl deleted file mode 100644 index 856b8ba..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_post_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_post_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = ((eT2) (src[src_loc] / val_pre)) / val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre.cl deleted file mode 100644 index 886bafb..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre.cl +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (val_post == (eT2) (0)) - { - // if both are 0, we take it as val_pre == 0 and val_post unused - dest[dest_index] = (eT2) (val_pre / src[src_index]); - } - else if (val_pre == (eT1) (0) && val_post != (eT2) (0)) - { - dest[dest_index] = val_post / ((eT2) src[src_index]); - } - else - { - // if both are nonzero, we apply sequentially---be careful! - dest[dest_index] = val_post / ((eT2) (val_pre / src[src_index])); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve1.cl deleted file mode 100644 index d19a030..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve1.cl +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_pre_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - if (val_post == (eT2) (0)) - { - // if both are 0, we take it as val_pre == 0 and val_post unused - dest[dest_locs[i + dest_locs_offset] + dest_offset] = (eT2) (val_pre / src[src_locs[i + src_locs_offset] + src_offset]); - } - else if (val_pre == (eT1) (0) && val_post != (eT2) (0)) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = val_post / ((eT2) src[src_locs[i + src_locs_offset] + src_offset]); - } - else - { - // if both are nonzero, we apply sequentially---be careful! - dest[dest_locs[i + dest_locs_offset] + dest_offset] = val_post / ((eT2) (val_pre / src[src_locs[i + src_locs_offset] + src_offset])); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve2.cl deleted file mode 100644 index 444c8fc..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_div_scalar_pre_sve2.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_div_scalar_pre_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - if (val_post == (eT2) (0)) - { - // If both are 0, we take it as val_pre == 0 and val_post unused - dest[dest_loc] = (eT2) (val_pre / src[src_loc]); - } - else if (val_pre == (eT1) (0) && val_post != (eT2) (0)) - { - dest[dest_loc] = val_post / ((eT2) src[src_loc]); - } - else - { - // If both are nonzero, we apply sequentially---be careful! - dest[dest_loc] = val_post / ((eT2) (val_pre / src[src_loc])); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_post.cl deleted file mode 100644 index b823806..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_erf_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) erf(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_pre.cl deleted file mode 100644 index 6369282..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erf_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_erf_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) erf(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_post.cl deleted file mode 100644 index 85bc476..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_erfc_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) erfc(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_pre.cl deleted file mode 100644 index 0df332a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_erfc_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_erfc_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) erfc(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_post.cl deleted file mode 100644 index 012a4f8..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp10_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) exp10((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_pre.cl deleted file mode 100644 index 4947908..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp10_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp10_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) exp10((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_post.cl deleted file mode 100644 index e2e730d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp2_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) exp2((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_pre.cl deleted file mode 100644 index eede48d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp2_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp2_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) exp2((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_post.cl deleted file mode 100644 index 433f2a5..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) exp((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_pre.cl deleted file mode 100644 index 8d9c70f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_exp_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_exp_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) exp((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_post.cl deleted file mode 100644 index 14b1b03..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_post.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_floor_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) floor(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_pre.cl deleted file mode 100644 index 4aa7e36..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_floor_pre.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_floor_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 val = (fp_eT2) src[src_index]; - dest[dest_index] = (eT2) floor(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_post.cl deleted file mode 100644 index b0b9050..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_lgamma_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) lgamma(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_pre.cl deleted file mode 100644 index e899f12..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_lgamma_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_lgamma_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) lgamma(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_post.cl deleted file mode 100644 index 78a981e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log10_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) log10((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_pre.cl deleted file mode 100644 index 15ed5f2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log10_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log10_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) log10((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_post.cl deleted file mode 100644 index 6b0eee0..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log2_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) log2((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_pre.cl deleted file mode 100644 index f92a105..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log2_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log2_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) log2((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_post.cl deleted file mode 100644 index 3580e96..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) log((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_pre.cl deleted file mode 100644 index 9d1223f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_log_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_log_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) log((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_max_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_max_array_cube.cl deleted file mode 100644 index 0107550..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_max_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_max_array_cube)(__global eT2* dest, - const UWORD dest_offset, - __global const eT2* src_A, - const UWORD src_A_offset, - __global const eT1* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = max(src_A[src_A_index], ((eT2) src_B[src_B_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_min_array_cube.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_min_array_cube.cl deleted file mode 100644 index 963b574..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_min_array_cube.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_min_array_cube)(__global eT2* dest, - const UWORD dest_offset, - __global const eT2* src_A, - const UWORD src_A_offset, - __global const eT1* src_B, - const UWORD src_B_offset, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_A_M_n_rows, - const UWORD src_A_M_n_cols, - const UWORD src_B_M_n_rows, - const UWORD src_B_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const UWORD src_A_index = row + col * src_A_M_n_rows + src_A_offset + slice * src_A_M_n_rows * src_A_M_n_cols; - const UWORD src_B_index = row + col * src_B_M_n_rows + src_B_offset + slice * src_B_M_n_rows * src_B_M_n_cols; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset + slice * dest_M_n_rows * dest_M_n_cols; - - dest[dest_index] = min(src_A[src_A_index], ((eT2) src_B[src_B_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post.cl deleted file mode 100644 index b0cef78..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) (src[src_index] - val_pre)) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve1.cl deleted file mode 100644 index dc79c2a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve1.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_post_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = ((eT2) (src[src_locs[i + src_locs_offset] + src_offset] - val_pre)) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve2.cl deleted file mode 100644 index 8ed24e8..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_post_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_post_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = ((eT2) (src[src_loc] - val_pre)) - val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post.cl deleted file mode 100644 index 0b2cd8a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) val_post; - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) (val_pre - src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve1.cl deleted file mode 100644 index cd55c0d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve1.cl +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - (void) val_post; - - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = (eT2) (val_pre - src[src_locs[i + src_locs_offset] + src_offset]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve2.cl deleted file mode 100644 index 1326c28..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_post_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_post_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = (eT2) (val_pre - src[src_loc]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre.cl deleted file mode 100644 index e66edaf..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void) val_pre; - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = val_post - ((eT2) (src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve1.cl deleted file mode 100644 index 0188d16..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve1.cl +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - (void) val_pre; - - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = val_post - ((eT2) (src[src_locs[i + src_locs_offset] + src_offset])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve2.cl deleted file mode 100644 index ad1e83f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_minus_scalar_pre_pre_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_minus_scalar_pre_pre_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = val_post - ((eT2) (src[src_loc])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mod_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mod_scalar.cl deleted file mode 100644 index a73ce37..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mod_scalar.cl +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mod_scalar)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD dest_M_n_rows, - const UWORD src_M_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD src_index = row + col * src_M_n_rows + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + dest_offset; - - if (row < n_rows && col < n_cols) - { - // For an integer type, the casts end up doing nothing. - uint_eT1 val = ((uint_eT1) src[src_index]) % ((uint_eT1) val_pre); - dest[dest_index] = (eT2) (((uint_eT2) val) % ((uint_eT2) val_post)); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar.cl deleted file mode 100644 index 818212e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_scalar)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) (src[src_index] * val_pre)) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve1.cl deleted file mode 100644 index 5b022a4..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve1.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_scalar_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = ((eT2) (src[src_locs[i + src_locs_offset] + src_offset] * val_pre)) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve2.cl deleted file mode 100644 index cbd9ca5..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_mul_scalar_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_mul_scalar_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = ((eT2) (src[src_loc] * val_pre)) * val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_post.cl deleted file mode 100644 index 8d2710e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_neg_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) -src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_pre.cl deleted file mode 100644 index abe43fe..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_neg_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_neg_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = -((eT2) src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar.cl deleted file mode 100644 index 6f382ba..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar.cl +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_scalar)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = ((eT2) (src[src_index] + val_pre)) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve1.cl deleted file mode 100644 index 3bec2dd..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve1.cl +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_scalar_sve1)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_locs, - const UWORD dest_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_locs, - const UWORD src_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - dest[dest_locs[i + dest_locs_offset] + dest_offset] = ((eT2) (src[src_locs[i + src_locs_offset] + src_offset] + val_pre)) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve2.cl deleted file mode 100644 index 6afdb65..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_plus_scalar_sve2.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_plus_scalar_sve2)(__global eT2* dest, - const UWORD dest_offset, - __global const UWORD* dest_row_locs, - const UWORD dest_row_locs_offset, - __global const UWORD* dest_col_locs, - const UWORD dest_col_locs_offset, - __global const eT1* src, - const UWORD src_offset, - __global const UWORD* src_row_locs, - const UWORD src_row_locs_offset, - __global const UWORD* src_col_locs, - const UWORD src_col_locs_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD dest_n_rows, - const UWORD src_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD src_loc = src_offset + - ((src_row_locs == NULL) ? row : src_row_locs[row + src_row_locs_offset]) + - src_n_rows * ((src_col_locs == NULL) ? col : src_col_locs[col + src_col_locs_offset]); - - const UWORD dest_loc = dest_offset + - ((dest_row_locs == NULL) ? row : dest_row_locs[row + dest_row_locs_offset]) + - dest_n_rows * ((dest_col_locs == NULL) ? col : dest_col_locs[col + dest_col_locs_offset]); - - dest[dest_loc] = ((eT2) (src[src_loc] + val_pre)) + val_post; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_post.cl deleted file mode 100644 index 9d98f9a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_post.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_pow_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) pow(val, (fp_eT1) val_pre); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_pre.cl deleted file mode 100644 index e26a831..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_pow_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_pow_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) pow(val, (fp_eT2) val_post); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_post.cl deleted file mode 100644 index 7e4413e..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_post.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_round_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) round(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_pre.cl deleted file mode 100644 index a5d715c..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_round_pre.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_round_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 val = (fp_eT2) src[src_index]; - dest[dest_index] = (eT2) round(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_post.cl deleted file mode 100644 index 6721792..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_post.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sign_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = (eT1) src[src_index]; - if (val > (eT1) 0) - { - dest[dest_index] = (eT2) 1; - } - else if (val == (eT1) 0) - { - dest[dest_index] = (eT2) 0; - } - else - { - dest[dest_index] = (eT2) -1; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_pre.cl deleted file mode 100644 index 6223859..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sign_pre.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sign_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = (eT2) src[src_index]; - if (val > (eT2) 0) - { - dest[dest_index] = (eT2) 1; - } - else if (val == (eT2) 0) - { - dest[dest_index] = (eT2) 0; - } - else - { - dest[dest_index] = (eT2) -1; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_post.cl deleted file mode 100644 index 10a2f6b..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sin_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) sin(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_pre.cl deleted file mode 100644 index f16ca0f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sin_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sin_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) sin(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_post.cl deleted file mode 100644 index d576c7d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_post.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sinc_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = (eT1) src[src_index]; - // To imitate Armadillo correctly, we use double if the type is not floating point. - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 tmp = val * M_PI; - dest[dest_index] = (tmp == (eT1) 0.0) ? (eT2) 1.0 : (eT2) (sin(tmp) / tmp); - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - const ARMA_FP_TYPE tmp = fp_val * M_PI; - dest[dest_index] = (tmp == 0.0) ? (eT2) 1.0 : (eT2) (sin(tmp) / tmp); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_pre.cl deleted file mode 100644 index 707254d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinc_pre.cl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sinc_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = (eT2) src[src_index]; - // To imitate Armadillo correctly, we use double if the type is not floating point. - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 tmp = val * M_PI; - dest[dest_index] = (tmp == (eT2) 0.0) ? (eT2) 1.0 : (eT2) (sin(tmp) / tmp); - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - const ARMA_FP_TYPE tmp = fp_val * M_PI; - dest[dest_index] = (tmp == 0.0) ? (eT2) 1.0 : (eT2) (sin(tmp) / tmp); - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_post.cl deleted file mode 100644 index 46c297a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sinh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) sinh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_pre.cl deleted file mode 100644 index f01a83d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sinh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sinh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) sinh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_post.cl deleted file mode 100644 index 0951351..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sqrt_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) ((eT1) sqrt((fp_eT1) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_pre.cl deleted file mode 100644 index 71ea604..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_sqrt_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_sqrt_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) sqrt((fp_eT2) ((eT2) src[src_index])); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_post.cl deleted file mode 100644 index 6def56f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_square_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - dest[dest_index] = (eT2) (src[src_index] * src[src_index]); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_pre.cl deleted file mode 100644 index edb6936..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_square_pre.cl +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_square_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT2 val = (eT2) src[src_index]; - dest[dest_index] = val * val; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_post.cl deleted file mode 100644 index f9e8d0b..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_tan_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) tan(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_pre.cl deleted file mode 100644 index ccba133..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tan_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_tan_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) tan(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_post.cl deleted file mode 100644 index 487e9cf..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_post.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_tanh_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) tanh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_pre.cl deleted file mode 100644 index 29419b2..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_tanh_pre.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_tanh_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const fp_eT2 val = (fp_eT2) (eT2) src[src_index]; - dest[dest_index] = (eT2) tanh(val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_post.cl deleted file mode 100644 index 4a50a3a..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_post.cl +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_exp_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To match Armadillo, we always use `double` as the intermediate type for any non-floating point type. - const eT1 val = src[src_index]; - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 fp_val = (fp_eT1) val; - if (fp_val >= log(COOT_FN(coot_type_max_,fp_eT1)())) - { - dest[dest_index] = (eT2) ((eT1) COOT_FN(coot_type_max_,fp_eT1)()); - } - else - { - dest[dest_index] = (eT2) ((eT1) exp(fp_val)); - } - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - if (fp_val >= log(ARMA_FP_MAX)) - { - dest[dest_index] = (eT2) ((eT1) ARMA_FP_MAX); - } - else - { - dest[dest_index] = (eT2) ((eT1) exp(fp_val)); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_pre.cl deleted file mode 100644 index ffd470d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_exp_pre.cl +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_exp_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To imitate Armadillo's behavior exactly, if the type is not floating-point, we convert to double. - const eT2 val = (eT2) src[src_index]; - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 fp_val = (fp_eT2) val; - if (fp_val >= log(COOT_FN(coot_type_max_,fp_eT2)())) - { - dest[dest_index] = (eT2) COOT_FN(coot_type_max_,fp_eT2)(); - } - else - { - dest[dest_index] = (eT2) exp(fp_val); - } - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - if (fp_val >= log(ARMA_FP_MAX)) - { - dest[dest_index] = (eT2) ARMA_FP_MAX; - } - else - { - dest[dest_index] = (eT2) exp(fp_val); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_post.cl deleted file mode 100644 index 1bfc97f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_post.cl +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_log_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 fp_val = (fp_eT1) val; - if (fp_val <= (fp_eT1) 0) - { - dest[dest_index] = (eT2) log(COOT_FN(coot_type_minpos_,fp_eT1)()); - } - else if (isinf(fp_val)) - { - dest[dest_index] = (eT2) log(COOT_FN(coot_type_max_,fp_eT1)()); - } - else - { - dest[dest_index] = (eT2) ((eT1) log(fp_val)); - } - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - if (fp_val <= (ARMA_FP_TYPE) 0) - { - dest[dest_index] = (eT2) log(ARMA_FP_MIN); - } - else if (isinf(fp_val)) - { - dest[dest_index] = (eT2) log(ARMA_FP_MAX); - } - else - { - dest[dest_index] = (eT2) ((eT1) log(fp_val)); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_pre.cl deleted file mode 100644 index ac92d12..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_log_pre.cl +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_log_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - // To match Armadillo, we always use `double` as the intermediate type for any non-floating point type. - const eT2 val = (eT2) src[src_index]; - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 fp_val = (fp_eT2) val; - if (fp_val <= (fp_eT2) 0) - { - dest[dest_index] = (eT2) log(COOT_FN(coot_type_minpos_,fp_eT2)()); - } - else if (isinf(fp_val)) - { - dest[dest_index] = (eT2) log(COOT_FN(coot_type_max_,fp_eT2)()); - } - else - { - dest[dest_index] = (eT2) log(fp_val); - } - } - else - { - const ARMA_FP_TYPE fp_val = (ARMA_FP_TYPE) val; - if (fp_val <= (ARMA_FP_TYPE) 0) - { - dest[dest_index] = (eT2) log(ARMA_FP_MIN); - } - else if (isinf(fp_val)) - { - dest[dest_index] = (eT2) log(ARMA_FP_MAX); - } - else - { - dest[dest_index] = (eT2) log(fp_val); - } - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_post.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_post.cl deleted file mode 100644 index c6d77ed..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_post.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_post)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT1)()) - { - const fp_eT1 val = (fp_eT1) src[src_index]; - dest[dest_index] = (eT2) trunc(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_pre.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_pre.cl deleted file mode 100644 index 95c2624..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/equ_array_trunc_pre.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2022-2025 Ryan Curtin (http://www.ratml.org/) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,equ_array_trunc_pre)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_pre, - const eT2 val_post, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - (void)(val_pre); - (void)(val_post); - - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - if (COOT_FN(coot_is_fp_,eT2)()) - { - const fp_eT2 val = (fp_eT2) src[src_index]; - dest[dest_index] = (eT2) trunc(val); - } - else - { - dest[dest_index] = (eT2) src[src_index]; - } - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve1.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve1.cl deleted file mode 100644 index 7a44609..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve1.cl +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,extract_sve1)(__global eT2* out_mem, - const UWORD out_mem_offset, - __global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_locs, - const UWORD in_locs_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - out_mem[i + out_mem_offset] = (eT2) in_mem[in_locs[i + in_locs_offset] + in_mem_offset]; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve2.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve2.cl deleted file mode 100644 index 908a292..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/extract_sve2.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2017 Conrad Sanderson (http://conradsanderson.id.au) -// Copyright 2025 Ryan Curtin (http://ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - - - -__kernel -void -COOT_FN(PREFIX,extract_sve2)(__global eT2* out_mem, - const UWORD out_mem_offset, - __global const eT1* in_mem, - const UWORD in_mem_offset, - __global const UWORD* in_row_locs, - const UWORD in_row_locs_offset, - __global const UWORD* in_col_locs, - const UWORD in_col_locs_offset, - const UWORD n_row_elems, - const UWORD n_col_elems, - const UWORD out_n_rows, - const UWORD in_n_rows) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - - if (row < n_row_elems && col < n_col_elems) - { - const UWORD in_loc = in_mem_offset + - ((in_row_locs == NULL) ? row : in_row_locs[row + in_row_locs_offset]) + - in_n_rows * ((in_col_locs == NULL) ? col : in_col_locs[col + in_col_locs_offset]); - - const UWORD out_loc = out_mem_offset + row + out_n_rows * col; - - out_mem[out_loc] = (eT2) in_mem[in_loc]; - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_and_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_and_array.cl deleted file mode 100644 index 6e47aaf..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_and_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_and_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 && val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_array.cl deleted file mode 100644 index db6e3cd..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_eq_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 == val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_scalar.cl deleted file mode 100644 index c03a8ee..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_eq_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_eq_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 == val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_array.cl deleted file mode 100644 index ce94343..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_gt_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 > val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_scalar.cl deleted file mode 100644 index 62f6e00..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gt_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_gt_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 > val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_array.cl deleted file mode 100644 index c045833..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_gteq_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 >= val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_scalar.cl deleted file mode 100644 index d4b3642..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_gteq_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_gteq_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 >= val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_array.cl deleted file mode 100644 index 96452ce..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_lt_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 < val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_scalar.cl deleted file mode 100644 index 5602adf..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lt_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_lt_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 < val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_array.cl deleted file mode 100644 index 6e1b121..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_lteq_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 <= val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_scalar.cl deleted file mode 100644 index 555597f..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_lteq_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_lteq_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 <= val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_array.cl deleted file mode 100644 index b3f4480..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_neq_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 != val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_scalar.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_scalar.cl deleted file mode 100644 index 0a7c0db..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_neq_scalar.cl +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_neq_scalar)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, // will be casted to eT2 before comparison - const UWORD X_offset, - const UWORD n_elem, - const eT2 val) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT2 val1 = (eT2) X[X_offset + i]; - out[out_offset + i] = (val1 != val); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_or_array.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/rel_or_array.cl deleted file mode 100644 index 1493265..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/rel_or_array.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,rel_or_array)(__global UWORD* out, - const UWORD out_offset, - __global const eT1* X, - const UWORD X_offset, - __global const eT2* Y, - const UWORD Y_offset, - const UWORD n_elem) - { - const UWORD i = get_global_id(0); - - if (i < n_elem) - { - const eT1 val1 = X[X_offset + i]; - const eT2 val2 = Y[Y_offset + i]; - out[out_offset + i] = (val1 || val2); - } - } diff --git a/inst/include/bandicoot_bits/ks/opencl/twoway/replace.cl b/inst/include/bandicoot_bits/ks/opencl/twoway/replace.cl deleted file mode 100644 index a5aed9d..0000000 --- a/inst/include/bandicoot_bits/ks/opencl/twoway/replace.cl +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2023-2025 Ryan Curtin (http://www.ratml.org) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ------------------------------------------------------------------------ - -__kernel -void -COOT_FN(PREFIX,replace)(__global eT2* dest, - const UWORD dest_offset, - __global const eT1* src, - const UWORD src_offset, - const eT1 val_find, - const eT1 val_replace, - const UWORD n_rows, - const UWORD n_cols, - const UWORD n_slices, - const UWORD dest_M_n_rows, - const UWORD dest_M_n_cols, - const UWORD src_M_n_rows, - const UWORD src_M_n_cols) - { - const UWORD row = get_global_id(0); - const UWORD col = get_global_id(1); - const UWORD slice = get_global_id(2); - - const UWORD src_index = row + col * src_M_n_rows + slice * src_M_n_rows * src_M_n_cols + src_offset; - const UWORD dest_index = row + col * dest_M_n_rows + slice * dest_M_n_rows * dest_M_n_cols + dest_offset; - - if (row < n_rows && col < n_cols && slice < n_slices) - { - const eT1 val = src[src_index]; - if (COOT_FN(coot_isnan_,eT1)(val_find)) - { - // We are searching for a NaN so the check is a little different. - dest[dest_index] = (eT2) (COOT_FN(coot_isnan_,eT1)(val) ? val_replace : val); - } - else - { - // No special handling needed. - dest[dest_index] = (eT2) ((val == val_find) ? val_replace : val); - } - } - } diff --git a/inst/include/bandicoot_bits/mtglue_mixed_meat.hpp b/inst/include/bandicoot_bits/mtglue_mixed_meat.hpp index c1e4328..e52f4cc 100644 --- a/inst/include/bandicoot_bits/mtglue_mixed_meat.hpp +++ b/inst/include/bandicoot_bits/mtglue_mixed_meat.hpp @@ -31,9 +31,6 @@ mtglue_mixed_times::apply(Mat& out, const mtGlue> tmp1(mtOp(X.A)); const partial_unwrap> tmp2(mtOp(X.B)); diff --git a/inst/include/bandicoot_bits/opencl/runtime_bones.hpp b/inst/include/bandicoot_bits/opencl/runtime_bones.hpp index 67b4845..4aad7b1 100644 --- a/inst/include/bandicoot_bits/opencl/runtime_bones.hpp +++ b/inst/include/bandicoot_bits/opencl/runtime_bones.hpp @@ -271,10 +271,10 @@ class runtime_t::adapt_uword { public: - coot_aligned size_t size; - coot_aligned void* addr; - coot_aligned u64 val64; - coot_aligned u32 val32; + coot_aligned size_t size = 0; + coot_aligned void* addr = nullptr; + coot_aligned u64 val64 = 0; + coot_aligned u32 val32 = 0; inline adapt_uword(const uword val = 0); // default value needed for allocating several at once diff --git a/inst/include/bandicoot_bits/opencl/runtime_meat.hpp b/inst/include/bandicoot_bits/opencl/runtime_meat.hpp index 1190a0f..df18955 100644 --- a/inst/include/bandicoot_bits/opencl/runtime_meat.hpp +++ b/inst/include/bandicoot_bits/opencl/runtime_meat.hpp @@ -1702,8 +1702,8 @@ runtime_t::adapt_uword::adapt_uword(const uword val) inline runtime_t::adapt_uword::adapt_uword(const runtime_t::adapt_uword& other) : size(other.size) - , val32(other.val32) , val64(other.val64) + , val32(other.val32) { if (other.addr == &other.val32) { @@ -1720,8 +1720,8 @@ runtime_t::adapt_uword::adapt_uword(const runtime_t::adapt_uword& other) inline runtime_t::adapt_uword::adapt_uword(runtime_t::adapt_uword&& other) : size(other.size) - , val32(other.val32) , val64(other.val64) + , val32(other.val32) { if (other.addr == &other.val32) { diff --git a/src/.gitignore b/src/.gitignore index 22034c4..5948b32 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,3 +1,5 @@ *.o *.so *.dll +Makevars +Makevars.win diff --git a/src/Makevars.win b/src/Makevars.win deleted file mode 100644 index 667e3f0..0000000 --- a/src/Makevars.win +++ /dev/null @@ -1,11 +0,0 @@ -## RcppBandicoot Makevars.win -## -## Windows-specific Makevars -## GPU backend configuration must be done manually on Windows - -PKG_CPPFLAGS = -I../inst/include - -## Warning: GPU backends may not be properly configured -## Users may need to manually specify OpenCL or CUDA paths -PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -Wno-missing-braces -PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win.in b/src/Makevars.win.in new file mode 100644 index 0000000..ccdda30 --- /dev/null +++ b/src/Makevars.win.in @@ -0,0 +1,12 @@ +## RcppBandicoot Makevars.win.in +## +## This file is processed by configure.win to generate Makevars.win +## It includes GPU backend configuration (OpenCL, CLBlast) for Windows + +PKG_CPPFLAGS = -I../inst/include -DCOOT_TARGET_OPENCL_VERSION=@OPENCL_TARGET_VERSION@ -DCOOT_KERNEL_SOURCE_DIR=\"@BANDICOOT_KERNELS_DIR@\" + +## Compiler flags from configure.win +PKG_CXXFLAGS = @BANDICOOT_CXXFLAGS@ + +## Linker flags from configure.win +PKG_LIBS = @OPENMP_CXXFLAGS@ @BANDICOOT_LIBS@